1 //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run 10 // both before and after the DAG is legalized. 11 // 12 // This pass is not a substitute for the LLVM IR instcombine pass. This pass is 13 // primarily intended to handle simplification opportunities that are implicit 14 // in the LLVM IR and exposed by the various codegen lowering phases. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/ADT/APFloat.h" 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/DenseMap.h" 22 #include "llvm/ADT/IntervalMap.h" 23 #include "llvm/ADT/None.h" 24 #include "llvm/ADT/Optional.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SetVector.h" 27 #include "llvm/ADT/SmallPtrSet.h" 28 #include "llvm/ADT/SmallSet.h" 29 #include "llvm/ADT/SmallVector.h" 30 #include "llvm/ADT/Statistic.h" 31 #include "llvm/Analysis/AliasAnalysis.h" 32 #include "llvm/Analysis/MemoryLocation.h" 33 #include "llvm/Analysis/VectorUtils.h" 34 #include "llvm/CodeGen/DAGCombine.h" 35 #include "llvm/CodeGen/ISDOpcodes.h" 36 #include "llvm/CodeGen/MachineFrameInfo.h" 37 #include "llvm/CodeGen/MachineFunction.h" 38 #include "llvm/CodeGen/MachineMemOperand.h" 39 #include "llvm/CodeGen/RuntimeLibcalls.h" 40 #include "llvm/CodeGen/SelectionDAG.h" 41 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" 42 #include "llvm/CodeGen/SelectionDAGNodes.h" 43 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 44 #include "llvm/CodeGen/TargetLowering.h" 45 #include "llvm/CodeGen/TargetRegisterInfo.h" 46 #include "llvm/CodeGen/TargetSubtargetInfo.h" 47 #include "llvm/CodeGen/ValueTypes.h" 48 #include "llvm/IR/Attributes.h" 49 #include "llvm/IR/Constant.h" 50 #include "llvm/IR/DataLayout.h" 51 #include "llvm/IR/DerivedTypes.h" 52 #include "llvm/IR/Function.h" 53 #include "llvm/IR/LLVMContext.h" 54 #include "llvm/IR/Metadata.h" 55 #include "llvm/Support/Casting.h" 56 #include "llvm/Support/CodeGen.h" 57 #include "llvm/Support/CommandLine.h" 58 #include "llvm/Support/Compiler.h" 59 #include "llvm/Support/Debug.h" 60 #include "llvm/Support/ErrorHandling.h" 61 #include "llvm/Support/KnownBits.h" 62 #include "llvm/Support/MachineValueType.h" 63 #include "llvm/Support/MathExtras.h" 64 #include "llvm/Support/raw_ostream.h" 65 #include "llvm/Target/TargetMachine.h" 66 #include "llvm/Target/TargetOptions.h" 67 #include <algorithm> 68 #include <cassert> 69 #include <cstdint> 70 #include <functional> 71 #include <iterator> 72 #include <string> 73 #include <tuple> 74 #include <utility> 75 76 using namespace llvm; 77 78 #define DEBUG_TYPE "dagcombine" 79 80 STATISTIC(NodesCombined , "Number of dag nodes combined"); 81 STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); 82 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); 83 STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); 84 STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); 85 STATISTIC(SlicedLoads, "Number of load sliced"); 86 STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops"); 87 88 static cl::opt<bool> 89 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, 90 cl::desc("Enable DAG combiner's use of IR alias analysis")); 91 92 static cl::opt<bool> 93 UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), 94 cl::desc("Enable DAG combiner's use of TBAA")); 95 96 #ifndef NDEBUG 97 static cl::opt<std::string> 98 CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, 99 cl::desc("Only use DAG-combiner alias analysis in this" 100 " function")); 101 #endif 102 103 /// Hidden option to stress test load slicing, i.e., when this option 104 /// is enabled, load slicing bypasses most of its profitability guards. 105 static cl::opt<bool> 106 StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, 107 cl::desc("Bypass the profitability model of load slicing"), 108 cl::init(false)); 109 110 static cl::opt<bool> 111 MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), 112 cl::desc("DAG combiner may split indexing from loads")); 113 114 static cl::opt<bool> 115 EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true), 116 cl::desc("DAG combiner enable merging multiple stores " 117 "into a wider store")); 118 119 static cl::opt<unsigned> TokenFactorInlineLimit( 120 "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048), 121 cl::desc("Limit the number of operands to inline for Token Factors")); 122 123 static cl::opt<unsigned> StoreMergeDependenceLimit( 124 "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10), 125 cl::desc("Limit the number of times for the same StoreNode and RootNode " 126 "to bail out in store merging dependence check")); 127 128 static cl::opt<bool> EnableReduceLoadOpStoreWidth( 129 "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true), 130 cl::desc("DAG cominber enable reducing the width of load/op/store " 131 "sequence")); 132 133 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore( 134 "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true), 135 cl::desc("DAG cominber enable load/<replace bytes>/store with " 136 "a narrower store")); 137 138 namespace { 139 140 class DAGCombiner { 141 SelectionDAG &DAG; 142 const TargetLowering &TLI; 143 const SelectionDAGTargetInfo *STI; 144 CombineLevel Level; 145 CodeGenOpt::Level OptLevel; 146 bool LegalDAG = false; 147 bool LegalOperations = false; 148 bool LegalTypes = false; 149 bool ForCodeSize; 150 bool DisableGenericCombines; 151 152 /// Worklist of all of the nodes that need to be simplified. 153 /// 154 /// This must behave as a stack -- new nodes to process are pushed onto the 155 /// back and when processing we pop off of the back. 156 /// 157 /// The worklist will not contain duplicates but may contain null entries 158 /// due to nodes being deleted from the underlying DAG. 159 SmallVector<SDNode *, 64> Worklist; 160 161 /// Mapping from an SDNode to its position on the worklist. 162 /// 163 /// This is used to find and remove nodes from the worklist (by nulling 164 /// them) when they are deleted from the underlying DAG. It relies on 165 /// stable indices of nodes within the worklist. 166 DenseMap<SDNode *, unsigned> WorklistMap; 167 /// This records all nodes attempted to add to the worklist since we 168 /// considered a new worklist entry. As we keep do not add duplicate nodes 169 /// in the worklist, this is different from the tail of the worklist. 170 SmallSetVector<SDNode *, 32> PruningList; 171 172 /// Set of nodes which have been combined (at least once). 173 /// 174 /// This is used to allow us to reliably add any operands of a DAG node 175 /// which have not yet been combined to the worklist. 176 SmallPtrSet<SDNode *, 32> CombinedNodes; 177 178 /// Map from candidate StoreNode to the pair of RootNode and count. 179 /// The count is used to track how many times we have seen the StoreNode 180 /// with the same RootNode bail out in dependence check. If we have seen 181 /// the bail out for the same pair many times over a limit, we won't 182 /// consider the StoreNode with the same RootNode as store merging 183 /// candidate again. 184 DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap; 185 186 // AA - Used for DAG load/store alias analysis. 187 AliasAnalysis *AA; 188 189 /// When an instruction is simplified, add all users of the instruction to 190 /// the work lists because they might get more simplified now. 191 void AddUsersToWorklist(SDNode *N) { 192 for (SDNode *Node : N->uses()) 193 AddToWorklist(Node); 194 } 195 196 /// Convenient shorthand to add a node and all of its user to the worklist. 197 void AddToWorklistWithUsers(SDNode *N) { 198 AddUsersToWorklist(N); 199 AddToWorklist(N); 200 } 201 202 // Prune potentially dangling nodes. This is called after 203 // any visit to a node, but should also be called during a visit after any 204 // failed combine which may have created a DAG node. 205 void clearAddedDanglingWorklistEntries() { 206 // Check any nodes added to the worklist to see if they are prunable. 207 while (!PruningList.empty()) { 208 auto *N = PruningList.pop_back_val(); 209 if (N->use_empty()) 210 recursivelyDeleteUnusedNodes(N); 211 } 212 } 213 214 SDNode *getNextWorklistEntry() { 215 // Before we do any work, remove nodes that are not in use. 216 clearAddedDanglingWorklistEntries(); 217 SDNode *N = nullptr; 218 // The Worklist holds the SDNodes in order, but it may contain null 219 // entries. 220 while (!N && !Worklist.empty()) { 221 N = Worklist.pop_back_val(); 222 } 223 224 if (N) { 225 bool GoodWorklistEntry = WorklistMap.erase(N); 226 (void)GoodWorklistEntry; 227 assert(GoodWorklistEntry && 228 "Found a worklist entry without a corresponding map entry!"); 229 } 230 return N; 231 } 232 233 /// Call the node-specific routine that folds each particular type of node. 234 SDValue visit(SDNode *N); 235 236 public: 237 DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) 238 : DAG(D), TLI(D.getTargetLoweringInfo()), 239 STI(D.getSubtarget().getSelectionDAGInfo()), 240 Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { 241 ForCodeSize = DAG.shouldOptForSize(); 242 DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel); 243 244 MaximumLegalStoreInBits = 0; 245 // We use the minimum store size here, since that's all we can guarantee 246 // for the scalable vector types. 247 for (MVT VT : MVT::all_valuetypes()) 248 if (EVT(VT).isSimple() && VT != MVT::Other && 249 TLI.isTypeLegal(EVT(VT)) && 250 VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits) 251 MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize(); 252 } 253 254 void ConsiderForPruning(SDNode *N) { 255 // Mark this for potential pruning. 256 PruningList.insert(N); 257 } 258 259 /// Add to the worklist making sure its instance is at the back (next to be 260 /// processed.) 261 void AddToWorklist(SDNode *N) { 262 assert(N->getOpcode() != ISD::DELETED_NODE && 263 "Deleted Node added to Worklist"); 264 265 // Skip handle nodes as they can't usefully be combined and confuse the 266 // zero-use deletion strategy. 267 if (N->getOpcode() == ISD::HANDLENODE) 268 return; 269 270 ConsiderForPruning(N); 271 272 if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) 273 Worklist.push_back(N); 274 } 275 276 /// Remove all instances of N from the worklist. 277 void removeFromWorklist(SDNode *N) { 278 CombinedNodes.erase(N); 279 PruningList.remove(N); 280 StoreRootCountMap.erase(N); 281 282 auto It = WorklistMap.find(N); 283 if (It == WorklistMap.end()) 284 return; // Not in the worklist. 285 286 // Null out the entry rather than erasing it to avoid a linear operation. 287 Worklist[It->second] = nullptr; 288 WorklistMap.erase(It); 289 } 290 291 void deleteAndRecombine(SDNode *N); 292 bool recursivelyDeleteUnusedNodes(SDNode *N); 293 294 /// Replaces all uses of the results of one DAG node with new values. 295 SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, 296 bool AddTo = true); 297 298 /// Replaces all uses of the results of one DAG node with new values. 299 SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) { 300 return CombineTo(N, &Res, 1, AddTo); 301 } 302 303 /// Replaces all uses of the results of one DAG node with new values. 304 SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, 305 bool AddTo = true) { 306 SDValue To[] = { Res0, Res1 }; 307 return CombineTo(N, To, 2, AddTo); 308 } 309 310 void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); 311 312 private: 313 unsigned MaximumLegalStoreInBits; 314 315 /// Check the specified integer node value to see if it can be simplified or 316 /// if things it uses can be simplified by bit propagation. 317 /// If so, return true. 318 bool SimplifyDemandedBits(SDValue Op) { 319 unsigned BitWidth = Op.getScalarValueSizeInBits(); 320 APInt DemandedBits = APInt::getAllOnesValue(BitWidth); 321 return SimplifyDemandedBits(Op, DemandedBits); 322 } 323 324 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) { 325 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 326 KnownBits Known; 327 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false)) 328 return false; 329 330 // Revisit the node. 331 AddToWorklist(Op.getNode()); 332 333 CommitTargetLoweringOpt(TLO); 334 return true; 335 } 336 337 /// Check the specified vector node value to see if it can be simplified or 338 /// if things it uses can be simplified as it only uses some of the 339 /// elements. If so, return true. 340 bool SimplifyDemandedVectorElts(SDValue Op) { 341 // TODO: For now just pretend it cannot be simplified. 342 if (Op.getValueType().isScalableVector()) 343 return false; 344 345 unsigned NumElts = Op.getValueType().getVectorNumElements(); 346 APInt DemandedElts = APInt::getAllOnesValue(NumElts); 347 return SimplifyDemandedVectorElts(Op, DemandedElts); 348 } 349 350 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, 351 const APInt &DemandedElts, 352 bool AssumeSingleUse = false); 353 bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, 354 bool AssumeSingleUse = false); 355 356 bool CombineToPreIndexedLoadStore(SDNode *N); 357 bool CombineToPostIndexedLoadStore(SDNode *N); 358 SDValue SplitIndexingFromLoad(LoadSDNode *LD); 359 bool SliceUpLoad(SDNode *N); 360 361 // Scalars have size 0 to distinguish from singleton vectors. 362 SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD); 363 bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); 364 bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val); 365 366 /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed 367 /// load. 368 /// 369 /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. 370 /// \param InVecVT type of the input vector to EVE with bitcasts resolved. 371 /// \param EltNo index of the vector element to load. 372 /// \param OriginalLoad load that EVE came from to be replaced. 373 /// \returns EVE on success SDValue() on failure. 374 SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, 375 SDValue EltNo, 376 LoadSDNode *OriginalLoad); 377 void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); 378 SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); 379 SDValue SExtPromoteOperand(SDValue Op, EVT PVT); 380 SDValue ZExtPromoteOperand(SDValue Op, EVT PVT); 381 SDValue PromoteIntBinOp(SDValue Op); 382 SDValue PromoteIntShiftOp(SDValue Op); 383 SDValue PromoteExtend(SDValue Op); 384 bool PromoteLoad(SDValue Op); 385 386 /// Call the node-specific routine that knows how to fold each 387 /// particular type of node. If that doesn't do anything, try the 388 /// target-specific DAG combines. 389 SDValue combine(SDNode *N); 390 391 // Visitation implementation - Implement dag node combining for different 392 // node types. The semantics are as follows: 393 // Return Value: 394 // SDValue.getNode() == 0 - No change was made 395 // SDValue.getNode() == N - N was replaced, is dead and has been handled. 396 // otherwise - N should be replaced by the returned Operand. 397 // 398 SDValue visitTokenFactor(SDNode *N); 399 SDValue visitMERGE_VALUES(SDNode *N); 400 SDValue visitADD(SDNode *N); 401 SDValue visitADDLike(SDNode *N); 402 SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); 403 SDValue visitSUB(SDNode *N); 404 SDValue visitADDSAT(SDNode *N); 405 SDValue visitSUBSAT(SDNode *N); 406 SDValue visitADDC(SDNode *N); 407 SDValue visitADDO(SDNode *N); 408 SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); 409 SDValue visitSUBC(SDNode *N); 410 SDValue visitSUBO(SDNode *N); 411 SDValue visitADDE(SDNode *N); 412 SDValue visitADDCARRY(SDNode *N); 413 SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); 414 SDValue visitSUBE(SDNode *N); 415 SDValue visitSUBCARRY(SDNode *N); 416 SDValue visitMUL(SDNode *N); 417 SDValue visitMULFIX(SDNode *N); 418 SDValue useDivRem(SDNode *N); 419 SDValue visitSDIV(SDNode *N); 420 SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N); 421 SDValue visitUDIV(SDNode *N); 422 SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N); 423 SDValue visitREM(SDNode *N); 424 SDValue visitMULHU(SDNode *N); 425 SDValue visitMULHS(SDNode *N); 426 SDValue visitSMUL_LOHI(SDNode *N); 427 SDValue visitUMUL_LOHI(SDNode *N); 428 SDValue visitMULO(SDNode *N); 429 SDValue visitIMINMAX(SDNode *N); 430 SDValue visitAND(SDNode *N); 431 SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N); 432 SDValue visitOR(SDNode *N); 433 SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N); 434 SDValue visitXOR(SDNode *N); 435 SDValue SimplifyVBinOp(SDNode *N); 436 SDValue visitSHL(SDNode *N); 437 SDValue visitSRA(SDNode *N); 438 SDValue visitSRL(SDNode *N); 439 SDValue visitFunnelShift(SDNode *N); 440 SDValue visitRotate(SDNode *N); 441 SDValue visitABS(SDNode *N); 442 SDValue visitBSWAP(SDNode *N); 443 SDValue visitBITREVERSE(SDNode *N); 444 SDValue visitCTLZ(SDNode *N); 445 SDValue visitCTLZ_ZERO_UNDEF(SDNode *N); 446 SDValue visitCTTZ(SDNode *N); 447 SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); 448 SDValue visitCTPOP(SDNode *N); 449 SDValue visitSELECT(SDNode *N); 450 SDValue visitVSELECT(SDNode *N); 451 SDValue visitSELECT_CC(SDNode *N); 452 SDValue visitSETCC(SDNode *N); 453 SDValue visitSETCCCARRY(SDNode *N); 454 SDValue visitSIGN_EXTEND(SDNode *N); 455 SDValue visitZERO_EXTEND(SDNode *N); 456 SDValue visitANY_EXTEND(SDNode *N); 457 SDValue visitAssertExt(SDNode *N); 458 SDValue visitAssertAlign(SDNode *N); 459 SDValue visitSIGN_EXTEND_INREG(SDNode *N); 460 SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); 461 SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); 462 SDValue visitTRUNCATE(SDNode *N); 463 SDValue visitBITCAST(SDNode *N); 464 SDValue visitFREEZE(SDNode *N); 465 SDValue visitBUILD_PAIR(SDNode *N); 466 SDValue visitFADD(SDNode *N); 467 SDValue visitFSUB(SDNode *N); 468 SDValue visitFMUL(SDNode *N); 469 SDValue visitFMA(SDNode *N); 470 SDValue visitFDIV(SDNode *N); 471 SDValue visitFREM(SDNode *N); 472 SDValue visitFSQRT(SDNode *N); 473 SDValue visitFCOPYSIGN(SDNode *N); 474 SDValue visitFPOW(SDNode *N); 475 SDValue visitSINT_TO_FP(SDNode *N); 476 SDValue visitUINT_TO_FP(SDNode *N); 477 SDValue visitFP_TO_SINT(SDNode *N); 478 SDValue visitFP_TO_UINT(SDNode *N); 479 SDValue visitFP_ROUND(SDNode *N); 480 SDValue visitFP_EXTEND(SDNode *N); 481 SDValue visitFNEG(SDNode *N); 482 SDValue visitFABS(SDNode *N); 483 SDValue visitFCEIL(SDNode *N); 484 SDValue visitFTRUNC(SDNode *N); 485 SDValue visitFFLOOR(SDNode *N); 486 SDValue visitFMINNUM(SDNode *N); 487 SDValue visitFMAXNUM(SDNode *N); 488 SDValue visitFMINIMUM(SDNode *N); 489 SDValue visitFMAXIMUM(SDNode *N); 490 SDValue visitBRCOND(SDNode *N); 491 SDValue visitBR_CC(SDNode *N); 492 SDValue visitLOAD(SDNode *N); 493 494 SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); 495 SDValue replaceStoreOfFPConstant(StoreSDNode *ST); 496 497 SDValue visitSTORE(SDNode *N); 498 SDValue visitLIFETIME_END(SDNode *N); 499 SDValue visitINSERT_VECTOR_ELT(SDNode *N); 500 SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); 501 SDValue visitBUILD_VECTOR(SDNode *N); 502 SDValue visitCONCAT_VECTORS(SDNode *N); 503 SDValue visitEXTRACT_SUBVECTOR(SDNode *N); 504 SDValue visitVECTOR_SHUFFLE(SDNode *N); 505 SDValue visitSCALAR_TO_VECTOR(SDNode *N); 506 SDValue visitINSERT_SUBVECTOR(SDNode *N); 507 SDValue visitMLOAD(SDNode *N); 508 SDValue visitMSTORE(SDNode *N); 509 SDValue visitMGATHER(SDNode *N); 510 SDValue visitMSCATTER(SDNode *N); 511 SDValue visitFP_TO_FP16(SDNode *N); 512 SDValue visitFP16_TO_FP(SDNode *N); 513 SDValue visitVECREDUCE(SDNode *N); 514 515 SDValue visitFADDForFMACombine(SDNode *N); 516 SDValue visitFSUBForFMACombine(SDNode *N); 517 SDValue visitFMULForFMADistributiveCombine(SDNode *N); 518 519 SDValue XformToShuffleWithZero(SDNode *N); 520 bool reassociationCanBreakAddressingModePattern(unsigned Opc, 521 const SDLoc &DL, SDValue N0, 522 SDValue N1); 523 SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, 524 SDValue N1); 525 SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, 526 SDValue N1, SDNodeFlags Flags); 527 528 SDValue visitShiftByConstant(SDNode *N); 529 530 SDValue foldSelectOfConstants(SDNode *N); 531 SDValue foldVSelectOfConstants(SDNode *N); 532 SDValue foldBinOpIntoSelect(SDNode *BO); 533 bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); 534 SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N); 535 SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); 536 SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, 537 SDValue N2, SDValue N3, ISD::CondCode CC, 538 bool NotExtCompare = false); 539 SDValue convertSelectOfFPConstantsToLoadOffset( 540 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, 541 ISD::CondCode CC); 542 SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, 543 SDValue N2, SDValue N3, ISD::CondCode CC); 544 SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, 545 const SDLoc &DL); 546 SDValue unfoldMaskedMerge(SDNode *N); 547 SDValue unfoldExtremeBitClearingToShifts(SDNode *N); 548 SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, 549 const SDLoc &DL, bool foldBooleans); 550 SDValue rebuildSetCC(SDValue N); 551 552 bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, 553 SDValue &CC, bool MatchStrict = false) const; 554 bool isOneUseSetCC(SDValue N) const; 555 556 SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, 557 unsigned HiOp); 558 SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); 559 SDValue CombineExtLoad(SDNode *N); 560 SDValue CombineZExtLogicopShiftLoad(SDNode *N); 561 SDValue combineRepeatedFPDivisors(SDNode *N); 562 SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); 563 SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); 564 SDValue BuildSDIV(SDNode *N); 565 SDValue BuildSDIVPow2(SDNode *N); 566 SDValue BuildUDIV(SDNode *N); 567 SDValue BuildLogBase2(SDValue V, const SDLoc &DL); 568 SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags); 569 SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); 570 SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); 571 SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); 572 SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, 573 SDNodeFlags Flags, bool Reciprocal); 574 SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, 575 SDNodeFlags Flags, bool Reciprocal); 576 SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, 577 bool DemandHighBits = true); 578 SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); 579 SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, 580 SDValue InnerPos, SDValue InnerNeg, 581 unsigned PosOpcode, unsigned NegOpcode, 582 const SDLoc &DL); 583 SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg, 584 SDValue InnerPos, SDValue InnerNeg, 585 unsigned PosOpcode, unsigned NegOpcode, 586 const SDLoc &DL); 587 SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); 588 SDValue MatchLoadCombine(SDNode *N); 589 SDValue MatchStoreCombine(StoreSDNode *N); 590 SDValue ReduceLoadWidth(SDNode *N); 591 SDValue ReduceLoadOpStoreWidth(SDNode *N); 592 SDValue splitMergedValStore(StoreSDNode *ST); 593 SDValue TransformFPLoadStorePair(SDNode *N); 594 SDValue convertBuildVecZextToZext(SDNode *N); 595 SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); 596 SDValue reduceBuildVecTruncToBitCast(SDNode *N); 597 SDValue reduceBuildVecToShuffle(SDNode *N); 598 SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, 599 ArrayRef<int> VectorMask, SDValue VecIn1, 600 SDValue VecIn2, unsigned LeftIdx, 601 bool DidSplitVec); 602 SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); 603 604 /// Walk up chain skipping non-aliasing memory nodes, 605 /// looking for aliasing nodes and adding them to the Aliases vector. 606 void GatherAllAliases(SDNode *N, SDValue OriginalChain, 607 SmallVectorImpl<SDValue> &Aliases); 608 609 /// Return true if there is any possibility that the two addresses overlap. 610 bool isAlias(SDNode *Op0, SDNode *Op1) const; 611 612 /// Walk up chain skipping non-aliasing memory nodes, looking for a better 613 /// chain (aliasing node.) 614 SDValue FindBetterChain(SDNode *N, SDValue Chain); 615 616 /// Try to replace a store and any possibly adjacent stores on 617 /// consecutive chains with better chains. Return true only if St is 618 /// replaced. 619 /// 620 /// Notice that other chains may still be replaced even if the function 621 /// returns false. 622 bool findBetterNeighborChains(StoreSDNode *St); 623 624 // Helper for findBetterNeighborChains. Walk up store chain add additional 625 // chained stores that do not overlap and can be parallelized. 626 bool parallelizeChainedStores(StoreSDNode *St); 627 628 /// Holds a pointer to an LSBaseSDNode as well as information on where it 629 /// is located in a sequence of memory operations connected by a chain. 630 struct MemOpLink { 631 // Ptr to the mem node. 632 LSBaseSDNode *MemNode; 633 634 // Offset from the base ptr. 635 int64_t OffsetFromBase; 636 637 MemOpLink(LSBaseSDNode *N, int64_t Offset) 638 : MemNode(N), OffsetFromBase(Offset) {} 639 }; 640 641 // Classify the origin of a stored value. 642 enum class StoreSource { Unknown, Constant, Extract, Load }; 643 StoreSource getStoreSource(SDValue StoreVal) { 644 if (isa<ConstantSDNode>(StoreVal) || isa<ConstantFPSDNode>(StoreVal)) 645 return StoreSource::Constant; 646 if (StoreVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || 647 StoreVal.getOpcode() == ISD::EXTRACT_SUBVECTOR) 648 return StoreSource::Extract; 649 if (isa<LoadSDNode>(StoreVal)) 650 return StoreSource::Load; 651 return StoreSource::Unknown; 652 } 653 654 /// This is a helper function for visitMUL to check the profitability 655 /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). 656 /// MulNode is the original multiply, AddNode is (add x, c1), 657 /// and ConstNode is c2. 658 bool isMulAddWithConstProfitable(SDNode *MulNode, 659 SDValue &AddNode, 660 SDValue &ConstNode); 661 662 /// This is a helper function for visitAND and visitZERO_EXTEND. Returns 663 /// true if the (and (load x) c) pattern matches an extload. ExtVT returns 664 /// the type of the loaded value to be extended. 665 bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, 666 EVT LoadResultTy, EVT &ExtVT); 667 668 /// Helper function to calculate whether the given Load/Store can have its 669 /// width reduced to ExtVT. 670 bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType, 671 EVT &MemVT, unsigned ShAmt = 0); 672 673 /// Used by BackwardsPropagateMask to find suitable loads. 674 bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads, 675 SmallPtrSetImpl<SDNode*> &NodesWithConsts, 676 ConstantSDNode *Mask, SDNode *&NodeToMask); 677 /// Attempt to propagate a given AND node back to load leaves so that they 678 /// can be combined into narrow loads. 679 bool BackwardsPropagateMask(SDNode *N); 680 681 /// Helper function for mergeConsecutiveStores which merges the component 682 /// store chains. 683 SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, 684 unsigned NumStores); 685 686 /// This is a helper function for mergeConsecutiveStores. When the source 687 /// elements of the consecutive stores are all constants or all extracted 688 /// vector elements, try to merge them into one larger store introducing 689 /// bitcasts if necessary. \return True if a merged store was created. 690 bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, 691 EVT MemVT, unsigned NumStores, 692 bool IsConstantSrc, bool UseVector, 693 bool UseTrunc); 694 695 /// This is a helper function for mergeConsecutiveStores. Stores that 696 /// potentially may be merged with St are placed in StoreNodes. RootNode is 697 /// a chain predecessor to all store candidates. 698 void getStoreMergeCandidates(StoreSDNode *St, 699 SmallVectorImpl<MemOpLink> &StoreNodes, 700 SDNode *&Root); 701 702 /// Helper function for mergeConsecutiveStores. Checks if candidate stores 703 /// have indirect dependency through their operands. RootNode is the 704 /// predecessor to all stores calculated by getStoreMergeCandidates and is 705 /// used to prune the dependency check. \return True if safe to merge. 706 bool checkMergeStoreCandidatesForDependencies( 707 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores, 708 SDNode *RootNode); 709 710 /// This is a helper function for mergeConsecutiveStores. Given a list of 711 /// store candidates, find the first N that are consecutive in memory. 712 /// Returns 0 if there are not at least 2 consecutive stores to try merging. 713 unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes, 714 int64_t ElementSizeBytes) const; 715 716 /// This is a helper function for mergeConsecutiveStores. It is used for 717 /// store chains that are composed entirely of constant values. 718 bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes, 719 unsigned NumConsecutiveStores, 720 EVT MemVT, SDNode *Root, bool AllowVectors); 721 722 /// This is a helper function for mergeConsecutiveStores. It is used for 723 /// store chains that are composed entirely of extracted vector elements. 724 /// When extracting multiple vector elements, try to store them in one 725 /// vector store rather than a sequence of scalar stores. 726 bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes, 727 unsigned NumConsecutiveStores, EVT MemVT, 728 SDNode *Root); 729 730 /// This is a helper function for mergeConsecutiveStores. It is used for 731 /// store chains that are composed entirely of loaded values. 732 bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, 733 unsigned NumConsecutiveStores, EVT MemVT, 734 SDNode *Root, bool AllowVectors, 735 bool IsNonTemporalStore, bool IsNonTemporalLoad); 736 737 /// Merge consecutive store operations into a wide store. 738 /// This optimization uses wide integers or vectors when possible. 739 /// \return true if stores were merged. 740 bool mergeConsecutiveStores(StoreSDNode *St); 741 742 /// Try to transform a truncation where C is a constant: 743 /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) 744 /// 745 /// \p N needs to be a truncation and its first operand an AND. Other 746 /// requirements are checked by the function (e.g. that trunc is 747 /// single-use) and if missed an empty SDValue is returned. 748 SDValue distributeTruncateThroughAnd(SDNode *N); 749 750 /// Helper function to determine whether the target supports operation 751 /// given by \p Opcode for type \p VT, that is, whether the operation 752 /// is legal or custom before legalizing operations, and whether is 753 /// legal (but not custom) after legalization. 754 bool hasOperation(unsigned Opcode, EVT VT) { 755 if (LegalOperations) 756 return TLI.isOperationLegal(Opcode, VT); 757 return TLI.isOperationLegalOrCustom(Opcode, VT); 758 } 759 760 public: 761 /// Runs the dag combiner on all nodes in the work list 762 void Run(CombineLevel AtLevel); 763 764 SelectionDAG &getDAG() const { return DAG; } 765 766 /// Returns a type large enough to hold any valid shift amount - before type 767 /// legalization these can be huge. 768 EVT getShiftAmountTy(EVT LHSTy) { 769 assert(LHSTy.isInteger() && "Shift amount is not an integer type!"); 770 return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes); 771 } 772 773 /// This method returns true if we are running before type legalization or 774 /// if the specified VT is legal. 775 bool isTypeLegal(const EVT &VT) { 776 if (!LegalTypes) return true; 777 return TLI.isTypeLegal(VT); 778 } 779 780 /// Convenience wrapper around TargetLowering::getSetCCResultType 781 EVT getSetCCResultType(EVT VT) const { 782 return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 783 } 784 785 void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, 786 SDValue OrigLoad, SDValue ExtLoad, 787 ISD::NodeType ExtType); 788 }; 789 790 /// This class is a DAGUpdateListener that removes any deleted 791 /// nodes from the worklist. 792 class WorklistRemover : public SelectionDAG::DAGUpdateListener { 793 DAGCombiner &DC; 794 795 public: 796 explicit WorklistRemover(DAGCombiner &dc) 797 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} 798 799 void NodeDeleted(SDNode *N, SDNode *E) override { 800 DC.removeFromWorklist(N); 801 } 802 }; 803 804 class WorklistInserter : public SelectionDAG::DAGUpdateListener { 805 DAGCombiner &DC; 806 807 public: 808 explicit WorklistInserter(DAGCombiner &dc) 809 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} 810 811 // FIXME: Ideally we could add N to the worklist, but this causes exponential 812 // compile time costs in large DAGs, e.g. Halide. 813 void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); } 814 }; 815 816 } // end anonymous namespace 817 818 //===----------------------------------------------------------------------===// 819 // TargetLowering::DAGCombinerInfo implementation 820 //===----------------------------------------------------------------------===// 821 822 void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) { 823 ((DAGCombiner*)DC)->AddToWorklist(N); 824 } 825 826 SDValue TargetLowering::DAGCombinerInfo:: 827 CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) { 828 return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo); 829 } 830 831 SDValue TargetLowering::DAGCombinerInfo:: 832 CombineTo(SDNode *N, SDValue Res, bool AddTo) { 833 return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo); 834 } 835 836 SDValue TargetLowering::DAGCombinerInfo:: 837 CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { 838 return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); 839 } 840 841 bool TargetLowering::DAGCombinerInfo:: 842 recursivelyDeleteUnusedNodes(SDNode *N) { 843 return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N); 844 } 845 846 void TargetLowering::DAGCombinerInfo:: 847 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { 848 return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); 849 } 850 851 //===----------------------------------------------------------------------===// 852 // Helper Functions 853 //===----------------------------------------------------------------------===// 854 855 void DAGCombiner::deleteAndRecombine(SDNode *N) { 856 removeFromWorklist(N); 857 858 // If the operands of this node are only used by the node, they will now be 859 // dead. Make sure to re-visit them and recursively delete dead nodes. 860 for (const SDValue &Op : N->ops()) 861 // For an operand generating multiple values, one of the values may 862 // become dead allowing further simplification (e.g. split index 863 // arithmetic from an indexed load). 864 if (Op->hasOneUse() || Op->getNumValues() > 1) 865 AddToWorklist(Op.getNode()); 866 867 DAG.DeleteNode(N); 868 } 869 870 // APInts must be the same size for most operations, this helper 871 // function zero extends the shorter of the pair so that they match. 872 // We provide an Offset so that we can create bitwidths that won't overflow. 873 static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) { 874 unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth()); 875 LHS = LHS.zextOrSelf(Bits); 876 RHS = RHS.zextOrSelf(Bits); 877 } 878 879 // Return true if this node is a setcc, or is a select_cc 880 // that selects between the target values used for true and false, making it 881 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to 882 // the appropriate nodes based on the type of node we are checking. This 883 // simplifies life a bit for the callers. 884 bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, 885 SDValue &CC, bool MatchStrict) const { 886 if (N.getOpcode() == ISD::SETCC) { 887 LHS = N.getOperand(0); 888 RHS = N.getOperand(1); 889 CC = N.getOperand(2); 890 return true; 891 } 892 893 if (MatchStrict && 894 (N.getOpcode() == ISD::STRICT_FSETCC || 895 N.getOpcode() == ISD::STRICT_FSETCCS)) { 896 LHS = N.getOperand(1); 897 RHS = N.getOperand(2); 898 CC = N.getOperand(3); 899 return true; 900 } 901 902 if (N.getOpcode() != ISD::SELECT_CC || 903 !TLI.isConstTrueVal(N.getOperand(2).getNode()) || 904 !TLI.isConstFalseVal(N.getOperand(3).getNode())) 905 return false; 906 907 if (TLI.getBooleanContents(N.getValueType()) == 908 TargetLowering::UndefinedBooleanContent) 909 return false; 910 911 LHS = N.getOperand(0); 912 RHS = N.getOperand(1); 913 CC = N.getOperand(4); 914 return true; 915 } 916 917 /// Return true if this is a SetCC-equivalent operation with only one use. 918 /// If this is true, it allows the users to invert the operation for free when 919 /// it is profitable to do so. 920 bool DAGCombiner::isOneUseSetCC(SDValue N) const { 921 SDValue N0, N1, N2; 922 if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse()) 923 return true; 924 return false; 925 } 926 927 // Returns the SDNode if it is a constant float BuildVector 928 // or constant float. 929 static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) { 930 if (isa<ConstantFPSDNode>(N)) 931 return N.getNode(); 932 if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) 933 return N.getNode(); 934 return nullptr; 935 } 936 937 // Determines if it is a constant integer or a build vector of constant 938 // integers (and undefs). 939 // Do not permit build vector implicit truncation. 940 static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { 941 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N)) 942 return !(Const->isOpaque() && NoOpaques); 943 if (N.getOpcode() != ISD::BUILD_VECTOR) 944 return false; 945 unsigned BitWidth = N.getScalarValueSizeInBits(); 946 for (const SDValue &Op : N->op_values()) { 947 if (Op.isUndef()) 948 continue; 949 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op); 950 if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth || 951 (Const->isOpaque() && NoOpaques)) 952 return false; 953 } 954 return true; 955 } 956 957 // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with 958 // undef's. 959 static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) { 960 if (V.getOpcode() != ISD::BUILD_VECTOR) 961 return false; 962 return isConstantOrConstantVector(V, NoOpaques) || 963 ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()); 964 } 965 966 // Determine if this an indexed load with an opaque target constant index. 967 static bool canSplitIdx(LoadSDNode *LD) { 968 return MaySplitLoadIndex && 969 (LD->getOperand(2).getOpcode() != ISD::TargetConstant || 970 !cast<ConstantSDNode>(LD->getOperand(2))->isOpaque()); 971 } 972 973 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, 974 const SDLoc &DL, 975 SDValue N0, 976 SDValue N1) { 977 // Currently this only tries to ensure we don't undo the GEP splits done by 978 // CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this, 979 // we check if the following transformation would be problematic: 980 // (load/store (add, (add, x, offset1), offset2)) -> 981 // (load/store (add, x, offset1+offset2)). 982 983 if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD) 984 return false; 985 986 if (N0.hasOneUse()) 987 return false; 988 989 auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 990 auto *C2 = dyn_cast<ConstantSDNode>(N1); 991 if (!C1 || !C2) 992 return false; 993 994 const APInt &C1APIntVal = C1->getAPIntValue(); 995 const APInt &C2APIntVal = C2->getAPIntValue(); 996 if (C1APIntVal.getBitWidth() > 64 || C2APIntVal.getBitWidth() > 64) 997 return false; 998 999 const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal; 1000 if (CombinedValueIntVal.getBitWidth() > 64) 1001 return false; 1002 const int64_t CombinedValue = CombinedValueIntVal.getSExtValue(); 1003 1004 for (SDNode *Node : N0->uses()) { 1005 auto LoadStore = dyn_cast<MemSDNode>(Node); 1006 if (LoadStore) { 1007 // Is x[offset2] already not a legal addressing mode? If so then 1008 // reassociating the constants breaks nothing (we test offset2 because 1009 // that's the one we hope to fold into the load or store). 1010 TargetLoweringBase::AddrMode AM; 1011 AM.HasBaseReg = true; 1012 AM.BaseOffs = C2APIntVal.getSExtValue(); 1013 EVT VT = LoadStore->getMemoryVT(); 1014 unsigned AS = LoadStore->getAddressSpace(); 1015 Type *AccessTy = VT.getTypeForEVT(*DAG.getContext()); 1016 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) 1017 continue; 1018 1019 // Would x[offset1+offset2] still be a legal addressing mode? 1020 AM.BaseOffs = CombinedValue; 1021 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) 1022 return true; 1023 } 1024 } 1025 1026 return false; 1027 } 1028 1029 // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression 1030 // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. 1031 SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, 1032 SDValue N0, SDValue N1) { 1033 EVT VT = N0.getValueType(); 1034 1035 if (N0.getOpcode() != Opc) 1036 return SDValue(); 1037 1038 if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { 1039 if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { 1040 // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) 1041 if (SDValue OpNode = 1042 DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1})) 1043 return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); 1044 return SDValue(); 1045 } 1046 if (N0.hasOneUse()) { 1047 // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) 1048 // iff (op x, c1) has one use 1049 SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); 1050 if (!OpNode.getNode()) 1051 return SDValue(); 1052 return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); 1053 } 1054 } 1055 return SDValue(); 1056 } 1057 1058 // Try to reassociate commutative binops. 1059 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, 1060 SDValue N1, SDNodeFlags Flags) { 1061 assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative."); 1062 1063 // Floating-point reassociation is not allowed without loose FP math. 1064 if (N0.getValueType().isFloatingPoint() || 1065 N1.getValueType().isFloatingPoint()) 1066 if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros()) 1067 return SDValue(); 1068 1069 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1)) 1070 return Combined; 1071 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0)) 1072 return Combined; 1073 return SDValue(); 1074 } 1075 1076 SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, 1077 bool AddTo) { 1078 assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); 1079 ++NodesCombined; 1080 LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: "; 1081 To[0].getNode()->dump(&DAG); 1082 dbgs() << " and " << NumTo - 1 << " other values\n"); 1083 for (unsigned i = 0, e = NumTo; i != e; ++i) 1084 assert((!To[i].getNode() || 1085 N->getValueType(i) == To[i].getValueType()) && 1086 "Cannot combine value to value of different type!"); 1087 1088 WorklistRemover DeadNodes(*this); 1089 DAG.ReplaceAllUsesWith(N, To); 1090 if (AddTo) { 1091 // Push the new nodes and any users onto the worklist 1092 for (unsigned i = 0, e = NumTo; i != e; ++i) { 1093 if (To[i].getNode()) { 1094 AddToWorklist(To[i].getNode()); 1095 AddUsersToWorklist(To[i].getNode()); 1096 } 1097 } 1098 } 1099 1100 // Finally, if the node is now dead, remove it from the graph. The node 1101 // may not be dead if the replacement process recursively simplified to 1102 // something else needing this node. 1103 if (N->use_empty()) 1104 deleteAndRecombine(N); 1105 return SDValue(N, 0); 1106 } 1107 1108 void DAGCombiner:: 1109 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { 1110 // Replace the old value with the new one. 1111 ++NodesCombined; 1112 LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); 1113 dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); 1114 dbgs() << '\n'); 1115 1116 // Replace all uses. If any nodes become isomorphic to other nodes and 1117 // are deleted, make sure to remove them from our worklist. 1118 WorklistRemover DeadNodes(*this); 1119 DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); 1120 1121 // Push the new node and any (possibly new) users onto the worklist. 1122 AddToWorklistWithUsers(TLO.New.getNode()); 1123 1124 // Finally, if the node is now dead, remove it from the graph. The node 1125 // may not be dead if the replacement process recursively simplified to 1126 // something else needing this node. 1127 if (TLO.Old.getNode()->use_empty()) 1128 deleteAndRecombine(TLO.Old.getNode()); 1129 } 1130 1131 /// Check the specified integer node value to see if it can be simplified or if 1132 /// things it uses can be simplified by bit propagation. If so, return true. 1133 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, 1134 const APInt &DemandedElts, 1135 bool AssumeSingleUse) { 1136 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 1137 KnownBits Known; 1138 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0, 1139 AssumeSingleUse)) 1140 return false; 1141 1142 // Revisit the node. 1143 AddToWorklist(Op.getNode()); 1144 1145 CommitTargetLoweringOpt(TLO); 1146 return true; 1147 } 1148 1149 /// Check the specified vector node value to see if it can be simplified or 1150 /// if things it uses can be simplified as it only uses some of the elements. 1151 /// If so, return true. 1152 bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, 1153 const APInt &DemandedElts, 1154 bool AssumeSingleUse) { 1155 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); 1156 APInt KnownUndef, KnownZero; 1157 if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, 1158 TLO, 0, AssumeSingleUse)) 1159 return false; 1160 1161 // Revisit the node. 1162 AddToWorklist(Op.getNode()); 1163 1164 CommitTargetLoweringOpt(TLO); 1165 return true; 1166 } 1167 1168 void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { 1169 SDLoc DL(Load); 1170 EVT VT = Load->getValueType(0); 1171 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0)); 1172 1173 LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: "; 1174 Trunc.getNode()->dump(&DAG); dbgs() << '\n'); 1175 WorklistRemover DeadNodes(*this); 1176 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); 1177 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); 1178 deleteAndRecombine(Load); 1179 AddToWorklist(Trunc.getNode()); 1180 } 1181 1182 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { 1183 Replace = false; 1184 SDLoc DL(Op); 1185 if (ISD::isUNINDEXEDLoad(Op.getNode())) { 1186 LoadSDNode *LD = cast<LoadSDNode>(Op); 1187 EVT MemVT = LD->getMemoryVT(); 1188 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD 1189 : LD->getExtensionType(); 1190 Replace = true; 1191 return DAG.getExtLoad(ExtType, DL, PVT, 1192 LD->getChain(), LD->getBasePtr(), 1193 MemVT, LD->getMemOperand()); 1194 } 1195 1196 unsigned Opc = Op.getOpcode(); 1197 switch (Opc) { 1198 default: break; 1199 case ISD::AssertSext: 1200 if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT)) 1201 return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1)); 1202 break; 1203 case ISD::AssertZext: 1204 if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT)) 1205 return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1)); 1206 break; 1207 case ISD::Constant: { 1208 unsigned ExtOpc = 1209 Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1210 return DAG.getNode(ExtOpc, DL, PVT, Op); 1211 } 1212 } 1213 1214 if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT)) 1215 return SDValue(); 1216 return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op); 1217 } 1218 1219 SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) { 1220 if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT)) 1221 return SDValue(); 1222 EVT OldVT = Op.getValueType(); 1223 SDLoc DL(Op); 1224 bool Replace = false; 1225 SDValue NewOp = PromoteOperand(Op, PVT, Replace); 1226 if (!NewOp.getNode()) 1227 return SDValue(); 1228 AddToWorklist(NewOp.getNode()); 1229 1230 if (Replace) 1231 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); 1232 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp, 1233 DAG.getValueType(OldVT)); 1234 } 1235 1236 SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) { 1237 EVT OldVT = Op.getValueType(); 1238 SDLoc DL(Op); 1239 bool Replace = false; 1240 SDValue NewOp = PromoteOperand(Op, PVT, Replace); 1241 if (!NewOp.getNode()) 1242 return SDValue(); 1243 AddToWorklist(NewOp.getNode()); 1244 1245 if (Replace) 1246 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); 1247 return DAG.getZeroExtendInReg(NewOp, DL, OldVT); 1248 } 1249 1250 /// Promote the specified integer binary operation if the target indicates it is 1251 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to 1252 /// i32 since i16 instructions are longer. 1253 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { 1254 if (!LegalOperations) 1255 return SDValue(); 1256 1257 EVT VT = Op.getValueType(); 1258 if (VT.isVector() || !VT.isInteger()) 1259 return SDValue(); 1260 1261 // If operation type is 'undesirable', e.g. i16 on x86, consider 1262 // promoting it. 1263 unsigned Opc = Op.getOpcode(); 1264 if (TLI.isTypeDesirableForOp(Opc, VT)) 1265 return SDValue(); 1266 1267 EVT PVT = VT; 1268 // Consult target whether it is a good idea to promote this operation and 1269 // what's the right type to promote it to. 1270 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1271 assert(PVT != VT && "Don't know what type to promote to!"); 1272 1273 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1274 1275 bool Replace0 = false; 1276 SDValue N0 = Op.getOperand(0); 1277 SDValue NN0 = PromoteOperand(N0, PVT, Replace0); 1278 1279 bool Replace1 = false; 1280 SDValue N1 = Op.getOperand(1); 1281 SDValue NN1 = PromoteOperand(N1, PVT, Replace1); 1282 SDLoc DL(Op); 1283 1284 SDValue RV = 1285 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); 1286 1287 // We are always replacing N0/N1's use in N and only need additional 1288 // replacements if there are additional uses. 1289 // Note: We are checking uses of the *nodes* (SDNode) rather than values 1290 // (SDValue) here because the node may reference multiple values 1291 // (for example, the chain value of a load node). 1292 Replace0 &= !N0->hasOneUse(); 1293 Replace1 &= (N0 != N1) && !N1->hasOneUse(); 1294 1295 // Combine Op here so it is preserved past replacements. 1296 CombineTo(Op.getNode(), RV); 1297 1298 // If operands have a use ordering, make sure we deal with 1299 // predecessor first. 1300 if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) { 1301 std::swap(N0, N1); 1302 std::swap(NN0, NN1); 1303 } 1304 1305 if (Replace0) { 1306 AddToWorklist(NN0.getNode()); 1307 ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); 1308 } 1309 if (Replace1) { 1310 AddToWorklist(NN1.getNode()); 1311 ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); 1312 } 1313 return Op; 1314 } 1315 return SDValue(); 1316 } 1317 1318 /// Promote the specified integer shift operation if the target indicates it is 1319 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to 1320 /// i32 since i16 instructions are longer. 1321 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { 1322 if (!LegalOperations) 1323 return SDValue(); 1324 1325 EVT VT = Op.getValueType(); 1326 if (VT.isVector() || !VT.isInteger()) 1327 return SDValue(); 1328 1329 // If operation type is 'undesirable', e.g. i16 on x86, consider 1330 // promoting it. 1331 unsigned Opc = Op.getOpcode(); 1332 if (TLI.isTypeDesirableForOp(Opc, VT)) 1333 return SDValue(); 1334 1335 EVT PVT = VT; 1336 // Consult target whether it is a good idea to promote this operation and 1337 // what's the right type to promote it to. 1338 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1339 assert(PVT != VT && "Don't know what type to promote to!"); 1340 1341 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1342 1343 bool Replace = false; 1344 SDValue N0 = Op.getOperand(0); 1345 SDValue N1 = Op.getOperand(1); 1346 if (Opc == ISD::SRA) 1347 N0 = SExtPromoteOperand(N0, PVT); 1348 else if (Opc == ISD::SRL) 1349 N0 = ZExtPromoteOperand(N0, PVT); 1350 else 1351 N0 = PromoteOperand(N0, PVT, Replace); 1352 1353 if (!N0.getNode()) 1354 return SDValue(); 1355 1356 SDLoc DL(Op); 1357 SDValue RV = 1358 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); 1359 1360 if (Replace) 1361 ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); 1362 1363 // Deal with Op being deleted. 1364 if (Op && Op.getOpcode() != ISD::DELETED_NODE) 1365 return RV; 1366 } 1367 return SDValue(); 1368 } 1369 1370 SDValue DAGCombiner::PromoteExtend(SDValue Op) { 1371 if (!LegalOperations) 1372 return SDValue(); 1373 1374 EVT VT = Op.getValueType(); 1375 if (VT.isVector() || !VT.isInteger()) 1376 return SDValue(); 1377 1378 // If operation type is 'undesirable', e.g. i16 on x86, consider 1379 // promoting it. 1380 unsigned Opc = Op.getOpcode(); 1381 if (TLI.isTypeDesirableForOp(Opc, VT)) 1382 return SDValue(); 1383 1384 EVT PVT = VT; 1385 // Consult target whether it is a good idea to promote this operation and 1386 // what's the right type to promote it to. 1387 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1388 assert(PVT != VT && "Don't know what type to promote to!"); 1389 // fold (aext (aext x)) -> (aext x) 1390 // fold (aext (zext x)) -> (zext x) 1391 // fold (aext (sext x)) -> (sext x) 1392 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); 1393 return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0)); 1394 } 1395 return SDValue(); 1396 } 1397 1398 bool DAGCombiner::PromoteLoad(SDValue Op) { 1399 if (!LegalOperations) 1400 return false; 1401 1402 if (!ISD::isUNINDEXEDLoad(Op.getNode())) 1403 return false; 1404 1405 EVT VT = Op.getValueType(); 1406 if (VT.isVector() || !VT.isInteger()) 1407 return false; 1408 1409 // If operation type is 'undesirable', e.g. i16 on x86, consider 1410 // promoting it. 1411 unsigned Opc = Op.getOpcode(); 1412 if (TLI.isTypeDesirableForOp(Opc, VT)) 1413 return false; 1414 1415 EVT PVT = VT; 1416 // Consult target whether it is a good idea to promote this operation and 1417 // what's the right type to promote it to. 1418 if (TLI.IsDesirableToPromoteOp(Op, PVT)) { 1419 assert(PVT != VT && "Don't know what type to promote to!"); 1420 1421 SDLoc DL(Op); 1422 SDNode *N = Op.getNode(); 1423 LoadSDNode *LD = cast<LoadSDNode>(N); 1424 EVT MemVT = LD->getMemoryVT(); 1425 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD 1426 : LD->getExtensionType(); 1427 SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT, 1428 LD->getChain(), LD->getBasePtr(), 1429 MemVT, LD->getMemOperand()); 1430 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); 1431 1432 LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: "; 1433 Result.getNode()->dump(&DAG); dbgs() << '\n'); 1434 WorklistRemover DeadNodes(*this); 1435 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1436 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); 1437 deleteAndRecombine(N); 1438 AddToWorklist(Result.getNode()); 1439 return true; 1440 } 1441 return false; 1442 } 1443 1444 /// Recursively delete a node which has no uses and any operands for 1445 /// which it is the only use. 1446 /// 1447 /// Note that this both deletes the nodes and removes them from the worklist. 1448 /// It also adds any nodes who have had a user deleted to the worklist as they 1449 /// may now have only one use and subject to other combines. 1450 bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { 1451 if (!N->use_empty()) 1452 return false; 1453 1454 SmallSetVector<SDNode *, 16> Nodes; 1455 Nodes.insert(N); 1456 do { 1457 N = Nodes.pop_back_val(); 1458 if (!N) 1459 continue; 1460 1461 if (N->use_empty()) { 1462 for (const SDValue &ChildN : N->op_values()) 1463 Nodes.insert(ChildN.getNode()); 1464 1465 removeFromWorklist(N); 1466 DAG.DeleteNode(N); 1467 } else { 1468 AddToWorklist(N); 1469 } 1470 } while (!Nodes.empty()); 1471 return true; 1472 } 1473 1474 //===----------------------------------------------------------------------===// 1475 // Main DAG Combiner implementation 1476 //===----------------------------------------------------------------------===// 1477 1478 void DAGCombiner::Run(CombineLevel AtLevel) { 1479 // set the instance variables, so that the various visit routines may use it. 1480 Level = AtLevel; 1481 LegalDAG = Level >= AfterLegalizeDAG; 1482 LegalOperations = Level >= AfterLegalizeVectorOps; 1483 LegalTypes = Level >= AfterLegalizeTypes; 1484 1485 WorklistInserter AddNodes(*this); 1486 1487 // Add all the dag nodes to the worklist. 1488 for (SDNode &Node : DAG.allnodes()) 1489 AddToWorklist(&Node); 1490 1491 // Create a dummy node (which is not added to allnodes), that adds a reference 1492 // to the root node, preventing it from being deleted, and tracking any 1493 // changes of the root. 1494 HandleSDNode Dummy(DAG.getRoot()); 1495 1496 // While we have a valid worklist entry node, try to combine it. 1497 while (SDNode *N = getNextWorklistEntry()) { 1498 // If N has no uses, it is dead. Make sure to revisit all N's operands once 1499 // N is deleted from the DAG, since they too may now be dead or may have a 1500 // reduced number of uses, allowing other xforms. 1501 if (recursivelyDeleteUnusedNodes(N)) 1502 continue; 1503 1504 WorklistRemover DeadNodes(*this); 1505 1506 // If this combine is running after legalizing the DAG, re-legalize any 1507 // nodes pulled off the worklist. 1508 if (LegalDAG) { 1509 SmallSetVector<SDNode *, 16> UpdatedNodes; 1510 bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); 1511 1512 for (SDNode *LN : UpdatedNodes) 1513 AddToWorklistWithUsers(LN); 1514 1515 if (!NIsValid) 1516 continue; 1517 } 1518 1519 LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); 1520 1521 // Add any operands of the new node which have not yet been combined to the 1522 // worklist as well. Because the worklist uniques things already, this 1523 // won't repeatedly process the same operand. 1524 CombinedNodes.insert(N); 1525 for (const SDValue &ChildN : N->op_values()) 1526 if (!CombinedNodes.count(ChildN.getNode())) 1527 AddToWorklist(ChildN.getNode()); 1528 1529 SDValue RV = combine(N); 1530 1531 if (!RV.getNode()) 1532 continue; 1533 1534 ++NodesCombined; 1535 1536 // If we get back the same node we passed in, rather than a new node or 1537 // zero, we know that the node must have defined multiple values and 1538 // CombineTo was used. Since CombineTo takes care of the worklist 1539 // mechanics for us, we have no work to do in this case. 1540 if (RV.getNode() == N) 1541 continue; 1542 1543 assert(N->getOpcode() != ISD::DELETED_NODE && 1544 RV.getOpcode() != ISD::DELETED_NODE && 1545 "Node was deleted but visit returned new node!"); 1546 1547 LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG)); 1548 1549 if (N->getNumValues() == RV.getNode()->getNumValues()) 1550 DAG.ReplaceAllUsesWith(N, RV.getNode()); 1551 else { 1552 assert(N->getValueType(0) == RV.getValueType() && 1553 N->getNumValues() == 1 && "Type mismatch"); 1554 DAG.ReplaceAllUsesWith(N, &RV); 1555 } 1556 1557 // Push the new node and any users onto the worklist 1558 AddToWorklist(RV.getNode()); 1559 AddUsersToWorklist(RV.getNode()); 1560 1561 // Finally, if the node is now dead, remove it from the graph. The node 1562 // may not be dead if the replacement process recursively simplified to 1563 // something else needing this node. This will also take care of adding any 1564 // operands which have lost a user to the worklist. 1565 recursivelyDeleteUnusedNodes(N); 1566 } 1567 1568 // If the root changed (e.g. it was a dead load, update the root). 1569 DAG.setRoot(Dummy.getValue()); 1570 DAG.RemoveDeadNodes(); 1571 } 1572 1573 SDValue DAGCombiner::visit(SDNode *N) { 1574 switch (N->getOpcode()) { 1575 default: break; 1576 case ISD::TokenFactor: return visitTokenFactor(N); 1577 case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); 1578 case ISD::ADD: return visitADD(N); 1579 case ISD::SUB: return visitSUB(N); 1580 case ISD::SADDSAT: 1581 case ISD::UADDSAT: return visitADDSAT(N); 1582 case ISD::SSUBSAT: 1583 case ISD::USUBSAT: return visitSUBSAT(N); 1584 case ISD::ADDC: return visitADDC(N); 1585 case ISD::SADDO: 1586 case ISD::UADDO: return visitADDO(N); 1587 case ISD::SUBC: return visitSUBC(N); 1588 case ISD::SSUBO: 1589 case ISD::USUBO: return visitSUBO(N); 1590 case ISD::ADDE: return visitADDE(N); 1591 case ISD::ADDCARRY: return visitADDCARRY(N); 1592 case ISD::SUBE: return visitSUBE(N); 1593 case ISD::SUBCARRY: return visitSUBCARRY(N); 1594 case ISD::SMULFIX: 1595 case ISD::SMULFIXSAT: 1596 case ISD::UMULFIX: 1597 case ISD::UMULFIXSAT: return visitMULFIX(N); 1598 case ISD::MUL: return visitMUL(N); 1599 case ISD::SDIV: return visitSDIV(N); 1600 case ISD::UDIV: return visitUDIV(N); 1601 case ISD::SREM: 1602 case ISD::UREM: return visitREM(N); 1603 case ISD::MULHU: return visitMULHU(N); 1604 case ISD::MULHS: return visitMULHS(N); 1605 case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); 1606 case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); 1607 case ISD::SMULO: 1608 case ISD::UMULO: return visitMULO(N); 1609 case ISD::SMIN: 1610 case ISD::SMAX: 1611 case ISD::UMIN: 1612 case ISD::UMAX: return visitIMINMAX(N); 1613 case ISD::AND: return visitAND(N); 1614 case ISD::OR: return visitOR(N); 1615 case ISD::XOR: return visitXOR(N); 1616 case ISD::SHL: return visitSHL(N); 1617 case ISD::SRA: return visitSRA(N); 1618 case ISD::SRL: return visitSRL(N); 1619 case ISD::ROTR: 1620 case ISD::ROTL: return visitRotate(N); 1621 case ISD::FSHL: 1622 case ISD::FSHR: return visitFunnelShift(N); 1623 case ISD::ABS: return visitABS(N); 1624 case ISD::BSWAP: return visitBSWAP(N); 1625 case ISD::BITREVERSE: return visitBITREVERSE(N); 1626 case ISD::CTLZ: return visitCTLZ(N); 1627 case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N); 1628 case ISD::CTTZ: return visitCTTZ(N); 1629 case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); 1630 case ISD::CTPOP: return visitCTPOP(N); 1631 case ISD::SELECT: return visitSELECT(N); 1632 case ISD::VSELECT: return visitVSELECT(N); 1633 case ISD::SELECT_CC: return visitSELECT_CC(N); 1634 case ISD::SETCC: return visitSETCC(N); 1635 case ISD::SETCCCARRY: return visitSETCCCARRY(N); 1636 case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); 1637 case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); 1638 case ISD::ANY_EXTEND: return visitANY_EXTEND(N); 1639 case ISD::AssertSext: 1640 case ISD::AssertZext: return visitAssertExt(N); 1641 case ISD::AssertAlign: return visitAssertAlign(N); 1642 case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); 1643 case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); 1644 case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); 1645 case ISD::TRUNCATE: return visitTRUNCATE(N); 1646 case ISD::BITCAST: return visitBITCAST(N); 1647 case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); 1648 case ISD::FADD: return visitFADD(N); 1649 case ISD::FSUB: return visitFSUB(N); 1650 case ISD::FMUL: return visitFMUL(N); 1651 case ISD::FMA: return visitFMA(N); 1652 case ISD::FDIV: return visitFDIV(N); 1653 case ISD::FREM: return visitFREM(N); 1654 case ISD::FSQRT: return visitFSQRT(N); 1655 case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); 1656 case ISD::FPOW: return visitFPOW(N); 1657 case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); 1658 case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); 1659 case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); 1660 case ISD::FP_TO_UINT: return visitFP_TO_UINT(N); 1661 case ISD::FP_ROUND: return visitFP_ROUND(N); 1662 case ISD::FP_EXTEND: return visitFP_EXTEND(N); 1663 case ISD::FNEG: return visitFNEG(N); 1664 case ISD::FABS: return visitFABS(N); 1665 case ISD::FFLOOR: return visitFFLOOR(N); 1666 case ISD::FMINNUM: return visitFMINNUM(N); 1667 case ISD::FMAXNUM: return visitFMAXNUM(N); 1668 case ISD::FMINIMUM: return visitFMINIMUM(N); 1669 case ISD::FMAXIMUM: return visitFMAXIMUM(N); 1670 case ISD::FCEIL: return visitFCEIL(N); 1671 case ISD::FTRUNC: return visitFTRUNC(N); 1672 case ISD::BRCOND: return visitBRCOND(N); 1673 case ISD::BR_CC: return visitBR_CC(N); 1674 case ISD::LOAD: return visitLOAD(N); 1675 case ISD::STORE: return visitSTORE(N); 1676 case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N); 1677 case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); 1678 case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); 1679 case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); 1680 case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); 1681 case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); 1682 case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N); 1683 case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); 1684 case ISD::MGATHER: return visitMGATHER(N); 1685 case ISD::MLOAD: return visitMLOAD(N); 1686 case ISD::MSCATTER: return visitMSCATTER(N); 1687 case ISD::MSTORE: return visitMSTORE(N); 1688 case ISD::LIFETIME_END: return visitLIFETIME_END(N); 1689 case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); 1690 case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); 1691 case ISD::FREEZE: return visitFREEZE(N); 1692 case ISD::VECREDUCE_FADD: 1693 case ISD::VECREDUCE_FMUL: 1694 case ISD::VECREDUCE_ADD: 1695 case ISD::VECREDUCE_MUL: 1696 case ISD::VECREDUCE_AND: 1697 case ISD::VECREDUCE_OR: 1698 case ISD::VECREDUCE_XOR: 1699 case ISD::VECREDUCE_SMAX: 1700 case ISD::VECREDUCE_SMIN: 1701 case ISD::VECREDUCE_UMAX: 1702 case ISD::VECREDUCE_UMIN: 1703 case ISD::VECREDUCE_FMAX: 1704 case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); 1705 } 1706 return SDValue(); 1707 } 1708 1709 SDValue DAGCombiner::combine(SDNode *N) { 1710 SDValue RV; 1711 if (!DisableGenericCombines) 1712 RV = visit(N); 1713 1714 // If nothing happened, try a target-specific DAG combine. 1715 if (!RV.getNode()) { 1716 assert(N->getOpcode() != ISD::DELETED_NODE && 1717 "Node was deleted but visit returned NULL!"); 1718 1719 if (N->getOpcode() >= ISD::BUILTIN_OP_END || 1720 TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) { 1721 1722 // Expose the DAG combiner to the target combiner impls. 1723 TargetLowering::DAGCombinerInfo 1724 DagCombineInfo(DAG, Level, false, this); 1725 1726 RV = TLI.PerformDAGCombine(N, DagCombineInfo); 1727 } 1728 } 1729 1730 // If nothing happened still, try promoting the operation. 1731 if (!RV.getNode()) { 1732 switch (N->getOpcode()) { 1733 default: break; 1734 case ISD::ADD: 1735 case ISD::SUB: 1736 case ISD::MUL: 1737 case ISD::AND: 1738 case ISD::OR: 1739 case ISD::XOR: 1740 RV = PromoteIntBinOp(SDValue(N, 0)); 1741 break; 1742 case ISD::SHL: 1743 case ISD::SRA: 1744 case ISD::SRL: 1745 RV = PromoteIntShiftOp(SDValue(N, 0)); 1746 break; 1747 case ISD::SIGN_EXTEND: 1748 case ISD::ZERO_EXTEND: 1749 case ISD::ANY_EXTEND: 1750 RV = PromoteExtend(SDValue(N, 0)); 1751 break; 1752 case ISD::LOAD: 1753 if (PromoteLoad(SDValue(N, 0))) 1754 RV = SDValue(N, 0); 1755 break; 1756 } 1757 } 1758 1759 // If N is a commutative binary node, try to eliminate it if the commuted 1760 // version is already present in the DAG. 1761 if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && 1762 N->getNumValues() == 1) { 1763 SDValue N0 = N->getOperand(0); 1764 SDValue N1 = N->getOperand(1); 1765 1766 // Constant operands are canonicalized to RHS. 1767 if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) { 1768 SDValue Ops[] = {N1, N0}; 1769 SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, 1770 N->getFlags()); 1771 if (CSENode) 1772 return SDValue(CSENode, 0); 1773 } 1774 } 1775 1776 return RV; 1777 } 1778 1779 /// Given a node, return its input chain if it has one, otherwise return a null 1780 /// sd operand. 1781 static SDValue getInputChainForNode(SDNode *N) { 1782 if (unsigned NumOps = N->getNumOperands()) { 1783 if (N->getOperand(0).getValueType() == MVT::Other) 1784 return N->getOperand(0); 1785 if (N->getOperand(NumOps-1).getValueType() == MVT::Other) 1786 return N->getOperand(NumOps-1); 1787 for (unsigned i = 1; i < NumOps-1; ++i) 1788 if (N->getOperand(i).getValueType() == MVT::Other) 1789 return N->getOperand(i); 1790 } 1791 return SDValue(); 1792 } 1793 1794 SDValue DAGCombiner::visitTokenFactor(SDNode *N) { 1795 // If N has two operands, where one has an input chain equal to the other, 1796 // the 'other' chain is redundant. 1797 if (N->getNumOperands() == 2) { 1798 if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1)) 1799 return N->getOperand(0); 1800 if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0)) 1801 return N->getOperand(1); 1802 } 1803 1804 // Don't simplify token factors if optnone. 1805 if (OptLevel == CodeGenOpt::None) 1806 return SDValue(); 1807 1808 // If the sole user is a token factor, we should make sure we have a 1809 // chance to merge them together. This prevents TF chains from inhibiting 1810 // optimizations. 1811 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor) 1812 AddToWorklist(*(N->use_begin())); 1813 1814 SmallVector<SDNode *, 8> TFs; // List of token factors to visit. 1815 SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. 1816 SmallPtrSet<SDNode*, 16> SeenOps; 1817 bool Changed = false; // If we should replace this token factor. 1818 1819 // Start out with this token factor. 1820 TFs.push_back(N); 1821 1822 // Iterate through token factors. The TFs grows when new token factors are 1823 // encountered. 1824 for (unsigned i = 0; i < TFs.size(); ++i) { 1825 // Limit number of nodes to inline, to avoid quadratic compile times. 1826 // We have to add the outstanding Token Factors to Ops, otherwise we might 1827 // drop Ops from the resulting Token Factors. 1828 if (Ops.size() > TokenFactorInlineLimit) { 1829 for (unsigned j = i; j < TFs.size(); j++) 1830 Ops.emplace_back(TFs[j], 0); 1831 // Drop unprocessed Token Factors from TFs, so we do not add them to the 1832 // combiner worklist later. 1833 TFs.resize(i); 1834 break; 1835 } 1836 1837 SDNode *TF = TFs[i]; 1838 // Check each of the operands. 1839 for (const SDValue &Op : TF->op_values()) { 1840 switch (Op.getOpcode()) { 1841 case ISD::EntryToken: 1842 // Entry tokens don't need to be added to the list. They are 1843 // redundant. 1844 Changed = true; 1845 break; 1846 1847 case ISD::TokenFactor: 1848 if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) { 1849 // Queue up for processing. 1850 TFs.push_back(Op.getNode()); 1851 Changed = true; 1852 break; 1853 } 1854 LLVM_FALLTHROUGH; 1855 1856 default: 1857 // Only add if it isn't already in the list. 1858 if (SeenOps.insert(Op.getNode()).second) 1859 Ops.push_back(Op); 1860 else 1861 Changed = true; 1862 break; 1863 } 1864 } 1865 } 1866 1867 // Re-visit inlined Token Factors, to clean them up in case they have been 1868 // removed. Skip the first Token Factor, as this is the current node. 1869 for (unsigned i = 1, e = TFs.size(); i < e; i++) 1870 AddToWorklist(TFs[i]); 1871 1872 // Remove Nodes that are chained to another node in the list. Do so 1873 // by walking up chains breath-first stopping when we've seen 1874 // another operand. In general we must climb to the EntryNode, but we can exit 1875 // early if we find all remaining work is associated with just one operand as 1876 // no further pruning is possible. 1877 1878 // List of nodes to search through and original Ops from which they originate. 1879 SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist; 1880 SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op. 1881 SmallPtrSet<SDNode *, 16> SeenChains; 1882 bool DidPruneOps = false; 1883 1884 unsigned NumLeftToConsider = 0; 1885 for (const SDValue &Op : Ops) { 1886 Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); 1887 OpWorkCount.push_back(1); 1888 } 1889 1890 auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { 1891 // If this is an Op, we can remove the op from the list. Remark any 1892 // search associated with it as from the current OpNumber. 1893 if (SeenOps.count(Op) != 0) { 1894 Changed = true; 1895 DidPruneOps = true; 1896 unsigned OrigOpNumber = 0; 1897 while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) 1898 OrigOpNumber++; 1899 assert((OrigOpNumber != Ops.size()) && 1900 "expected to find TokenFactor Operand"); 1901 // Re-mark worklist from OrigOpNumber to OpNumber 1902 for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { 1903 if (Worklist[i].second == OrigOpNumber) { 1904 Worklist[i].second = OpNumber; 1905 } 1906 } 1907 OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; 1908 OpWorkCount[OrigOpNumber] = 0; 1909 NumLeftToConsider--; 1910 } 1911 // Add if it's a new chain 1912 if (SeenChains.insert(Op).second) { 1913 OpWorkCount[OpNumber]++; 1914 Worklist.push_back(std::make_pair(Op, OpNumber)); 1915 } 1916 }; 1917 1918 for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { 1919 // We need at least be consider at least 2 Ops to prune. 1920 if (NumLeftToConsider <= 1) 1921 break; 1922 auto CurNode = Worklist[i].first; 1923 auto CurOpNumber = Worklist[i].second; 1924 assert((OpWorkCount[CurOpNumber] > 0) && 1925 "Node should not appear in worklist"); 1926 switch (CurNode->getOpcode()) { 1927 case ISD::EntryToken: 1928 // Hitting EntryToken is the only way for the search to terminate without 1929 // hitting 1930 // another operand's search. Prevent us from marking this operand 1931 // considered. 1932 NumLeftToConsider++; 1933 break; 1934 case ISD::TokenFactor: 1935 for (const SDValue &Op : CurNode->op_values()) 1936 AddToWorklist(i, Op.getNode(), CurOpNumber); 1937 break; 1938 case ISD::LIFETIME_START: 1939 case ISD::LIFETIME_END: 1940 case ISD::CopyFromReg: 1941 case ISD::CopyToReg: 1942 AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); 1943 break; 1944 default: 1945 if (auto *MemNode = dyn_cast<MemSDNode>(CurNode)) 1946 AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); 1947 break; 1948 } 1949 OpWorkCount[CurOpNumber]--; 1950 if (OpWorkCount[CurOpNumber] == 0) 1951 NumLeftToConsider--; 1952 } 1953 1954 // If we've changed things around then replace token factor. 1955 if (Changed) { 1956 SDValue Result; 1957 if (Ops.empty()) { 1958 // The entry token is the only possible outcome. 1959 Result = DAG.getEntryNode(); 1960 } else { 1961 if (DidPruneOps) { 1962 SmallVector<SDValue, 8> PrunedOps; 1963 // 1964 for (const SDValue &Op : Ops) { 1965 if (SeenChains.count(Op.getNode()) == 0) 1966 PrunedOps.push_back(Op); 1967 } 1968 Result = DAG.getTokenFactor(SDLoc(N), PrunedOps); 1969 } else { 1970 Result = DAG.getTokenFactor(SDLoc(N), Ops); 1971 } 1972 } 1973 return Result; 1974 } 1975 return SDValue(); 1976 } 1977 1978 /// MERGE_VALUES can always be eliminated. 1979 SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { 1980 WorklistRemover DeadNodes(*this); 1981 // Replacing results may cause a different MERGE_VALUES to suddenly 1982 // be CSE'd with N, and carry its uses with it. Iterate until no 1983 // uses remain, to ensure that the node can be safely deleted. 1984 // First add the users of this node to the work list so that they 1985 // can be tried again once they have new operands. 1986 AddUsersToWorklist(N); 1987 do { 1988 // Do as a single replacement to avoid rewalking use lists. 1989 SmallVector<SDValue, 8> Ops; 1990 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 1991 Ops.push_back(N->getOperand(i)); 1992 DAG.ReplaceAllUsesWith(N, Ops.data()); 1993 } while (!N->use_empty()); 1994 deleteAndRecombine(N); 1995 return SDValue(N, 0); // Return N so it doesn't get rechecked! 1996 } 1997 1998 /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a 1999 /// ConstantSDNode pointer else nullptr. 2000 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { 2001 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N); 2002 return Const != nullptr && !Const->isOpaque() ? Const : nullptr; 2003 } 2004 2005 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { 2006 assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 && 2007 "Unexpected binary operator"); 2008 2009 // Don't do this unless the old select is going away. We want to eliminate the 2010 // binary operator, not replace a binop with a select. 2011 // TODO: Handle ISD::SELECT_CC. 2012 unsigned SelOpNo = 0; 2013 SDValue Sel = BO->getOperand(0); 2014 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) { 2015 SelOpNo = 1; 2016 Sel = BO->getOperand(1); 2017 } 2018 2019 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) 2020 return SDValue(); 2021 2022 SDValue CT = Sel.getOperand(1); 2023 if (!isConstantOrConstantVector(CT, true) && 2024 !isConstantFPBuildVectorOrConstantFP(CT)) 2025 return SDValue(); 2026 2027 SDValue CF = Sel.getOperand(2); 2028 if (!isConstantOrConstantVector(CF, true) && 2029 !isConstantFPBuildVectorOrConstantFP(CF)) 2030 return SDValue(); 2031 2032 // Bail out if any constants are opaque because we can't constant fold those. 2033 // The exception is "and" and "or" with either 0 or -1 in which case we can 2034 // propagate non constant operands into select. I.e.: 2035 // and (select Cond, 0, -1), X --> select Cond, 0, X 2036 // or X, (select Cond, -1, 0) --> select Cond, -1, X 2037 auto BinOpcode = BO->getOpcode(); 2038 bool CanFoldNonConst = 2039 (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && 2040 (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) && 2041 (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF)); 2042 2043 SDValue CBO = BO->getOperand(SelOpNo ^ 1); 2044 if (!CanFoldNonConst && 2045 !isConstantOrConstantVector(CBO, true) && 2046 !isConstantFPBuildVectorOrConstantFP(CBO)) 2047 return SDValue(); 2048 2049 EVT VT = Sel.getValueType(); 2050 2051 // In case of shift value and shift amount may have different VT. For instance 2052 // on x86 shift amount is i8 regardles of LHS type. Bail out if we have 2053 // swapped operands and value types do not match. NB: x86 is fine if operands 2054 // are not swapped with shift amount VT being not bigger than shifted value. 2055 // TODO: that is possible to check for a shift operation, correct VTs and 2056 // still perform optimization on x86 if needed. 2057 if (SelOpNo && VT != CBO.getValueType()) 2058 return SDValue(); 2059 2060 // We have a select-of-constants followed by a binary operator with a 2061 // constant. Eliminate the binop by pulling the constant math into the select. 2062 // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO 2063 SDLoc DL(Sel); 2064 SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT) 2065 : DAG.getNode(BinOpcode, DL, VT, CT, CBO); 2066 if (!CanFoldNonConst && !NewCT.isUndef() && 2067 !isConstantOrConstantVector(NewCT, true) && 2068 !isConstantFPBuildVectorOrConstantFP(NewCT)) 2069 return SDValue(); 2070 2071 SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF) 2072 : DAG.getNode(BinOpcode, DL, VT, CF, CBO); 2073 if (!CanFoldNonConst && !NewCF.isUndef() && 2074 !isConstantOrConstantVector(NewCF, true) && 2075 !isConstantFPBuildVectorOrConstantFP(NewCF)) 2076 return SDValue(); 2077 2078 SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); 2079 SelectOp->setFlags(BO->getFlags()); 2080 return SelectOp; 2081 } 2082 2083 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) { 2084 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 2085 "Expecting add or sub"); 2086 2087 // Match a constant operand and a zext operand for the math instruction: 2088 // add Z, C 2089 // sub C, Z 2090 bool IsAdd = N->getOpcode() == ISD::ADD; 2091 SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0); 2092 SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1); 2093 auto *CN = dyn_cast<ConstantSDNode>(C); 2094 if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND) 2095 return SDValue(); 2096 2097 // Match the zext operand as a setcc of a boolean. 2098 if (Z.getOperand(0).getOpcode() != ISD::SETCC || 2099 Z.getOperand(0).getValueType() != MVT::i1) 2100 return SDValue(); 2101 2102 // Match the compare as: setcc (X & 1), 0, eq. 2103 SDValue SetCC = Z.getOperand(0); 2104 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 2105 if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) || 2106 SetCC.getOperand(0).getOpcode() != ISD::AND || 2107 !isOneConstant(SetCC.getOperand(0).getOperand(1))) 2108 return SDValue(); 2109 2110 // We are adding/subtracting a constant and an inverted low bit. Turn that 2111 // into a subtract/add of the low bit with incremented/decremented constant: 2112 // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1)) 2113 // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1)) 2114 EVT VT = C.getValueType(); 2115 SDLoc DL(N); 2116 SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT); 2117 SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) : 2118 DAG.getConstant(CN->getAPIntValue() - 1, DL, VT); 2119 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit); 2120 } 2121 2122 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into 2123 /// a shift and add with a different constant. 2124 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { 2125 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 2126 "Expecting add or sub"); 2127 2128 // We need a constant operand for the add/sub, and the other operand is a 2129 // logical shift right: add (srl), C or sub C, (srl). 2130 bool IsAdd = N->getOpcode() == ISD::ADD; 2131 SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0); 2132 SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1); 2133 if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) || 2134 ShiftOp.getOpcode() != ISD::SRL) 2135 return SDValue(); 2136 2137 // The shift must be of a 'not' value. 2138 SDValue Not = ShiftOp.getOperand(0); 2139 if (!Not.hasOneUse() || !isBitwiseNot(Not)) 2140 return SDValue(); 2141 2142 // The shift must be moving the sign bit to the least-significant-bit. 2143 EVT VT = ShiftOp.getValueType(); 2144 SDValue ShAmt = ShiftOp.getOperand(1); 2145 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); 2146 if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1)) 2147 return SDValue(); 2148 2149 // Eliminate the 'not' by adjusting the shift and add/sub constant: 2150 // add (srl (not X), 31), C --> add (sra X, 31), (C + 1) 2151 // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1) 2152 SDLoc DL(N); 2153 auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL; 2154 SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt); 2155 if (SDValue NewC = 2156 DAG.FoldConstantArithmetic(IsAdd ? ISD::ADD : ISD::SUB, DL, VT, 2157 {ConstantOp, DAG.getConstant(1, DL, VT)})) 2158 return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC); 2159 return SDValue(); 2160 } 2161 2162 /// Try to fold a node that behaves like an ADD (note that N isn't necessarily 2163 /// an ISD::ADD here, it could for example be an ISD::OR if we know that there 2164 /// are no common bits set in the operands). 2165 SDValue DAGCombiner::visitADDLike(SDNode *N) { 2166 SDValue N0 = N->getOperand(0); 2167 SDValue N1 = N->getOperand(1); 2168 EVT VT = N0.getValueType(); 2169 SDLoc DL(N); 2170 2171 // fold vector ops 2172 if (VT.isVector()) { 2173 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 2174 return FoldedVOp; 2175 2176 // fold (add x, 0) -> x, vector edition 2177 if (ISD::isBuildVectorAllZeros(N1.getNode())) 2178 return N0; 2179 if (ISD::isBuildVectorAllZeros(N0.getNode())) 2180 return N1; 2181 } 2182 2183 // fold (add x, undef) -> undef 2184 if (N0.isUndef()) 2185 return N0; 2186 2187 if (N1.isUndef()) 2188 return N1; 2189 2190 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 2191 // canonicalize constant to RHS 2192 if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2193 return DAG.getNode(ISD::ADD, DL, VT, N1, N0); 2194 // fold (add c1, c2) -> c1+c2 2195 return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}); 2196 } 2197 2198 // fold (add x, 0) -> x 2199 if (isNullConstant(N1)) 2200 return N0; 2201 2202 if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { 2203 // fold ((A-c1)+c2) -> (A+(c2-c1)) 2204 if (N0.getOpcode() == ISD::SUB && 2205 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) { 2206 SDValue Sub = 2207 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)}); 2208 assert(Sub && "Constant folding failed"); 2209 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub); 2210 } 2211 2212 // fold ((c1-A)+c2) -> (c1+c2)-A 2213 if (N0.getOpcode() == ISD::SUB && 2214 isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { 2215 SDValue Add = 2216 DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)}); 2217 assert(Add && "Constant folding failed"); 2218 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); 2219 } 2220 2221 // add (sext i1 X), 1 -> zext (not i1 X) 2222 // We don't transform this pattern: 2223 // add (zext i1 X), -1 -> sext (not i1 X) 2224 // because most (?) targets generate better code for the zext form. 2225 if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && 2226 isOneOrOneSplat(N1)) { 2227 SDValue X = N0.getOperand(0); 2228 if ((!LegalOperations || 2229 (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && 2230 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) && 2231 X.getScalarValueSizeInBits() == 1) { 2232 SDValue Not = DAG.getNOT(DL, X, X.getValueType()); 2233 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); 2234 } 2235 } 2236 2237 // Fold (add (or x, c0), c1) -> (add x, (c0 + c1)) if (or x, c0) is 2238 // equivalent to (add x, c0). 2239 if (N0.getOpcode() == ISD::OR && 2240 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) && 2241 DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) { 2242 if (SDValue Add0 = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, 2243 {N1, N0.getOperand(1)})) 2244 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0); 2245 } 2246 } 2247 2248 if (SDValue NewSel = foldBinOpIntoSelect(N)) 2249 return NewSel; 2250 2251 // reassociate add 2252 if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) { 2253 if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) 2254 return RADD; 2255 } 2256 // fold ((0-A) + B) -> B-A 2257 if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0))) 2258 return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); 2259 2260 // fold (A + (0-B)) -> A-B 2261 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) 2262 return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1)); 2263 2264 // fold (A+(B-A)) -> B 2265 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1)) 2266 return N1.getOperand(0); 2267 2268 // fold ((B-A)+A) -> B 2269 if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1)) 2270 return N0.getOperand(0); 2271 2272 // fold ((A-B)+(C-A)) -> (C-B) 2273 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && 2274 N0.getOperand(0) == N1.getOperand(1)) 2275 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2276 N0.getOperand(1)); 2277 2278 // fold ((A-B)+(B-C)) -> (A-C) 2279 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && 2280 N0.getOperand(1) == N1.getOperand(0)) 2281 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), 2282 N1.getOperand(1)); 2283 2284 // fold (A+(B-(A+C))) to (B-C) 2285 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && 2286 N0 == N1.getOperand(1).getOperand(0)) 2287 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2288 N1.getOperand(1).getOperand(1)); 2289 2290 // fold (A+(B-(C+A))) to (B-C) 2291 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && 2292 N0 == N1.getOperand(1).getOperand(1)) 2293 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), 2294 N1.getOperand(1).getOperand(0)); 2295 2296 // fold (A+((B-A)+or-C)) to (B+or-C) 2297 if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) && 2298 N1.getOperand(0).getOpcode() == ISD::SUB && 2299 N0 == N1.getOperand(0).getOperand(1)) 2300 return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0), 2301 N1.getOperand(1)); 2302 2303 // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant 2304 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) { 2305 SDValue N00 = N0.getOperand(0); 2306 SDValue N01 = N0.getOperand(1); 2307 SDValue N10 = N1.getOperand(0); 2308 SDValue N11 = N1.getOperand(1); 2309 2310 if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10)) 2311 return DAG.getNode(ISD::SUB, DL, VT, 2312 DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10), 2313 DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); 2314 } 2315 2316 // fold (add (umax X, C), -C) --> (usubsat X, C) 2317 if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) { 2318 auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { 2319 return (!Max && !Op) || 2320 (Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue())); 2321 }; 2322 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT, 2323 /*AllowUndefs*/ true)) 2324 return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), 2325 N0.getOperand(1)); 2326 } 2327 2328 if (SimplifyDemandedBits(SDValue(N, 0))) 2329 return SDValue(N, 0); 2330 2331 if (isOneOrOneSplat(N1)) { 2332 // fold (add (xor a, -1), 1) -> (sub 0, a) 2333 if (isBitwiseNot(N0)) 2334 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 2335 N0.getOperand(0)); 2336 2337 // fold (add (add (xor a, -1), b), 1) -> (sub b, a) 2338 if (N0.getOpcode() == ISD::ADD || 2339 N0.getOpcode() == ISD::UADDO || 2340 N0.getOpcode() == ISD::SADDO) { 2341 SDValue A, Xor; 2342 2343 if (isBitwiseNot(N0.getOperand(0))) { 2344 A = N0.getOperand(1); 2345 Xor = N0.getOperand(0); 2346 } else if (isBitwiseNot(N0.getOperand(1))) { 2347 A = N0.getOperand(0); 2348 Xor = N0.getOperand(1); 2349 } 2350 2351 if (Xor) 2352 return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0)); 2353 } 2354 2355 // Look for: 2356 // add (add x, y), 1 2357 // And if the target does not like this form then turn into: 2358 // sub y, (xor x, -1) 2359 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && 2360 N0.getOpcode() == ISD::ADD) { 2361 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), 2362 DAG.getAllOnesConstant(DL, VT)); 2363 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not); 2364 } 2365 } 2366 2367 // (x - y) + -1 -> add (xor y, -1), x 2368 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2369 isAllOnesOrAllOnesSplat(N1)) { 2370 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1); 2371 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); 2372 } 2373 2374 if (SDValue Combined = visitADDLikeCommutative(N0, N1, N)) 2375 return Combined; 2376 2377 if (SDValue Combined = visitADDLikeCommutative(N1, N0, N)) 2378 return Combined; 2379 2380 return SDValue(); 2381 } 2382 2383 SDValue DAGCombiner::visitADD(SDNode *N) { 2384 SDValue N0 = N->getOperand(0); 2385 SDValue N1 = N->getOperand(1); 2386 EVT VT = N0.getValueType(); 2387 SDLoc DL(N); 2388 2389 if (SDValue Combined = visitADDLike(N)) 2390 return Combined; 2391 2392 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) 2393 return V; 2394 2395 if (SDValue V = foldAddSubOfSignBit(N, DAG)) 2396 return V; 2397 2398 // fold (a+b) -> (a|b) iff a and b share no bits. 2399 if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && 2400 DAG.haveNoCommonBitsSet(N0, N1)) 2401 return DAG.getNode(ISD::OR, DL, VT, N0, N1); 2402 2403 // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)). 2404 if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) { 2405 APInt C0 = N0->getConstantOperandAPInt(0); 2406 APInt C1 = N1->getConstantOperandAPInt(0); 2407 return DAG.getVScale(DL, VT, C0 + C1); 2408 } 2409 2410 // fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2) 2411 if ((N0.getOpcode() == ISD::ADD) && 2412 (N0.getOperand(1).getOpcode() == ISD::VSCALE) && 2413 (N1.getOpcode() == ISD::VSCALE)) { 2414 auto VS0 = N0.getOperand(1)->getConstantOperandAPInt(0); 2415 auto VS1 = N1->getConstantOperandAPInt(0); 2416 auto VS = DAG.getVScale(DL, VT, VS0 + VS1); 2417 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS); 2418 } 2419 2420 return SDValue(); 2421 } 2422 2423 SDValue DAGCombiner::visitADDSAT(SDNode *N) { 2424 unsigned Opcode = N->getOpcode(); 2425 SDValue N0 = N->getOperand(0); 2426 SDValue N1 = N->getOperand(1); 2427 EVT VT = N0.getValueType(); 2428 SDLoc DL(N); 2429 2430 // fold vector ops 2431 if (VT.isVector()) { 2432 // TODO SimplifyVBinOp 2433 2434 // fold (add_sat x, 0) -> x, vector edition 2435 if (ISD::isBuildVectorAllZeros(N1.getNode())) 2436 return N0; 2437 if (ISD::isBuildVectorAllZeros(N0.getNode())) 2438 return N1; 2439 } 2440 2441 // fold (add_sat x, undef) -> -1 2442 if (N0.isUndef() || N1.isUndef()) 2443 return DAG.getAllOnesConstant(DL, VT); 2444 2445 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 2446 // canonicalize constant to RHS 2447 if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2448 return DAG.getNode(Opcode, DL, VT, N1, N0); 2449 // fold (add_sat c1, c2) -> c3 2450 return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}); 2451 } 2452 2453 // fold (add_sat x, 0) -> x 2454 if (isNullConstant(N1)) 2455 return N0; 2456 2457 // If it cannot overflow, transform into an add. 2458 if (Opcode == ISD::UADDSAT) 2459 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2460 return DAG.getNode(ISD::ADD, DL, VT, N0, N1); 2461 2462 return SDValue(); 2463 } 2464 2465 static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { 2466 bool Masked = false; 2467 2468 // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. 2469 while (true) { 2470 if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { 2471 V = V.getOperand(0); 2472 continue; 2473 } 2474 2475 if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { 2476 Masked = true; 2477 V = V.getOperand(0); 2478 continue; 2479 } 2480 2481 break; 2482 } 2483 2484 // If this is not a carry, return. 2485 if (V.getResNo() != 1) 2486 return SDValue(); 2487 2488 if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && 2489 V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) 2490 return SDValue(); 2491 2492 EVT VT = V.getNode()->getValueType(0); 2493 if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT)) 2494 return SDValue(); 2495 2496 // If the result is masked, then no matter what kind of bool it is we can 2497 // return. If it isn't, then we need to make sure the bool type is either 0 or 2498 // 1 and not other values. 2499 if (Masked || 2500 TLI.getBooleanContents(V.getValueType()) == 2501 TargetLoweringBase::ZeroOrOneBooleanContent) 2502 return V; 2503 2504 return SDValue(); 2505 } 2506 2507 /// Given the operands of an add/sub operation, see if the 2nd operand is a 2508 /// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert 2509 /// the opcode and bypass the mask operation. 2510 static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1, 2511 SelectionDAG &DAG, const SDLoc &DL) { 2512 if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1))) 2513 return SDValue(); 2514 2515 EVT VT = N0.getValueType(); 2516 if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits()) 2517 return SDValue(); 2518 2519 // add N0, (and (AssertSext X, i1), 1) --> sub N0, X 2520 // sub N0, (and (AssertSext X, i1), 1) --> add N0, X 2521 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0)); 2522 } 2523 2524 /// Helper for doing combines based on N0 and N1 being added to each other. 2525 SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1, 2526 SDNode *LocReference) { 2527 EVT VT = N0.getValueType(); 2528 SDLoc DL(LocReference); 2529 2530 // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) 2531 if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && 2532 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) 2533 return DAG.getNode(ISD::SUB, DL, VT, N0, 2534 DAG.getNode(ISD::SHL, DL, VT, 2535 N1.getOperand(0).getOperand(1), 2536 N1.getOperand(1))); 2537 2538 if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL)) 2539 return V; 2540 2541 // Look for: 2542 // add (add x, 1), y 2543 // And if the target does not like this form then turn into: 2544 // sub y, (xor x, -1) 2545 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && 2546 N0.getOpcode() == ISD::ADD && isOneOrOneSplat(N0.getOperand(1))) { 2547 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), 2548 DAG.getAllOnesConstant(DL, VT)); 2549 return DAG.getNode(ISD::SUB, DL, VT, N1, Not); 2550 } 2551 2552 // Hoist one-use subtraction by non-opaque constant: 2553 // (x - C) + y -> (x + y) - C 2554 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. 2555 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2556 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 2557 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1); 2558 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); 2559 } 2560 // Hoist one-use subtraction from non-opaque constant: 2561 // (C - x) + y -> (y - x) + C 2562 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 2563 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { 2564 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); 2565 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0)); 2566 } 2567 2568 // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1' 2569 // rather than 'add 0/-1' (the zext should get folded). 2570 // add (sext i1 Y), X --> sub X, (zext i1 Y) 2571 if (N0.getOpcode() == ISD::SIGN_EXTEND && 2572 N0.getOperand(0).getScalarValueSizeInBits() == 1 && 2573 TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) { 2574 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); 2575 return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt); 2576 } 2577 2578 // add X, (sextinreg Y i1) -> sub X, (and Y 1) 2579 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { 2580 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1)); 2581 if (TN->getVT() == MVT::i1) { 2582 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), 2583 DAG.getConstant(1, DL, VT)); 2584 return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt); 2585 } 2586 } 2587 2588 // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) 2589 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && 2590 N1.getResNo() == 0) 2591 return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), 2592 N0, N1.getOperand(0), N1.getOperand(2)); 2593 2594 // (add X, Carry) -> (addcarry X, 0, Carry) 2595 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) 2596 if (SDValue Carry = getAsCarry(TLI, N1)) 2597 return DAG.getNode(ISD::ADDCARRY, DL, 2598 DAG.getVTList(VT, Carry.getValueType()), N0, 2599 DAG.getConstant(0, DL, VT), Carry); 2600 2601 return SDValue(); 2602 } 2603 2604 SDValue DAGCombiner::visitADDC(SDNode *N) { 2605 SDValue N0 = N->getOperand(0); 2606 SDValue N1 = N->getOperand(1); 2607 EVT VT = N0.getValueType(); 2608 SDLoc DL(N); 2609 2610 // If the flag result is dead, turn this into an ADD. 2611 if (!N->hasAnyUseOfValue(1)) 2612 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2613 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 2614 2615 // canonicalize constant to RHS. 2616 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2617 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2618 if (N0C && !N1C) 2619 return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); 2620 2621 // fold (addc x, 0) -> x + no carry out 2622 if (isNullConstant(N1)) 2623 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, 2624 DL, MVT::Glue)); 2625 2626 // If it cannot overflow, transform into an add. 2627 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2628 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2629 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 2630 2631 return SDValue(); 2632 } 2633 2634 static SDValue flipBoolean(SDValue V, const SDLoc &DL, 2635 SelectionDAG &DAG, const TargetLowering &TLI) { 2636 EVT VT = V.getValueType(); 2637 2638 SDValue Cst; 2639 switch (TLI.getBooleanContents(VT)) { 2640 case TargetLowering::ZeroOrOneBooleanContent: 2641 case TargetLowering::UndefinedBooleanContent: 2642 Cst = DAG.getConstant(1, DL, VT); 2643 break; 2644 case TargetLowering::ZeroOrNegativeOneBooleanContent: 2645 Cst = DAG.getAllOnesConstant(DL, VT); 2646 break; 2647 } 2648 2649 return DAG.getNode(ISD::XOR, DL, VT, V, Cst); 2650 } 2651 2652 /** 2653 * Flips a boolean if it is cheaper to compute. If the Force parameters is set, 2654 * then the flip also occurs if computing the inverse is the same cost. 2655 * This function returns an empty SDValue in case it cannot flip the boolean 2656 * without increasing the cost of the computation. If you want to flip a boolean 2657 * no matter what, use flipBoolean. 2658 */ 2659 static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG, 2660 const TargetLowering &TLI, 2661 bool Force) { 2662 if (Force && isa<ConstantSDNode>(V)) 2663 return flipBoolean(V, SDLoc(V), DAG, TLI); 2664 2665 if (V.getOpcode() != ISD::XOR) 2666 return SDValue(); 2667 2668 ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false); 2669 if (!Const) 2670 return SDValue(); 2671 2672 EVT VT = V.getValueType(); 2673 2674 bool IsFlip = false; 2675 switch(TLI.getBooleanContents(VT)) { 2676 case TargetLowering::ZeroOrOneBooleanContent: 2677 IsFlip = Const->isOne(); 2678 break; 2679 case TargetLowering::ZeroOrNegativeOneBooleanContent: 2680 IsFlip = Const->isAllOnesValue(); 2681 break; 2682 case TargetLowering::UndefinedBooleanContent: 2683 IsFlip = (Const->getAPIntValue() & 0x01) == 1; 2684 break; 2685 } 2686 2687 if (IsFlip) 2688 return V.getOperand(0); 2689 if (Force) 2690 return flipBoolean(V, SDLoc(V), DAG, TLI); 2691 return SDValue(); 2692 } 2693 2694 SDValue DAGCombiner::visitADDO(SDNode *N) { 2695 SDValue N0 = N->getOperand(0); 2696 SDValue N1 = N->getOperand(1); 2697 EVT VT = N0.getValueType(); 2698 bool IsSigned = (ISD::SADDO == N->getOpcode()); 2699 2700 EVT CarryVT = N->getValueType(1); 2701 SDLoc DL(N); 2702 2703 // If the flag result is dead, turn this into an ADD. 2704 if (!N->hasAnyUseOfValue(1)) 2705 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2706 DAG.getUNDEF(CarryVT)); 2707 2708 // canonicalize constant to RHS. 2709 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 2710 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 2711 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); 2712 2713 // fold (addo x, 0) -> x + no carry out 2714 if (isNullOrNullSplat(N1)) 2715 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); 2716 2717 if (!IsSigned) { 2718 // If it cannot overflow, transform into an add. 2719 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) 2720 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), 2721 DAG.getConstant(0, DL, CarryVT)); 2722 2723 // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. 2724 if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { 2725 SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), 2726 DAG.getConstant(0, DL, VT), N0.getOperand(0)); 2727 return CombineTo(N, Sub, 2728 flipBoolean(Sub.getValue(1), DL, DAG, TLI)); 2729 } 2730 2731 if (SDValue Combined = visitUADDOLike(N0, N1, N)) 2732 return Combined; 2733 2734 if (SDValue Combined = visitUADDOLike(N1, N0, N)) 2735 return Combined; 2736 } 2737 2738 return SDValue(); 2739 } 2740 2741 SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { 2742 EVT VT = N0.getValueType(); 2743 if (VT.isVector()) 2744 return SDValue(); 2745 2746 // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) 2747 // If Y + 1 cannot overflow. 2748 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { 2749 SDValue Y = N1.getOperand(0); 2750 SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); 2751 if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) 2752 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, 2753 N1.getOperand(2)); 2754 } 2755 2756 // (uaddo X, Carry) -> (addcarry X, 0, Carry) 2757 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) 2758 if (SDValue Carry = getAsCarry(TLI, N1)) 2759 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, 2760 DAG.getConstant(0, SDLoc(N), VT), Carry); 2761 2762 return SDValue(); 2763 } 2764 2765 SDValue DAGCombiner::visitADDE(SDNode *N) { 2766 SDValue N0 = N->getOperand(0); 2767 SDValue N1 = N->getOperand(1); 2768 SDValue CarryIn = N->getOperand(2); 2769 2770 // canonicalize constant to RHS 2771 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2772 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2773 if (N0C && !N1C) 2774 return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(), 2775 N1, N0, CarryIn); 2776 2777 // fold (adde x, y, false) -> (addc x, y) 2778 if (CarryIn.getOpcode() == ISD::CARRY_FALSE) 2779 return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1); 2780 2781 return SDValue(); 2782 } 2783 2784 SDValue DAGCombiner::visitADDCARRY(SDNode *N) { 2785 SDValue N0 = N->getOperand(0); 2786 SDValue N1 = N->getOperand(1); 2787 SDValue CarryIn = N->getOperand(2); 2788 SDLoc DL(N); 2789 2790 // canonicalize constant to RHS 2791 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); 2792 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 2793 if (N0C && !N1C) 2794 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); 2795 2796 // fold (addcarry x, y, false) -> (uaddo x, y) 2797 if (isNullConstant(CarryIn)) { 2798 if (!LegalOperations || 2799 TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0))) 2800 return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); 2801 } 2802 2803 // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. 2804 if (isNullConstant(N0) && isNullConstant(N1)) { 2805 EVT VT = N0.getValueType(); 2806 EVT CarryVT = CarryIn.getValueType(); 2807 SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); 2808 AddToWorklist(CarryExt.getNode()); 2809 return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, 2810 DAG.getConstant(1, DL, VT)), 2811 DAG.getConstant(0, DL, CarryVT)); 2812 } 2813 2814 if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) 2815 return Combined; 2816 2817 if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) 2818 return Combined; 2819 2820 return SDValue(); 2821 } 2822 2823 /** 2824 * If we are facing some sort of diamond carry propapagtion pattern try to 2825 * break it up to generate something like: 2826 * (addcarry X, 0, (addcarry A, B, Z):Carry) 2827 * 2828 * The end result is usually an increase in operation required, but because the 2829 * carry is now linearized, other tranforms can kick in and optimize the DAG. 2830 * 2831 * Patterns typically look something like 2832 * (uaddo A, B) 2833 * / \ 2834 * Carry Sum 2835 * | \ 2836 * | (addcarry *, 0, Z) 2837 * | / 2838 * \ Carry 2839 * | / 2840 * (addcarry X, *, *) 2841 * 2842 * But numerous variation exist. Our goal is to identify A, B, X and Z and 2843 * produce a combine with a single path for carry propagation. 2844 */ 2845 static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, 2846 SDValue X, SDValue Carry0, SDValue Carry1, 2847 SDNode *N) { 2848 if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1) 2849 return SDValue(); 2850 if (Carry1.getOpcode() != ISD::UADDO) 2851 return SDValue(); 2852 2853 SDValue Z; 2854 2855 /** 2856 * First look for a suitable Z. It will present itself in the form of 2857 * (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true 2858 */ 2859 if (Carry0.getOpcode() == ISD::ADDCARRY && 2860 isNullConstant(Carry0.getOperand(1))) { 2861 Z = Carry0.getOperand(2); 2862 } else if (Carry0.getOpcode() == ISD::UADDO && 2863 isOneConstant(Carry0.getOperand(1))) { 2864 EVT VT = Combiner.getSetCCResultType(Carry0.getValueType()); 2865 Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT); 2866 } else { 2867 // We couldn't find a suitable Z. 2868 return SDValue(); 2869 } 2870 2871 2872 auto cancelDiamond = [&](SDValue A,SDValue B) { 2873 SDLoc DL(N); 2874 SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z); 2875 Combiner.AddToWorklist(NewY.getNode()); 2876 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X, 2877 DAG.getConstant(0, DL, X.getValueType()), 2878 NewY.getValue(1)); 2879 }; 2880 2881 /** 2882 * (uaddo A, B) 2883 * | 2884 * Sum 2885 * | 2886 * (addcarry *, 0, Z) 2887 */ 2888 if (Carry0.getOperand(0) == Carry1.getValue(0)) { 2889 return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1)); 2890 } 2891 2892 /** 2893 * (addcarry A, 0, Z) 2894 * | 2895 * Sum 2896 * | 2897 * (uaddo *, B) 2898 */ 2899 if (Carry1.getOperand(0) == Carry0.getValue(0)) { 2900 return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1)); 2901 } 2902 2903 if (Carry1.getOperand(1) == Carry0.getValue(0)) { 2904 return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0)); 2905 } 2906 2907 return SDValue(); 2908 } 2909 2910 // If we are facing some sort of diamond carry/borrow in/out pattern try to 2911 // match patterns like: 2912 // 2913 // (uaddo A, B) CarryIn 2914 // | \ | 2915 // | \ | 2916 // PartialSum PartialCarryOutX / 2917 // | | / 2918 // | ____|____________/ 2919 // | / | 2920 // (uaddo *, *) \________ 2921 // | \ \ 2922 // | \ | 2923 // | PartialCarryOutY | 2924 // | \ | 2925 // | \ / 2926 // AddCarrySum | ______/ 2927 // | / 2928 // CarryOut = (or *, *) 2929 // 2930 // And generate ADDCARRY (or SUBCARRY) with two result values: 2931 // 2932 // {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn) 2933 // 2934 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with 2935 // a single path for carry/borrow out propagation: 2936 static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, 2937 const TargetLowering &TLI, SDValue Carry0, 2938 SDValue Carry1, SDNode *N) { 2939 if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) 2940 return SDValue(); 2941 unsigned Opcode = Carry0.getOpcode(); 2942 if (Opcode != Carry1.getOpcode()) 2943 return SDValue(); 2944 if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) 2945 return SDValue(); 2946 2947 // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the 2948 // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in 2949 // the above ASCII art.) 2950 if (Carry1.getOperand(0) != Carry0.getValue(0) && 2951 Carry1.getOperand(1) != Carry0.getValue(0)) 2952 std::swap(Carry0, Carry1); 2953 if (Carry1.getOperand(0) != Carry0.getValue(0) && 2954 Carry1.getOperand(1) != Carry0.getValue(0)) 2955 return SDValue(); 2956 2957 // The carry in value must be on the righthand side for subtraction. 2958 unsigned CarryInOperandNum = 2959 Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0; 2960 if (Opcode == ISD::USUBO && CarryInOperandNum != 1) 2961 return SDValue(); 2962 SDValue CarryIn = Carry1.getOperand(CarryInOperandNum); 2963 2964 unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; 2965 if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType())) 2966 return SDValue(); 2967 2968 // Verify that the carry/borrow in is plausibly a carry/borrow bit. 2969 // TODO: make getAsCarry() aware of how partial carries are merged. 2970 if (CarryIn.getOpcode() != ISD::ZERO_EXTEND) 2971 return SDValue(); 2972 CarryIn = CarryIn.getOperand(0); 2973 if (CarryIn.getValueType() != MVT::i1) 2974 return SDValue(); 2975 2976 SDLoc DL(N); 2977 SDValue Merged = 2978 DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0), 2979 Carry0.getOperand(1), CarryIn); 2980 2981 // Please note that because we have proven that the result of the UADDO/USUBO 2982 // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can 2983 // therefore prove that if the first UADDO/USUBO overflows, the second 2984 // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the 2985 // maximum value. 2986 // 2987 // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry 2988 // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow) 2989 // 2990 // This is important because it means that OR and XOR can be used to merge 2991 // carry flags; and that AND can return a constant zero. 2992 // 2993 // TODO: match other operations that can merge flags (ADD, etc) 2994 DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0)); 2995 if (N->getOpcode() == ISD::AND) 2996 return DAG.getConstant(0, DL, MVT::i1); 2997 return Merged.getValue(1); 2998 } 2999 3000 SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, 3001 SDNode *N) { 3002 // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. 3003 if (isBitwiseNot(N0)) 3004 if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) { 3005 SDLoc DL(N); 3006 SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1, 3007 N0.getOperand(0), NotC); 3008 return CombineTo(N, Sub, 3009 flipBoolean(Sub.getValue(1), DL, DAG, TLI)); 3010 } 3011 3012 // Iff the flag result is dead: 3013 // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) 3014 // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo 3015 // or the dependency between the instructions. 3016 if ((N0.getOpcode() == ISD::ADD || 3017 (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 && 3018 N0.getValue(1) != CarryIn)) && 3019 isNullConstant(N1) && !N->hasAnyUseOfValue(1)) 3020 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), 3021 N0.getOperand(0), N0.getOperand(1), CarryIn); 3022 3023 /** 3024 * When one of the addcarry argument is itself a carry, we may be facing 3025 * a diamond carry propagation. In which case we try to transform the DAG 3026 * to ensure linear carry propagation if that is possible. 3027 */ 3028 if (auto Y = getAsCarry(TLI, N1)) { 3029 // Because both are carries, Y and Z can be swapped. 3030 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N)) 3031 return R; 3032 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N)) 3033 return R; 3034 } 3035 3036 return SDValue(); 3037 } 3038 3039 // Since it may not be valid to emit a fold to zero for vector initializers 3040 // check if we can before folding. 3041 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, 3042 SelectionDAG &DAG, bool LegalOperations) { 3043 if (!VT.isVector()) 3044 return DAG.getConstant(0, DL, VT); 3045 if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) 3046 return DAG.getConstant(0, DL, VT); 3047 return SDValue(); 3048 } 3049 3050 SDValue DAGCombiner::visitSUB(SDNode *N) { 3051 SDValue N0 = N->getOperand(0); 3052 SDValue N1 = N->getOperand(1); 3053 EVT VT = N0.getValueType(); 3054 SDLoc DL(N); 3055 3056 // fold vector ops 3057 if (VT.isVector()) { 3058 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3059 return FoldedVOp; 3060 3061 // fold (sub x, 0) -> x, vector edition 3062 if (ISD::isBuildVectorAllZeros(N1.getNode())) 3063 return N0; 3064 } 3065 3066 // fold (sub x, x) -> 0 3067 // FIXME: Refactor this and xor and other similar operations together. 3068 if (N0 == N1) 3069 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 3070 3071 // fold (sub c1, c2) -> c3 3072 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1})) 3073 return C; 3074 3075 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3076 return NewSel; 3077 3078 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); 3079 3080 // fold (sub x, c) -> (add x, -c) 3081 if (N1C) { 3082 return DAG.getNode(ISD::ADD, DL, VT, N0, 3083 DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); 3084 } 3085 3086 if (isNullOrNullSplat(N0)) { 3087 unsigned BitWidth = VT.getScalarSizeInBits(); 3088 // Right-shifting everything out but the sign bit followed by negation is 3089 // the same as flipping arithmetic/logical shift type without the negation: 3090 // -(X >>u 31) -> (X >>s 31) 3091 // -(X >>s 31) -> (X >>u 31) 3092 if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) { 3093 ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1)); 3094 if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) { 3095 auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA; 3096 if (!LegalOperations || TLI.isOperationLegal(NewSh, VT)) 3097 return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1)); 3098 } 3099 } 3100 3101 // 0 - X --> 0 if the sub is NUW. 3102 if (N->getFlags().hasNoUnsignedWrap()) 3103 return N0; 3104 3105 if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) { 3106 // N1 is either 0 or the minimum signed value. If the sub is NSW, then 3107 // N1 must be 0 because negating the minimum signed value is undefined. 3108 if (N->getFlags().hasNoSignedWrap()) 3109 return N0; 3110 3111 // 0 - X --> X if X is 0 or the minimum signed value. 3112 return N1; 3113 } 3114 } 3115 3116 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) 3117 if (isAllOnesOrAllOnesSplat(N0)) 3118 return DAG.getNode(ISD::XOR, DL, VT, N1, N0); 3119 3120 // fold (A - (0-B)) -> A+B 3121 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0))) 3122 return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1)); 3123 3124 // fold A-(A-B) -> B 3125 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0)) 3126 return N1.getOperand(1); 3127 3128 // fold (A+B)-A -> B 3129 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1) 3130 return N0.getOperand(1); 3131 3132 // fold (A+B)-B -> A 3133 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1) 3134 return N0.getOperand(0); 3135 3136 // fold (A+C1)-C2 -> A+(C1-C2) 3137 if (N0.getOpcode() == ISD::ADD && 3138 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3139 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3140 SDValue NewC = 3141 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1}); 3142 assert(NewC && "Constant folding failed"); 3143 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC); 3144 } 3145 3146 // fold C2-(A+C1) -> (C2-C1)-A 3147 if (N1.getOpcode() == ISD::ADD) { 3148 SDValue N11 = N1.getOperand(1); 3149 if (isConstantOrConstantVector(N0, /* NoOpaques */ true) && 3150 isConstantOrConstantVector(N11, /* NoOpaques */ true)) { 3151 SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11}); 3152 assert(NewC && "Constant folding failed"); 3153 return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0)); 3154 } 3155 } 3156 3157 // fold (A-C1)-C2 -> A-(C1+C2) 3158 if (N0.getOpcode() == ISD::SUB && 3159 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3160 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3161 SDValue NewC = 3162 DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1}); 3163 assert(NewC && "Constant folding failed"); 3164 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC); 3165 } 3166 3167 // fold (c1-A)-c2 -> (c1-c2)-A 3168 if (N0.getOpcode() == ISD::SUB && 3169 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3170 isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) { 3171 SDValue NewC = 3172 DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1}); 3173 assert(NewC && "Constant folding failed"); 3174 return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1)); 3175 } 3176 3177 // fold ((A+(B+or-C))-B) -> A+or-C 3178 if (N0.getOpcode() == ISD::ADD && 3179 (N0.getOperand(1).getOpcode() == ISD::SUB || 3180 N0.getOperand(1).getOpcode() == ISD::ADD) && 3181 N0.getOperand(1).getOperand(0) == N1) 3182 return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0), 3183 N0.getOperand(1).getOperand(1)); 3184 3185 // fold ((A+(C+B))-B) -> A+C 3186 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD && 3187 N0.getOperand(1).getOperand(1) == N1) 3188 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), 3189 N0.getOperand(1).getOperand(0)); 3190 3191 // fold ((A-(B-C))-C) -> A-B 3192 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB && 3193 N0.getOperand(1).getOperand(1) == N1) 3194 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), 3195 N0.getOperand(1).getOperand(0)); 3196 3197 // fold (A-(B-C)) -> A+(C-B) 3198 if (N1.getOpcode() == ISD::SUB && N1.hasOneUse()) 3199 return DAG.getNode(ISD::ADD, DL, VT, N0, 3200 DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1), 3201 N1.getOperand(0))); 3202 3203 // A - (A & B) -> A & (~B) 3204 if (N1.getOpcode() == ISD::AND) { 3205 SDValue A = N1.getOperand(0); 3206 SDValue B = N1.getOperand(1); 3207 if (A != N0) 3208 std::swap(A, B); 3209 if (A == N0 && 3210 (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) { 3211 SDValue InvB = 3212 DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT)); 3213 return DAG.getNode(ISD::AND, DL, VT, A, InvB); 3214 } 3215 } 3216 3217 // fold (X - (-Y * Z)) -> (X + (Y * Z)) 3218 if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) { 3219 if (N1.getOperand(0).getOpcode() == ISD::SUB && 3220 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) { 3221 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, 3222 N1.getOperand(0).getOperand(1), 3223 N1.getOperand(1)); 3224 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); 3225 } 3226 if (N1.getOperand(1).getOpcode() == ISD::SUB && 3227 isNullOrNullSplat(N1.getOperand(1).getOperand(0))) { 3228 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, 3229 N1.getOperand(0), 3230 N1.getOperand(1).getOperand(1)); 3231 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); 3232 } 3233 } 3234 3235 // If either operand of a sub is undef, the result is undef 3236 if (N0.isUndef()) 3237 return N0; 3238 if (N1.isUndef()) 3239 return N1; 3240 3241 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) 3242 return V; 3243 3244 if (SDValue V = foldAddSubOfSignBit(N, DAG)) 3245 return V; 3246 3247 if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N))) 3248 return V; 3249 3250 // (x - y) - 1 -> add (xor y, -1), x 3251 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) { 3252 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), 3253 DAG.getAllOnesConstant(DL, VT)); 3254 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); 3255 } 3256 3257 // Look for: 3258 // sub y, (xor x, -1) 3259 // And if the target does not like this form then turn into: 3260 // add (add x, y), 1 3261 if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) { 3262 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0)); 3263 return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT)); 3264 } 3265 3266 // Hoist one-use addition by non-opaque constant: 3267 // (x + C) - y -> (x - y) + C 3268 if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD && 3269 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 3270 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); 3271 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1)); 3272 } 3273 // y - (x + C) -> (y - x) - C 3274 if (N1.hasOneUse() && N1.getOpcode() == ISD::ADD && 3275 isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) { 3276 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0)); 3277 return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1)); 3278 } 3279 // (x - C) - y -> (x - y) - C 3280 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. 3281 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 3282 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { 3283 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); 3284 return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1)); 3285 } 3286 // (C - x) - y -> C - (x + y) 3287 if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && 3288 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { 3289 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1); 3290 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add); 3291 } 3292 3293 // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1' 3294 // rather than 'sub 0/1' (the sext should get folded). 3295 // sub X, (zext i1 Y) --> add X, (sext i1 Y) 3296 if (N1.getOpcode() == ISD::ZERO_EXTEND && 3297 N1.getOperand(0).getScalarValueSizeInBits() == 1 && 3298 TLI.getBooleanContents(VT) == 3299 TargetLowering::ZeroOrNegativeOneBooleanContent) { 3300 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0)); 3301 return DAG.getNode(ISD::ADD, DL, VT, N0, SExt); 3302 } 3303 3304 // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X) 3305 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { 3306 if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { 3307 SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1); 3308 SDValue S0 = N1.getOperand(0); 3309 if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) { 3310 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 3311 if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) 3312 if (C->getAPIntValue() == (OpSizeInBits - 1)) 3313 return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0); 3314 } 3315 } 3316 } 3317 3318 // If the relocation model supports it, consider symbol offsets. 3319 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0)) 3320 if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) { 3321 // fold (sub Sym, c) -> Sym-c 3322 if (N1C && GA->getOpcode() == ISD::GlobalAddress) 3323 return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, 3324 GA->getOffset() - 3325 (uint64_t)N1C->getSExtValue()); 3326 // fold (sub Sym+c1, Sym+c2) -> c1-c2 3327 if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1)) 3328 if (GA->getGlobal() == GB->getGlobal()) 3329 return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(), 3330 DL, VT); 3331 } 3332 3333 // sub X, (sextinreg Y i1) -> add X, (and Y 1) 3334 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { 3335 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1)); 3336 if (TN->getVT() == MVT::i1) { 3337 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), 3338 DAG.getConstant(1, DL, VT)); 3339 return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt); 3340 } 3341 } 3342 3343 // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C)) 3344 if (N1.getOpcode() == ISD::VSCALE) { 3345 APInt IntVal = N1.getConstantOperandAPInt(0); 3346 return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal)); 3347 } 3348 3349 // Prefer an add for more folding potential and possibly better codegen: 3350 // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1) 3351 if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) { 3352 SDValue ShAmt = N1.getOperand(1); 3353 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); 3354 if (ShAmtC && 3355 ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) { 3356 SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt); 3357 return DAG.getNode(ISD::ADD, DL, VT, N0, SRA); 3358 } 3359 } 3360 3361 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) { 3362 // (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry) 3363 if (SDValue Carry = getAsCarry(TLI, N0)) { 3364 SDValue X = N1; 3365 SDValue Zero = DAG.getConstant(0, DL, VT); 3366 SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X); 3367 return DAG.getNode(ISD::ADDCARRY, DL, 3368 DAG.getVTList(VT, Carry.getValueType()), NegX, Zero, 3369 Carry); 3370 } 3371 } 3372 3373 return SDValue(); 3374 } 3375 3376 SDValue DAGCombiner::visitSUBSAT(SDNode *N) { 3377 SDValue N0 = N->getOperand(0); 3378 SDValue N1 = N->getOperand(1); 3379 EVT VT = N0.getValueType(); 3380 SDLoc DL(N); 3381 3382 // fold vector ops 3383 if (VT.isVector()) { 3384 // TODO SimplifyVBinOp 3385 3386 // fold (sub_sat x, 0) -> x, vector edition 3387 if (ISD::isBuildVectorAllZeros(N1.getNode())) 3388 return N0; 3389 } 3390 3391 // fold (sub_sat x, undef) -> 0 3392 if (N0.isUndef() || N1.isUndef()) 3393 return DAG.getConstant(0, DL, VT); 3394 3395 // fold (sub_sat x, x) -> 0 3396 if (N0 == N1) 3397 return DAG.getConstant(0, DL, VT); 3398 3399 // fold (sub_sat c1, c2) -> c3 3400 if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1})) 3401 return C; 3402 3403 // fold (sub_sat x, 0) -> x 3404 if (isNullConstant(N1)) 3405 return N0; 3406 3407 return SDValue(); 3408 } 3409 3410 SDValue DAGCombiner::visitSUBC(SDNode *N) { 3411 SDValue N0 = N->getOperand(0); 3412 SDValue N1 = N->getOperand(1); 3413 EVT VT = N0.getValueType(); 3414 SDLoc DL(N); 3415 3416 // If the flag result is dead, turn this into an SUB. 3417 if (!N->hasAnyUseOfValue(1)) 3418 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), 3419 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3420 3421 // fold (subc x, x) -> 0 + no borrow 3422 if (N0 == N1) 3423 return CombineTo(N, DAG.getConstant(0, DL, VT), 3424 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3425 3426 // fold (subc x, 0) -> x + no borrow 3427 if (isNullConstant(N1)) 3428 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3429 3430 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow 3431 if (isAllOnesConstant(N0)) 3432 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), 3433 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); 3434 3435 return SDValue(); 3436 } 3437 3438 SDValue DAGCombiner::visitSUBO(SDNode *N) { 3439 SDValue N0 = N->getOperand(0); 3440 SDValue N1 = N->getOperand(1); 3441 EVT VT = N0.getValueType(); 3442 bool IsSigned = (ISD::SSUBO == N->getOpcode()); 3443 3444 EVT CarryVT = N->getValueType(1); 3445 SDLoc DL(N); 3446 3447 // If the flag result is dead, turn this into an SUB. 3448 if (!N->hasAnyUseOfValue(1)) 3449 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), 3450 DAG.getUNDEF(CarryVT)); 3451 3452 // fold (subo x, x) -> 0 + no borrow 3453 if (N0 == N1) 3454 return CombineTo(N, DAG.getConstant(0, DL, VT), 3455 DAG.getConstant(0, DL, CarryVT)); 3456 3457 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); 3458 3459 // fold (subox, c) -> (addo x, -c) 3460 if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) { 3461 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, 3462 DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); 3463 } 3464 3465 // fold (subo x, 0) -> x + no borrow 3466 if (isNullOrNullSplat(N1)) 3467 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); 3468 3469 // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow 3470 if (!IsSigned && isAllOnesOrAllOnesSplat(N0)) 3471 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), 3472 DAG.getConstant(0, DL, CarryVT)); 3473 3474 return SDValue(); 3475 } 3476 3477 SDValue DAGCombiner::visitSUBE(SDNode *N) { 3478 SDValue N0 = N->getOperand(0); 3479 SDValue N1 = N->getOperand(1); 3480 SDValue CarryIn = N->getOperand(2); 3481 3482 // fold (sube x, y, false) -> (subc x, y) 3483 if (CarryIn.getOpcode() == ISD::CARRY_FALSE) 3484 return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1); 3485 3486 return SDValue(); 3487 } 3488 3489 SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { 3490 SDValue N0 = N->getOperand(0); 3491 SDValue N1 = N->getOperand(1); 3492 SDValue CarryIn = N->getOperand(2); 3493 3494 // fold (subcarry x, y, false) -> (usubo x, y) 3495 if (isNullConstant(CarryIn)) { 3496 if (!LegalOperations || 3497 TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0))) 3498 return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); 3499 } 3500 3501 return SDValue(); 3502 } 3503 3504 // Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and 3505 // UMULFIXSAT here. 3506 SDValue DAGCombiner::visitMULFIX(SDNode *N) { 3507 SDValue N0 = N->getOperand(0); 3508 SDValue N1 = N->getOperand(1); 3509 SDValue Scale = N->getOperand(2); 3510 EVT VT = N0.getValueType(); 3511 3512 // fold (mulfix x, undef, scale) -> 0 3513 if (N0.isUndef() || N1.isUndef()) 3514 return DAG.getConstant(0, SDLoc(N), VT); 3515 3516 // Canonicalize constant to RHS (vector doesn't have to splat) 3517 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 3518 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 3519 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale); 3520 3521 // fold (mulfix x, 0, scale) -> 0 3522 if (isNullConstant(N1)) 3523 return DAG.getConstant(0, SDLoc(N), VT); 3524 3525 return SDValue(); 3526 } 3527 3528 SDValue DAGCombiner::visitMUL(SDNode *N) { 3529 SDValue N0 = N->getOperand(0); 3530 SDValue N1 = N->getOperand(1); 3531 EVT VT = N0.getValueType(); 3532 3533 // fold (mul x, undef) -> 0 3534 if (N0.isUndef() || N1.isUndef()) 3535 return DAG.getConstant(0, SDLoc(N), VT); 3536 3537 bool N1IsConst = false; 3538 bool N1IsOpaqueConst = false; 3539 APInt ConstValue1; 3540 3541 // fold vector ops 3542 if (VT.isVector()) { 3543 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3544 return FoldedVOp; 3545 3546 N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1); 3547 assert((!N1IsConst || 3548 ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) && 3549 "Splat APInt should be element width"); 3550 } else { 3551 N1IsConst = isa<ConstantSDNode>(N1); 3552 if (N1IsConst) { 3553 ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue(); 3554 N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque(); 3555 } 3556 } 3557 3558 // fold (mul c1, c2) -> c1*c2 3559 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1})) 3560 return C; 3561 3562 // canonicalize constant to RHS (vector doesn't have to splat) 3563 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 3564 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 3565 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); 3566 3567 // fold (mul x, 0) -> 0 3568 if (N1IsConst && ConstValue1.isNullValue()) 3569 return N1; 3570 3571 // fold (mul x, 1) -> x 3572 if (N1IsConst && ConstValue1.isOneValue()) 3573 return N0; 3574 3575 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3576 return NewSel; 3577 3578 // fold (mul x, -1) -> 0-x 3579 if (N1IsConst && ConstValue1.isAllOnesValue()) { 3580 SDLoc DL(N); 3581 return DAG.getNode(ISD::SUB, DL, VT, 3582 DAG.getConstant(0, DL, VT), N0); 3583 } 3584 3585 // fold (mul x, (1 << c)) -> x << c 3586 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 3587 DAG.isKnownToBeAPowerOfTwo(N1) && 3588 (!VT.isVector() || Level <= AfterLegalizeVectorOps)) { 3589 SDLoc DL(N); 3590 SDValue LogBase2 = BuildLogBase2(N1, DL); 3591 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 3592 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); 3593 return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); 3594 } 3595 3596 // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c 3597 if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) { 3598 unsigned Log2Val = (-ConstValue1).logBase2(); 3599 SDLoc DL(N); 3600 // FIXME: If the input is something that is easily negated (e.g. a 3601 // single-use add), we should put the negate there. 3602 return DAG.getNode(ISD::SUB, DL, VT, 3603 DAG.getConstant(0, DL, VT), 3604 DAG.getNode(ISD::SHL, DL, VT, N0, 3605 DAG.getConstant(Log2Val, DL, 3606 getShiftAmountTy(N0.getValueType())))); 3607 } 3608 3609 // Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub. 3610 // mul x, (2^N + 1) --> add (shl x, N), x 3611 // mul x, (2^N - 1) --> sub (shl x, N), x 3612 // Examples: x * 33 --> (x << 5) + x 3613 // x * 15 --> (x << 4) - x 3614 // x * -33 --> -((x << 5) + x) 3615 // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) 3616 if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { 3617 // TODO: We could handle more general decomposition of any constant by 3618 // having the target set a limit on number of ops and making a 3619 // callback to determine that sequence (similar to sqrt expansion). 3620 unsigned MathOp = ISD::DELETED_NODE; 3621 APInt MulC = ConstValue1.abs(); 3622 if ((MulC - 1).isPowerOf2()) 3623 MathOp = ISD::ADD; 3624 else if ((MulC + 1).isPowerOf2()) 3625 MathOp = ISD::SUB; 3626 3627 if (MathOp != ISD::DELETED_NODE) { 3628 unsigned ShAmt = 3629 MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2(); 3630 assert(ShAmt < VT.getScalarSizeInBits() && 3631 "multiply-by-constant generated out of bounds shift"); 3632 SDLoc DL(N); 3633 SDValue Shl = 3634 DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT)); 3635 SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0); 3636 if (ConstValue1.isNegative()) 3637 R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R); 3638 return R; 3639 } 3640 } 3641 3642 // (mul (shl X, c1), c2) -> (mul X, c2 << c1) 3643 if (N0.getOpcode() == ISD::SHL && 3644 isConstantOrConstantVector(N1, /* NoOpaques */ true) && 3645 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { 3646 SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1)); 3647 if (isConstantOrConstantVector(C3)) 3648 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3); 3649 } 3650 3651 // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one 3652 // use. 3653 { 3654 SDValue Sh(nullptr, 0), Y(nullptr, 0); 3655 3656 // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)). 3657 if (N0.getOpcode() == ISD::SHL && 3658 isConstantOrConstantVector(N0.getOperand(1)) && 3659 N0.getNode()->hasOneUse()) { 3660 Sh = N0; Y = N1; 3661 } else if (N1.getOpcode() == ISD::SHL && 3662 isConstantOrConstantVector(N1.getOperand(1)) && 3663 N1.getNode()->hasOneUse()) { 3664 Sh = N1; Y = N0; 3665 } 3666 3667 if (Sh.getNode()) { 3668 SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y); 3669 return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1)); 3670 } 3671 } 3672 3673 // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) 3674 if (DAG.isConstantIntBuildVectorOrConstantInt(N1) && 3675 N0.getOpcode() == ISD::ADD && 3676 DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) && 3677 isMulAddWithConstProfitable(N, N0, N1)) 3678 return DAG.getNode(ISD::ADD, SDLoc(N), VT, 3679 DAG.getNode(ISD::MUL, SDLoc(N0), VT, 3680 N0.getOperand(0), N1), 3681 DAG.getNode(ISD::MUL, SDLoc(N1), VT, 3682 N0.getOperand(1), N1)); 3683 3684 // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)). 3685 if (N0.getOpcode() == ISD::VSCALE) 3686 if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) { 3687 APInt C0 = N0.getConstantOperandAPInt(0); 3688 APInt C1 = NC1->getAPIntValue(); 3689 return DAG.getVScale(SDLoc(N), VT, C0 * C1); 3690 } 3691 3692 // reassociate mul 3693 if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) 3694 return RMUL; 3695 3696 return SDValue(); 3697 } 3698 3699 /// Return true if divmod libcall is available. 3700 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, 3701 const TargetLowering &TLI) { 3702 RTLIB::Libcall LC; 3703 EVT NodeType = Node->getValueType(0); 3704 if (!NodeType.isSimple()) 3705 return false; 3706 switch (NodeType.getSimpleVT().SimpleTy) { 3707 default: return false; // No libcall for vector types. 3708 case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 3709 case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 3710 case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 3711 case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 3712 case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; 3713 } 3714 3715 return TLI.getLibcallName(LC) != nullptr; 3716 } 3717 3718 /// Issue divrem if both quotient and remainder are needed. 3719 SDValue DAGCombiner::useDivRem(SDNode *Node) { 3720 if (Node->use_empty()) 3721 return SDValue(); // This is a dead node, leave it alone. 3722 3723 unsigned Opcode = Node->getOpcode(); 3724 bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM); 3725 unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; 3726 3727 // DivMod lib calls can still work on non-legal types if using lib-calls. 3728 EVT VT = Node->getValueType(0); 3729 if (VT.isVector() || !VT.isInteger()) 3730 return SDValue(); 3731 3732 if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT)) 3733 return SDValue(); 3734 3735 // If DIVREM is going to get expanded into a libcall, 3736 // but there is no libcall available, then don't combine. 3737 if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) && 3738 !isDivRemLibcallAvailable(Node, isSigned, TLI)) 3739 return SDValue(); 3740 3741 // If div is legal, it's better to do the normal expansion 3742 unsigned OtherOpcode = 0; 3743 if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) { 3744 OtherOpcode = isSigned ? ISD::SREM : ISD::UREM; 3745 if (TLI.isOperationLegalOrCustom(Opcode, VT)) 3746 return SDValue(); 3747 } else { 3748 OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 3749 if (TLI.isOperationLegalOrCustom(OtherOpcode, VT)) 3750 return SDValue(); 3751 } 3752 3753 SDValue Op0 = Node->getOperand(0); 3754 SDValue Op1 = Node->getOperand(1); 3755 SDValue combined; 3756 for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), 3757 UE = Op0.getNode()->use_end(); UI != UE; ++UI) { 3758 SDNode *User = *UI; 3759 if (User == Node || User->getOpcode() == ISD::DELETED_NODE || 3760 User->use_empty()) 3761 continue; 3762 // Convert the other matching node(s), too; 3763 // otherwise, the DIVREM may get target-legalized into something 3764 // target-specific that we won't be able to recognize. 3765 unsigned UserOpc = User->getOpcode(); 3766 if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) && 3767 User->getOperand(0) == Op0 && 3768 User->getOperand(1) == Op1) { 3769 if (!combined) { 3770 if (UserOpc == OtherOpcode) { 3771 SDVTList VTs = DAG.getVTList(VT, VT); 3772 combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1); 3773 } else if (UserOpc == DivRemOpc) { 3774 combined = SDValue(User, 0); 3775 } else { 3776 assert(UserOpc == Opcode); 3777 continue; 3778 } 3779 } 3780 if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV) 3781 CombineTo(User, combined); 3782 else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM) 3783 CombineTo(User, combined.getValue(1)); 3784 } 3785 } 3786 return combined; 3787 } 3788 3789 static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { 3790 SDValue N0 = N->getOperand(0); 3791 SDValue N1 = N->getOperand(1); 3792 EVT VT = N->getValueType(0); 3793 SDLoc DL(N); 3794 3795 unsigned Opc = N->getOpcode(); 3796 bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc); 3797 ConstantSDNode *N1C = isConstOrConstSplat(N1); 3798 3799 // X / undef -> undef 3800 // X % undef -> undef 3801 // X / 0 -> undef 3802 // X % 0 -> undef 3803 // NOTE: This includes vectors where any divisor element is zero/undef. 3804 if (DAG.isUndef(Opc, {N0, N1})) 3805 return DAG.getUNDEF(VT); 3806 3807 // undef / X -> 0 3808 // undef % X -> 0 3809 if (N0.isUndef()) 3810 return DAG.getConstant(0, DL, VT); 3811 3812 // 0 / X -> 0 3813 // 0 % X -> 0 3814 ConstantSDNode *N0C = isConstOrConstSplat(N0); 3815 if (N0C && N0C->isNullValue()) 3816 return N0; 3817 3818 // X / X -> 1 3819 // X % X -> 0 3820 if (N0 == N1) 3821 return DAG.getConstant(IsDiv ? 1 : 0, DL, VT); 3822 3823 // X / 1 -> X 3824 // X % 1 -> 0 3825 // If this is a boolean op (single-bit element type), we can't have 3826 // division-by-zero or remainder-by-zero, so assume the divisor is 1. 3827 // TODO: Similarly, if we're zero-extending a boolean divisor, then assume 3828 // it's a 1. 3829 if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1)) 3830 return IsDiv ? N0 : DAG.getConstant(0, DL, VT); 3831 3832 return SDValue(); 3833 } 3834 3835 SDValue DAGCombiner::visitSDIV(SDNode *N) { 3836 SDValue N0 = N->getOperand(0); 3837 SDValue N1 = N->getOperand(1); 3838 EVT VT = N->getValueType(0); 3839 EVT CCVT = getSetCCResultType(VT); 3840 3841 // fold vector ops 3842 if (VT.isVector()) 3843 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3844 return FoldedVOp; 3845 3846 SDLoc DL(N); 3847 3848 // fold (sdiv c1, c2) -> c1/c2 3849 ConstantSDNode *N1C = isConstOrConstSplat(N1); 3850 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1})) 3851 return C; 3852 3853 // fold (sdiv X, -1) -> 0-X 3854 if (N1C && N1C->isAllOnesValue()) 3855 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); 3856 3857 // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0) 3858 if (N1C && N1C->getAPIntValue().isMinSignedValue()) 3859 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 3860 DAG.getConstant(1, DL, VT), 3861 DAG.getConstant(0, DL, VT)); 3862 3863 if (SDValue V = simplifyDivRem(N, DAG)) 3864 return V; 3865 3866 if (SDValue NewSel = foldBinOpIntoSelect(N)) 3867 return NewSel; 3868 3869 // If we know the sign bits of both operands are zero, strength reduce to a 3870 // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 3871 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) 3872 return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1); 3873 3874 if (SDValue V = visitSDIVLike(N0, N1, N)) { 3875 // If the corresponding remainder node exists, update its users with 3876 // (Dividend - (Quotient * Divisor). 3877 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(), 3878 { N0, N1 })) { 3879 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1); 3880 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 3881 AddToWorklist(Mul.getNode()); 3882 AddToWorklist(Sub.getNode()); 3883 CombineTo(RemNode, Sub); 3884 } 3885 return V; 3886 } 3887 3888 // sdiv, srem -> sdivrem 3889 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is 3890 // true. Otherwise, we break the simplification logic in visitREM(). 3891 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 3892 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) 3893 if (SDValue DivRem = useDivRem(N)) 3894 return DivRem; 3895 3896 return SDValue(); 3897 } 3898 3899 SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { 3900 SDLoc DL(N); 3901 EVT VT = N->getValueType(0); 3902 EVT CCVT = getSetCCResultType(VT); 3903 unsigned BitWidth = VT.getScalarSizeInBits(); 3904 3905 // Helper for determining whether a value is a power-2 constant scalar or a 3906 // vector of such elements. 3907 auto IsPowerOfTwo = [](ConstantSDNode *C) { 3908 if (C->isNullValue() || C->isOpaque()) 3909 return false; 3910 if (C->getAPIntValue().isPowerOf2()) 3911 return true; 3912 if ((-C->getAPIntValue()).isPowerOf2()) 3913 return true; 3914 return false; 3915 }; 3916 3917 // fold (sdiv X, pow2) -> simple ops after legalize 3918 // FIXME: We check for the exact bit here because the generic lowering gives 3919 // better results in that case. The target-specific lowering should learn how 3920 // to handle exact sdivs efficiently. 3921 if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) { 3922 // Target-specific implementation of sdiv x, pow2. 3923 if (SDValue Res = BuildSDIVPow2(N)) 3924 return Res; 3925 3926 // Create constants that are functions of the shift amount value. 3927 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); 3928 SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy); 3929 SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1); 3930 C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy); 3931 SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1); 3932 if (!isConstantOrConstantVector(Inexact)) 3933 return SDValue(); 3934 3935 // Splat the sign bit into the register 3936 SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0, 3937 DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy)); 3938 AddToWorklist(Sign.getNode()); 3939 3940 // Add (N0 < 0) ? abs2 - 1 : 0; 3941 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact); 3942 AddToWorklist(Srl.getNode()); 3943 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl); 3944 AddToWorklist(Add.getNode()); 3945 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1); 3946 AddToWorklist(Sra.getNode()); 3947 3948 // Special case: (sdiv X, 1) -> X 3949 // Special Case: (sdiv X, -1) -> 0-X 3950 SDValue One = DAG.getConstant(1, DL, VT); 3951 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); 3952 SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ); 3953 SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ); 3954 SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes); 3955 Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra); 3956 3957 // If dividing by a positive value, we're done. Otherwise, the result must 3958 // be negated. 3959 SDValue Zero = DAG.getConstant(0, DL, VT); 3960 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra); 3961 3962 // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding. 3963 SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT); 3964 SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra); 3965 return Res; 3966 } 3967 3968 // If integer divide is expensive and we satisfy the requirements, emit an 3969 // alternate sequence. Targets may check function attributes for size/speed 3970 // trade-offs. 3971 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 3972 if (isConstantOrConstantVector(N1) && 3973 !TLI.isIntDivCheap(N->getValueType(0), Attr)) 3974 if (SDValue Op = BuildSDIV(N)) 3975 return Op; 3976 3977 return SDValue(); 3978 } 3979 3980 SDValue DAGCombiner::visitUDIV(SDNode *N) { 3981 SDValue N0 = N->getOperand(0); 3982 SDValue N1 = N->getOperand(1); 3983 EVT VT = N->getValueType(0); 3984 EVT CCVT = getSetCCResultType(VT); 3985 3986 // fold vector ops 3987 if (VT.isVector()) 3988 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 3989 return FoldedVOp; 3990 3991 SDLoc DL(N); 3992 3993 // fold (udiv c1, c2) -> c1/c2 3994 ConstantSDNode *N1C = isConstOrConstSplat(N1); 3995 if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1})) 3996 return C; 3997 3998 // fold (udiv X, -1) -> select(X == -1, 1, 0) 3999 if (N1C && N1C->getAPIntValue().isAllOnesValue()) 4000 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 4001 DAG.getConstant(1, DL, VT), 4002 DAG.getConstant(0, DL, VT)); 4003 4004 if (SDValue V = simplifyDivRem(N, DAG)) 4005 return V; 4006 4007 if (SDValue NewSel = foldBinOpIntoSelect(N)) 4008 return NewSel; 4009 4010 if (SDValue V = visitUDIVLike(N0, N1, N)) { 4011 // If the corresponding remainder node exists, update its users with 4012 // (Dividend - (Quotient * Divisor). 4013 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(), 4014 { N0, N1 })) { 4015 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1); 4016 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 4017 AddToWorklist(Mul.getNode()); 4018 AddToWorklist(Sub.getNode()); 4019 CombineTo(RemNode, Sub); 4020 } 4021 return V; 4022 } 4023 4024 // sdiv, srem -> sdivrem 4025 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is 4026 // true. Otherwise, we break the simplification logic in visitREM(). 4027 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4028 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) 4029 if (SDValue DivRem = useDivRem(N)) 4030 return DivRem; 4031 4032 return SDValue(); 4033 } 4034 4035 SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { 4036 SDLoc DL(N); 4037 EVT VT = N->getValueType(0); 4038 4039 // fold (udiv x, (1 << c)) -> x >>u c 4040 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 4041 DAG.isKnownToBeAPowerOfTwo(N1)) { 4042 SDValue LogBase2 = BuildLogBase2(N1, DL); 4043 AddToWorklist(LogBase2.getNode()); 4044 4045 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 4046 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); 4047 AddToWorklist(Trunc.getNode()); 4048 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); 4049 } 4050 4051 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 4052 if (N1.getOpcode() == ISD::SHL) { 4053 SDValue N10 = N1.getOperand(0); 4054 if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) && 4055 DAG.isKnownToBeAPowerOfTwo(N10)) { 4056 SDValue LogBase2 = BuildLogBase2(N10, DL); 4057 AddToWorklist(LogBase2.getNode()); 4058 4059 EVT ADDVT = N1.getOperand(1).getValueType(); 4060 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); 4061 AddToWorklist(Trunc.getNode()); 4062 SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc); 4063 AddToWorklist(Add.getNode()); 4064 return DAG.getNode(ISD::SRL, DL, VT, N0, Add); 4065 } 4066 } 4067 4068 // fold (udiv x, c) -> alternate 4069 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4070 if (isConstantOrConstantVector(N1) && 4071 !TLI.isIntDivCheap(N->getValueType(0), Attr)) 4072 if (SDValue Op = BuildUDIV(N)) 4073 return Op; 4074 4075 return SDValue(); 4076 } 4077 4078 // handles ISD::SREM and ISD::UREM 4079 SDValue DAGCombiner::visitREM(SDNode *N) { 4080 unsigned Opcode = N->getOpcode(); 4081 SDValue N0 = N->getOperand(0); 4082 SDValue N1 = N->getOperand(1); 4083 EVT VT = N->getValueType(0); 4084 EVT CCVT = getSetCCResultType(VT); 4085 4086 bool isSigned = (Opcode == ISD::SREM); 4087 SDLoc DL(N); 4088 4089 // fold (rem c1, c2) -> c1%c2 4090 ConstantSDNode *N1C = isConstOrConstSplat(N1); 4091 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) 4092 return C; 4093 4094 // fold (urem X, -1) -> select(X == -1, 0, x) 4095 if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue()) 4096 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), 4097 DAG.getConstant(0, DL, VT), N0); 4098 4099 if (SDValue V = simplifyDivRem(N, DAG)) 4100 return V; 4101 4102 if (SDValue NewSel = foldBinOpIntoSelect(N)) 4103 return NewSel; 4104 4105 if (isSigned) { 4106 // If we know the sign bits of both operands are zero, strength reduce to a 4107 // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 4108 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) 4109 return DAG.getNode(ISD::UREM, DL, VT, N0, N1); 4110 } else { 4111 SDValue NegOne = DAG.getAllOnesConstant(DL, VT); 4112 if (DAG.isKnownToBeAPowerOfTwo(N1)) { 4113 // fold (urem x, pow2) -> (and x, pow2-1) 4114 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); 4115 AddToWorklist(Add.getNode()); 4116 return DAG.getNode(ISD::AND, DL, VT, N0, Add); 4117 } 4118 if (N1.getOpcode() == ISD::SHL && 4119 DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { 4120 // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) 4121 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); 4122 AddToWorklist(Add.getNode()); 4123 return DAG.getNode(ISD::AND, DL, VT, N0, Add); 4124 } 4125 } 4126 4127 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 4128 4129 // If X/C can be simplified by the division-by-constant logic, lower 4130 // X%C to the equivalent of X-X/C*C. 4131 // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the 4132 // speculative DIV must not cause a DIVREM conversion. We guard against this 4133 // by skipping the simplification if isIntDivCheap(). When div is not cheap, 4134 // combine will not return a DIVREM. Regardless, checking cheapness here 4135 // makes sense since the simplification results in fatter code. 4136 if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { 4137 SDValue OptimizedDiv = 4138 isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N); 4139 if (OptimizedDiv.getNode()) { 4140 // If the equivalent Div node also exists, update its users. 4141 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 4142 if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), 4143 { N0, N1 })) 4144 CombineTo(DivNode, OptimizedDiv); 4145 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1); 4146 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); 4147 AddToWorklist(OptimizedDiv.getNode()); 4148 AddToWorklist(Mul.getNode()); 4149 return Sub; 4150 } 4151 } 4152 4153 // sdiv, srem -> sdivrem 4154 if (SDValue DivRem = useDivRem(N)) 4155 return DivRem.getValue(1); 4156 4157 return SDValue(); 4158 } 4159 4160 SDValue DAGCombiner::visitMULHS(SDNode *N) { 4161 SDValue N0 = N->getOperand(0); 4162 SDValue N1 = N->getOperand(1); 4163 EVT VT = N->getValueType(0); 4164 SDLoc DL(N); 4165 4166 if (VT.isVector()) { 4167 // fold (mulhs x, 0) -> 0 4168 // do not return N0/N1, because undef node may exist. 4169 if (ISD::isBuildVectorAllZeros(N0.getNode()) || 4170 ISD::isBuildVectorAllZeros(N1.getNode())) 4171 return DAG.getConstant(0, DL, VT); 4172 } 4173 4174 // fold (mulhs x, 0) -> 0 4175 if (isNullConstant(N1)) 4176 return N1; 4177 // fold (mulhs x, 1) -> (sra x, size(x)-1) 4178 if (isOneConstant(N1)) 4179 return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0, 4180 DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL, 4181 getShiftAmountTy(N0.getValueType()))); 4182 4183 // fold (mulhs x, undef) -> 0 4184 if (N0.isUndef() || N1.isUndef()) 4185 return DAG.getConstant(0, DL, VT); 4186 4187 // If the type twice as wide is legal, transform the mulhs to a wider multiply 4188 // plus a shift. 4189 if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) { 4190 MVT Simple = VT.getSimpleVT(); 4191 unsigned SimpleSize = Simple.getSizeInBits(); 4192 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4193 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4194 N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0); 4195 N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1); 4196 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); 4197 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, 4198 DAG.getConstant(SimpleSize, DL, 4199 getShiftAmountTy(N1.getValueType()))); 4200 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); 4201 } 4202 } 4203 4204 return SDValue(); 4205 } 4206 4207 SDValue DAGCombiner::visitMULHU(SDNode *N) { 4208 SDValue N0 = N->getOperand(0); 4209 SDValue N1 = N->getOperand(1); 4210 EVT VT = N->getValueType(0); 4211 SDLoc DL(N); 4212 4213 if (VT.isVector()) { 4214 // fold (mulhu x, 0) -> 0 4215 // do not return N0/N1, because undef node may exist. 4216 if (ISD::isBuildVectorAllZeros(N0.getNode()) || 4217 ISD::isBuildVectorAllZeros(N1.getNode())) 4218 return DAG.getConstant(0, DL, VT); 4219 } 4220 4221 // fold (mulhu x, 0) -> 0 4222 if (isNullConstant(N1)) 4223 return N1; 4224 // fold (mulhu x, 1) -> 0 4225 if (isOneConstant(N1)) 4226 return DAG.getConstant(0, DL, N0.getValueType()); 4227 // fold (mulhu x, undef) -> 0 4228 if (N0.isUndef() || N1.isUndef()) 4229 return DAG.getConstant(0, DL, VT); 4230 4231 // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c) 4232 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && 4233 DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) { 4234 unsigned NumEltBits = VT.getScalarSizeInBits(); 4235 SDValue LogBase2 = BuildLogBase2(N1, DL); 4236 SDValue SRLAmt = DAG.getNode( 4237 ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2); 4238 EVT ShiftVT = getShiftAmountTy(N0.getValueType()); 4239 SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT); 4240 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); 4241 } 4242 4243 // If the type twice as wide is legal, transform the mulhu to a wider multiply 4244 // plus a shift. 4245 if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) { 4246 MVT Simple = VT.getSimpleVT(); 4247 unsigned SimpleSize = Simple.getSizeInBits(); 4248 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4249 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4250 N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0); 4251 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1); 4252 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); 4253 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, 4254 DAG.getConstant(SimpleSize, DL, 4255 getShiftAmountTy(N1.getValueType()))); 4256 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); 4257 } 4258 } 4259 4260 return SDValue(); 4261 } 4262 4263 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp 4264 /// give the opcodes for the two computations that are being performed. Return 4265 /// true if a simplification was made. 4266 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, 4267 unsigned HiOp) { 4268 // If the high half is not needed, just compute the low half. 4269 bool HiExists = N->hasAnyUseOfValue(1); 4270 if (!HiExists && (!LegalOperations || 4271 TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { 4272 SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); 4273 return CombineTo(N, Res, Res); 4274 } 4275 4276 // If the low half is not needed, just compute the high half. 4277 bool LoExists = N->hasAnyUseOfValue(0); 4278 if (!LoExists && (!LegalOperations || 4279 TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) { 4280 SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); 4281 return CombineTo(N, Res, Res); 4282 } 4283 4284 // If both halves are used, return as it is. 4285 if (LoExists && HiExists) 4286 return SDValue(); 4287 4288 // If the two computed results can be simplified separately, separate them. 4289 if (LoExists) { 4290 SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); 4291 AddToWorklist(Lo.getNode()); 4292 SDValue LoOpt = combine(Lo.getNode()); 4293 if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && 4294 (!LegalOperations || 4295 TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType()))) 4296 return CombineTo(N, LoOpt, LoOpt); 4297 } 4298 4299 if (HiExists) { 4300 SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); 4301 AddToWorklist(Hi.getNode()); 4302 SDValue HiOpt = combine(Hi.getNode()); 4303 if (HiOpt.getNode() && HiOpt != Hi && 4304 (!LegalOperations || 4305 TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType()))) 4306 return CombineTo(N, HiOpt, HiOpt); 4307 } 4308 4309 return SDValue(); 4310 } 4311 4312 SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { 4313 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS)) 4314 return Res; 4315 4316 EVT VT = N->getValueType(0); 4317 SDLoc DL(N); 4318 4319 // If the type is twice as wide is legal, transform the mulhu to a wider 4320 // multiply plus a shift. 4321 if (VT.isSimple() && !VT.isVector()) { 4322 MVT Simple = VT.getSimpleVT(); 4323 unsigned SimpleSize = Simple.getSizeInBits(); 4324 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4325 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4326 SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0)); 4327 SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1)); 4328 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); 4329 // Compute the high part as N1. 4330 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, 4331 DAG.getConstant(SimpleSize, DL, 4332 getShiftAmountTy(Lo.getValueType()))); 4333 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); 4334 // Compute the low part as N0. 4335 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); 4336 return CombineTo(N, Lo, Hi); 4337 } 4338 } 4339 4340 return SDValue(); 4341 } 4342 4343 SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { 4344 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU)) 4345 return Res; 4346 4347 EVT VT = N->getValueType(0); 4348 SDLoc DL(N); 4349 4350 // (umul_lohi N0, 0) -> (0, 0) 4351 if (isNullConstant(N->getOperand(1))) { 4352 SDValue Zero = DAG.getConstant(0, DL, VT); 4353 return CombineTo(N, Zero, Zero); 4354 } 4355 4356 // (umul_lohi N0, 1) -> (N0, 0) 4357 if (isOneConstant(N->getOperand(1))) { 4358 SDValue Zero = DAG.getConstant(0, DL, VT); 4359 return CombineTo(N, N->getOperand(0), Zero); 4360 } 4361 4362 // If the type is twice as wide is legal, transform the mulhu to a wider 4363 // multiply plus a shift. 4364 if (VT.isSimple() && !VT.isVector()) { 4365 MVT Simple = VT.getSimpleVT(); 4366 unsigned SimpleSize = Simple.getSizeInBits(); 4367 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); 4368 if (TLI.isOperationLegal(ISD::MUL, NewVT)) { 4369 SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0)); 4370 SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1)); 4371 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); 4372 // Compute the high part as N1. 4373 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, 4374 DAG.getConstant(SimpleSize, DL, 4375 getShiftAmountTy(Lo.getValueType()))); 4376 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); 4377 // Compute the low part as N0. 4378 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); 4379 return CombineTo(N, Lo, Hi); 4380 } 4381 } 4382 4383 return SDValue(); 4384 } 4385 4386 SDValue DAGCombiner::visitMULO(SDNode *N) { 4387 SDValue N0 = N->getOperand(0); 4388 SDValue N1 = N->getOperand(1); 4389 EVT VT = N0.getValueType(); 4390 bool IsSigned = (ISD::SMULO == N->getOpcode()); 4391 4392 EVT CarryVT = N->getValueType(1); 4393 SDLoc DL(N); 4394 4395 // canonicalize constant to RHS. 4396 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 4397 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 4398 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); 4399 4400 // fold (mulo x, 0) -> 0 + no carry out 4401 if (isNullOrNullSplat(N1)) 4402 return CombineTo(N, DAG.getConstant(0, DL, VT), 4403 DAG.getConstant(0, DL, CarryVT)); 4404 4405 // (mulo x, 2) -> (addo x, x) 4406 if (ConstantSDNode *C2 = isConstOrConstSplat(N1)) 4407 if (C2->getAPIntValue() == 2) 4408 return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL, 4409 N->getVTList(), N0, N0); 4410 4411 return SDValue(); 4412 } 4413 4414 SDValue DAGCombiner::visitIMINMAX(SDNode *N) { 4415 SDValue N0 = N->getOperand(0); 4416 SDValue N1 = N->getOperand(1); 4417 EVT VT = N0.getValueType(); 4418 unsigned Opcode = N->getOpcode(); 4419 4420 // fold vector ops 4421 if (VT.isVector()) 4422 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 4423 return FoldedVOp; 4424 4425 // fold operation with constant operands. 4426 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1})) 4427 return C; 4428 4429 // canonicalize constant to RHS 4430 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 4431 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 4432 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); 4433 4434 // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX. 4435 // Only do this if the current op isn't legal and the flipped is. 4436 if (!TLI.isOperationLegal(Opcode, VT) && 4437 (N0.isUndef() || DAG.SignBitIsZero(N0)) && 4438 (N1.isUndef() || DAG.SignBitIsZero(N1))) { 4439 unsigned AltOpcode; 4440 switch (Opcode) { 4441 case ISD::SMIN: AltOpcode = ISD::UMIN; break; 4442 case ISD::SMAX: AltOpcode = ISD::UMAX; break; 4443 case ISD::UMIN: AltOpcode = ISD::SMIN; break; 4444 case ISD::UMAX: AltOpcode = ISD::SMAX; break; 4445 default: llvm_unreachable("Unknown MINMAX opcode"); 4446 } 4447 if (TLI.isOperationLegal(AltOpcode, VT)) 4448 return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1); 4449 } 4450 4451 return SDValue(); 4452 } 4453 4454 /// If this is a bitwise logic instruction and both operands have the same 4455 /// opcode, try to sink the other opcode after the logic instruction. 4456 SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { 4457 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 4458 EVT VT = N0.getValueType(); 4459 unsigned LogicOpcode = N->getOpcode(); 4460 unsigned HandOpcode = N0.getOpcode(); 4461 assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR || 4462 LogicOpcode == ISD::XOR) && "Expected logic opcode"); 4463 assert(HandOpcode == N1.getOpcode() && "Bad input!"); 4464 4465 // Bail early if none of these transforms apply. 4466 if (N0.getNumOperands() == 0) 4467 return SDValue(); 4468 4469 // FIXME: We should check number of uses of the operands to not increase 4470 // the instruction count for all transforms. 4471 4472 // Handle size-changing casts. 4473 SDValue X = N0.getOperand(0); 4474 SDValue Y = N1.getOperand(0); 4475 EVT XVT = X.getValueType(); 4476 SDLoc DL(N); 4477 if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND || 4478 HandOpcode == ISD::SIGN_EXTEND) { 4479 // If both operands have other uses, this transform would create extra 4480 // instructions without eliminating anything. 4481 if (!N0.hasOneUse() && !N1.hasOneUse()) 4482 return SDValue(); 4483 // We need matching integer source types. 4484 if (XVT != Y.getValueType()) 4485 return SDValue(); 4486 // Don't create an illegal op during or after legalization. Don't ever 4487 // create an unsupported vector op. 4488 if ((VT.isVector() || LegalOperations) && 4489 !TLI.isOperationLegalOrCustom(LogicOpcode, XVT)) 4490 return SDValue(); 4491 // Avoid infinite looping with PromoteIntBinOp. 4492 // TODO: Should we apply desirable/legal constraints to all opcodes? 4493 if (HandOpcode == ISD::ANY_EXTEND && LegalTypes && 4494 !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) 4495 return SDValue(); 4496 // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y) 4497 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4498 return DAG.getNode(HandOpcode, DL, VT, Logic); 4499 } 4500 4501 // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y) 4502 if (HandOpcode == ISD::TRUNCATE) { 4503 // If both operands have other uses, this transform would create extra 4504 // instructions without eliminating anything. 4505 if (!N0.hasOneUse() && !N1.hasOneUse()) 4506 return SDValue(); 4507 // We need matching source types. 4508 if (XVT != Y.getValueType()) 4509 return SDValue(); 4510 // Don't create an illegal op during or after legalization. 4511 if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT)) 4512 return SDValue(); 4513 // Be extra careful sinking truncate. If it's free, there's no benefit in 4514 // widening a binop. Also, don't create a logic op on an illegal type. 4515 if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT)) 4516 return SDValue(); 4517 if (!TLI.isTypeLegal(XVT)) 4518 return SDValue(); 4519 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4520 return DAG.getNode(HandOpcode, DL, VT, Logic); 4521 } 4522 4523 // For binops SHL/SRL/SRA/AND: 4524 // logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z 4525 if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL || 4526 HandOpcode == ISD::SRA || HandOpcode == ISD::AND) && 4527 N0.getOperand(1) == N1.getOperand(1)) { 4528 // If either operand has other uses, this transform is not an improvement. 4529 if (!N0.hasOneUse() || !N1.hasOneUse()) 4530 return SDValue(); 4531 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4532 return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1)); 4533 } 4534 4535 // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y) 4536 if (HandOpcode == ISD::BSWAP) { 4537 // If either operand has other uses, this transform is not an improvement. 4538 if (!N0.hasOneUse() || !N1.hasOneUse()) 4539 return SDValue(); 4540 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4541 return DAG.getNode(HandOpcode, DL, VT, Logic); 4542 } 4543 4544 // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) 4545 // Only perform this optimization up until type legalization, before 4546 // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by 4547 // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and 4548 // we don't want to undo this promotion. 4549 // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper 4550 // on scalars. 4551 if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) && 4552 Level <= AfterLegalizeTypes) { 4553 // Input types must be integer and the same. 4554 if (XVT.isInteger() && XVT == Y.getValueType() && 4555 !(VT.isVector() && TLI.isTypeLegal(VT) && 4556 !XVT.isVector() && !TLI.isTypeLegal(XVT))) { 4557 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); 4558 return DAG.getNode(HandOpcode, DL, VT, Logic); 4559 } 4560 } 4561 4562 // Xor/and/or are indifferent to the swizzle operation (shuffle of one value). 4563 // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B)) 4564 // If both shuffles use the same mask, and both shuffle within a single 4565 // vector, then it is worthwhile to move the swizzle after the operation. 4566 // The type-legalizer generates this pattern when loading illegal 4567 // vector types from memory. In many cases this allows additional shuffle 4568 // optimizations. 4569 // There are other cases where moving the shuffle after the xor/and/or 4570 // is profitable even if shuffles don't perform a swizzle. 4571 // If both shuffles use the same mask, and both shuffles have the same first 4572 // or second operand, then it might still be profitable to move the shuffle 4573 // after the xor/and/or operation. 4574 if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { 4575 auto *SVN0 = cast<ShuffleVectorSDNode>(N0); 4576 auto *SVN1 = cast<ShuffleVectorSDNode>(N1); 4577 assert(X.getValueType() == Y.getValueType() && 4578 "Inputs to shuffles are not the same type"); 4579 4580 // Check that both shuffles use the same mask. The masks are known to be of 4581 // the same length because the result vector type is the same. 4582 // Check also that shuffles have only one use to avoid introducing extra 4583 // instructions. 4584 if (!SVN0->hasOneUse() || !SVN1->hasOneUse() || 4585 !SVN0->getMask().equals(SVN1->getMask())) 4586 return SDValue(); 4587 4588 // Don't try to fold this node if it requires introducing a 4589 // build vector of all zeros that might be illegal at this stage. 4590 SDValue ShOp = N0.getOperand(1); 4591 if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) 4592 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 4593 4594 // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C) 4595 if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { 4596 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, 4597 N0.getOperand(0), N1.getOperand(0)); 4598 return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask()); 4599 } 4600 4601 // Don't try to fold this node if it requires introducing a 4602 // build vector of all zeros that might be illegal at this stage. 4603 ShOp = N0.getOperand(0); 4604 if (LogicOpcode == ISD::XOR && !ShOp.isUndef()) 4605 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 4606 4607 // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B)) 4608 if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) { 4609 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1), 4610 N1.getOperand(1)); 4611 return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask()); 4612 } 4613 } 4614 4615 return SDValue(); 4616 } 4617 4618 /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. 4619 SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, 4620 const SDLoc &DL) { 4621 SDValue LL, LR, RL, RR, N0CC, N1CC; 4622 if (!isSetCCEquivalent(N0, LL, LR, N0CC) || 4623 !isSetCCEquivalent(N1, RL, RR, N1CC)) 4624 return SDValue(); 4625 4626 assert(N0.getValueType() == N1.getValueType() && 4627 "Unexpected operand types for bitwise logic op"); 4628 assert(LL.getValueType() == LR.getValueType() && 4629 RL.getValueType() == RR.getValueType() && 4630 "Unexpected operand types for setcc"); 4631 4632 // If we're here post-legalization or the logic op type is not i1, the logic 4633 // op type must match a setcc result type. Also, all folds require new 4634 // operations on the left and right operands, so those types must match. 4635 EVT VT = N0.getValueType(); 4636 EVT OpVT = LL.getValueType(); 4637 if (LegalOperations || VT.getScalarType() != MVT::i1) 4638 if (VT != getSetCCResultType(OpVT)) 4639 return SDValue(); 4640 if (OpVT != RL.getValueType()) 4641 return SDValue(); 4642 4643 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get(); 4644 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get(); 4645 bool IsInteger = OpVT.isInteger(); 4646 if (LR == RR && CC0 == CC1 && IsInteger) { 4647 bool IsZero = isNullOrNullSplat(LR); 4648 bool IsNeg1 = isAllOnesOrAllOnesSplat(LR); 4649 4650 // All bits clear? 4651 bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; 4652 // All sign bits clear? 4653 bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; 4654 // Any bits set? 4655 bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; 4656 // Any sign bits set? 4657 bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; 4658 4659 // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) 4660 // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) 4661 // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) 4662 // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) 4663 if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { 4664 SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); 4665 AddToWorklist(Or.getNode()); 4666 return DAG.getSetCC(DL, VT, Or, LR, CC1); 4667 } 4668 4669 // All bits set? 4670 bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; 4671 // All sign bits set? 4672 bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; 4673 // Any bits clear? 4674 bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; 4675 // Any sign bits clear? 4676 bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; 4677 4678 // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) 4679 // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) 4680 // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) 4681 // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) 4682 if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { 4683 SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); 4684 AddToWorklist(And.getNode()); 4685 return DAG.getSetCC(DL, VT, And, LR, CC1); 4686 } 4687 } 4688 4689 // TODO: What is the 'or' equivalent of this fold? 4690 // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) 4691 if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 && 4692 IsInteger && CC0 == ISD::SETNE && 4693 ((isNullConstant(LR) && isAllOnesConstant(RR)) || 4694 (isAllOnesConstant(LR) && isNullConstant(RR)))) { 4695 SDValue One = DAG.getConstant(1, DL, OpVT); 4696 SDValue Two = DAG.getConstant(2, DL, OpVT); 4697 SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); 4698 AddToWorklist(Add.getNode()); 4699 return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); 4700 } 4701 4702 // Try more general transforms if the predicates match and the only user of 4703 // the compares is the 'and' or 'or'. 4704 if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && 4705 N0.hasOneUse() && N1.hasOneUse()) { 4706 // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 4707 // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 4708 if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { 4709 SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); 4710 SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); 4711 SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); 4712 SDValue Zero = DAG.getConstant(0, DL, OpVT); 4713 return DAG.getSetCC(DL, VT, Or, Zero, CC1); 4714 } 4715 4716 // Turn compare of constants whose difference is 1 bit into add+and+setcc. 4717 // TODO - support non-uniform vector amounts. 4718 if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) { 4719 // Match a shared variable operand and 2 non-opaque constant operands. 4720 ConstantSDNode *C0 = isConstOrConstSplat(LR); 4721 ConstantSDNode *C1 = isConstOrConstSplat(RR); 4722 if (LL == RL && C0 && C1 && !C0->isOpaque() && !C1->isOpaque()) { 4723 // Canonicalize larger constant as C0. 4724 if (C1->getAPIntValue().ugt(C0->getAPIntValue())) 4725 std::swap(C0, C1); 4726 4727 // The difference of the constants must be a single bit. 4728 const APInt &C0Val = C0->getAPIntValue(); 4729 const APInt &C1Val = C1->getAPIntValue(); 4730 if ((C0Val - C1Val).isPowerOf2()) { 4731 // and/or (setcc X, C0, ne), (setcc X, C1, ne/eq) --> 4732 // setcc ((add X, -C1), ~(C0 - C1)), 0, ne/eq 4733 SDValue OffsetC = DAG.getConstant(-C1Val, DL, OpVT); 4734 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LL, OffsetC); 4735 SDValue MaskC = DAG.getConstant(~(C0Val - C1Val), DL, OpVT); 4736 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Add, MaskC); 4737 SDValue Zero = DAG.getConstant(0, DL, OpVT); 4738 return DAG.getSetCC(DL, VT, And, Zero, CC0); 4739 } 4740 } 4741 } 4742 } 4743 4744 // Canonicalize equivalent operands to LL == RL. 4745 if (LL == RR && LR == RL) { 4746 CC1 = ISD::getSetCCSwappedOperands(CC1); 4747 std::swap(RL, RR); 4748 } 4749 4750 // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) 4751 // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) 4752 if (LL == RL && LR == RR) { 4753 ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT) 4754 : ISD::getSetCCOrOperation(CC0, CC1, OpVT); 4755 if (NewCC != ISD::SETCC_INVALID && 4756 (!LegalOperations || 4757 (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && 4758 TLI.isOperationLegal(ISD::SETCC, OpVT)))) 4759 return DAG.getSetCC(DL, VT, LL, LR, NewCC); 4760 } 4761 4762 return SDValue(); 4763 } 4764 4765 /// This contains all DAGCombine rules which reduce two values combined by 4766 /// an And operation to a single value. This makes them reusable in the context 4767 /// of visitSELECT(). Rules involving constants are not included as 4768 /// visitSELECT() already handles those cases. 4769 SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { 4770 EVT VT = N1.getValueType(); 4771 SDLoc DL(N); 4772 4773 // fold (and x, undef) -> 0 4774 if (N0.isUndef() || N1.isUndef()) 4775 return DAG.getConstant(0, DL, VT); 4776 4777 if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) 4778 return V; 4779 4780 if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && 4781 VT.getSizeInBits() <= 64) { 4782 if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 4783 if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) { 4784 // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal 4785 // immediate for an add, but it is legal if its top c2 bits are set, 4786 // transform the ADD so the immediate doesn't need to be materialized 4787 // in a register. 4788 APInt ADDC = ADDI->getAPIntValue(); 4789 APInt SRLC = SRLI->getAPIntValue(); 4790 if (ADDC.getMinSignedBits() <= 64 && 4791 SRLC.ult(VT.getSizeInBits()) && 4792 !TLI.isLegalAddImmediate(ADDC.getSExtValue())) { 4793 APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), 4794 SRLC.getZExtValue()); 4795 if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { 4796 ADDC |= Mask; 4797 if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { 4798 SDLoc DL0(N0); 4799 SDValue NewAdd = 4800 DAG.getNode(ISD::ADD, DL0, VT, 4801 N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); 4802 CombineTo(N0.getNode(), NewAdd); 4803 // Return N so it doesn't get rechecked! 4804 return SDValue(N, 0); 4805 } 4806 } 4807 } 4808 } 4809 } 4810 } 4811 4812 // Reduce bit extract of low half of an integer to the narrower type. 4813 // (and (srl i64:x, K), KMask) -> 4814 // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) 4815 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { 4816 if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) { 4817 if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 4818 unsigned Size = VT.getSizeInBits(); 4819 const APInt &AndMask = CAnd->getAPIntValue(); 4820 unsigned ShiftBits = CShift->getZExtValue(); 4821 4822 // Bail out, this node will probably disappear anyway. 4823 if (ShiftBits == 0) 4824 return SDValue(); 4825 4826 unsigned MaskBits = AndMask.countTrailingOnes(); 4827 EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); 4828 4829 if (AndMask.isMask() && 4830 // Required bits must not span the two halves of the integer and 4831 // must fit in the half size type. 4832 (ShiftBits + MaskBits <= Size / 2) && 4833 TLI.isNarrowingProfitable(VT, HalfVT) && 4834 TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && 4835 TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && 4836 TLI.isTruncateFree(VT, HalfVT) && 4837 TLI.isZExtFree(HalfVT, VT)) { 4838 // The isNarrowingProfitable is to avoid regressions on PPC and 4839 // AArch64 which match a few 64-bit bit insert / bit extract patterns 4840 // on downstream users of this. Those patterns could probably be 4841 // extended to handle extensions mixed in. 4842 4843 SDValue SL(N0); 4844 assert(MaskBits <= Size); 4845 4846 // Extracting the highest bit of the low half. 4847 EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); 4848 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, 4849 N0.getOperand(0)); 4850 4851 SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); 4852 SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); 4853 SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); 4854 SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); 4855 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); 4856 } 4857 } 4858 } 4859 } 4860 4861 return SDValue(); 4862 } 4863 4864 bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, 4865 EVT LoadResultTy, EVT &ExtVT) { 4866 if (!AndC->getAPIntValue().isMask()) 4867 return false; 4868 4869 unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); 4870 4871 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 4872 EVT LoadedVT = LoadN->getMemoryVT(); 4873 4874 if (ExtVT == LoadedVT && 4875 (!LegalOperations || 4876 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { 4877 // ZEXTLOAD will match without needing to change the size of the value being 4878 // loaded. 4879 return true; 4880 } 4881 4882 // Do not change the width of a volatile or atomic loads. 4883 if (!LoadN->isSimple()) 4884 return false; 4885 4886 // Do not generate loads of non-round integer types since these can 4887 // be expensive (and would be wrong if the type is not byte sized). 4888 if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound()) 4889 return false; 4890 4891 if (LegalOperations && 4892 !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) 4893 return false; 4894 4895 if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) 4896 return false; 4897 4898 return true; 4899 } 4900 4901 bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, 4902 ISD::LoadExtType ExtType, EVT &MemVT, 4903 unsigned ShAmt) { 4904 if (!LDST) 4905 return false; 4906 // Only allow byte offsets. 4907 if (ShAmt % 8) 4908 return false; 4909 4910 // Do not generate loads of non-round integer types since these can 4911 // be expensive (and would be wrong if the type is not byte sized). 4912 if (!MemVT.isRound()) 4913 return false; 4914 4915 // Don't change the width of a volatile or atomic loads. 4916 if (!LDST->isSimple()) 4917 return false; 4918 4919 // Verify that we are actually reducing a load width here. 4920 if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits()) 4921 return false; 4922 4923 // Ensure that this isn't going to produce an unsupported memory access. 4924 if (ShAmt) { 4925 assert(ShAmt % 8 == 0 && "ShAmt is byte offset"); 4926 const unsigned ByteShAmt = ShAmt / 8; 4927 const Align LDSTAlign = LDST->getAlign(); 4928 const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt); 4929 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 4930 LDST->getAddressSpace(), NarrowAlign, 4931 LDST->getMemOperand()->getFlags())) 4932 return false; 4933 } 4934 4935 // It's not possible to generate a constant of extended or untyped type. 4936 EVT PtrType = LDST->getBasePtr().getValueType(); 4937 if (PtrType == MVT::Untyped || PtrType.isExtended()) 4938 return false; 4939 4940 if (isa<LoadSDNode>(LDST)) { 4941 LoadSDNode *Load = cast<LoadSDNode>(LDST); 4942 // Don't transform one with multiple uses, this would require adding a new 4943 // load. 4944 if (!SDValue(Load, 0).hasOneUse()) 4945 return false; 4946 4947 if (LegalOperations && 4948 !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT)) 4949 return false; 4950 4951 // For the transform to be legal, the load must produce only two values 4952 // (the value loaded and the chain). Don't transform a pre-increment 4953 // load, for example, which produces an extra value. Otherwise the 4954 // transformation is not equivalent, and the downstream logic to replace 4955 // uses gets things wrong. 4956 if (Load->getNumValues() > 2) 4957 return false; 4958 4959 // If the load that we're shrinking is an extload and we're not just 4960 // discarding the extension we can't simply shrink the load. Bail. 4961 // TODO: It would be possible to merge the extensions in some cases. 4962 if (Load->getExtensionType() != ISD::NON_EXTLOAD && 4963 Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) 4964 return false; 4965 4966 if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT)) 4967 return false; 4968 } else { 4969 assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode"); 4970 StoreSDNode *Store = cast<StoreSDNode>(LDST); 4971 // Can't write outside the original store 4972 if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) 4973 return false; 4974 4975 if (LegalOperations && 4976 !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT)) 4977 return false; 4978 } 4979 return true; 4980 } 4981 4982 bool DAGCombiner::SearchForAndLoads(SDNode *N, 4983 SmallVectorImpl<LoadSDNode*> &Loads, 4984 SmallPtrSetImpl<SDNode*> &NodesWithConsts, 4985 ConstantSDNode *Mask, 4986 SDNode *&NodeToMask) { 4987 // Recursively search for the operands, looking for loads which can be 4988 // narrowed. 4989 for (SDValue Op : N->op_values()) { 4990 if (Op.getValueType().isVector()) 4991 return false; 4992 4993 // Some constants may need fixing up later if they are too large. 4994 if (auto *C = dyn_cast<ConstantSDNode>(Op)) { 4995 if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && 4996 (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) 4997 NodesWithConsts.insert(N); 4998 continue; 4999 } 5000 5001 if (!Op.hasOneUse()) 5002 return false; 5003 5004 switch(Op.getOpcode()) { 5005 case ISD::LOAD: { 5006 auto *Load = cast<LoadSDNode>(Op); 5007 EVT ExtVT; 5008 if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && 5009 isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) { 5010 5011 // ZEXTLOAD is already small enough. 5012 if (Load->getExtensionType() == ISD::ZEXTLOAD && 5013 ExtVT.bitsGE(Load->getMemoryVT())) 5014 continue; 5015 5016 // Use LE to convert equal sized loads to zext. 5017 if (ExtVT.bitsLE(Load->getMemoryVT())) 5018 Loads.push_back(Load); 5019 5020 continue; 5021 } 5022 return false; 5023 } 5024 case ISD::ZERO_EXTEND: 5025 case ISD::AssertZext: { 5026 unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); 5027 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 5028 EVT VT = Op.getOpcode() == ISD::AssertZext ? 5029 cast<VTSDNode>(Op.getOperand(1))->getVT() : 5030 Op.getOperand(0).getValueType(); 5031 5032 // We can accept extending nodes if the mask is wider or an equal 5033 // width to the original type. 5034 if (ExtVT.bitsGE(VT)) 5035 continue; 5036 break; 5037 } 5038 case ISD::OR: 5039 case ISD::XOR: 5040 case ISD::AND: 5041 if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask, 5042 NodeToMask)) 5043 return false; 5044 continue; 5045 } 5046 5047 // Allow one node which will masked along with any loads found. 5048 if (NodeToMask) 5049 return false; 5050 5051 // Also ensure that the node to be masked only produces one data result. 5052 NodeToMask = Op.getNode(); 5053 if (NodeToMask->getNumValues() > 1) { 5054 bool HasValue = false; 5055 for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) { 5056 MVT VT = SDValue(NodeToMask, i).getSimpleValueType(); 5057 if (VT != MVT::Glue && VT != MVT::Other) { 5058 if (HasValue) { 5059 NodeToMask = nullptr; 5060 return false; 5061 } 5062 HasValue = true; 5063 } 5064 } 5065 assert(HasValue && "Node to be masked has no data result?"); 5066 } 5067 } 5068 return true; 5069 } 5070 5071 bool DAGCombiner::BackwardsPropagateMask(SDNode *N) { 5072 auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5073 if (!Mask) 5074 return false; 5075 5076 if (!Mask->getAPIntValue().isMask()) 5077 return false; 5078 5079 // No need to do anything if the and directly uses a load. 5080 if (isa<LoadSDNode>(N->getOperand(0))) 5081 return false; 5082 5083 SmallVector<LoadSDNode*, 8> Loads; 5084 SmallPtrSet<SDNode*, 2> NodesWithConsts; 5085 SDNode *FixupNode = nullptr; 5086 if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { 5087 if (Loads.size() == 0) 5088 return false; 5089 5090 LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump()); 5091 SDValue MaskOp = N->getOperand(1); 5092 5093 // If it exists, fixup the single node we allow in the tree that needs 5094 // masking. 5095 if (FixupNode) { 5096 LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump()); 5097 SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), 5098 FixupNode->getValueType(0), 5099 SDValue(FixupNode, 0), MaskOp); 5100 DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); 5101 if (And.getOpcode() == ISD ::AND) 5102 DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp); 5103 } 5104 5105 // Narrow any constants that need it. 5106 for (auto *LogicN : NodesWithConsts) { 5107 SDValue Op0 = LogicN->getOperand(0); 5108 SDValue Op1 = LogicN->getOperand(1); 5109 5110 if (isa<ConstantSDNode>(Op0)) 5111 std::swap(Op0, Op1); 5112 5113 SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), 5114 Op1, MaskOp); 5115 5116 DAG.UpdateNodeOperands(LogicN, Op0, And); 5117 } 5118 5119 // Create narrow loads. 5120 for (auto *Load : Loads) { 5121 LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump()); 5122 SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), 5123 SDValue(Load, 0), MaskOp); 5124 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); 5125 if (And.getOpcode() == ISD ::AND) 5126 And = SDValue( 5127 DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0); 5128 SDValue NewLoad = ReduceLoadWidth(And.getNode()); 5129 assert(NewLoad && 5130 "Shouldn't be masking the load if it can't be narrowed"); 5131 CombineTo(Load, NewLoad, NewLoad.getValue(1)); 5132 } 5133 DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); 5134 return true; 5135 } 5136 return false; 5137 } 5138 5139 // Unfold 5140 // x & (-1 'logical shift' y) 5141 // To 5142 // (x 'opposite logical shift' y) 'logical shift' y 5143 // if it is better for performance. 5144 SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) { 5145 assert(N->getOpcode() == ISD::AND); 5146 5147 SDValue N0 = N->getOperand(0); 5148 SDValue N1 = N->getOperand(1); 5149 5150 // Do we actually prefer shifts over mask? 5151 if (!TLI.shouldFoldMaskToVariableShiftPair(N0)) 5152 return SDValue(); 5153 5154 // Try to match (-1 '[outer] logical shift' y) 5155 unsigned OuterShift; 5156 unsigned InnerShift; // The opposite direction to the OuterShift. 5157 SDValue Y; // Shift amount. 5158 auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool { 5159 if (!M.hasOneUse()) 5160 return false; 5161 OuterShift = M->getOpcode(); 5162 if (OuterShift == ISD::SHL) 5163 InnerShift = ISD::SRL; 5164 else if (OuterShift == ISD::SRL) 5165 InnerShift = ISD::SHL; 5166 else 5167 return false; 5168 if (!isAllOnesConstant(M->getOperand(0))) 5169 return false; 5170 Y = M->getOperand(1); 5171 return true; 5172 }; 5173 5174 SDValue X; 5175 if (matchMask(N1)) 5176 X = N0; 5177 else if (matchMask(N0)) 5178 X = N1; 5179 else 5180 return SDValue(); 5181 5182 SDLoc DL(N); 5183 EVT VT = N->getValueType(0); 5184 5185 // tmp = x 'opposite logical shift' y 5186 SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y); 5187 // ret = tmp 'logical shift' y 5188 SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y); 5189 5190 return T1; 5191 } 5192 5193 /// Try to replace shift/logic that tests if a bit is clear with mask + setcc. 5194 /// For a target with a bit test, this is expected to become test + set and save 5195 /// at least 1 instruction. 5196 static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) { 5197 assert(And->getOpcode() == ISD::AND && "Expected an 'and' op"); 5198 5199 // This is probably not worthwhile without a supported type. 5200 EVT VT = And->getValueType(0); 5201 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5202 if (!TLI.isTypeLegal(VT)) 5203 return SDValue(); 5204 5205 // Look through an optional extension and find a 'not'. 5206 // TODO: Should we favor test+set even without the 'not' op? 5207 SDValue Not = And->getOperand(0), And1 = And->getOperand(1); 5208 if (Not.getOpcode() == ISD::ANY_EXTEND) 5209 Not = Not.getOperand(0); 5210 if (!isBitwiseNot(Not) || !Not.hasOneUse() || !isOneConstant(And1)) 5211 return SDValue(); 5212 5213 // Look though an optional truncation. The source operand may not be the same 5214 // type as the original 'and', but that is ok because we are masking off 5215 // everything but the low bit. 5216 SDValue Srl = Not.getOperand(0); 5217 if (Srl.getOpcode() == ISD::TRUNCATE) 5218 Srl = Srl.getOperand(0); 5219 5220 // Match a shift-right by constant. 5221 if (Srl.getOpcode() != ISD::SRL || !Srl.hasOneUse() || 5222 !isa<ConstantSDNode>(Srl.getOperand(1))) 5223 return SDValue(); 5224 5225 // We might have looked through casts that make this transform invalid. 5226 // TODO: If the source type is wider than the result type, do the mask and 5227 // compare in the source type. 5228 const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1); 5229 unsigned VTBitWidth = VT.getSizeInBits(); 5230 if (ShiftAmt.uge(VTBitWidth)) 5231 return SDValue(); 5232 5233 // Turn this into a bit-test pattern using mask op + setcc: 5234 // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0 5235 SDLoc DL(And); 5236 SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT); 5237 EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 5238 SDValue Mask = DAG.getConstant( 5239 APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT); 5240 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask); 5241 SDValue Zero = DAG.getConstant(0, DL, VT); 5242 SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ); 5243 return DAG.getZExtOrTrunc(Setcc, DL, VT); 5244 } 5245 5246 SDValue DAGCombiner::visitAND(SDNode *N) { 5247 SDValue N0 = N->getOperand(0); 5248 SDValue N1 = N->getOperand(1); 5249 EVT VT = N1.getValueType(); 5250 5251 // x & x --> x 5252 if (N0 == N1) 5253 return N0; 5254 5255 // fold vector ops 5256 if (VT.isVector()) { 5257 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 5258 return FoldedVOp; 5259 5260 // fold (and x, 0) -> 0, vector edition 5261 if (ISD::isBuildVectorAllZeros(N0.getNode())) 5262 // do not return N0, because undef node may exist in N0 5263 return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()), 5264 SDLoc(N), N0.getValueType()); 5265 if (ISD::isBuildVectorAllZeros(N1.getNode())) 5266 // do not return N1, because undef node may exist in N1 5267 return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()), 5268 SDLoc(N), N1.getValueType()); 5269 5270 // fold (and x, -1) -> x, vector edition 5271 if (ISD::isBuildVectorAllOnes(N0.getNode())) 5272 return N1; 5273 if (ISD::isBuildVectorAllOnes(N1.getNode())) 5274 return N0; 5275 } 5276 5277 // fold (and c1, c2) -> c1&c2 5278 ConstantSDNode *N1C = isConstOrConstSplat(N1); 5279 if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1})) 5280 return C; 5281 5282 // canonicalize constant to RHS 5283 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 5284 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 5285 return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); 5286 5287 // fold (and x, -1) -> x 5288 if (isAllOnesConstant(N1)) 5289 return N0; 5290 5291 // if (and x, c) is known to be zero, return 0 5292 unsigned BitWidth = VT.getScalarSizeInBits(); 5293 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), 5294 APInt::getAllOnesValue(BitWidth))) 5295 return DAG.getConstant(0, SDLoc(N), VT); 5296 5297 if (SDValue NewSel = foldBinOpIntoSelect(N)) 5298 return NewSel; 5299 5300 // reassociate and 5301 if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) 5302 return RAND; 5303 5304 // Try to convert a constant mask AND into a shuffle clear mask. 5305 if (VT.isVector()) 5306 if (SDValue Shuffle = XformToShuffleWithZero(N)) 5307 return Shuffle; 5308 5309 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 5310 return Combined; 5311 5312 // fold (and (or x, C), D) -> D if (C & D) == D 5313 auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { 5314 return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); 5315 }; 5316 if (N0.getOpcode() == ISD::OR && 5317 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) 5318 return N1; 5319 // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. 5320 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { 5321 SDValue N0Op0 = N0.getOperand(0); 5322 APInt Mask = ~N1C->getAPIntValue(); 5323 Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits()); 5324 if (DAG.MaskedValueIsZero(N0Op0, Mask)) { 5325 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5326 N0.getValueType(), N0Op0); 5327 5328 // Replace uses of the AND with uses of the Zero extend node. 5329 CombineTo(N, Zext); 5330 5331 // We actually want to replace all uses of the any_extend with the 5332 // zero_extend, to avoid duplicating things. This will later cause this 5333 // AND to be folded. 5334 CombineTo(N0.getNode(), Zext); 5335 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5336 } 5337 } 5338 5339 // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> 5340 // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must 5341 // already be zero by virtue of the width of the base type of the load. 5342 // 5343 // the 'X' node here can either be nothing or an extract_vector_elt to catch 5344 // more cases. 5345 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5346 N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() && 5347 N0.getOperand(0).getOpcode() == ISD::LOAD && 5348 N0.getOperand(0).getResNo() == 0) || 5349 (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) { 5350 LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ? 5351 N0 : N0.getOperand(0) ); 5352 5353 // Get the constant (if applicable) the zero'th operand is being ANDed with. 5354 // This can be a pure constant or a vector splat, in which case we treat the 5355 // vector as a scalar and use the splat value. 5356 APInt Constant = APInt::getNullValue(1); 5357 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 5358 Constant = C->getAPIntValue(); 5359 } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) { 5360 APInt SplatValue, SplatUndef; 5361 unsigned SplatBitSize; 5362 bool HasAnyUndefs; 5363 bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef, 5364 SplatBitSize, HasAnyUndefs); 5365 if (IsSplat) { 5366 // Undef bits can contribute to a possible optimisation if set, so 5367 // set them. 5368 SplatValue |= SplatUndef; 5369 5370 // The splat value may be something like "0x00FFFFFF", which means 0 for 5371 // the first vector value and FF for the rest, repeating. We need a mask 5372 // that will apply equally to all members of the vector, so AND all the 5373 // lanes of the constant together. 5374 unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits(); 5375 5376 // If the splat value has been compressed to a bitlength lower 5377 // than the size of the vector lane, we need to re-expand it to 5378 // the lane size. 5379 if (EltBitWidth > SplatBitSize) 5380 for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth); 5381 SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2) 5382 SplatValue |= SplatValue.shl(SplatBitSize); 5383 5384 // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a 5385 // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value. 5386 if ((SplatBitSize % EltBitWidth) == 0) { 5387 Constant = APInt::getAllOnesValue(EltBitWidth); 5388 for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i) 5389 Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth); 5390 } 5391 } 5392 } 5393 5394 // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is 5395 // actually legal and isn't going to get expanded, else this is a false 5396 // optimisation. 5397 bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, 5398 Load->getValueType(0), 5399 Load->getMemoryVT()); 5400 5401 // Resize the constant to the same size as the original memory access before 5402 // extension. If it is still the AllOnesValue then this AND is completely 5403 // unneeded. 5404 Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits()); 5405 5406 bool B; 5407 switch (Load->getExtensionType()) { 5408 default: B = false; break; 5409 case ISD::EXTLOAD: B = CanZextLoadProfitably; break; 5410 case ISD::ZEXTLOAD: 5411 case ISD::NON_EXTLOAD: B = true; break; 5412 } 5413 5414 if (B && Constant.isAllOnesValue()) { 5415 // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to 5416 // preserve semantics once we get rid of the AND. 5417 SDValue NewLoad(Load, 0); 5418 5419 // Fold the AND away. NewLoad may get replaced immediately. 5420 CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); 5421 5422 if (Load->getExtensionType() == ISD::EXTLOAD) { 5423 NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, 5424 Load->getValueType(0), SDLoc(Load), 5425 Load->getChain(), Load->getBasePtr(), 5426 Load->getOffset(), Load->getMemoryVT(), 5427 Load->getMemOperand()); 5428 // Replace uses of the EXTLOAD with the new ZEXTLOAD. 5429 if (Load->getNumValues() == 3) { 5430 // PRE/POST_INC loads have 3 values. 5431 SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1), 5432 NewLoad.getValue(2) }; 5433 CombineTo(Load, To, 3, true); 5434 } else { 5435 CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); 5436 } 5437 } 5438 5439 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5440 } 5441 } 5442 5443 // fold (and (load x), 255) -> (zextload x, i8) 5444 // fold (and (extload x, i16), 255) -> (zextload x, i8) 5445 // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) 5446 if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || 5447 (N0.getOpcode() == ISD::ANY_EXTEND && 5448 N0.getOperand(0).getOpcode() == ISD::LOAD))) { 5449 if (SDValue Res = ReduceLoadWidth(N)) { 5450 LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND 5451 ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0); 5452 AddToWorklist(N); 5453 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 0), Res); 5454 return SDValue(N, 0); 5455 } 5456 } 5457 5458 if (LegalTypes) { 5459 // Attempt to propagate the AND back up to the leaves which, if they're 5460 // loads, can be combined to narrow loads and the AND node can be removed. 5461 // Perform after legalization so that extend nodes will already be 5462 // combined into the loads. 5463 if (BackwardsPropagateMask(N)) 5464 return SDValue(N, 0); 5465 } 5466 5467 if (SDValue Combined = visitANDLike(N0, N1, N)) 5468 return Combined; 5469 5470 // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) 5471 if (N0.getOpcode() == N1.getOpcode()) 5472 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 5473 return V; 5474 5475 // Masking the negated extension of a boolean is just the zero-extended 5476 // boolean: 5477 // and (sub 0, zext(bool X)), 1 --> zext(bool X) 5478 // and (sub 0, sext(bool X)), 1 --> zext(bool X) 5479 // 5480 // Note: the SimplifyDemandedBits fold below can make an information-losing 5481 // transform, and then we have no way to find this better fold. 5482 if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { 5483 if (isNullOrNullSplat(N0.getOperand(0))) { 5484 SDValue SubRHS = N0.getOperand(1); 5485 if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && 5486 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) 5487 return SubRHS; 5488 if (SubRHS.getOpcode() == ISD::SIGN_EXTEND && 5489 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) 5490 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0)); 5491 } 5492 } 5493 5494 // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) 5495 // fold (and (sra)) -> (and (srl)) when possible. 5496 if (SimplifyDemandedBits(SDValue(N, 0))) 5497 return SDValue(N, 0); 5498 5499 // fold (zext_inreg (extload x)) -> (zextload x) 5500 // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use 5501 if (ISD::isUNINDEXEDLoad(N0.getNode()) && 5502 (ISD::isEXTLoad(N0.getNode()) || 5503 (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) { 5504 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 5505 EVT MemVT = LN0->getMemoryVT(); 5506 // If we zero all the possible extended bits, then we can turn this into 5507 // a zextload if we are running before legalize or the operation is legal. 5508 unsigned ExtBitSize = N1.getScalarValueSizeInBits(); 5509 unsigned MemBitSize = MemVT.getScalarSizeInBits(); 5510 APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize); 5511 if (DAG.MaskedValueIsZero(N1, ExtBits) && 5512 ((!LegalOperations && LN0->isSimple()) || 5513 TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { 5514 SDValue ExtLoad = 5515 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), 5516 LN0->getBasePtr(), MemVT, LN0->getMemOperand()); 5517 AddToWorklist(N); 5518 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 5519 return SDValue(N, 0); // Return N so it doesn't get rechecked! 5520 } 5521 } 5522 5523 // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const) 5524 if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) { 5525 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), 5526 N0.getOperand(1), false)) 5527 return BSwap; 5528 } 5529 5530 if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N)) 5531 return Shifts; 5532 5533 if (TLI.hasBitTest(N0, N1)) 5534 if (SDValue V = combineShiftAnd1ToBitTest(N, DAG)) 5535 return V; 5536 5537 return SDValue(); 5538 } 5539 5540 /// Match (a >> 8) | (a << 8) as (bswap a) >> 16. 5541 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, 5542 bool DemandHighBits) { 5543 if (!LegalOperations) 5544 return SDValue(); 5545 5546 EVT VT = N->getValueType(0); 5547 if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) 5548 return SDValue(); 5549 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) 5550 return SDValue(); 5551 5552 // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff) 5553 bool LookPassAnd0 = false; 5554 bool LookPassAnd1 = false; 5555 if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) 5556 std::swap(N0, N1); 5557 if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL) 5558 std::swap(N0, N1); 5559 if (N0.getOpcode() == ISD::AND) { 5560 if (!N0.getNode()->hasOneUse()) 5561 return SDValue(); 5562 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5563 // Also handle 0xffff since the LHS is guaranteed to have zeros there. 5564 // This is needed for X86. 5565 if (!N01C || (N01C->getZExtValue() != 0xFF00 && 5566 N01C->getZExtValue() != 0xFFFF)) 5567 return SDValue(); 5568 N0 = N0.getOperand(0); 5569 LookPassAnd0 = true; 5570 } 5571 5572 if (N1.getOpcode() == ISD::AND) { 5573 if (!N1.getNode()->hasOneUse()) 5574 return SDValue(); 5575 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 5576 if (!N11C || N11C->getZExtValue() != 0xFF) 5577 return SDValue(); 5578 N1 = N1.getOperand(0); 5579 LookPassAnd1 = true; 5580 } 5581 5582 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 5583 std::swap(N0, N1); 5584 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 5585 return SDValue(); 5586 if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse()) 5587 return SDValue(); 5588 5589 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5590 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 5591 if (!N01C || !N11C) 5592 return SDValue(); 5593 if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8) 5594 return SDValue(); 5595 5596 // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8) 5597 SDValue N00 = N0->getOperand(0); 5598 if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) { 5599 if (!N00.getNode()->hasOneUse()) 5600 return SDValue(); 5601 ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1)); 5602 if (!N001C || N001C->getZExtValue() != 0xFF) 5603 return SDValue(); 5604 N00 = N00.getOperand(0); 5605 LookPassAnd0 = true; 5606 } 5607 5608 SDValue N10 = N1->getOperand(0); 5609 if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) { 5610 if (!N10.getNode()->hasOneUse()) 5611 return SDValue(); 5612 ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1)); 5613 // Also allow 0xFFFF since the bits will be shifted out. This is needed 5614 // for X86. 5615 if (!N101C || (N101C->getZExtValue() != 0xFF00 && 5616 N101C->getZExtValue() != 0xFFFF)) 5617 return SDValue(); 5618 N10 = N10.getOperand(0); 5619 LookPassAnd1 = true; 5620 } 5621 5622 if (N00 != N10) 5623 return SDValue(); 5624 5625 // Make sure everything beyond the low halfword gets set to zero since the SRL 5626 // 16 will clear the top bits. 5627 unsigned OpSizeInBits = VT.getSizeInBits(); 5628 if (DemandHighBits && OpSizeInBits > 16) { 5629 // If the left-shift isn't masked out then the only way this is a bswap is 5630 // if all bits beyond the low 8 are 0. In that case the entire pattern 5631 // reduces to a left shift anyway: leave it for other parts of the combiner. 5632 if (!LookPassAnd0) 5633 return SDValue(); 5634 5635 // However, if the right shift isn't masked out then it might be because 5636 // it's not needed. See if we can spot that too. 5637 if (!LookPassAnd1 && 5638 !DAG.MaskedValueIsZero( 5639 N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16))) 5640 return SDValue(); 5641 } 5642 5643 SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00); 5644 if (OpSizeInBits > 16) { 5645 SDLoc DL(N); 5646 Res = DAG.getNode(ISD::SRL, DL, VT, Res, 5647 DAG.getConstant(OpSizeInBits - 16, DL, 5648 getShiftAmountTy(VT))); 5649 } 5650 return Res; 5651 } 5652 5653 /// Return true if the specified node is an element that makes up a 32-bit 5654 /// packed halfword byteswap. 5655 /// ((x & 0x000000ff) << 8) | 5656 /// ((x & 0x0000ff00) >> 8) | 5657 /// ((x & 0x00ff0000) << 8) | 5658 /// ((x & 0xff000000) >> 8) 5659 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) { 5660 if (!N.getNode()->hasOneUse()) 5661 return false; 5662 5663 unsigned Opc = N.getOpcode(); 5664 if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) 5665 return false; 5666 5667 SDValue N0 = N.getOperand(0); 5668 unsigned Opc0 = N0.getOpcode(); 5669 if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL) 5670 return false; 5671 5672 ConstantSDNode *N1C = nullptr; 5673 // SHL or SRL: look upstream for AND mask operand 5674 if (Opc == ISD::AND) 5675 N1C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 5676 else if (Opc0 == ISD::AND) 5677 N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5678 if (!N1C) 5679 return false; 5680 5681 unsigned MaskByteOffset; 5682 switch (N1C->getZExtValue()) { 5683 default: 5684 return false; 5685 case 0xFF: MaskByteOffset = 0; break; 5686 case 0xFF00: MaskByteOffset = 1; break; 5687 case 0xFFFF: 5688 // In case demanded bits didn't clear the bits that will be shifted out. 5689 // This is needed for X86. 5690 if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) { 5691 MaskByteOffset = 1; 5692 break; 5693 } 5694 return false; 5695 case 0xFF0000: MaskByteOffset = 2; break; 5696 case 0xFF000000: MaskByteOffset = 3; break; 5697 } 5698 5699 // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). 5700 if (Opc == ISD::AND) { 5701 if (MaskByteOffset == 0 || MaskByteOffset == 2) { 5702 // (x >> 8) & 0xff 5703 // (x >> 8) & 0xff0000 5704 if (Opc0 != ISD::SRL) 5705 return false; 5706 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5707 if (!C || C->getZExtValue() != 8) 5708 return false; 5709 } else { 5710 // (x << 8) & 0xff00 5711 // (x << 8) & 0xff000000 5712 if (Opc0 != ISD::SHL) 5713 return false; 5714 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5715 if (!C || C->getZExtValue() != 8) 5716 return false; 5717 } 5718 } else if (Opc == ISD::SHL) { 5719 // (x & 0xff) << 8 5720 // (x & 0xff0000) << 8 5721 if (MaskByteOffset != 0 && MaskByteOffset != 2) 5722 return false; 5723 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 5724 if (!C || C->getZExtValue() != 8) 5725 return false; 5726 } else { // Opc == ISD::SRL 5727 // (x & 0xff00) >> 8 5728 // (x & 0xff000000) >> 8 5729 if (MaskByteOffset != 1 && MaskByteOffset != 3) 5730 return false; 5731 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); 5732 if (!C || C->getZExtValue() != 8) 5733 return false; 5734 } 5735 5736 if (Parts[MaskByteOffset]) 5737 return false; 5738 5739 Parts[MaskByteOffset] = N0.getOperand(0).getNode(); 5740 return true; 5741 } 5742 5743 // Match 2 elements of a packed halfword bswap. 5744 static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) { 5745 if (N.getOpcode() == ISD::OR) 5746 return isBSwapHWordElement(N.getOperand(0), Parts) && 5747 isBSwapHWordElement(N.getOperand(1), Parts); 5748 5749 if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) { 5750 ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1)); 5751 if (!C || C->getAPIntValue() != 16) 5752 return false; 5753 Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode(); 5754 return true; 5755 } 5756 5757 return false; 5758 } 5759 5760 // Match this pattern: 5761 // (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff)) 5762 // And rewrite this to: 5763 // (rotr (bswap A), 16) 5764 static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI, 5765 SelectionDAG &DAG, SDNode *N, SDValue N0, 5766 SDValue N1, EVT VT, EVT ShiftAmountTy) { 5767 assert(N->getOpcode() == ISD::OR && VT == MVT::i32 && 5768 "MatchBSwapHWordOrAndAnd: expecting i32"); 5769 if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) 5770 return SDValue(); 5771 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) 5772 return SDValue(); 5773 // TODO: this is too restrictive; lifting this restriction requires more tests 5774 if (!N0->hasOneUse() || !N1->hasOneUse()) 5775 return SDValue(); 5776 ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1)); 5777 ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1)); 5778 if (!Mask0 || !Mask1) 5779 return SDValue(); 5780 if (Mask0->getAPIntValue() != 0xff00ff00 || 5781 Mask1->getAPIntValue() != 0x00ff00ff) 5782 return SDValue(); 5783 SDValue Shift0 = N0.getOperand(0); 5784 SDValue Shift1 = N1.getOperand(0); 5785 if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL) 5786 return SDValue(); 5787 ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1)); 5788 ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1)); 5789 if (!ShiftAmt0 || !ShiftAmt1) 5790 return SDValue(); 5791 if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8) 5792 return SDValue(); 5793 if (Shift0.getOperand(0) != Shift1.getOperand(0)) 5794 return SDValue(); 5795 5796 SDLoc DL(N); 5797 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0)); 5798 SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy); 5799 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt); 5800 } 5801 5802 /// Match a 32-bit packed halfword bswap. That is 5803 /// ((x & 0x000000ff) << 8) | 5804 /// ((x & 0x0000ff00) >> 8) | 5805 /// ((x & 0x00ff0000) << 8) | 5806 /// ((x & 0xff000000) >> 8) 5807 /// => (rotl (bswap x), 16) 5808 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { 5809 if (!LegalOperations) 5810 return SDValue(); 5811 5812 EVT VT = N->getValueType(0); 5813 if (VT != MVT::i32) 5814 return SDValue(); 5815 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) 5816 return SDValue(); 5817 5818 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT, 5819 getShiftAmountTy(VT))) 5820 return BSwap; 5821 5822 // Try again with commuted operands. 5823 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT, 5824 getShiftAmountTy(VT))) 5825 return BSwap; 5826 5827 5828 // Look for either 5829 // (or (bswaphpair), (bswaphpair)) 5830 // (or (or (bswaphpair), (and)), (and)) 5831 // (or (or (and), (bswaphpair)), (and)) 5832 SDNode *Parts[4] = {}; 5833 5834 if (isBSwapHWordPair(N0, Parts)) { 5835 // (or (or (and), (and)), (or (and), (and))) 5836 if (!isBSwapHWordPair(N1, Parts)) 5837 return SDValue(); 5838 } else if (N0.getOpcode() == ISD::OR) { 5839 // (or (or (or (and), (and)), (and)), (and)) 5840 if (!isBSwapHWordElement(N1, Parts)) 5841 return SDValue(); 5842 SDValue N00 = N0.getOperand(0); 5843 SDValue N01 = N0.getOperand(1); 5844 if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) && 5845 !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts))) 5846 return SDValue(); 5847 } else 5848 return SDValue(); 5849 5850 // Make sure the parts are all coming from the same node. 5851 if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3]) 5852 return SDValue(); 5853 5854 SDLoc DL(N); 5855 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, 5856 SDValue(Parts[0], 0)); 5857 5858 // Result of the bswap should be rotated by 16. If it's not legal, then 5859 // do (x << 16) | (x >> 16). 5860 SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT)); 5861 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT)) 5862 return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt); 5863 if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) 5864 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt); 5865 return DAG.getNode(ISD::OR, DL, VT, 5866 DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt), 5867 DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt)); 5868 } 5869 5870 /// This contains all DAGCombine rules which reduce two values combined by 5871 /// an Or operation to a single value \see visitANDLike(). 5872 SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { 5873 EVT VT = N1.getValueType(); 5874 SDLoc DL(N); 5875 5876 // fold (or x, undef) -> -1 5877 if (!LegalOperations && (N0.isUndef() || N1.isUndef())) 5878 return DAG.getAllOnesConstant(DL, VT); 5879 5880 if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) 5881 return V; 5882 5883 // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. 5884 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && 5885 // Don't increase # computations. 5886 (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { 5887 // We can only do this xform if we know that bits from X that are set in C2 5888 // but not in C1 are already zero. Likewise for Y. 5889 if (const ConstantSDNode *N0O1C = 5890 getAsNonOpaqueConstant(N0.getOperand(1))) { 5891 if (const ConstantSDNode *N1O1C = 5892 getAsNonOpaqueConstant(N1.getOperand(1))) { 5893 // We can only do this xform if we know that bits from X that are set in 5894 // C2 but not in C1 are already zero. Likewise for Y. 5895 const APInt &LHSMask = N0O1C->getAPIntValue(); 5896 const APInt &RHSMask = N1O1C->getAPIntValue(); 5897 5898 if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) && 5899 DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { 5900 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, 5901 N0.getOperand(0), N1.getOperand(0)); 5902 return DAG.getNode(ISD::AND, DL, VT, X, 5903 DAG.getConstant(LHSMask | RHSMask, DL, VT)); 5904 } 5905 } 5906 } 5907 } 5908 5909 // (or (and X, M), (and X, N)) -> (and X, (or M, N)) 5910 if (N0.getOpcode() == ISD::AND && 5911 N1.getOpcode() == ISD::AND && 5912 N0.getOperand(0) == N1.getOperand(0) && 5913 // Don't increase # computations. 5914 (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { 5915 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, 5916 N0.getOperand(1), N1.getOperand(1)); 5917 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); 5918 } 5919 5920 return SDValue(); 5921 } 5922 5923 /// OR combines for which the commuted variant will be tried as well. 5924 static SDValue visitORCommutative( 5925 SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) { 5926 EVT VT = N0.getValueType(); 5927 if (N0.getOpcode() == ISD::AND) { 5928 // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y) 5929 if (isBitwiseNot(N0.getOperand(1)) && N0.getOperand(1).getOperand(0) == N1) 5930 return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1); 5931 5932 // fold (or (and (xor Y, -1), X), Y) -> (or X, Y) 5933 if (isBitwiseNot(N0.getOperand(0)) && N0.getOperand(0).getOperand(0) == N1) 5934 return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1); 5935 } 5936 5937 return SDValue(); 5938 } 5939 5940 SDValue DAGCombiner::visitOR(SDNode *N) { 5941 SDValue N0 = N->getOperand(0); 5942 SDValue N1 = N->getOperand(1); 5943 EVT VT = N1.getValueType(); 5944 5945 // x | x --> x 5946 if (N0 == N1) 5947 return N0; 5948 5949 // fold vector ops 5950 if (VT.isVector()) { 5951 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 5952 return FoldedVOp; 5953 5954 // fold (or x, 0) -> x, vector edition 5955 if (ISD::isBuildVectorAllZeros(N0.getNode())) 5956 return N1; 5957 if (ISD::isBuildVectorAllZeros(N1.getNode())) 5958 return N0; 5959 5960 // fold (or x, -1) -> -1, vector edition 5961 if (ISD::isBuildVectorAllOnes(N0.getNode())) 5962 // do not return N0, because undef node may exist in N0 5963 return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); 5964 if (ISD::isBuildVectorAllOnes(N1.getNode())) 5965 // do not return N1, because undef node may exist in N1 5966 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); 5967 5968 // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) 5969 // Do this only if the resulting shuffle is legal. 5970 if (isa<ShuffleVectorSDNode>(N0) && 5971 isa<ShuffleVectorSDNode>(N1) && 5972 // Avoid folding a node with illegal type. 5973 TLI.isTypeLegal(VT)) { 5974 bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode()); 5975 bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()); 5976 bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); 5977 bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); 5978 // Ensure both shuffles have a zero input. 5979 if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { 5980 assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); 5981 assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); 5982 const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0); 5983 const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1); 5984 bool CanFold = true; 5985 int NumElts = VT.getVectorNumElements(); 5986 SmallVector<int, 4> Mask(NumElts); 5987 5988 for (int i = 0; i != NumElts; ++i) { 5989 int M0 = SV0->getMaskElt(i); 5990 int M1 = SV1->getMaskElt(i); 5991 5992 // Determine if either index is pointing to a zero vector. 5993 bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts)); 5994 bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts)); 5995 5996 // If one element is zero and the otherside is undef, keep undef. 5997 // This also handles the case that both are undef. 5998 if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) { 5999 Mask[i] = -1; 6000 continue; 6001 } 6002 6003 // Make sure only one of the elements is zero. 6004 if (M0Zero == M1Zero) { 6005 CanFold = false; 6006 break; 6007 } 6008 6009 assert((M0 >= 0 || M1 >= 0) && "Undef index!"); 6010 6011 // We have a zero and non-zero element. If the non-zero came from 6012 // SV0 make the index a LHS index. If it came from SV1, make it 6013 // a RHS index. We need to mod by NumElts because we don't care 6014 // which operand it came from in the original shuffles. 6015 Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts; 6016 } 6017 6018 if (CanFold) { 6019 SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0); 6020 SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0); 6021 6022 SDValue LegalShuffle = 6023 TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, 6024 Mask, DAG); 6025 if (LegalShuffle) 6026 return LegalShuffle; 6027 } 6028 } 6029 } 6030 } 6031 6032 // fold (or c1, c2) -> c1|c2 6033 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 6034 if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1})) 6035 return C; 6036 6037 // canonicalize constant to RHS 6038 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 6039 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 6040 return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); 6041 6042 // fold (or x, 0) -> x 6043 if (isNullConstant(N1)) 6044 return N0; 6045 6046 // fold (or x, -1) -> -1 6047 if (isAllOnesConstant(N1)) 6048 return N1; 6049 6050 if (SDValue NewSel = foldBinOpIntoSelect(N)) 6051 return NewSel; 6052 6053 // fold (or x, c) -> c iff (x & ~c) == 0 6054 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) 6055 return N1; 6056 6057 if (SDValue Combined = visitORLike(N0, N1, N)) 6058 return Combined; 6059 6060 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 6061 return Combined; 6062 6063 // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) 6064 if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) 6065 return BSwap; 6066 if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1)) 6067 return BSwap; 6068 6069 // reassociate or 6070 if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) 6071 return ROR; 6072 6073 // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) 6074 // iff (c1 & c2) != 0 or c1/c2 are undef. 6075 auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) { 6076 return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue()); 6077 }; 6078 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && 6079 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) { 6080 if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, 6081 {N1, N0.getOperand(1)})) { 6082 SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1); 6083 AddToWorklist(IOR.getNode()); 6084 return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR); 6085 } 6086 } 6087 6088 if (SDValue Combined = visitORCommutative(DAG, N0, N1, N)) 6089 return Combined; 6090 if (SDValue Combined = visitORCommutative(DAG, N1, N0, N)) 6091 return Combined; 6092 6093 // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) 6094 if (N0.getOpcode() == N1.getOpcode()) 6095 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 6096 return V; 6097 6098 // See if this is some rotate idiom. 6099 if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N))) 6100 return Rot; 6101 6102 if (SDValue Load = MatchLoadCombine(N)) 6103 return Load; 6104 6105 // Simplify the operands using demanded-bits information. 6106 if (SimplifyDemandedBits(SDValue(N, 0))) 6107 return SDValue(N, 0); 6108 6109 // If OR can be rewritten into ADD, try combines based on ADD. 6110 if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) && 6111 DAG.haveNoCommonBitsSet(N0, N1)) 6112 if (SDValue Combined = visitADDLike(N)) 6113 return Combined; 6114 6115 return SDValue(); 6116 } 6117 6118 static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) { 6119 if (Op.getOpcode() == ISD::AND && 6120 DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { 6121 Mask = Op.getOperand(1); 6122 return Op.getOperand(0); 6123 } 6124 return Op; 6125 } 6126 6127 /// Match "(X shl/srl V1) & V2" where V2 may not be present. 6128 static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift, 6129 SDValue &Mask) { 6130 Op = stripConstantMask(DAG, Op, Mask); 6131 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) { 6132 Shift = Op; 6133 return true; 6134 } 6135 return false; 6136 } 6137 6138 /// Helper function for visitOR to extract the needed side of a rotate idiom 6139 /// from a shl/srl/mul/udiv. This is meant to handle cases where 6140 /// InstCombine merged some outside op with one of the shifts from 6141 /// the rotate pattern. 6142 /// \returns An empty \c SDValue if the needed shift couldn't be extracted. 6143 /// Otherwise, returns an expansion of \p ExtractFrom based on the following 6144 /// patterns: 6145 /// 6146 /// (or (add v v) (shrl v bitwidth-1)): 6147 /// expands (add v v) -> (shl v 1) 6148 /// 6149 /// (or (mul v c0) (shrl (mul v c1) c2)): 6150 /// expands (mul v c0) -> (shl (mul v c1) c3) 6151 /// 6152 /// (or (udiv v c0) (shl (udiv v c1) c2)): 6153 /// expands (udiv v c0) -> (shrl (udiv v c1) c3) 6154 /// 6155 /// (or (shl v c0) (shrl (shl v c1) c2)): 6156 /// expands (shl v c0) -> (shl (shl v c1) c3) 6157 /// 6158 /// (or (shrl v c0) (shl (shrl v c1) c2)): 6159 /// expands (shrl v c0) -> (shrl (shrl v c1) c3) 6160 /// 6161 /// Such that in all cases, c3+c2==bitwidth(op v c1). 6162 static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift, 6163 SDValue ExtractFrom, SDValue &Mask, 6164 const SDLoc &DL) { 6165 assert(OppShift && ExtractFrom && "Empty SDValue"); 6166 assert( 6167 (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) && 6168 "Existing shift must be valid as a rotate half"); 6169 6170 ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask); 6171 6172 // Value and Type of the shift. 6173 SDValue OppShiftLHS = OppShift.getOperand(0); 6174 EVT ShiftedVT = OppShiftLHS.getValueType(); 6175 6176 // Amount of the existing shift. 6177 ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1)); 6178 6179 // (add v v) -> (shl v 1) 6180 // TODO: Should this be a general DAG canonicalization? 6181 if (OppShift.getOpcode() == ISD::SRL && OppShiftCst && 6182 ExtractFrom.getOpcode() == ISD::ADD && 6183 ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) && 6184 ExtractFrom.getOperand(0) == OppShiftLHS && 6185 OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1) 6186 return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS, 6187 DAG.getShiftAmountConstant(1, ShiftedVT, DL)); 6188 6189 // Preconditions: 6190 // (or (op0 v c0) (shiftl/r (op0 v c1) c2)) 6191 // 6192 // Find opcode of the needed shift to be extracted from (op0 v c0). 6193 unsigned Opcode = ISD::DELETED_NODE; 6194 bool IsMulOrDiv = false; 6195 // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift 6196 // opcode or its arithmetic (mul or udiv) variant. 6197 auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) { 6198 IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant; 6199 if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift) 6200 return false; 6201 Opcode = NeededShift; 6202 return true; 6203 }; 6204 // op0 must be either the needed shift opcode or the mul/udiv equivalent 6205 // that the needed shift can be extracted from. 6206 if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) && 6207 (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV))) 6208 return SDValue(); 6209 6210 // op0 must be the same opcode on both sides, have the same LHS argument, 6211 // and produce the same value type. 6212 if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() || 6213 OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) || 6214 ShiftedVT != ExtractFrom.getValueType()) 6215 return SDValue(); 6216 6217 // Constant mul/udiv/shift amount from the RHS of the shift's LHS op. 6218 ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1)); 6219 // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op. 6220 ConstantSDNode *ExtractFromCst = 6221 isConstOrConstSplat(ExtractFrom.getOperand(1)); 6222 // TODO: We should be able to handle non-uniform constant vectors for these values 6223 // Check that we have constant values. 6224 if (!OppShiftCst || !OppShiftCst->getAPIntValue() || 6225 !OppLHSCst || !OppLHSCst->getAPIntValue() || 6226 !ExtractFromCst || !ExtractFromCst->getAPIntValue()) 6227 return SDValue(); 6228 6229 // Compute the shift amount we need to extract to complete the rotate. 6230 const unsigned VTWidth = ShiftedVT.getScalarSizeInBits(); 6231 if (OppShiftCst->getAPIntValue().ugt(VTWidth)) 6232 return SDValue(); 6233 APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue(); 6234 // Normalize the bitwidth of the two mul/udiv/shift constant operands. 6235 APInt ExtractFromAmt = ExtractFromCst->getAPIntValue(); 6236 APInt OppLHSAmt = OppLHSCst->getAPIntValue(); 6237 zeroExtendToMatch(ExtractFromAmt, OppLHSAmt); 6238 6239 // Now try extract the needed shift from the ExtractFrom op and see if the 6240 // result matches up with the existing shift's LHS op. 6241 if (IsMulOrDiv) { 6242 // Op to extract from is a mul or udiv by a constant. 6243 // Check: 6244 // c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0 6245 // c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0 6246 const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(), 6247 NeededShiftAmt.getZExtValue()); 6248 APInt ResultAmt; 6249 APInt Rem; 6250 APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem); 6251 if (Rem != 0 || ResultAmt != OppLHSAmt) 6252 return SDValue(); 6253 } else { 6254 // Op to extract from is a shift by a constant. 6255 // Check: 6256 // c2 - (bitwidth(op0 v c0) - c1) == c0 6257 if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc( 6258 ExtractFromAmt.getBitWidth())) 6259 return SDValue(); 6260 } 6261 6262 // Return the expanded shift op that should allow a rotate to be formed. 6263 EVT ShiftVT = OppShift.getOperand(1).getValueType(); 6264 EVT ResVT = ExtractFrom.getValueType(); 6265 SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT); 6266 return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode); 6267 } 6268 6269 // Return true if we can prove that, whenever Neg and Pos are both in the 6270 // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that 6271 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: 6272 // 6273 // (or (shift1 X, Neg), (shift2 X, Pos)) 6274 // 6275 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate 6276 // in direction shift1 by Neg. The range [0, EltSize) means that we only need 6277 // to consider shift amounts with defined behavior. 6278 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, 6279 SelectionDAG &DAG) { 6280 // If EltSize is a power of 2 then: 6281 // 6282 // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) 6283 // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize). 6284 // 6285 // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check 6286 // for the stronger condition: 6287 // 6288 // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A] 6289 // 6290 // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1) 6291 // we can just replace Neg with Neg' for the rest of the function. 6292 // 6293 // In other cases we check for the even stronger condition: 6294 // 6295 // Neg == EltSize - Pos [B] 6296 // 6297 // for all Neg and Pos. Note that the (or ...) then invokes undefined 6298 // behavior if Pos == 0 (and consequently Neg == EltSize). 6299 // 6300 // We could actually use [A] whenever EltSize is a power of 2, but the 6301 // only extra cases that it would match are those uninteresting ones 6302 // where Neg and Pos are never in range at the same time. E.g. for 6303 // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) 6304 // as well as (sub 32, Pos), but: 6305 // 6306 // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) 6307 // 6308 // always invokes undefined behavior for 32-bit X. 6309 // 6310 // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. 6311 unsigned MaskLoBits = 0; 6312 if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { 6313 if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { 6314 KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0)); 6315 unsigned Bits = Log2_64(EltSize); 6316 if (NegC->getAPIntValue().getActiveBits() <= Bits && 6317 ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) { 6318 Neg = Neg.getOperand(0); 6319 MaskLoBits = Bits; 6320 } 6321 } 6322 } 6323 6324 // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. 6325 if (Neg.getOpcode() != ISD::SUB) 6326 return false; 6327 ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0)); 6328 if (!NegC) 6329 return false; 6330 SDValue NegOp1 = Neg.getOperand(1); 6331 6332 // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with 6333 // Pos'. The truncation is redundant for the purpose of the equality. 6334 if (MaskLoBits && Pos.getOpcode() == ISD::AND) { 6335 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) { 6336 KnownBits Known = DAG.computeKnownBits(Pos.getOperand(0)); 6337 if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits && 6338 ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >= 6339 MaskLoBits)) 6340 Pos = Pos.getOperand(0); 6341 } 6342 } 6343 6344 // The condition we need is now: 6345 // 6346 // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask 6347 // 6348 // If NegOp1 == Pos then we need: 6349 // 6350 // EltSize & Mask == NegC & Mask 6351 // 6352 // (because "x & Mask" is a truncation and distributes through subtraction). 6353 // 6354 // We also need to account for a potential truncation of NegOp1 if the amount 6355 // has already been legalized to a shift amount type. 6356 APInt Width; 6357 if ((Pos == NegOp1) || 6358 (NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0))) 6359 Width = NegC->getAPIntValue(); 6360 6361 // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. 6362 // Then the condition we want to prove becomes: 6363 // 6364 // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask 6365 // 6366 // which, again because "x & Mask" is a truncation, becomes: 6367 // 6368 // NegC & Mask == (EltSize - PosC) & Mask 6369 // EltSize & Mask == (NegC + PosC) & Mask 6370 else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) { 6371 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) 6372 Width = PosC->getAPIntValue() + NegC->getAPIntValue(); 6373 else 6374 return false; 6375 } else 6376 return false; 6377 6378 // Now we just need to check that EltSize & Mask == Width & Mask. 6379 if (MaskLoBits) 6380 // EltSize & Mask is 0 since Mask is EltSize - 1. 6381 return Width.getLoBits(MaskLoBits) == 0; 6382 return Width == EltSize; 6383 } 6384 6385 // A subroutine of MatchRotate used once we have found an OR of two opposite 6386 // shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces 6387 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the 6388 // former being preferred if supported. InnerPos and InnerNeg are Pos and 6389 // Neg with outer conversions stripped away. 6390 SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, 6391 SDValue Neg, SDValue InnerPos, 6392 SDValue InnerNeg, unsigned PosOpcode, 6393 unsigned NegOpcode, const SDLoc &DL) { 6394 // fold (or (shl x, (*ext y)), 6395 // (srl x, (*ext (sub 32, y)))) -> 6396 // (rotl x, y) or (rotr x, (sub 32, y)) 6397 // 6398 // fold (or (shl x, (*ext (sub 32, y))), 6399 // (srl x, (*ext y))) -> 6400 // (rotr x, y) or (rotl x, (sub 32, y)) 6401 EVT VT = Shifted.getValueType(); 6402 if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) { 6403 bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); 6404 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, 6405 HasPos ? Pos : Neg); 6406 } 6407 6408 return SDValue(); 6409 } 6410 6411 // A subroutine of MatchRotate used once we have found an OR of two opposite 6412 // shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces 6413 // to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the 6414 // former being preferred if supported. InnerPos and InnerNeg are Pos and 6415 // Neg with outer conversions stripped away. 6416 // TODO: Merge with MatchRotatePosNeg. 6417 SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, 6418 SDValue Neg, SDValue InnerPos, 6419 SDValue InnerNeg, unsigned PosOpcode, 6420 unsigned NegOpcode, const SDLoc &DL) { 6421 EVT VT = N0.getValueType(); 6422 unsigned EltBits = VT.getScalarSizeInBits(); 6423 6424 // fold (or (shl x0, (*ext y)), 6425 // (srl x1, (*ext (sub 32, y)))) -> 6426 // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y)) 6427 // 6428 // fold (or (shl x0, (*ext (sub 32, y))), 6429 // (srl x1, (*ext y))) -> 6430 // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y)) 6431 if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) { 6432 bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); 6433 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1, 6434 HasPos ? Pos : Neg); 6435 } 6436 6437 // Matching the shift+xor cases, we can't easily use the xor'd shift amount 6438 // so for now just use the PosOpcode case if its legal. 6439 // TODO: When can we use the NegOpcode case? 6440 if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) { 6441 auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) { 6442 if (Op.getOpcode() != BinOpc) 6443 return false; 6444 ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1)); 6445 return Cst && (Cst->getAPIntValue() == Imm); 6446 }; 6447 6448 // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31))) 6449 // -> (fshl x0, x1, y) 6450 if (IsBinOpImm(N1, ISD::SRL, 1) && 6451 IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) && 6452 InnerPos == InnerNeg.getOperand(0) && 6453 TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) { 6454 return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos); 6455 } 6456 6457 // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y)) 6458 // -> (fshr x0, x1, y) 6459 if (IsBinOpImm(N0, ISD::SHL, 1) && 6460 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) && 6461 InnerNeg == InnerPos.getOperand(0) && 6462 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) { 6463 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg); 6464 } 6465 6466 // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y)) 6467 // -> (fshr x0, x1, y) 6468 // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization? 6469 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) && 6470 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) && 6471 InnerNeg == InnerPos.getOperand(0) && 6472 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) { 6473 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg); 6474 } 6475 } 6476 6477 return SDValue(); 6478 } 6479 6480 // MatchRotate - Handle an 'or' of two operands. If this is one of the many 6481 // idioms for rotate, and if the target supports rotation instructions, generate 6482 // a rot[lr]. This also matches funnel shift patterns, similar to rotation but 6483 // with different shifted sources. 6484 SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { 6485 // Must be a legal type. Expanded 'n promoted things won't work with rotates. 6486 EVT VT = LHS.getValueType(); 6487 if (!TLI.isTypeLegal(VT)) 6488 return SDValue(); 6489 6490 // The target must have at least one rotate/funnel flavor. 6491 bool HasROTL = hasOperation(ISD::ROTL, VT); 6492 bool HasROTR = hasOperation(ISD::ROTR, VT); 6493 bool HasFSHL = hasOperation(ISD::FSHL, VT); 6494 bool HasFSHR = hasOperation(ISD::FSHR, VT); 6495 if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR) 6496 return SDValue(); 6497 6498 // Check for truncated rotate. 6499 if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && 6500 LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { 6501 assert(LHS.getValueType() == RHS.getValueType()); 6502 if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { 6503 return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot); 6504 } 6505 } 6506 6507 // Match "(X shl/srl V1) & V2" where V2 may not be present. 6508 SDValue LHSShift; // The shift. 6509 SDValue LHSMask; // AND value if any. 6510 matchRotateHalf(DAG, LHS, LHSShift, LHSMask); 6511 6512 SDValue RHSShift; // The shift. 6513 SDValue RHSMask; // AND value if any. 6514 matchRotateHalf(DAG, RHS, RHSShift, RHSMask); 6515 6516 // If neither side matched a rotate half, bail 6517 if (!LHSShift && !RHSShift) 6518 return SDValue(); 6519 6520 // InstCombine may have combined a constant shl, srl, mul, or udiv with one 6521 // side of the rotate, so try to handle that here. In all cases we need to 6522 // pass the matched shift from the opposite side to compute the opcode and 6523 // needed shift amount to extract. We still want to do this if both sides 6524 // matched a rotate half because one half may be a potential overshift that 6525 // can be broken down (ie if InstCombine merged two shl or srl ops into a 6526 // single one). 6527 6528 // Have LHS side of the rotate, try to extract the needed shift from the RHS. 6529 if (LHSShift) 6530 if (SDValue NewRHSShift = 6531 extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL)) 6532 RHSShift = NewRHSShift; 6533 // Have RHS side of the rotate, try to extract the needed shift from the LHS. 6534 if (RHSShift) 6535 if (SDValue NewLHSShift = 6536 extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL)) 6537 LHSShift = NewLHSShift; 6538 6539 // If a side is still missing, nothing else we can do. 6540 if (!RHSShift || !LHSShift) 6541 return SDValue(); 6542 6543 // At this point we've matched or extracted a shift op on each side. 6544 6545 if (LHSShift.getOpcode() == RHSShift.getOpcode()) 6546 return SDValue(); // Shifts must disagree. 6547 6548 bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0); 6549 if (!IsRotate && !(HasFSHL || HasFSHR)) 6550 return SDValue(); // Requires funnel shift support. 6551 6552 // Canonicalize shl to left side in a shl/srl pair. 6553 if (RHSShift.getOpcode() == ISD::SHL) { 6554 std::swap(LHS, RHS); 6555 std::swap(LHSShift, RHSShift); 6556 std::swap(LHSMask, RHSMask); 6557 } 6558 6559 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 6560 SDValue LHSShiftArg = LHSShift.getOperand(0); 6561 SDValue LHSShiftAmt = LHSShift.getOperand(1); 6562 SDValue RHSShiftArg = RHSShift.getOperand(0); 6563 SDValue RHSShiftAmt = RHSShift.getOperand(1); 6564 6565 // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) 6566 // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) 6567 // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1) 6568 // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2) 6569 // iff C1+C2 == EltSizeInBits 6570 auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, 6571 ConstantSDNode *RHS) { 6572 return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; 6573 }; 6574 if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { 6575 SDValue Res; 6576 if (IsRotate && (HasROTL || HasROTR)) 6577 Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, 6578 HasROTL ? LHSShiftAmt : RHSShiftAmt); 6579 else 6580 Res = DAG.getNode(HasFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg, 6581 RHSShiftArg, HasFSHL ? LHSShiftAmt : RHSShiftAmt); 6582 6583 // If there is an AND of either shifted operand, apply it to the result. 6584 if (LHSMask.getNode() || RHSMask.getNode()) { 6585 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); 6586 SDValue Mask = AllOnes; 6587 6588 if (LHSMask.getNode()) { 6589 SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt); 6590 Mask = DAG.getNode(ISD::AND, DL, VT, Mask, 6591 DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits)); 6592 } 6593 if (RHSMask.getNode()) { 6594 SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt); 6595 Mask = DAG.getNode(ISD::AND, DL, VT, Mask, 6596 DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); 6597 } 6598 6599 Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask); 6600 } 6601 6602 return Res; 6603 } 6604 6605 // If there is a mask here, and we have a variable shift, we can't be sure 6606 // that we're masking out the right stuff. 6607 if (LHSMask.getNode() || RHSMask.getNode()) 6608 return SDValue(); 6609 6610 // If the shift amount is sign/zext/any-extended just peel it off. 6611 SDValue LExtOp0 = LHSShiftAmt; 6612 SDValue RExtOp0 = RHSShiftAmt; 6613 if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || 6614 LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || 6615 LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || 6616 LHSShiftAmt.getOpcode() == ISD::TRUNCATE) && 6617 (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || 6618 RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || 6619 RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || 6620 RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) { 6621 LExtOp0 = LHSShiftAmt.getOperand(0); 6622 RExtOp0 = RHSShiftAmt.getOperand(0); 6623 } 6624 6625 if (IsRotate && (HasROTL || HasROTR)) { 6626 SDValue TryL = 6627 MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0, 6628 RExtOp0, ISD::ROTL, ISD::ROTR, DL); 6629 if (TryL) 6630 return TryL; 6631 6632 SDValue TryR = 6633 MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0, 6634 LExtOp0, ISD::ROTR, ISD::ROTL, DL); 6635 if (TryR) 6636 return TryR; 6637 } 6638 6639 SDValue TryL = 6640 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt, 6641 LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL); 6642 if (TryL) 6643 return TryL; 6644 6645 SDValue TryR = 6646 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt, 6647 RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL); 6648 if (TryR) 6649 return TryR; 6650 6651 return SDValue(); 6652 } 6653 6654 namespace { 6655 6656 /// Represents known origin of an individual byte in load combine pattern. The 6657 /// value of the byte is either constant zero or comes from memory. 6658 struct ByteProvider { 6659 // For constant zero providers Load is set to nullptr. For memory providers 6660 // Load represents the node which loads the byte from memory. 6661 // ByteOffset is the offset of the byte in the value produced by the load. 6662 LoadSDNode *Load = nullptr; 6663 unsigned ByteOffset = 0; 6664 6665 ByteProvider() = default; 6666 6667 static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { 6668 return ByteProvider(Load, ByteOffset); 6669 } 6670 6671 static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } 6672 6673 bool isConstantZero() const { return !Load; } 6674 bool isMemory() const { return Load; } 6675 6676 bool operator==(const ByteProvider &Other) const { 6677 return Other.Load == Load && Other.ByteOffset == ByteOffset; 6678 } 6679 6680 private: 6681 ByteProvider(LoadSDNode *Load, unsigned ByteOffset) 6682 : Load(Load), ByteOffset(ByteOffset) {} 6683 }; 6684 6685 } // end anonymous namespace 6686 6687 /// Recursively traverses the expression calculating the origin of the requested 6688 /// byte of the given value. Returns None if the provider can't be calculated. 6689 /// 6690 /// For all the values except the root of the expression verifies that the value 6691 /// has exactly one use and if it's not true return None. This way if the origin 6692 /// of the byte is returned it's guaranteed that the values which contribute to 6693 /// the byte are not used outside of this expression. 6694 /// 6695 /// Because the parts of the expression are not allowed to have more than one 6696 /// use this function iterates over trees, not DAGs. So it never visits the same 6697 /// node more than once. 6698 static const Optional<ByteProvider> 6699 calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, 6700 bool Root = false) { 6701 // Typical i64 by i8 pattern requires recursion up to 8 calls depth 6702 if (Depth == 10) 6703 return None; 6704 6705 if (!Root && !Op.hasOneUse()) 6706 return None; 6707 6708 assert(Op.getValueType().isScalarInteger() && "can't handle other types"); 6709 unsigned BitWidth = Op.getValueSizeInBits(); 6710 if (BitWidth % 8 != 0) 6711 return None; 6712 unsigned ByteWidth = BitWidth / 8; 6713 assert(Index < ByteWidth && "invalid index requested"); 6714 (void) ByteWidth; 6715 6716 switch (Op.getOpcode()) { 6717 case ISD::OR: { 6718 auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); 6719 if (!LHS) 6720 return None; 6721 auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); 6722 if (!RHS) 6723 return None; 6724 6725 if (LHS->isConstantZero()) 6726 return RHS; 6727 if (RHS->isConstantZero()) 6728 return LHS; 6729 return None; 6730 } 6731 case ISD::SHL: { 6732 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 6733 if (!ShiftOp) 6734 return None; 6735 6736 uint64_t BitShift = ShiftOp->getZExtValue(); 6737 if (BitShift % 8 != 0) 6738 return None; 6739 uint64_t ByteShift = BitShift / 8; 6740 6741 return Index < ByteShift 6742 ? ByteProvider::getConstantZero() 6743 : calculateByteProvider(Op->getOperand(0), Index - ByteShift, 6744 Depth + 1); 6745 } 6746 case ISD::ANY_EXTEND: 6747 case ISD::SIGN_EXTEND: 6748 case ISD::ZERO_EXTEND: { 6749 SDValue NarrowOp = Op->getOperand(0); 6750 unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); 6751 if (NarrowBitWidth % 8 != 0) 6752 return None; 6753 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 6754 6755 if (Index >= NarrowByteWidth) 6756 return Op.getOpcode() == ISD::ZERO_EXTEND 6757 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) 6758 : None; 6759 return calculateByteProvider(NarrowOp, Index, Depth + 1); 6760 } 6761 case ISD::BSWAP: 6762 return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, 6763 Depth + 1); 6764 case ISD::LOAD: { 6765 auto L = cast<LoadSDNode>(Op.getNode()); 6766 if (!L->isSimple() || L->isIndexed()) 6767 return None; 6768 6769 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); 6770 if (NarrowBitWidth % 8 != 0) 6771 return None; 6772 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 6773 6774 if (Index >= NarrowByteWidth) 6775 return L->getExtensionType() == ISD::ZEXTLOAD 6776 ? Optional<ByteProvider>(ByteProvider::getConstantZero()) 6777 : None; 6778 return ByteProvider::getMemory(L, Index); 6779 } 6780 } 6781 6782 return None; 6783 } 6784 6785 static unsigned LittleEndianByteAt(unsigned BW, unsigned i) { 6786 return i; 6787 } 6788 6789 static unsigned BigEndianByteAt(unsigned BW, unsigned i) { 6790 return BW - i - 1; 6791 } 6792 6793 // Check if the bytes offsets we are looking at match with either big or 6794 // little endian value loaded. Return true for big endian, false for little 6795 // endian, and None if match failed. 6796 static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets, 6797 int64_t FirstOffset) { 6798 // The endian can be decided only when it is 2 bytes at least. 6799 unsigned Width = ByteOffsets.size(); 6800 if (Width < 2) 6801 return None; 6802 6803 bool BigEndian = true, LittleEndian = true; 6804 for (unsigned i = 0; i < Width; i++) { 6805 int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; 6806 LittleEndian &= CurrentByteOffset == LittleEndianByteAt(Width, i); 6807 BigEndian &= CurrentByteOffset == BigEndianByteAt(Width, i); 6808 if (!BigEndian && !LittleEndian) 6809 return None; 6810 } 6811 6812 assert((BigEndian != LittleEndian) && "It should be either big endian or" 6813 "little endian"); 6814 return BigEndian; 6815 } 6816 6817 static SDValue stripTruncAndExt(SDValue Value) { 6818 switch (Value.getOpcode()) { 6819 case ISD::TRUNCATE: 6820 case ISD::ZERO_EXTEND: 6821 case ISD::SIGN_EXTEND: 6822 case ISD::ANY_EXTEND: 6823 return stripTruncAndExt(Value.getOperand(0)); 6824 } 6825 return Value; 6826 } 6827 6828 /// Match a pattern where a wide type scalar value is stored by several narrow 6829 /// stores. Fold it into a single store or a BSWAP and a store if the targets 6830 /// supports it. 6831 /// 6832 /// Assuming little endian target: 6833 /// i8 *p = ... 6834 /// i32 val = ... 6835 /// p[0] = (val >> 0) & 0xFF; 6836 /// p[1] = (val >> 8) & 0xFF; 6837 /// p[2] = (val >> 16) & 0xFF; 6838 /// p[3] = (val >> 24) & 0xFF; 6839 /// => 6840 /// *((i32)p) = val; 6841 /// 6842 /// i8 *p = ... 6843 /// i32 val = ... 6844 /// p[0] = (val >> 24) & 0xFF; 6845 /// p[1] = (val >> 16) & 0xFF; 6846 /// p[2] = (val >> 8) & 0xFF; 6847 /// p[3] = (val >> 0) & 0xFF; 6848 /// => 6849 /// *((i32)p) = BSWAP(val); 6850 SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { 6851 // Collect all the stores in the chain. 6852 SDValue Chain; 6853 SmallVector<StoreSDNode *, 8> Stores; 6854 for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) { 6855 // TODO: Allow unordered atomics when wider type is legal (see D66309) 6856 if (Store->getMemoryVT() != MVT::i8 || 6857 !Store->isSimple() || Store->isIndexed()) 6858 return SDValue(); 6859 Stores.push_back(Store); 6860 Chain = Store->getChain(); 6861 } 6862 // Handle the simple type only. 6863 unsigned Width = Stores.size(); 6864 EVT VT = EVT::getIntegerVT( 6865 *DAG.getContext(), Width * N->getMemoryVT().getSizeInBits()); 6866 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 6867 return SDValue(); 6868 6869 if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT)) 6870 return SDValue(); 6871 6872 // Check if all the bytes of the combined value we are looking at are stored 6873 // to the same base address. Collect bytes offsets from Base address into 6874 // ByteOffsets. 6875 SDValue CombinedValue; 6876 SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX); 6877 int64_t FirstOffset = INT64_MAX; 6878 StoreSDNode *FirstStore = nullptr; 6879 Optional<BaseIndexOffset> Base; 6880 for (auto Store : Stores) { 6881 // All the stores store different byte of the CombinedValue. A truncate is 6882 // required to get that byte value. 6883 SDValue Trunc = Store->getValue(); 6884 if (Trunc.getOpcode() != ISD::TRUNCATE) 6885 return SDValue(); 6886 // A shift operation is required to get the right byte offset, except the 6887 // first byte. 6888 int64_t Offset = 0; 6889 SDValue Value = Trunc.getOperand(0); 6890 if (Value.getOpcode() == ISD::SRL || 6891 Value.getOpcode() == ISD::SRA) { 6892 auto *ShiftOffset = dyn_cast<ConstantSDNode>(Value.getOperand(1)); 6893 // Trying to match the following pattern. The shift offset must be 6894 // a constant and a multiple of 8. It is the byte offset in "y". 6895 // 6896 // x = srl y, offset 6897 // i8 z = trunc x 6898 // store z, ... 6899 if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8)) 6900 return SDValue(); 6901 6902 Offset = ShiftOffset->getSExtValue()/8; 6903 Value = Value.getOperand(0); 6904 } 6905 6906 // Stores must share the same combined value with different offsets. 6907 if (!CombinedValue) 6908 CombinedValue = Value; 6909 else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value)) 6910 return SDValue(); 6911 6912 // The trunc and all the extend operation should be stripped to get the 6913 // real value we are stored. 6914 else if (CombinedValue.getValueType() != VT) { 6915 if (Value.getValueType() == VT || 6916 Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits()) 6917 CombinedValue = Value; 6918 // Give up if the combined value type is smaller than the store size. 6919 if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits()) 6920 return SDValue(); 6921 } 6922 6923 // Stores must share the same base address 6924 BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG); 6925 int64_t ByteOffsetFromBase = 0; 6926 if (!Base) 6927 Base = Ptr; 6928 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) 6929 return SDValue(); 6930 6931 // Remember the first byte store 6932 if (ByteOffsetFromBase < FirstOffset) { 6933 FirstStore = Store; 6934 FirstOffset = ByteOffsetFromBase; 6935 } 6936 // Map the offset in the store and the offset in the combined value, and 6937 // early return if it has been set before. 6938 if (Offset < 0 || Offset >= Width || ByteOffsets[Offset] != INT64_MAX) 6939 return SDValue(); 6940 ByteOffsets[Offset] = ByteOffsetFromBase; 6941 } 6942 6943 assert(FirstOffset != INT64_MAX && "First byte offset must be set"); 6944 assert(FirstStore && "First store must be set"); 6945 6946 // Check if the bytes of the combined value we are looking at match with 6947 // either big or little endian value store. 6948 Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); 6949 if (!IsBigEndian.hasValue()) 6950 return SDValue(); 6951 6952 // The node we are looking at matches with the pattern, check if we can 6953 // replace it with a single bswap if needed and store. 6954 6955 // If the store needs byte swap check if the target supports it 6956 bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian; 6957 6958 // Before legalize we can introduce illegal bswaps which will be later 6959 // converted to an explicit bswap sequence. This way we end up with a single 6960 // store and byte shuffling instead of several stores and byte shuffling. 6961 if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) 6962 return SDValue(); 6963 6964 // Check that a store of the wide type is both allowed and fast on the target 6965 bool Fast = false; 6966 bool Allowed = 6967 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 6968 *FirstStore->getMemOperand(), &Fast); 6969 if (!Allowed || !Fast) 6970 return SDValue(); 6971 6972 if (VT != CombinedValue.getValueType()) { 6973 assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() && 6974 "Get unexpected store value to combine"); 6975 CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, 6976 CombinedValue); 6977 } 6978 6979 if (NeedsBswap) 6980 CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue); 6981 6982 SDValue NewStore = 6983 DAG.getStore(Chain, SDLoc(N), CombinedValue, FirstStore->getBasePtr(), 6984 FirstStore->getPointerInfo(), FirstStore->getAlignment()); 6985 6986 // Rely on other DAG combine rules to remove the other individual stores. 6987 DAG.ReplaceAllUsesWith(N, NewStore.getNode()); 6988 return NewStore; 6989 } 6990 6991 /// Match a pattern where a wide type scalar value is loaded by several narrow 6992 /// loads and combined by shifts and ors. Fold it into a single load or a load 6993 /// and a BSWAP if the targets supports it. 6994 /// 6995 /// Assuming little endian target: 6996 /// i8 *a = ... 6997 /// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) 6998 /// => 6999 /// i32 val = *((i32)a) 7000 /// 7001 /// i8 *a = ... 7002 /// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] 7003 /// => 7004 /// i32 val = BSWAP(*((i32)a)) 7005 /// 7006 /// TODO: This rule matches complex patterns with OR node roots and doesn't 7007 /// interact well with the worklist mechanism. When a part of the pattern is 7008 /// updated (e.g. one of the loads) its direct users are put into the worklist, 7009 /// but the root node of the pattern which triggers the load combine is not 7010 /// necessarily a direct user of the changed node. For example, once the address 7011 /// of t28 load is reassociated load combine won't be triggered: 7012 /// t25: i32 = add t4, Constant:i32<2> 7013 /// t26: i64 = sign_extend t25 7014 /// t27: i64 = add t2, t26 7015 /// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64 7016 /// t29: i32 = zero_extend t28 7017 /// t32: i32 = shl t29, Constant:i8<8> 7018 /// t33: i32 = or t23, t32 7019 /// As a possible fix visitLoad can check if the load can be a part of a load 7020 /// combine pattern and add corresponding OR roots to the worklist. 7021 SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { 7022 assert(N->getOpcode() == ISD::OR && 7023 "Can only match load combining against OR nodes"); 7024 7025 // Handles simple types only 7026 EVT VT = N->getValueType(0); 7027 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 7028 return SDValue(); 7029 unsigned ByteWidth = VT.getSizeInBits() / 8; 7030 7031 bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); 7032 auto MemoryByteOffset = [&] (ByteProvider P) { 7033 assert(P.isMemory() && "Must be a memory byte provider"); 7034 unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); 7035 assert(LoadBitWidth % 8 == 0 && 7036 "can only analyze providers for individual bytes not bit"); 7037 unsigned LoadByteWidth = LoadBitWidth / 8; 7038 return IsBigEndianTarget 7039 ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) 7040 : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); 7041 }; 7042 7043 Optional<BaseIndexOffset> Base; 7044 SDValue Chain; 7045 7046 SmallPtrSet<LoadSDNode *, 8> Loads; 7047 Optional<ByteProvider> FirstByteProvider; 7048 int64_t FirstOffset = INT64_MAX; 7049 7050 // Check if all the bytes of the OR we are looking at are loaded from the same 7051 // base address. Collect bytes offsets from Base address in ByteOffsets. 7052 SmallVector<int64_t, 8> ByteOffsets(ByteWidth); 7053 unsigned ZeroExtendedBytes = 0; 7054 for (int i = ByteWidth - 1; i >= 0; --i) { 7055 auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); 7056 if (!P) 7057 return SDValue(); 7058 7059 if (P->isConstantZero()) { 7060 // It's OK for the N most significant bytes to be 0, we can just 7061 // zero-extend the load. 7062 if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i))) 7063 return SDValue(); 7064 continue; 7065 } 7066 assert(P->isMemory() && "provenance should either be memory or zero"); 7067 7068 LoadSDNode *L = P->Load; 7069 assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && 7070 !L->isIndexed() && 7071 "Must be enforced by calculateByteProvider"); 7072 assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); 7073 7074 // All loads must share the same chain 7075 SDValue LChain = L->getChain(); 7076 if (!Chain) 7077 Chain = LChain; 7078 else if (Chain != LChain) 7079 return SDValue(); 7080 7081 // Loads must share the same base address 7082 BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG); 7083 int64_t ByteOffsetFromBase = 0; 7084 if (!Base) 7085 Base = Ptr; 7086 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) 7087 return SDValue(); 7088 7089 // Calculate the offset of the current byte from the base address 7090 ByteOffsetFromBase += MemoryByteOffset(*P); 7091 ByteOffsets[i] = ByteOffsetFromBase; 7092 7093 // Remember the first byte load 7094 if (ByteOffsetFromBase < FirstOffset) { 7095 FirstByteProvider = P; 7096 FirstOffset = ByteOffsetFromBase; 7097 } 7098 7099 Loads.insert(L); 7100 } 7101 assert(!Loads.empty() && "All the bytes of the value must be loaded from " 7102 "memory, so there must be at least one load which produces the value"); 7103 assert(Base && "Base address of the accessed memory location must be set"); 7104 assert(FirstOffset != INT64_MAX && "First byte offset must be set"); 7105 7106 bool NeedsZext = ZeroExtendedBytes > 0; 7107 7108 EVT MemVT = 7109 EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8); 7110 7111 if (!MemVT.isSimple()) 7112 return SDValue(); 7113 7114 // Before legalize we can introduce too wide illegal loads which will be later 7115 // split into legal sized loads. This enables us to combine i64 load by i8 7116 // patterns to a couple of i32 loads on 32 bit targets. 7117 if (LegalOperations && 7118 !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, 7119 MemVT)) 7120 return SDValue(); 7121 7122 // Check if the bytes of the OR we are looking at match with either big or 7123 // little endian value load 7124 Optional<bool> IsBigEndian = isBigEndian( 7125 makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); 7126 if (!IsBigEndian.hasValue()) 7127 return SDValue(); 7128 7129 assert(FirstByteProvider && "must be set"); 7130 7131 // Ensure that the first byte is loaded from zero offset of the first load. 7132 // So the combined value can be loaded from the first load address. 7133 if (MemoryByteOffset(*FirstByteProvider) != 0) 7134 return SDValue(); 7135 LoadSDNode *FirstLoad = FirstByteProvider->Load; 7136 7137 // The node we are looking at matches with the pattern, check if we can 7138 // replace it with a single (possibly zero-extended) load and bswap + shift if 7139 // needed. 7140 7141 // If the load needs byte swap check if the target supports it 7142 bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; 7143 7144 // Before legalize we can introduce illegal bswaps which will be later 7145 // converted to an explicit bswap sequence. This way we end up with a single 7146 // load and byte shuffling instead of several loads and byte shuffling. 7147 // We do not introduce illegal bswaps when zero-extending as this tends to 7148 // introduce too many arithmetic instructions. 7149 if (NeedsBswap && (LegalOperations || NeedsZext) && 7150 !TLI.isOperationLegal(ISD::BSWAP, VT)) 7151 return SDValue(); 7152 7153 // If we need to bswap and zero extend, we have to insert a shift. Check that 7154 // it is legal. 7155 if (NeedsBswap && NeedsZext && LegalOperations && 7156 !TLI.isOperationLegal(ISD::SHL, VT)) 7157 return SDValue(); 7158 7159 // Check that a load of the wide type is both allowed and fast on the target 7160 bool Fast = false; 7161 bool Allowed = 7162 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 7163 *FirstLoad->getMemOperand(), &Fast); 7164 if (!Allowed || !Fast) 7165 return SDValue(); 7166 7167 SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, 7168 SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), 7169 FirstLoad->getPointerInfo(), MemVT, 7170 FirstLoad->getAlignment()); 7171 7172 // Transfer chain users from old loads to the new load. 7173 for (LoadSDNode *L : Loads) 7174 DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); 7175 7176 if (!NeedsBswap) 7177 return NewLoad; 7178 7179 SDValue ShiftedLoad = 7180 NeedsZext 7181 ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, 7182 DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT, 7183 SDLoc(N), LegalOperations)) 7184 : NewLoad; 7185 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); 7186 } 7187 7188 // If the target has andn, bsl, or a similar bit-select instruction, 7189 // we want to unfold masked merge, with canonical pattern of: 7190 // | A | |B| 7191 // ((x ^ y) & m) ^ y 7192 // | D | 7193 // Into: 7194 // (x & m) | (y & ~m) 7195 // If y is a constant, and the 'andn' does not work with immediates, 7196 // we unfold into a different pattern: 7197 // ~(~x & m) & (m | y) 7198 // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at 7199 // the very least that breaks andnpd / andnps patterns, and because those 7200 // patterns are simplified in IR and shouldn't be created in the DAG 7201 SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) { 7202 assert(N->getOpcode() == ISD::XOR); 7203 7204 // Don't touch 'not' (i.e. where y = -1). 7205 if (isAllOnesOrAllOnesSplat(N->getOperand(1))) 7206 return SDValue(); 7207 7208 EVT VT = N->getValueType(0); 7209 7210 // There are 3 commutable operators in the pattern, 7211 // so we have to deal with 8 possible variants of the basic pattern. 7212 SDValue X, Y, M; 7213 auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) { 7214 if (And.getOpcode() != ISD::AND || !And.hasOneUse()) 7215 return false; 7216 SDValue Xor = And.getOperand(XorIdx); 7217 if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse()) 7218 return false; 7219 SDValue Xor0 = Xor.getOperand(0); 7220 SDValue Xor1 = Xor.getOperand(1); 7221 // Don't touch 'not' (i.e. where y = -1). 7222 if (isAllOnesOrAllOnesSplat(Xor1)) 7223 return false; 7224 if (Other == Xor0) 7225 std::swap(Xor0, Xor1); 7226 if (Other != Xor1) 7227 return false; 7228 X = Xor0; 7229 Y = Xor1; 7230 M = And.getOperand(XorIdx ? 0 : 1); 7231 return true; 7232 }; 7233 7234 SDValue N0 = N->getOperand(0); 7235 SDValue N1 = N->getOperand(1); 7236 if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) && 7237 !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0)) 7238 return SDValue(); 7239 7240 // Don't do anything if the mask is constant. This should not be reachable. 7241 // InstCombine should have already unfolded this pattern, and DAGCombiner 7242 // probably shouldn't produce it, too. 7243 if (isa<ConstantSDNode>(M.getNode())) 7244 return SDValue(); 7245 7246 // We can transform if the target has AndNot 7247 if (!TLI.hasAndNot(M)) 7248 return SDValue(); 7249 7250 SDLoc DL(N); 7251 7252 // If Y is a constant, check that 'andn' works with immediates. 7253 if (!TLI.hasAndNot(Y)) { 7254 assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable."); 7255 // If not, we need to do a bit more work to make sure andn is still used. 7256 SDValue NotX = DAG.getNOT(DL, X, VT); 7257 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M); 7258 SDValue NotLHS = DAG.getNOT(DL, LHS, VT); 7259 SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y); 7260 return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS); 7261 } 7262 7263 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M); 7264 SDValue NotM = DAG.getNOT(DL, M, VT); 7265 SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM); 7266 7267 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); 7268 } 7269 7270 SDValue DAGCombiner::visitXOR(SDNode *N) { 7271 SDValue N0 = N->getOperand(0); 7272 SDValue N1 = N->getOperand(1); 7273 EVT VT = N0.getValueType(); 7274 7275 // fold vector ops 7276 if (VT.isVector()) { 7277 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 7278 return FoldedVOp; 7279 7280 // fold (xor x, 0) -> x, vector edition 7281 if (ISD::isBuildVectorAllZeros(N0.getNode())) 7282 return N1; 7283 if (ISD::isBuildVectorAllZeros(N1.getNode())) 7284 return N0; 7285 } 7286 7287 // fold (xor undef, undef) -> 0. This is a common idiom (misuse). 7288 SDLoc DL(N); 7289 if (N0.isUndef() && N1.isUndef()) 7290 return DAG.getConstant(0, DL, VT); 7291 7292 // fold (xor x, undef) -> undef 7293 if (N0.isUndef()) 7294 return N0; 7295 if (N1.isUndef()) 7296 return N1; 7297 7298 // fold (xor c1, c2) -> c1^c2 7299 if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1})) 7300 return C; 7301 7302 // canonicalize constant to RHS 7303 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 7304 !DAG.isConstantIntBuildVectorOrConstantInt(N1)) 7305 return DAG.getNode(ISD::XOR, DL, VT, N1, N0); 7306 7307 // fold (xor x, 0) -> x 7308 if (isNullConstant(N1)) 7309 return N0; 7310 7311 if (SDValue NewSel = foldBinOpIntoSelect(N)) 7312 return NewSel; 7313 7314 // reassociate xor 7315 if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) 7316 return RXOR; 7317 7318 // fold !(x cc y) -> (x !cc y) 7319 unsigned N0Opcode = N0.getOpcode(); 7320 SDValue LHS, RHS, CC; 7321 if (TLI.isConstTrueVal(N1.getNode()) && 7322 isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/true)) { 7323 ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 7324 LHS.getValueType()); 7325 if (!LegalOperations || 7326 TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { 7327 switch (N0Opcode) { 7328 default: 7329 llvm_unreachable("Unhandled SetCC Equivalent!"); 7330 case ISD::SETCC: 7331 return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); 7332 case ISD::SELECT_CC: 7333 return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), 7334 N0.getOperand(3), NotCC); 7335 case ISD::STRICT_FSETCC: 7336 case ISD::STRICT_FSETCCS: { 7337 if (N0.hasOneUse()) { 7338 // FIXME Can we handle multiple uses? Could we token factor the chain 7339 // results from the new/old setcc? 7340 SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, 7341 N0.getOperand(0), 7342 N0Opcode == ISD::STRICT_FSETCCS); 7343 CombineTo(N, SetCC); 7344 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1)); 7345 recursivelyDeleteUnusedNodes(N0.getNode()); 7346 return SDValue(N, 0); // Return N so it doesn't get rechecked! 7347 } 7348 break; 7349 } 7350 } 7351 } 7352 } 7353 7354 // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y))) 7355 if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() && 7356 isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){ 7357 SDValue V = N0.getOperand(0); 7358 SDLoc DL0(N0); 7359 V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V, 7360 DAG.getConstant(1, DL0, V.getValueType())); 7361 AddToWorklist(V.getNode()); 7362 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V); 7363 } 7364 7365 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc 7366 if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() && 7367 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { 7368 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1); 7369 if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) { 7370 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; 7371 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00 7372 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01 7373 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode()); 7374 return DAG.getNode(NewOpcode, DL, VT, N00, N01); 7375 } 7376 } 7377 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants 7378 if (isAllOnesConstant(N1) && N0.hasOneUse() && 7379 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) { 7380 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1); 7381 if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) { 7382 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND; 7383 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00 7384 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01 7385 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode()); 7386 return DAG.getNode(NewOpcode, DL, VT, N00, N01); 7387 } 7388 } 7389 7390 // fold (not (neg x)) -> (add X, -1) 7391 // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if 7392 // Y is a constant or the subtract has a single use. 7393 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB && 7394 isNullConstant(N0.getOperand(0))) { 7395 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), 7396 DAG.getAllOnesConstant(DL, VT)); 7397 } 7398 7399 // fold (not (add X, -1)) -> (neg X) 7400 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD && 7401 isAllOnesOrAllOnesSplat(N0.getOperand(1))) { 7402 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 7403 N0.getOperand(0)); 7404 } 7405 7406 // fold (xor (and x, y), y) -> (and (not x), y) 7407 if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) { 7408 SDValue X = N0.getOperand(0); 7409 SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); 7410 AddToWorklist(NotX.getNode()); 7411 return DAG.getNode(ISD::AND, DL, VT, NotX, N1); 7412 } 7413 7414 if ((N0Opcode == ISD::SRL || N0Opcode == ISD::SHL) && N0.hasOneUse()) { 7415 ConstantSDNode *XorC = isConstOrConstSplat(N1); 7416 ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1)); 7417 unsigned BitWidth = VT.getScalarSizeInBits(); 7418 if (XorC && ShiftC) { 7419 // Don't crash on an oversized shift. We can not guarantee that a bogus 7420 // shift has been simplified to undef. 7421 uint64_t ShiftAmt = ShiftC->getLimitedValue(); 7422 if (ShiftAmt < BitWidth) { 7423 APInt Ones = APInt::getAllOnesValue(BitWidth); 7424 Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt); 7425 if (XorC->getAPIntValue() == Ones) { 7426 // If the xor constant is a shifted -1, do a 'not' before the shift: 7427 // xor (X << ShiftC), XorC --> (not X) << ShiftC 7428 // xor (X >> ShiftC), XorC --> (not X) >> ShiftC 7429 SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT); 7430 return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1)); 7431 } 7432 } 7433 } 7434 } 7435 7436 // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) 7437 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { 7438 SDValue A = N0Opcode == ISD::ADD ? N0 : N1; 7439 SDValue S = N0Opcode == ISD::SRA ? N0 : N1; 7440 if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) { 7441 SDValue A0 = A.getOperand(0), A1 = A.getOperand(1); 7442 SDValue S0 = S.getOperand(0); 7443 if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) { 7444 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 7445 if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1))) 7446 if (C->getAPIntValue() == (OpSizeInBits - 1)) 7447 return DAG.getNode(ISD::ABS, DL, VT, S0); 7448 } 7449 } 7450 } 7451 7452 // fold (xor x, x) -> 0 7453 if (N0 == N1) 7454 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); 7455 7456 // fold (xor (shl 1, x), -1) -> (rotl ~1, x) 7457 // Here is a concrete example of this equivalence: 7458 // i16 x == 14 7459 // i16 shl == 1 << 14 == 16384 == 0b0100000000000000 7460 // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111 7461 // 7462 // => 7463 // 7464 // i16 ~1 == 0b1111111111111110 7465 // i16 rol(~1, 14) == 0b1011111111111111 7466 // 7467 // Some additional tips to help conceptualize this transform: 7468 // - Try to see the operation as placing a single zero in a value of all ones. 7469 // - There exists no value for x which would allow the result to contain zero. 7470 // - Values of x larger than the bitwidth are undefined and do not require a 7471 // consistent result. 7472 // - Pushing the zero left requires shifting one bits in from the right. 7473 // A rotate left of ~1 is a nice way of achieving the desired result. 7474 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL && 7475 isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) { 7476 return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT), 7477 N0.getOperand(1)); 7478 } 7479 7480 // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) 7481 if (N0Opcode == N1.getOpcode()) 7482 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) 7483 return V; 7484 7485 // Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable 7486 if (SDValue MM = unfoldMaskedMerge(N)) 7487 return MM; 7488 7489 // Simplify the expression using non-local knowledge. 7490 if (SimplifyDemandedBits(SDValue(N, 0))) 7491 return SDValue(N, 0); 7492 7493 if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) 7494 return Combined; 7495 7496 return SDValue(); 7497 } 7498 7499 /// If we have a shift-by-constant of a bitwise logic op that itself has a 7500 /// shift-by-constant operand with identical opcode, we may be able to convert 7501 /// that into 2 independent shifts followed by the logic op. This is a 7502 /// throughput improvement. 7503 static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) { 7504 // Match a one-use bitwise logic op. 7505 SDValue LogicOp = Shift->getOperand(0); 7506 if (!LogicOp.hasOneUse()) 7507 return SDValue(); 7508 7509 unsigned LogicOpcode = LogicOp.getOpcode(); 7510 if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR && 7511 LogicOpcode != ISD::XOR) 7512 return SDValue(); 7513 7514 // Find a matching one-use shift by constant. 7515 unsigned ShiftOpcode = Shift->getOpcode(); 7516 SDValue C1 = Shift->getOperand(1); 7517 ConstantSDNode *C1Node = isConstOrConstSplat(C1); 7518 assert(C1Node && "Expected a shift with constant operand"); 7519 const APInt &C1Val = C1Node->getAPIntValue(); 7520 auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp, 7521 const APInt *&ShiftAmtVal) { 7522 if (V.getOpcode() != ShiftOpcode || !V.hasOneUse()) 7523 return false; 7524 7525 ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1)); 7526 if (!ShiftCNode) 7527 return false; 7528 7529 // Capture the shifted operand and shift amount value. 7530 ShiftOp = V.getOperand(0); 7531 ShiftAmtVal = &ShiftCNode->getAPIntValue(); 7532 7533 // Shift amount types do not have to match their operand type, so check that 7534 // the constants are the same width. 7535 if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth()) 7536 return false; 7537 7538 // The fold is not valid if the sum of the shift values exceeds bitwidth. 7539 if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits())) 7540 return false; 7541 7542 return true; 7543 }; 7544 7545 // Logic ops are commutative, so check each operand for a match. 7546 SDValue X, Y; 7547 const APInt *C0Val; 7548 if (matchFirstShift(LogicOp.getOperand(0), X, C0Val)) 7549 Y = LogicOp.getOperand(1); 7550 else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val)) 7551 Y = LogicOp.getOperand(0); 7552 else 7553 return SDValue(); 7554 7555 // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1) 7556 SDLoc DL(Shift); 7557 EVT VT = Shift->getValueType(0); 7558 EVT ShiftAmtVT = Shift->getOperand(1).getValueType(); 7559 SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT); 7560 SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC); 7561 SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1); 7562 return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2); 7563 } 7564 7565 /// Handle transforms common to the three shifts, when the shift amount is a 7566 /// constant. 7567 /// We are looking for: (shift being one of shl/sra/srl) 7568 /// shift (binop X, C0), C1 7569 /// And want to transform into: 7570 /// binop (shift X, C1), (shift C0, C1) 7571 SDValue DAGCombiner::visitShiftByConstant(SDNode *N) { 7572 assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand"); 7573 7574 // Do not turn a 'not' into a regular xor. 7575 if (isBitwiseNot(N->getOperand(0))) 7576 return SDValue(); 7577 7578 // The inner binop must be one-use, since we want to replace it. 7579 SDValue LHS = N->getOperand(0); 7580 if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level)) 7581 return SDValue(); 7582 7583 // TODO: This is limited to early combining because it may reveal regressions 7584 // otherwise. But since we just checked a target hook to see if this is 7585 // desirable, that should have filtered out cases where this interferes 7586 // with some other pattern matching. 7587 if (!LegalTypes) 7588 if (SDValue R = combineShiftOfShiftedLogic(N, DAG)) 7589 return R; 7590 7591 // We want to pull some binops through shifts, so that we have (and (shift)) 7592 // instead of (shift (and)), likewise for add, or, xor, etc. This sort of 7593 // thing happens with address calculations, so it's important to canonicalize 7594 // it. 7595 switch (LHS.getOpcode()) { 7596 default: 7597 return SDValue(); 7598 case ISD::OR: 7599 case ISD::XOR: 7600 case ISD::AND: 7601 break; 7602 case ISD::ADD: 7603 if (N->getOpcode() != ISD::SHL) 7604 return SDValue(); // only shl(add) not sr[al](add). 7605 break; 7606 } 7607 7608 // We require the RHS of the binop to be a constant and not opaque as well. 7609 ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1)); 7610 if (!BinOpCst) 7611 return SDValue(); 7612 7613 // FIXME: disable this unless the input to the binop is a shift by a constant 7614 // or is copy/select. Enable this in other cases when figure out it's exactly 7615 // profitable. 7616 SDValue BinOpLHSVal = LHS.getOperand(0); 7617 bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL || 7618 BinOpLHSVal.getOpcode() == ISD::SRA || 7619 BinOpLHSVal.getOpcode() == ISD::SRL) && 7620 isa<ConstantSDNode>(BinOpLHSVal.getOperand(1)); 7621 bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg || 7622 BinOpLHSVal.getOpcode() == ISD::SELECT; 7623 7624 if (!IsShiftByConstant && !IsCopyOrSelect) 7625 return SDValue(); 7626 7627 if (IsCopyOrSelect && N->hasOneUse()) 7628 return SDValue(); 7629 7630 // Fold the constants, shifting the binop RHS by the shift amount. 7631 SDLoc DL(N); 7632 EVT VT = N->getValueType(0); 7633 SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1), 7634 N->getOperand(1)); 7635 assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!"); 7636 7637 SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0), 7638 N->getOperand(1)); 7639 return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS); 7640 } 7641 7642 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { 7643 assert(N->getOpcode() == ISD::TRUNCATE); 7644 assert(N->getOperand(0).getOpcode() == ISD::AND); 7645 7646 // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) 7647 EVT TruncVT = N->getValueType(0); 7648 if (N->hasOneUse() && N->getOperand(0).hasOneUse() && 7649 TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) { 7650 SDValue N01 = N->getOperand(0).getOperand(1); 7651 if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { 7652 SDLoc DL(N); 7653 SDValue N00 = N->getOperand(0).getOperand(0); 7654 SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); 7655 SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); 7656 AddToWorklist(Trunc00.getNode()); 7657 AddToWorklist(Trunc01.getNode()); 7658 return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); 7659 } 7660 } 7661 7662 return SDValue(); 7663 } 7664 7665 SDValue DAGCombiner::visitRotate(SDNode *N) { 7666 SDLoc dl(N); 7667 SDValue N0 = N->getOperand(0); 7668 SDValue N1 = N->getOperand(1); 7669 EVT VT = N->getValueType(0); 7670 unsigned Bitsize = VT.getScalarSizeInBits(); 7671 7672 // fold (rot x, 0) -> x 7673 if (isNullOrNullSplat(N1)) 7674 return N0; 7675 7676 // fold (rot x, c) -> x iff (c % BitSize) == 0 7677 if (isPowerOf2_32(Bitsize) && Bitsize > 1) { 7678 APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1); 7679 if (DAG.MaskedValueIsZero(N1, ModuloMask)) 7680 return N0; 7681 } 7682 7683 // fold (rot x, c) -> (rot x, c % BitSize) 7684 bool OutOfRange = false; 7685 auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) { 7686 OutOfRange |= C->getAPIntValue().uge(Bitsize); 7687 return true; 7688 }; 7689 if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) { 7690 EVT AmtVT = N1.getValueType(); 7691 SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT); 7692 if (SDValue Amt = 7693 DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits})) 7694 return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt); 7695 } 7696 7697 // rot i16 X, 8 --> bswap X 7698 auto *RotAmtC = isConstOrConstSplat(N1); 7699 if (RotAmtC && RotAmtC->getAPIntValue() == 8 && 7700 VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT)) 7701 return DAG.getNode(ISD::BSWAP, dl, VT, N0); 7702 7703 // Simplify the operands using demanded-bits information. 7704 if (SimplifyDemandedBits(SDValue(N, 0))) 7705 return SDValue(N, 0); 7706 7707 // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). 7708 if (N1.getOpcode() == ISD::TRUNCATE && 7709 N1.getOperand(0).getOpcode() == ISD::AND) { 7710 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 7711 return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); 7712 } 7713 7714 unsigned NextOp = N0.getOpcode(); 7715 // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) 7716 if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { 7717 SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); 7718 SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); 7719 if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { 7720 EVT ShiftVT = C1->getValueType(0); 7721 bool SameSide = (N->getOpcode() == NextOp); 7722 unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; 7723 if (SDValue CombinedShift = DAG.FoldConstantArithmetic( 7724 CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) { 7725 SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); 7726 SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( 7727 ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC}); 7728 return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0), 7729 CombinedShiftNorm); 7730 } 7731 } 7732 } 7733 return SDValue(); 7734 } 7735 7736 SDValue DAGCombiner::visitSHL(SDNode *N) { 7737 SDValue N0 = N->getOperand(0); 7738 SDValue N1 = N->getOperand(1); 7739 if (SDValue V = DAG.simplifyShift(N0, N1)) 7740 return V; 7741 7742 EVT VT = N0.getValueType(); 7743 EVT ShiftVT = N1.getValueType(); 7744 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 7745 7746 // fold vector ops 7747 if (VT.isVector()) { 7748 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 7749 return FoldedVOp; 7750 7751 BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1); 7752 // If setcc produces all-one true value then: 7753 // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV) 7754 if (N1CV && N1CV->isConstant()) { 7755 if (N0.getOpcode() == ISD::AND) { 7756 SDValue N00 = N0->getOperand(0); 7757 SDValue N01 = N0->getOperand(1); 7758 BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01); 7759 7760 if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && 7761 TLI.getBooleanContents(N00.getOperand(0).getValueType()) == 7762 TargetLowering::ZeroOrNegativeOneBooleanContent) { 7763 if (SDValue C = 7764 DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1})) 7765 return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); 7766 } 7767 } 7768 } 7769 } 7770 7771 ConstantSDNode *N1C = isConstOrConstSplat(N1); 7772 7773 // fold (shl c1, c2) -> c1<<c2 7774 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1})) 7775 return C; 7776 7777 if (SDValue NewSel = foldBinOpIntoSelect(N)) 7778 return NewSel; 7779 7780 // if (shl x, c) is known to be zero, return 0 7781 if (DAG.MaskedValueIsZero(SDValue(N, 0), 7782 APInt::getAllOnesValue(OpSizeInBits))) 7783 return DAG.getConstant(0, SDLoc(N), VT); 7784 7785 // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). 7786 if (N1.getOpcode() == ISD::TRUNCATE && 7787 N1.getOperand(0).getOpcode() == ISD::AND) { 7788 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 7789 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); 7790 } 7791 7792 if (SimplifyDemandedBits(SDValue(N, 0))) 7793 return SDValue(N, 0); 7794 7795 // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) 7796 if (N0.getOpcode() == ISD::SHL) { 7797 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, 7798 ConstantSDNode *RHS) { 7799 APInt c1 = LHS->getAPIntValue(); 7800 APInt c2 = RHS->getAPIntValue(); 7801 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7802 return (c1 + c2).uge(OpSizeInBits); 7803 }; 7804 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) 7805 return DAG.getConstant(0, SDLoc(N), VT); 7806 7807 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, 7808 ConstantSDNode *RHS) { 7809 APInt c1 = LHS->getAPIntValue(); 7810 APInt c2 = RHS->getAPIntValue(); 7811 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7812 return (c1 + c2).ult(OpSizeInBits); 7813 }; 7814 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { 7815 SDLoc DL(N); 7816 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); 7817 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum); 7818 } 7819 } 7820 7821 // fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2)) 7822 // For this to be valid, the second form must not preserve any of the bits 7823 // that are shifted out by the inner shift in the first form. This means 7824 // the outer shift size must be >= the number of bits added by the ext. 7825 // As a corollary, we don't care what kind of ext it is. 7826 if ((N0.getOpcode() == ISD::ZERO_EXTEND || 7827 N0.getOpcode() == ISD::ANY_EXTEND || 7828 N0.getOpcode() == ISD::SIGN_EXTEND) && 7829 N0.getOperand(0).getOpcode() == ISD::SHL) { 7830 SDValue N0Op0 = N0.getOperand(0); 7831 SDValue InnerShiftAmt = N0Op0.getOperand(1); 7832 EVT InnerVT = N0Op0.getValueType(); 7833 uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits(); 7834 7835 auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, 7836 ConstantSDNode *RHS) { 7837 APInt c1 = LHS->getAPIntValue(); 7838 APInt c2 = RHS->getAPIntValue(); 7839 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7840 return c2.uge(OpSizeInBits - InnerBitwidth) && 7841 (c1 + c2).uge(OpSizeInBits); 7842 }; 7843 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange, 7844 /*AllowUndefs*/ false, 7845 /*AllowTypeMismatch*/ true)) 7846 return DAG.getConstant(0, SDLoc(N), VT); 7847 7848 auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, 7849 ConstantSDNode *RHS) { 7850 APInt c1 = LHS->getAPIntValue(); 7851 APInt c2 = RHS->getAPIntValue(); 7852 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 7853 return c2.uge(OpSizeInBits - InnerBitwidth) && 7854 (c1 + c2).ult(OpSizeInBits); 7855 }; 7856 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange, 7857 /*AllowUndefs*/ false, 7858 /*AllowTypeMismatch*/ true)) { 7859 SDLoc DL(N); 7860 SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0)); 7861 SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT); 7862 Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1); 7863 return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum); 7864 } 7865 } 7866 7867 // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) 7868 // Only fold this if the inner zext has no other uses to avoid increasing 7869 // the total number of instructions. 7870 if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && 7871 N0.getOperand(0).getOpcode() == ISD::SRL) { 7872 SDValue N0Op0 = N0.getOperand(0); 7873 SDValue InnerShiftAmt = N0Op0.getOperand(1); 7874 7875 auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) { 7876 APInt c1 = LHS->getAPIntValue(); 7877 APInt c2 = RHS->getAPIntValue(); 7878 zeroExtendToMatch(c1, c2); 7879 return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2); 7880 }; 7881 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual, 7882 /*AllowUndefs*/ false, 7883 /*AllowTypeMismatch*/ true)) { 7884 SDLoc DL(N); 7885 EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType(); 7886 SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT); 7887 NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL); 7888 AddToWorklist(NewSHL.getNode()); 7889 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); 7890 } 7891 } 7892 7893 // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 7894 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 7895 // TODO - support non-uniform vector shift amounts. 7896 if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && 7897 N0->getFlags().hasExact()) { 7898 if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { 7899 uint64_t C1 = N0C1->getZExtValue(); 7900 uint64_t C2 = N1C->getZExtValue(); 7901 SDLoc DL(N); 7902 if (C1 <= C2) 7903 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), 7904 DAG.getConstant(C2 - C1, DL, ShiftVT)); 7905 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), 7906 DAG.getConstant(C1 - C2, DL, ShiftVT)); 7907 } 7908 } 7909 7910 // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or 7911 // (and (srl x, (sub c1, c2), MASK) 7912 // Only fold this if the inner shift has no other uses -- if it does, folding 7913 // this will increase the total number of instructions. 7914 // TODO - drop hasOneUse requirement if c1 == c2? 7915 // TODO - support non-uniform vector shift amounts. 7916 if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() && 7917 TLI.shouldFoldConstantShiftPairToMask(N, Level)) { 7918 if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { 7919 if (N0C1->getAPIntValue().ult(OpSizeInBits)) { 7920 uint64_t c1 = N0C1->getZExtValue(); 7921 uint64_t c2 = N1C->getZExtValue(); 7922 APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); 7923 SDValue Shift; 7924 if (c2 > c1) { 7925 Mask <<= c2 - c1; 7926 SDLoc DL(N); 7927 Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), 7928 DAG.getConstant(c2 - c1, DL, ShiftVT)); 7929 } else { 7930 Mask.lshrInPlace(c1 - c2); 7931 SDLoc DL(N); 7932 Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), 7933 DAG.getConstant(c1 - c2, DL, ShiftVT)); 7934 } 7935 SDLoc DL(N0); 7936 return DAG.getNode(ISD::AND, DL, VT, Shift, 7937 DAG.getConstant(Mask, DL, VT)); 7938 } 7939 } 7940 } 7941 7942 // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) 7943 if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && 7944 isConstantOrConstantVector(N1, /* No Opaques */ true)) { 7945 SDLoc DL(N); 7946 SDValue AllBits = DAG.getAllOnesConstant(DL, VT); 7947 SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); 7948 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); 7949 } 7950 7951 // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 7952 // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) 7953 // Variant of version done on multiply, except mul by a power of 2 is turned 7954 // into a shift. 7955 if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && 7956 N0.getNode()->hasOneUse() && 7957 isConstantOrConstantVector(N1, /* No Opaques */ true) && 7958 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) && 7959 TLI.isDesirableToCommuteWithShift(N, Level)) { 7960 SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); 7961 SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); 7962 AddToWorklist(Shl0.getNode()); 7963 AddToWorklist(Shl1.getNode()); 7964 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); 7965 } 7966 7967 // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) 7968 if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() && 7969 isConstantOrConstantVector(N1, /* No Opaques */ true) && 7970 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { 7971 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); 7972 if (isConstantOrConstantVector(Shl)) 7973 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl); 7974 } 7975 7976 if (N1C && !N1C->isOpaque()) 7977 if (SDValue NewSHL = visitShiftByConstant(N)) 7978 return NewSHL; 7979 7980 // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)). 7981 if (N0.getOpcode() == ISD::VSCALE) 7982 if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) { 7983 auto DL = SDLoc(N); 7984 APInt C0 = N0.getConstantOperandAPInt(0); 7985 APInt C1 = NC1->getAPIntValue(); 7986 return DAG.getVScale(DL, VT, C0 << C1); 7987 } 7988 7989 return SDValue(); 7990 } 7991 7992 // Transform a right shift of a multiply into a multiply-high. 7993 // Examples: 7994 // (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b) 7995 // (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b) 7996 static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG, 7997 const TargetLowering &TLI) { 7998 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 7999 "SRL or SRA node is required here!"); 8000 8001 // Check the shift amount. Proceed with the transformation if the shift 8002 // amount is constant. 8003 ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1)); 8004 if (!ShiftAmtSrc) 8005 return SDValue(); 8006 8007 SDLoc DL(N); 8008 8009 // The operation feeding into the shift must be a multiply. 8010 SDValue ShiftOperand = N->getOperand(0); 8011 if (ShiftOperand.getOpcode() != ISD::MUL) 8012 return SDValue(); 8013 8014 // Both operands must be equivalent extend nodes. 8015 SDValue LeftOp = ShiftOperand.getOperand(0); 8016 SDValue RightOp = ShiftOperand.getOperand(1); 8017 bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND; 8018 bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND; 8019 8020 if ((!(IsSignExt || IsZeroExt)) || LeftOp.getOpcode() != RightOp.getOpcode()) 8021 return SDValue(); 8022 8023 EVT WideVT1 = LeftOp.getValueType(); 8024 EVT WideVT2 = RightOp.getValueType(); 8025 (void)WideVT2; 8026 // Proceed with the transformation if the wide types match. 8027 assert((WideVT1 == WideVT2) && 8028 "Cannot have a multiply node with two different operand types."); 8029 8030 EVT NarrowVT = LeftOp.getOperand(0).getValueType(); 8031 // Check that the two extend nodes are the same type. 8032 if (NarrowVT != RightOp.getOperand(0).getValueType()) 8033 return SDValue(); 8034 8035 // Only transform into mulh if mulh for the narrow type is cheaper than 8036 // a multiply followed by a shift. This should also check if mulh is 8037 // legal for NarrowVT on the target. 8038 if (!TLI.isMulhCheaperThanMulShift(NarrowVT)) 8039 return SDValue(); 8040 8041 // Proceed with the transformation if the wide type is twice as large 8042 // as the narrow type. 8043 unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits(); 8044 if (WideVT1.getScalarSizeInBits() != 2 * NarrowVTSize) 8045 return SDValue(); 8046 8047 // Check the shift amount with the narrow type size. 8048 // Proceed with the transformation if the shift amount is the width 8049 // of the narrow type. 8050 unsigned ShiftAmt = ShiftAmtSrc->getZExtValue(); 8051 if (ShiftAmt != NarrowVTSize) 8052 return SDValue(); 8053 8054 // If the operation feeding into the MUL is a sign extend (sext), 8055 // we use mulhs. Othewise, zero extends (zext) use mulhu. 8056 unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU; 8057 8058 SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), 8059 RightOp.getOperand(0)); 8060 return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1) 8061 : DAG.getZExtOrTrunc(Result, DL, WideVT1)); 8062 } 8063 8064 SDValue DAGCombiner::visitSRA(SDNode *N) { 8065 SDValue N0 = N->getOperand(0); 8066 SDValue N1 = N->getOperand(1); 8067 if (SDValue V = DAG.simplifyShift(N0, N1)) 8068 return V; 8069 8070 EVT VT = N0.getValueType(); 8071 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 8072 8073 // Arithmetic shifting an all-sign-bit value is a no-op. 8074 // fold (sra 0, x) -> 0 8075 // fold (sra -1, x) -> -1 8076 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) 8077 return N0; 8078 8079 // fold vector ops 8080 if (VT.isVector()) 8081 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 8082 return FoldedVOp; 8083 8084 ConstantSDNode *N1C = isConstOrConstSplat(N1); 8085 8086 // fold (sra c1, c2) -> (sra c1, c2) 8087 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1})) 8088 return C; 8089 8090 if (SDValue NewSel = foldBinOpIntoSelect(N)) 8091 return NewSel; 8092 8093 // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports 8094 // sext_inreg. 8095 if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { 8096 unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); 8097 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); 8098 if (VT.isVector()) 8099 ExtVT = EVT::getVectorVT(*DAG.getContext(), 8100 ExtVT, VT.getVectorNumElements()); 8101 if (!LegalOperations || 8102 TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) == 8103 TargetLowering::Legal) 8104 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, 8105 N0.getOperand(0), DAG.getValueType(ExtVT)); 8106 } 8107 8108 // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) 8109 // clamp (add c1, c2) to max shift. 8110 if (N0.getOpcode() == ISD::SRA) { 8111 SDLoc DL(N); 8112 EVT ShiftVT = N1.getValueType(); 8113 EVT ShiftSVT = ShiftVT.getScalarType(); 8114 SmallVector<SDValue, 16> ShiftValues; 8115 8116 auto SumOfShifts = [&](ConstantSDNode *LHS, ConstantSDNode *RHS) { 8117 APInt c1 = LHS->getAPIntValue(); 8118 APInt c2 = RHS->getAPIntValue(); 8119 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8120 APInt Sum = c1 + c2; 8121 unsigned ShiftSum = 8122 Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue(); 8123 ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT)); 8124 return true; 8125 }; 8126 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) { 8127 SDValue ShiftValue; 8128 if (VT.isVector()) 8129 ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues); 8130 else 8131 ShiftValue = ShiftValues[0]; 8132 return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue); 8133 } 8134 } 8135 8136 // fold (sra (shl X, m), (sub result_size, n)) 8137 // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for 8138 // result_size - n != m. 8139 // If truncate is free for the target sext(shl) is likely to result in better 8140 // code. 8141 if (N0.getOpcode() == ISD::SHL && N1C) { 8142 // Get the two constanst of the shifts, CN0 = m, CN = n. 8143 const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1)); 8144 if (N01C) { 8145 LLVMContext &Ctx = *DAG.getContext(); 8146 // Determine what the truncate's result bitsize and type would be. 8147 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue()); 8148 8149 if (VT.isVector()) 8150 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements()); 8151 8152 // Determine the residual right-shift amount. 8153 int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue(); 8154 8155 // If the shift is not a no-op (in which case this should be just a sign 8156 // extend already), the truncated to type is legal, sign_extend is legal 8157 // on that type, and the truncate to that type is both legal and free, 8158 // perform the transform. 8159 if ((ShiftAmt > 0) && 8160 TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) && 8161 TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) && 8162 TLI.isTruncateFree(VT, TruncVT)) { 8163 SDLoc DL(N); 8164 SDValue Amt = DAG.getConstant(ShiftAmt, DL, 8165 getShiftAmountTy(N0.getOperand(0).getValueType())); 8166 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, 8167 N0.getOperand(0), Amt); 8168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, 8169 Shift); 8170 return DAG.getNode(ISD::SIGN_EXTEND, DL, 8171 N->getValueType(0), Trunc); 8172 } 8173 } 8174 } 8175 8176 // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper. 8177 // sra (add (shl X, N1C), AddC), N1C --> 8178 // sext (add (trunc X to (width - N1C)), AddC') 8179 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C && 8180 N0.getOperand(0).getOpcode() == ISD::SHL && 8181 N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) { 8182 if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) { 8183 SDValue Shl = N0.getOperand(0); 8184 // Determine what the truncate's type would be and ask the target if that 8185 // is a free operation. 8186 LLVMContext &Ctx = *DAG.getContext(); 8187 unsigned ShiftAmt = N1C->getZExtValue(); 8188 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt); 8189 if (VT.isVector()) 8190 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements()); 8191 8192 // TODO: The simple type check probably belongs in the default hook 8193 // implementation and/or target-specific overrides (because 8194 // non-simple types likely require masking when legalized), but that 8195 // restriction may conflict with other transforms. 8196 if (TruncVT.isSimple() && isTypeLegal(TruncVT) && 8197 TLI.isTruncateFree(VT, TruncVT)) { 8198 SDLoc DL(N); 8199 SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT); 8200 SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt). 8201 trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT); 8202 SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC); 8203 return DAG.getSExtOrTrunc(Add, DL, VT); 8204 } 8205 } 8206 } 8207 8208 // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). 8209 if (N1.getOpcode() == ISD::TRUNCATE && 8210 N1.getOperand(0).getOpcode() == ISD::AND) { 8211 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 8212 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); 8213 } 8214 8215 // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) 8216 // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) 8217 // if c1 is equal to the number of bits the trunc removes 8218 // TODO - support non-uniform vector shift amounts. 8219 if (N0.getOpcode() == ISD::TRUNCATE && 8220 (N0.getOperand(0).getOpcode() == ISD::SRL || 8221 N0.getOperand(0).getOpcode() == ISD::SRA) && 8222 N0.getOperand(0).hasOneUse() && 8223 N0.getOperand(0).getOperand(1).hasOneUse() && N1C) { 8224 SDValue N0Op0 = N0.getOperand(0); 8225 if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) { 8226 EVT LargeVT = N0Op0.getValueType(); 8227 unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits; 8228 if (LargeShift->getAPIntValue() == TruncBits) { 8229 SDLoc DL(N); 8230 SDValue Amt = DAG.getConstant(N1C->getZExtValue() + TruncBits, DL, 8231 getShiftAmountTy(LargeVT)); 8232 SDValue SRA = 8233 DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt); 8234 return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA); 8235 } 8236 } 8237 } 8238 8239 // Simplify, based on bits shifted out of the LHS. 8240 if (SimplifyDemandedBits(SDValue(N, 0))) 8241 return SDValue(N, 0); 8242 8243 // If the sign bit is known to be zero, switch this to a SRL. 8244 if (DAG.SignBitIsZero(N0)) 8245 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); 8246 8247 if (N1C && !N1C->isOpaque()) 8248 if (SDValue NewSRA = visitShiftByConstant(N)) 8249 return NewSRA; 8250 8251 // Try to transform this shift into a multiply-high if 8252 // it matches the appropriate pattern detected in combineShiftToMULH. 8253 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI)) 8254 return MULH; 8255 8256 return SDValue(); 8257 } 8258 8259 SDValue DAGCombiner::visitSRL(SDNode *N) { 8260 SDValue N0 = N->getOperand(0); 8261 SDValue N1 = N->getOperand(1); 8262 if (SDValue V = DAG.simplifyShift(N0, N1)) 8263 return V; 8264 8265 EVT VT = N0.getValueType(); 8266 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 8267 8268 // fold vector ops 8269 if (VT.isVector()) 8270 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 8271 return FoldedVOp; 8272 8273 ConstantSDNode *N1C = isConstOrConstSplat(N1); 8274 8275 // fold (srl c1, c2) -> c1 >>u c2 8276 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1})) 8277 return C; 8278 8279 if (SDValue NewSel = foldBinOpIntoSelect(N)) 8280 return NewSel; 8281 8282 // if (srl x, c) is known to be zero, return 0 8283 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), 8284 APInt::getAllOnesValue(OpSizeInBits))) 8285 return DAG.getConstant(0, SDLoc(N), VT); 8286 8287 // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) 8288 if (N0.getOpcode() == ISD::SRL) { 8289 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, 8290 ConstantSDNode *RHS) { 8291 APInt c1 = LHS->getAPIntValue(); 8292 APInt c2 = RHS->getAPIntValue(); 8293 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8294 return (c1 + c2).uge(OpSizeInBits); 8295 }; 8296 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) 8297 return DAG.getConstant(0, SDLoc(N), VT); 8298 8299 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, 8300 ConstantSDNode *RHS) { 8301 APInt c1 = LHS->getAPIntValue(); 8302 APInt c2 = RHS->getAPIntValue(); 8303 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); 8304 return (c1 + c2).ult(OpSizeInBits); 8305 }; 8306 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { 8307 SDLoc DL(N); 8308 EVT ShiftVT = N1.getValueType(); 8309 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); 8310 return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum); 8311 } 8312 } 8313 8314 if (N1C && N0.getOpcode() == ISD::TRUNCATE && 8315 N0.getOperand(0).getOpcode() == ISD::SRL) { 8316 SDValue InnerShift = N0.getOperand(0); 8317 // TODO - support non-uniform vector shift amounts. 8318 if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) { 8319 uint64_t c1 = N001C->getZExtValue(); 8320 uint64_t c2 = N1C->getZExtValue(); 8321 EVT InnerShiftVT = InnerShift.getValueType(); 8322 EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType(); 8323 uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); 8324 // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2))) 8325 // This is only valid if the OpSizeInBits + c1 = size of inner shift. 8326 if (c1 + OpSizeInBits == InnerShiftSize) { 8327 SDLoc DL(N); 8328 if (c1 + c2 >= InnerShiftSize) 8329 return DAG.getConstant(0, DL, VT); 8330 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); 8331 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, 8332 InnerShift.getOperand(0), NewShiftAmt); 8333 return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift); 8334 } 8335 // In the more general case, we can clear the high bits after the shift: 8336 // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask) 8337 if (N0.hasOneUse() && InnerShift.hasOneUse() && 8338 c1 + c2 < InnerShiftSize) { 8339 SDLoc DL(N); 8340 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); 8341 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, 8342 InnerShift.getOperand(0), NewShiftAmt); 8343 SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize, 8344 OpSizeInBits - c2), 8345 DL, InnerShiftVT); 8346 SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask); 8347 return DAG.getNode(ISD::TRUNCATE, DL, VT, And); 8348 } 8349 } 8350 } 8351 8352 // fold (srl (shl x, c), c) -> (and x, cst2) 8353 // TODO - (srl (shl x, c1), c2). 8354 if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && 8355 isConstantOrConstantVector(N1, /* NoOpaques */ true)) { 8356 SDLoc DL(N); 8357 SDValue Mask = 8358 DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); 8359 AddToWorklist(Mask.getNode()); 8360 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); 8361 } 8362 8363 // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) 8364 // TODO - support non-uniform vector shift amounts. 8365 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { 8366 // Shifting in all undef bits? 8367 EVT SmallVT = N0.getOperand(0).getValueType(); 8368 unsigned BitSize = SmallVT.getScalarSizeInBits(); 8369 if (N1C->getAPIntValue().uge(BitSize)) 8370 return DAG.getUNDEF(VT); 8371 8372 if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { 8373 uint64_t ShiftAmt = N1C->getZExtValue(); 8374 SDLoc DL0(N0); 8375 SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT, 8376 N0.getOperand(0), 8377 DAG.getConstant(ShiftAmt, DL0, 8378 getShiftAmountTy(SmallVT))); 8379 AddToWorklist(SmallShift.getNode()); 8380 APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt); 8381 SDLoc DL(N); 8382 return DAG.getNode(ISD::AND, DL, VT, 8383 DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift), 8384 DAG.getConstant(Mask, DL, VT)); 8385 } 8386 } 8387 8388 // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign 8389 // bit, which is unmodified by sra. 8390 if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) { 8391 if (N0.getOpcode() == ISD::SRA) 8392 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); 8393 } 8394 8395 // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). 8396 if (N1C && N0.getOpcode() == ISD::CTLZ && 8397 N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { 8398 KnownBits Known = DAG.computeKnownBits(N0.getOperand(0)); 8399 8400 // If any of the input bits are KnownOne, then the input couldn't be all 8401 // zeros, thus the result of the srl will always be zero. 8402 if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); 8403 8404 // If all of the bits input the to ctlz node are known to be zero, then 8405 // the result of the ctlz is "32" and the result of the shift is one. 8406 APInt UnknownBits = ~Known.Zero; 8407 if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT); 8408 8409 // Otherwise, check to see if there is exactly one bit input to the ctlz. 8410 if (UnknownBits.isPowerOf2()) { 8411 // Okay, we know that only that the single bit specified by UnknownBits 8412 // could be set on input to the CTLZ node. If this bit is set, the SRL 8413 // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair 8414 // to an SRL/XOR pair, which is likely to simplify more. 8415 unsigned ShAmt = UnknownBits.countTrailingZeros(); 8416 SDValue Op = N0.getOperand(0); 8417 8418 if (ShAmt) { 8419 SDLoc DL(N0); 8420 Op = DAG.getNode(ISD::SRL, DL, VT, Op, 8421 DAG.getConstant(ShAmt, DL, 8422 getShiftAmountTy(Op.getValueType()))); 8423 AddToWorklist(Op.getNode()); 8424 } 8425 8426 SDLoc DL(N); 8427 return DAG.getNode(ISD::XOR, DL, VT, 8428 Op, DAG.getConstant(1, DL, VT)); 8429 } 8430 } 8431 8432 // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). 8433 if (N1.getOpcode() == ISD::TRUNCATE && 8434 N1.getOperand(0).getOpcode() == ISD::AND) { 8435 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) 8436 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); 8437 } 8438 8439 // fold operands of srl based on knowledge that the low bits are not 8440 // demanded. 8441 if (SimplifyDemandedBits(SDValue(N, 0))) 8442 return SDValue(N, 0); 8443 8444 if (N1C && !N1C->isOpaque()) 8445 if (SDValue NewSRL = visitShiftByConstant(N)) 8446 return NewSRL; 8447 8448 // Attempt to convert a srl of a load into a narrower zero-extending load. 8449 if (SDValue NarrowLoad = ReduceLoadWidth(N)) 8450 return NarrowLoad; 8451 8452 // Here is a common situation. We want to optimize: 8453 // 8454 // %a = ... 8455 // %b = and i32 %a, 2 8456 // %c = srl i32 %b, 1 8457 // brcond i32 %c ... 8458 // 8459 // into 8460 // 8461 // %a = ... 8462 // %b = and %a, 2 8463 // %c = setcc eq %b, 0 8464 // brcond %c ... 8465 // 8466 // However when after the source operand of SRL is optimized into AND, the SRL 8467 // itself may not be optimized further. Look for it and add the BRCOND into 8468 // the worklist. 8469 if (N->hasOneUse()) { 8470 SDNode *Use = *N->use_begin(); 8471 if (Use->getOpcode() == ISD::BRCOND) 8472 AddToWorklist(Use); 8473 else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) { 8474 // Also look pass the truncate. 8475 Use = *Use->use_begin(); 8476 if (Use->getOpcode() == ISD::BRCOND) 8477 AddToWorklist(Use); 8478 } 8479 } 8480 8481 // Try to transform this shift into a multiply-high if 8482 // it matches the appropriate pattern detected in combineShiftToMULH. 8483 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI)) 8484 return MULH; 8485 8486 return SDValue(); 8487 } 8488 8489 SDValue DAGCombiner::visitFunnelShift(SDNode *N) { 8490 EVT VT = N->getValueType(0); 8491 SDValue N0 = N->getOperand(0); 8492 SDValue N1 = N->getOperand(1); 8493 SDValue N2 = N->getOperand(2); 8494 bool IsFSHL = N->getOpcode() == ISD::FSHL; 8495 unsigned BitWidth = VT.getScalarSizeInBits(); 8496 8497 // fold (fshl N0, N1, 0) -> N0 8498 // fold (fshr N0, N1, 0) -> N1 8499 if (isPowerOf2_32(BitWidth)) 8500 if (DAG.MaskedValueIsZero( 8501 N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1))) 8502 return IsFSHL ? N0 : N1; 8503 8504 auto IsUndefOrZero = [](SDValue V) { 8505 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true); 8506 }; 8507 8508 // TODO - support non-uniform vector shift amounts. 8509 if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) { 8510 EVT ShAmtTy = N2.getValueType(); 8511 8512 // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) 8513 if (Cst->getAPIntValue().uge(BitWidth)) { 8514 uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth); 8515 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1, 8516 DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy)); 8517 } 8518 8519 unsigned ShAmt = Cst->getZExtValue(); 8520 if (ShAmt == 0) 8521 return IsFSHL ? N0 : N1; 8522 8523 // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C) 8524 // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C) 8525 // fold fshl(N0, undef_or_zero, C) -> shl(N0, C) 8526 // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C) 8527 if (IsUndefOrZero(N0)) 8528 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, 8529 DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt, 8530 SDLoc(N), ShAmtTy)); 8531 if (IsUndefOrZero(N1)) 8532 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, 8533 DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, 8534 SDLoc(N), ShAmtTy)); 8535 8536 // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. 8537 // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. 8538 // TODO - bigendian support once we have test coverage. 8539 // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine? 8540 // TODO - permit LHS EXTLOAD if extensions are shifted out. 8541 if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() && 8542 !DAG.getDataLayout().isBigEndian()) { 8543 auto *LHS = dyn_cast<LoadSDNode>(N0); 8544 auto *RHS = dyn_cast<LoadSDNode>(N1); 8545 if (LHS && RHS && LHS->isSimple() && RHS->isSimple() && 8546 LHS->getAddressSpace() == RHS->getAddressSpace() && 8547 (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) && 8548 ISD::isNON_EXTLoad(LHS)) { 8549 if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) { 8550 SDLoc DL(RHS); 8551 uint64_t PtrOff = 8552 IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8); 8553 Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff); 8554 bool Fast = false; 8555 if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 8556 RHS->getAddressSpace(), NewAlign, 8557 RHS->getMemOperand()->getFlags(), &Fast) && 8558 Fast) { 8559 SDValue NewPtr = 8560 DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL); 8561 AddToWorklist(NewPtr.getNode()); 8562 SDValue Load = DAG.getLoad( 8563 VT, DL, RHS->getChain(), NewPtr, 8564 RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign, 8565 RHS->getMemOperand()->getFlags(), RHS->getAAInfo()); 8566 // Replace the old load's chain with the new load's chain. 8567 WorklistRemover DeadNodes(*this); 8568 DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1)); 8569 return Load; 8570 } 8571 } 8572 } 8573 } 8574 } 8575 8576 // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2) 8577 // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2) 8578 // iff We know the shift amount is in range. 8579 // TODO: when is it worth doing SUB(BW, N2) as well? 8580 if (isPowerOf2_32(BitWidth)) { 8581 APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1); 8582 if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) 8583 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2); 8584 if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) 8585 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2); 8586 } 8587 8588 // fold (fshl N0, N0, N2) -> (rotl N0, N2) 8589 // fold (fshr N0, N0, N2) -> (rotr N0, N2) 8590 // TODO: Investigate flipping this rotate if only one is legal, if funnel shift 8591 // is legal as well we might be better off avoiding non-constant (BW - N2). 8592 unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR; 8593 if (N0 == N1 && hasOperation(RotOpc, VT)) 8594 return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2); 8595 8596 // Simplify, based on bits shifted out of N0/N1. 8597 if (SimplifyDemandedBits(SDValue(N, 0))) 8598 return SDValue(N, 0); 8599 8600 return SDValue(); 8601 } 8602 8603 SDValue DAGCombiner::visitABS(SDNode *N) { 8604 SDValue N0 = N->getOperand(0); 8605 EVT VT = N->getValueType(0); 8606 8607 // fold (abs c1) -> c2 8608 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8609 return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); 8610 // fold (abs (abs x)) -> (abs x) 8611 if (N0.getOpcode() == ISD::ABS) 8612 return N0; 8613 // fold (abs x) -> x iff not-negative 8614 if (DAG.SignBitIsZero(N0)) 8615 return N0; 8616 return SDValue(); 8617 } 8618 8619 SDValue DAGCombiner::visitBSWAP(SDNode *N) { 8620 SDValue N0 = N->getOperand(0); 8621 EVT VT = N->getValueType(0); 8622 8623 // fold (bswap c1) -> c2 8624 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8625 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0); 8626 // fold (bswap (bswap x)) -> x 8627 if (N0.getOpcode() == ISD::BSWAP) 8628 return N0->getOperand(0); 8629 return SDValue(); 8630 } 8631 8632 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { 8633 SDValue N0 = N->getOperand(0); 8634 EVT VT = N->getValueType(0); 8635 8636 // fold (bitreverse c1) -> c2 8637 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8638 return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); 8639 // fold (bitreverse (bitreverse x)) -> x 8640 if (N0.getOpcode() == ISD::BITREVERSE) 8641 return N0.getOperand(0); 8642 return SDValue(); 8643 } 8644 8645 SDValue DAGCombiner::visitCTLZ(SDNode *N) { 8646 SDValue N0 = N->getOperand(0); 8647 EVT VT = N->getValueType(0); 8648 8649 // fold (ctlz c1) -> c2 8650 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8651 return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0); 8652 8653 // If the value is known never to be zero, switch to the undef version. 8654 if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) { 8655 if (DAG.isKnownNeverZero(N0)) 8656 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8657 } 8658 8659 return SDValue(); 8660 } 8661 8662 SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { 8663 SDValue N0 = N->getOperand(0); 8664 EVT VT = N->getValueType(0); 8665 8666 // fold (ctlz_zero_undef c1) -> c2 8667 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8668 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8669 return SDValue(); 8670 } 8671 8672 SDValue DAGCombiner::visitCTTZ(SDNode *N) { 8673 SDValue N0 = N->getOperand(0); 8674 EVT VT = N->getValueType(0); 8675 8676 // fold (cttz c1) -> c2 8677 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8678 return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0); 8679 8680 // If the value is known never to be zero, switch to the undef version. 8681 if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) { 8682 if (DAG.isKnownNeverZero(N0)) 8683 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8684 } 8685 8686 return SDValue(); 8687 } 8688 8689 SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { 8690 SDValue N0 = N->getOperand(0); 8691 EVT VT = N->getValueType(0); 8692 8693 // fold (cttz_zero_undef c1) -> c2 8694 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8695 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); 8696 return SDValue(); 8697 } 8698 8699 SDValue DAGCombiner::visitCTPOP(SDNode *N) { 8700 SDValue N0 = N->getOperand(0); 8701 EVT VT = N->getValueType(0); 8702 8703 // fold (ctpop c1) -> c2 8704 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 8705 return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0); 8706 return SDValue(); 8707 } 8708 8709 // FIXME: This should be checking for no signed zeros on individual operands, as 8710 // well as no nans. 8711 static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS, 8712 SDValue RHS, 8713 const TargetLowering &TLI) { 8714 const TargetOptions &Options = DAG.getTarget().Options; 8715 EVT VT = LHS.getValueType(); 8716 8717 return Options.NoSignedZerosFPMath && VT.isFloatingPoint() && 8718 TLI.isProfitableToCombineMinNumMaxNum(VT) && 8719 DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS); 8720 } 8721 8722 /// Generate Min/Max node 8723 static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, 8724 SDValue RHS, SDValue True, SDValue False, 8725 ISD::CondCode CC, const TargetLowering &TLI, 8726 SelectionDAG &DAG) { 8727 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) 8728 return SDValue(); 8729 8730 EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); 8731 switch (CC) { 8732 case ISD::SETOLT: 8733 case ISD::SETOLE: 8734 case ISD::SETLT: 8735 case ISD::SETLE: 8736 case ISD::SETULT: 8737 case ISD::SETULE: { 8738 // Since it's known never nan to get here already, either fminnum or 8739 // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is 8740 // expanded in terms of it. 8741 unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; 8742 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) 8743 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); 8744 8745 unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; 8746 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) 8747 return DAG.getNode(Opcode, DL, VT, LHS, RHS); 8748 return SDValue(); 8749 } 8750 case ISD::SETOGT: 8751 case ISD::SETOGE: 8752 case ISD::SETGT: 8753 case ISD::SETGE: 8754 case ISD::SETUGT: 8755 case ISD::SETUGE: { 8756 unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE; 8757 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) 8758 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); 8759 8760 unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; 8761 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) 8762 return DAG.getNode(Opcode, DL, VT, LHS, RHS); 8763 return SDValue(); 8764 } 8765 default: 8766 return SDValue(); 8767 } 8768 } 8769 8770 /// If a (v)select has a condition value that is a sign-bit test, try to smear 8771 /// the condition operand sign-bit across the value width and use it as a mask. 8772 static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) { 8773 SDValue Cond = N->getOperand(0); 8774 SDValue C1 = N->getOperand(1); 8775 SDValue C2 = N->getOperand(2); 8776 assert(isConstantOrConstantVector(C1) && isConstantOrConstantVector(C2) && 8777 "Expected select-of-constants"); 8778 8779 EVT VT = N->getValueType(0); 8780 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() || 8781 VT != Cond.getOperand(0).getValueType()) 8782 return SDValue(); 8783 8784 // The inverted-condition + commuted-select variants of these patterns are 8785 // canonicalized to these forms in IR. 8786 SDValue X = Cond.getOperand(0); 8787 SDValue CondC = Cond.getOperand(1); 8788 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8789 if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) && 8790 isAllOnesOrAllOnesSplat(C2)) { 8791 // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1 8792 SDLoc DL(N); 8793 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT); 8794 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC); 8795 return DAG.getNode(ISD::OR, DL, VT, Sra, C1); 8796 } 8797 if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) { 8798 // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1 8799 SDLoc DL(N); 8800 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT); 8801 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC); 8802 return DAG.getNode(ISD::AND, DL, VT, Sra, C1); 8803 } 8804 return SDValue(); 8805 } 8806 8807 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { 8808 SDValue Cond = N->getOperand(0); 8809 SDValue N1 = N->getOperand(1); 8810 SDValue N2 = N->getOperand(2); 8811 EVT VT = N->getValueType(0); 8812 EVT CondVT = Cond.getValueType(); 8813 SDLoc DL(N); 8814 8815 if (!VT.isInteger()) 8816 return SDValue(); 8817 8818 auto *C1 = dyn_cast<ConstantSDNode>(N1); 8819 auto *C2 = dyn_cast<ConstantSDNode>(N2); 8820 if (!C1 || !C2) 8821 return SDValue(); 8822 8823 // Only do this before legalization to avoid conflicting with target-specific 8824 // transforms in the other direction (create a select from a zext/sext). There 8825 // is also a target-independent combine here in DAGCombiner in the other 8826 // direction for (select Cond, -1, 0) when the condition is not i1. 8827 if (CondVT == MVT::i1 && !LegalOperations) { 8828 if (C1->isNullValue() && C2->isOne()) { 8829 // select Cond, 0, 1 --> zext (!Cond) 8830 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); 8831 if (VT != MVT::i1) 8832 NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); 8833 return NotCond; 8834 } 8835 if (C1->isNullValue() && C2->isAllOnesValue()) { 8836 // select Cond, 0, -1 --> sext (!Cond) 8837 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); 8838 if (VT != MVT::i1) 8839 NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); 8840 return NotCond; 8841 } 8842 if (C1->isOne() && C2->isNullValue()) { 8843 // select Cond, 1, 0 --> zext (Cond) 8844 if (VT != MVT::i1) 8845 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 8846 return Cond; 8847 } 8848 if (C1->isAllOnesValue() && C2->isNullValue()) { 8849 // select Cond, -1, 0 --> sext (Cond) 8850 if (VT != MVT::i1) 8851 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); 8852 return Cond; 8853 } 8854 8855 // Use a target hook because some targets may prefer to transform in the 8856 // other direction. 8857 if (TLI.convertSelectOfConstantsToMath(VT)) { 8858 // For any constants that differ by 1, we can transform the select into an 8859 // extend and add. 8860 const APInt &C1Val = C1->getAPIntValue(); 8861 const APInt &C2Val = C2->getAPIntValue(); 8862 if (C1Val - 1 == C2Val) { 8863 // select Cond, C1, C1-1 --> add (zext Cond), C1-1 8864 if (VT != MVT::i1) 8865 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 8866 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); 8867 } 8868 if (C1Val + 1 == C2Val) { 8869 // select Cond, C1, C1+1 --> add (sext Cond), C1+1 8870 if (VT != MVT::i1) 8871 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); 8872 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); 8873 } 8874 8875 // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2) 8876 if (C1Val.isPowerOf2() && C2Val.isNullValue()) { 8877 if (VT != MVT::i1) 8878 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); 8879 SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT); 8880 return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC); 8881 } 8882 8883 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG)) 8884 return V; 8885 } 8886 8887 return SDValue(); 8888 } 8889 8890 // fold (select Cond, 0, 1) -> (xor Cond, 1) 8891 // We can't do this reliably if integer based booleans have different contents 8892 // to floating point based booleans. This is because we can't tell whether we 8893 // have an integer-based boolean or a floating-point-based boolean unless we 8894 // can find the SETCC that produced it and inspect its operands. This is 8895 // fairly easy if C is the SETCC node, but it can potentially be 8896 // undiscoverable (or not reasonably discoverable). For example, it could be 8897 // in another basic block or it could require searching a complicated 8898 // expression. 8899 if (CondVT.isInteger() && 8900 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) == 8901 TargetLowering::ZeroOrOneBooleanContent && 8902 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) == 8903 TargetLowering::ZeroOrOneBooleanContent && 8904 C1->isNullValue() && C2->isOne()) { 8905 SDValue NotCond = 8906 DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); 8907 if (VT.bitsEq(CondVT)) 8908 return NotCond; 8909 return DAG.getZExtOrTrunc(NotCond, DL, VT); 8910 } 8911 8912 return SDValue(); 8913 } 8914 8915 SDValue DAGCombiner::visitSELECT(SDNode *N) { 8916 SDValue N0 = N->getOperand(0); 8917 SDValue N1 = N->getOperand(1); 8918 SDValue N2 = N->getOperand(2); 8919 EVT VT = N->getValueType(0); 8920 EVT VT0 = N0.getValueType(); 8921 SDLoc DL(N); 8922 SDNodeFlags Flags = N->getFlags(); 8923 8924 if (SDValue V = DAG.simplifySelect(N0, N1, N2)) 8925 return V; 8926 8927 // fold (select X, X, Y) -> (or X, Y) 8928 // fold (select X, 1, Y) -> (or C, Y) 8929 if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1))) 8930 return DAG.getNode(ISD::OR, DL, VT, N0, N2); 8931 8932 if (SDValue V = foldSelectOfConstants(N)) 8933 return V; 8934 8935 // fold (select C, 0, X) -> (and (not C), X) 8936 if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) { 8937 SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); 8938 AddToWorklist(NOTNode.getNode()); 8939 return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2); 8940 } 8941 // fold (select C, X, 1) -> (or (not C), X) 8942 if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) { 8943 SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); 8944 AddToWorklist(NOTNode.getNode()); 8945 return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1); 8946 } 8947 // fold (select X, Y, X) -> (and X, Y) 8948 // fold (select X, Y, 0) -> (and X, Y) 8949 if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2))) 8950 return DAG.getNode(ISD::AND, DL, VT, N0, N1); 8951 8952 // If we can fold this based on the true/false value, do so. 8953 if (SimplifySelectOps(N, N1, N2)) 8954 return SDValue(N, 0); // Don't revisit N. 8955 8956 if (VT0 == MVT::i1) { 8957 // The code in this block deals with the following 2 equivalences: 8958 // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) 8959 // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) 8960 // The target can specify its preferred form with the 8961 // shouldNormalizeToSelectSequence() callback. However we always transform 8962 // to the right anyway if we find the inner select exists in the DAG anyway 8963 // and we always transform to the left side if we know that we can further 8964 // optimize the combination of the conditions. 8965 bool normalizeToSequence = 8966 TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); 8967 // select (and Cond0, Cond1), X, Y 8968 // -> select Cond0, (select Cond1, X, Y), Y 8969 if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { 8970 SDValue Cond0 = N0->getOperand(0); 8971 SDValue Cond1 = N0->getOperand(1); 8972 SDValue InnerSelect = 8973 DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags); 8974 if (normalizeToSequence || !InnerSelect.use_empty()) 8975 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, 8976 InnerSelect, N2, Flags); 8977 // Cleanup on failure. 8978 if (InnerSelect.use_empty()) 8979 recursivelyDeleteUnusedNodes(InnerSelect.getNode()); 8980 } 8981 // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) 8982 if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { 8983 SDValue Cond0 = N0->getOperand(0); 8984 SDValue Cond1 = N0->getOperand(1); 8985 SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), 8986 Cond1, N1, N2, Flags); 8987 if (normalizeToSequence || !InnerSelect.use_empty()) 8988 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1, 8989 InnerSelect, Flags); 8990 // Cleanup on failure. 8991 if (InnerSelect.use_empty()) 8992 recursivelyDeleteUnusedNodes(InnerSelect.getNode()); 8993 } 8994 8995 // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y 8996 if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) { 8997 SDValue N1_0 = N1->getOperand(0); 8998 SDValue N1_1 = N1->getOperand(1); 8999 SDValue N1_2 = N1->getOperand(2); 9000 if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { 9001 // Create the actual and node if we can generate good code for it. 9002 if (!normalizeToSequence) { 9003 SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); 9004 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, 9005 N2, Flags); 9006 } 9007 // Otherwise see if we can optimize the "and" to a better pattern. 9008 if (SDValue Combined = visitANDLike(N0, N1_0, N)) { 9009 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1, 9010 N2, Flags); 9011 } 9012 } 9013 } 9014 // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y 9015 if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) { 9016 SDValue N2_0 = N2->getOperand(0); 9017 SDValue N2_1 = N2->getOperand(1); 9018 SDValue N2_2 = N2->getOperand(2); 9019 if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { 9020 // Create the actual or node if we can generate good code for it. 9021 if (!normalizeToSequence) { 9022 SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); 9023 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, 9024 N2_2, Flags); 9025 } 9026 // Otherwise see if we can optimize to a better pattern. 9027 if (SDValue Combined = visitORLike(N0, N2_0, N)) 9028 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1, 9029 N2_2, Flags); 9030 } 9031 } 9032 } 9033 9034 // select (not Cond), N1, N2 -> select Cond, N2, N1 9035 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) { 9036 SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1); 9037 SelectOp->setFlags(Flags); 9038 return SelectOp; 9039 } 9040 9041 // Fold selects based on a setcc into other things, such as min/max/abs. 9042 if (N0.getOpcode() == ISD::SETCC) { 9043 SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1); 9044 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 9045 9046 // select (fcmp lt x, y), x, y -> fminnum x, y 9047 // select (fcmp gt x, y), x, y -> fmaxnum x, y 9048 // 9049 // This is OK if we don't care what happens if either operand is a NaN. 9050 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI)) 9051 if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, 9052 CC, TLI, DAG)) 9053 return FMinMax; 9054 9055 // Use 'unsigned add with overflow' to optimize an unsigned saturating add. 9056 // This is conservatively limited to pre-legal-operations to give targets 9057 // a chance to reverse the transform if they want to do that. Also, it is 9058 // unlikely that the pattern would be formed late, so it's probably not 9059 // worth going through the other checks. 9060 if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) && 9061 CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) && 9062 N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) { 9063 auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1)); 9064 auto *NotC = dyn_cast<ConstantSDNode>(Cond1); 9065 if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) { 9066 // select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) --> 9067 // uaddo Cond0, C; select uaddo.1, -1, uaddo.0 9068 // 9069 // The IR equivalent of this transform would have this form: 9070 // %a = add %x, C 9071 // %c = icmp ugt %x, ~C 9072 // %r = select %c, -1, %a 9073 // => 9074 // %u = call {iN,i1} llvm.uadd.with.overflow(%x, C) 9075 // %u0 = extractvalue %u, 0 9076 // %u1 = extractvalue %u, 1 9077 // %r = select %u1, -1, %u0 9078 SDVTList VTs = DAG.getVTList(VT, VT0); 9079 SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1)); 9080 return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0)); 9081 } 9082 } 9083 9084 if (TLI.isOperationLegal(ISD::SELECT_CC, VT) || 9085 (!LegalOperations && 9086 TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) { 9087 // Any flags available in a select/setcc fold will be on the setcc as they 9088 // migrated from fcmp 9089 Flags = N0.getNode()->getFlags(); 9090 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, 9091 N2, N0.getOperand(2)); 9092 SelectNode->setFlags(Flags); 9093 return SelectNode; 9094 } 9095 9096 return SimplifySelect(DL, N0, N1, N2); 9097 } 9098 9099 return SDValue(); 9100 } 9101 9102 // This function assumes all the vselect's arguments are CONCAT_VECTOR 9103 // nodes and that the condition is a BV of ConstantSDNodes (or undefs). 9104 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { 9105 SDLoc DL(N); 9106 SDValue Cond = N->getOperand(0); 9107 SDValue LHS = N->getOperand(1); 9108 SDValue RHS = N->getOperand(2); 9109 EVT VT = N->getValueType(0); 9110 int NumElems = VT.getVectorNumElements(); 9111 assert(LHS.getOpcode() == ISD::CONCAT_VECTORS && 9112 RHS.getOpcode() == ISD::CONCAT_VECTORS && 9113 Cond.getOpcode() == ISD::BUILD_VECTOR); 9114 9115 // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about 9116 // binary ones here. 9117 if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2) 9118 return SDValue(); 9119 9120 // We're sure we have an even number of elements due to the 9121 // concat_vectors we have as arguments to vselect. 9122 // Skip BV elements until we find one that's not an UNDEF 9123 // After we find an UNDEF element, keep looping until we get to half the 9124 // length of the BV and see if all the non-undef nodes are the same. 9125 ConstantSDNode *BottomHalf = nullptr; 9126 for (int i = 0; i < NumElems / 2; ++i) { 9127 if (Cond->getOperand(i)->isUndef()) 9128 continue; 9129 9130 if (BottomHalf == nullptr) 9131 BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i)); 9132 else if (Cond->getOperand(i).getNode() != BottomHalf) 9133 return SDValue(); 9134 } 9135 9136 // Do the same for the second half of the BuildVector 9137 ConstantSDNode *TopHalf = nullptr; 9138 for (int i = NumElems / 2; i < NumElems; ++i) { 9139 if (Cond->getOperand(i)->isUndef()) 9140 continue; 9141 9142 if (TopHalf == nullptr) 9143 TopHalf = cast<ConstantSDNode>(Cond.getOperand(i)); 9144 else if (Cond->getOperand(i).getNode() != TopHalf) 9145 return SDValue(); 9146 } 9147 9148 assert(TopHalf && BottomHalf && 9149 "One half of the selector was all UNDEFs and the other was all the " 9150 "same value. This should have been addressed before this function."); 9151 return DAG.getNode( 9152 ISD::CONCAT_VECTORS, DL, VT, 9153 BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0), 9154 TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); 9155 } 9156 9157 SDValue DAGCombiner::visitMSCATTER(SDNode *N) { 9158 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N); 9159 SDValue Mask = MSC->getMask(); 9160 SDValue Chain = MSC->getChain(); 9161 SDLoc DL(N); 9162 9163 // Zap scatters with a zero mask. 9164 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 9165 return Chain; 9166 9167 return SDValue(); 9168 } 9169 9170 SDValue DAGCombiner::visitMSTORE(SDNode *N) { 9171 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); 9172 SDValue Mask = MST->getMask(); 9173 SDValue Chain = MST->getChain(); 9174 SDLoc DL(N); 9175 9176 // Zap masked stores with a zero mask. 9177 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 9178 return Chain; 9179 9180 // Try transforming N to an indexed store. 9181 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 9182 return SDValue(N, 0); 9183 9184 return SDValue(); 9185 } 9186 9187 SDValue DAGCombiner::visitMGATHER(SDNode *N) { 9188 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N); 9189 SDValue Mask = MGT->getMask(); 9190 SDLoc DL(N); 9191 9192 // Zap gathers with a zero mask. 9193 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 9194 return CombineTo(N, MGT->getPassThru(), MGT->getChain()); 9195 9196 return SDValue(); 9197 } 9198 9199 SDValue DAGCombiner::visitMLOAD(SDNode *N) { 9200 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N); 9201 SDValue Mask = MLD->getMask(); 9202 SDLoc DL(N); 9203 9204 // Zap masked loads with a zero mask. 9205 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 9206 return CombineTo(N, MLD->getPassThru(), MLD->getChain()); 9207 9208 // Try transforming N to an indexed load. 9209 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 9210 return SDValue(N, 0); 9211 9212 return SDValue(); 9213 } 9214 9215 /// A vector select of 2 constant vectors can be simplified to math/logic to 9216 /// avoid a variable select instruction and possibly avoid constant loads. 9217 SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { 9218 SDValue Cond = N->getOperand(0); 9219 SDValue N1 = N->getOperand(1); 9220 SDValue N2 = N->getOperand(2); 9221 EVT VT = N->getValueType(0); 9222 if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 || 9223 !TLI.convertSelectOfConstantsToMath(VT) || 9224 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) || 9225 !ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) 9226 return SDValue(); 9227 9228 // Check if we can use the condition value to increment/decrement a single 9229 // constant value. This simplifies a select to an add and removes a constant 9230 // load/materialization from the general case. 9231 bool AllAddOne = true; 9232 bool AllSubOne = true; 9233 unsigned Elts = VT.getVectorNumElements(); 9234 for (unsigned i = 0; i != Elts; ++i) { 9235 SDValue N1Elt = N1.getOperand(i); 9236 SDValue N2Elt = N2.getOperand(i); 9237 if (N1Elt.isUndef() || N2Elt.isUndef()) 9238 continue; 9239 if (N1Elt.getValueType() != N2Elt.getValueType()) 9240 continue; 9241 9242 const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue(); 9243 const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue(); 9244 if (C1 != C2 + 1) 9245 AllAddOne = false; 9246 if (C1 != C2 - 1) 9247 AllSubOne = false; 9248 } 9249 9250 // Further simplifications for the extra-special cases where the constants are 9251 // all 0 or all -1 should be implemented as folds of these patterns. 9252 SDLoc DL(N); 9253 if (AllAddOne || AllSubOne) { 9254 // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C 9255 // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C 9256 auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 9257 SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond); 9258 return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2); 9259 } 9260 9261 // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C) 9262 APInt Pow2C; 9263 if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() && 9264 isNullOrNullSplat(N2)) { 9265 SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT); 9266 SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT); 9267 return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC); 9268 } 9269 9270 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG)) 9271 return V; 9272 9273 // The general case for select-of-constants: 9274 // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2 9275 // ...but that only makes sense if a vselect is slower than 2 logic ops, so 9276 // leave that to a machine-specific pass. 9277 return SDValue(); 9278 } 9279 9280 SDValue DAGCombiner::visitVSELECT(SDNode *N) { 9281 SDValue N0 = N->getOperand(0); 9282 SDValue N1 = N->getOperand(1); 9283 SDValue N2 = N->getOperand(2); 9284 EVT VT = N->getValueType(0); 9285 SDLoc DL(N); 9286 9287 if (SDValue V = DAG.simplifySelect(N0, N1, N2)) 9288 return V; 9289 9290 // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1 9291 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) 9292 return DAG.getSelect(DL, VT, F, N2, N1); 9293 9294 // Canonicalize integer abs. 9295 // vselect (setg[te] X, 0), X, -X -> 9296 // vselect (setgt X, -1), X, -X -> 9297 // vselect (setl[te] X, 0), -X, X -> 9298 // Y = sra (X, size(X)-1); xor (add (X, Y), Y) 9299 if (N0.getOpcode() == ISD::SETCC) { 9300 SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); 9301 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 9302 bool isAbs = false; 9303 bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 9304 9305 if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) || 9306 (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) && 9307 N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1)) 9308 isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode()); 9309 else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) && 9310 N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1)) 9311 isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); 9312 9313 if (isAbs) { 9314 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) 9315 return DAG.getNode(ISD::ABS, DL, VT, LHS); 9316 9317 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS, 9318 DAG.getConstant(VT.getScalarSizeInBits() - 1, 9319 DL, getShiftAmountTy(VT))); 9320 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift); 9321 AddToWorklist(Shift.getNode()); 9322 AddToWorklist(Add.getNode()); 9323 return DAG.getNode(ISD::XOR, DL, VT, Add, Shift); 9324 } 9325 9326 // vselect x, y (fcmp lt x, y) -> fminnum x, y 9327 // vselect x, y (fcmp gt x, y) -> fmaxnum x, y 9328 // 9329 // This is OK if we don't care about what happens if either operand is a 9330 // NaN. 9331 // 9332 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) { 9333 if (SDValue FMinMax = 9334 combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG)) 9335 return FMinMax; 9336 } 9337 9338 // If this select has a condition (setcc) with narrower operands than the 9339 // select, try to widen the compare to match the select width. 9340 // TODO: This should be extended to handle any constant. 9341 // TODO: This could be extended to handle non-loading patterns, but that 9342 // requires thorough testing to avoid regressions. 9343 if (isNullOrNullSplat(RHS)) { 9344 EVT NarrowVT = LHS.getValueType(); 9345 EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger(); 9346 EVT SetCCVT = getSetCCResultType(LHS.getValueType()); 9347 unsigned SetCCWidth = SetCCVT.getScalarSizeInBits(); 9348 unsigned WideWidth = WideVT.getScalarSizeInBits(); 9349 bool IsSigned = isSignedIntSetCC(CC); 9350 auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 9351 if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() && 9352 SetCCWidth != 1 && SetCCWidth < WideWidth && 9353 TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) && 9354 TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) { 9355 // Both compare operands can be widened for free. The LHS can use an 9356 // extended load, and the RHS is a constant: 9357 // vselect (ext (setcc load(X), C)), N1, N2 --> 9358 // vselect (setcc extload(X), C'), N1, N2 9359 auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 9360 SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS); 9361 SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS); 9362 EVT WideSetCCVT = getSetCCResultType(WideVT); 9363 SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC); 9364 return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); 9365 } 9366 } 9367 } 9368 9369 if (SimplifySelectOps(N, N1, N2)) 9370 return SDValue(N, 0); // Don't revisit N. 9371 9372 // Fold (vselect (build_vector all_ones), N1, N2) -> N1 9373 if (ISD::isBuildVectorAllOnes(N0.getNode())) 9374 return N1; 9375 // Fold (vselect (build_vector all_zeros), N1, N2) -> N2 9376 if (ISD::isBuildVectorAllZeros(N0.getNode())) 9377 return N2; 9378 9379 // The ConvertSelectToConcatVector function is assuming both the above 9380 // checks for (vselect (build_vector all{ones,zeros) ...) have been made 9381 // and addressed. 9382 if (N1.getOpcode() == ISD::CONCAT_VECTORS && 9383 N2.getOpcode() == ISD::CONCAT_VECTORS && 9384 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { 9385 if (SDValue CV = ConvertSelectToConcatVector(N, DAG)) 9386 return CV; 9387 } 9388 9389 if (SDValue V = foldVSelectOfConstants(N)) 9390 return V; 9391 9392 return SDValue(); 9393 } 9394 9395 SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { 9396 SDValue N0 = N->getOperand(0); 9397 SDValue N1 = N->getOperand(1); 9398 SDValue N2 = N->getOperand(2); 9399 SDValue N3 = N->getOperand(3); 9400 SDValue N4 = N->getOperand(4); 9401 ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get(); 9402 9403 // fold select_cc lhs, rhs, x, x, cc -> x 9404 if (N2 == N3) 9405 return N2; 9406 9407 // Determine if the condition we're dealing with is constant 9408 if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, 9409 CC, SDLoc(N), false)) { 9410 AddToWorklist(SCC.getNode()); 9411 9412 if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) { 9413 if (!SCCC->isNullValue()) 9414 return N2; // cond always true -> true val 9415 else 9416 return N3; // cond always false -> false val 9417 } else if (SCC->isUndef()) { 9418 // When the condition is UNDEF, just return the first operand. This is 9419 // coherent the DAG creation, no setcc node is created in this case 9420 return N2; 9421 } else if (SCC.getOpcode() == ISD::SETCC) { 9422 // Fold to a simpler select_cc 9423 SDValue SelectOp = DAG.getNode( 9424 ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), 9425 SCC.getOperand(1), N2, N3, SCC.getOperand(2)); 9426 SelectOp->setFlags(SCC->getFlags()); 9427 return SelectOp; 9428 } 9429 } 9430 9431 // If we can fold this based on the true/false value, do so. 9432 if (SimplifySelectOps(N, N2, N3)) 9433 return SDValue(N, 0); // Don't revisit N. 9434 9435 // fold select_cc into other things, such as min/max/abs 9436 return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC); 9437 } 9438 9439 SDValue DAGCombiner::visitSETCC(SDNode *N) { 9440 // setcc is very commonly used as an argument to brcond. This pattern 9441 // also lend itself to numerous combines and, as a result, it is desired 9442 // we keep the argument to a brcond as a setcc as much as possible. 9443 bool PreferSetCC = 9444 N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND; 9445 9446 SDValue Combined = SimplifySetCC( 9447 N->getValueType(0), N->getOperand(0), N->getOperand(1), 9448 cast<CondCodeSDNode>(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC); 9449 9450 if (!Combined) 9451 return SDValue(); 9452 9453 // If we prefer to have a setcc, and we don't, we'll try our best to 9454 // recreate one using rebuildSetCC. 9455 if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) { 9456 SDValue NewSetCC = rebuildSetCC(Combined); 9457 9458 // We don't have anything interesting to combine to. 9459 if (NewSetCC.getNode() == N) 9460 return SDValue(); 9461 9462 if (NewSetCC) 9463 return NewSetCC; 9464 } 9465 9466 return Combined; 9467 } 9468 9469 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { 9470 SDValue LHS = N->getOperand(0); 9471 SDValue RHS = N->getOperand(1); 9472 SDValue Carry = N->getOperand(2); 9473 SDValue Cond = N->getOperand(3); 9474 9475 // If Carry is false, fold to a regular SETCC. 9476 if (isNullConstant(Carry)) 9477 return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); 9478 9479 return SDValue(); 9480 } 9481 9482 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or 9483 /// a build_vector of constants. 9484 /// This function is called by the DAGCombiner when visiting sext/zext/aext 9485 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). 9486 /// Vector extends are not folded if operations are legal; this is to 9487 /// avoid introducing illegal build_vector dag nodes. 9488 static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, 9489 SelectionDAG &DAG, bool LegalTypes) { 9490 unsigned Opcode = N->getOpcode(); 9491 SDValue N0 = N->getOperand(0); 9492 EVT VT = N->getValueType(0); 9493 SDLoc DL(N); 9494 9495 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || 9496 Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || 9497 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) 9498 && "Expected EXTEND dag node in input!"); 9499 9500 // fold (sext c1) -> c1 9501 // fold (zext c1) -> c1 9502 // fold (aext c1) -> c1 9503 if (isa<ConstantSDNode>(N0)) 9504 return DAG.getNode(Opcode, DL, VT, N0); 9505 9506 // fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) 9507 // fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2) 9508 // fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) 9509 if (N0->getOpcode() == ISD::SELECT) { 9510 SDValue Op1 = N0->getOperand(1); 9511 SDValue Op2 = N0->getOperand(2); 9512 if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) && 9513 (Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) { 9514 // For any_extend, choose sign extension of the constants to allow a 9515 // possible further transform to sign_extend_inreg.i.e. 9516 // 9517 // t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0> 9518 // t2: i64 = any_extend t1 9519 // --> 9520 // t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0> 9521 // --> 9522 // t4: i64 = sign_extend_inreg t3 9523 unsigned FoldOpc = Opcode; 9524 if (FoldOpc == ISD::ANY_EXTEND) 9525 FoldOpc = ISD::SIGN_EXTEND; 9526 return DAG.getSelect(DL, VT, N0->getOperand(0), 9527 DAG.getNode(FoldOpc, DL, VT, Op1), 9528 DAG.getNode(FoldOpc, DL, VT, Op2)); 9529 } 9530 } 9531 9532 // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) 9533 // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) 9534 // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) 9535 EVT SVT = VT.getScalarType(); 9536 if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) && 9537 ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) 9538 return SDValue(); 9539 9540 // We can fold this node into a build_vector. 9541 unsigned VTBits = SVT.getSizeInBits(); 9542 unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits(); 9543 SmallVector<SDValue, 8> Elts; 9544 unsigned NumElts = VT.getVectorNumElements(); 9545 9546 // For zero-extensions, UNDEF elements still guarantee to have the upper 9547 // bits set to zero. 9548 bool IsZext = 9549 Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ZERO_EXTEND_VECTOR_INREG; 9550 9551 for (unsigned i = 0; i != NumElts; ++i) { 9552 SDValue Op = N0.getOperand(i); 9553 if (Op.isUndef()) { 9554 Elts.push_back(IsZext ? DAG.getConstant(0, DL, SVT) : DAG.getUNDEF(SVT)); 9555 continue; 9556 } 9557 9558 SDLoc DL(Op); 9559 // Get the constant value and if needed trunc it to the size of the type. 9560 // Nodes like build_vector might have constants wider than the scalar type. 9561 APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits); 9562 if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) 9563 Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT)); 9564 else 9565 Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT)); 9566 } 9567 9568 return DAG.getBuildVector(VT, DL, Elts); 9569 } 9570 9571 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: 9572 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))" 9573 // transformation. Returns true if extension are possible and the above 9574 // mentioned transformation is profitable. 9575 static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0, 9576 unsigned ExtOpc, 9577 SmallVectorImpl<SDNode *> &ExtendNodes, 9578 const TargetLowering &TLI) { 9579 bool HasCopyToRegUses = false; 9580 bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType()); 9581 for (SDNode::use_iterator UI = N0.getNode()->use_begin(), 9582 UE = N0.getNode()->use_end(); 9583 UI != UE; ++UI) { 9584 SDNode *User = *UI; 9585 if (User == N) 9586 continue; 9587 if (UI.getUse().getResNo() != N0.getResNo()) 9588 continue; 9589 // FIXME: Only extend SETCC N, N and SETCC N, c for now. 9590 if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) { 9591 ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get(); 9592 if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC)) 9593 // Sign bits will be lost after a zext. 9594 return false; 9595 bool Add = false; 9596 for (unsigned i = 0; i != 2; ++i) { 9597 SDValue UseOp = User->getOperand(i); 9598 if (UseOp == N0) 9599 continue; 9600 if (!isa<ConstantSDNode>(UseOp)) 9601 return false; 9602 Add = true; 9603 } 9604 if (Add) 9605 ExtendNodes.push_back(User); 9606 continue; 9607 } 9608 // If truncates aren't free and there are users we can't 9609 // extend, it isn't worthwhile. 9610 if (!isTruncFree) 9611 return false; 9612 // Remember if this value is live-out. 9613 if (User->getOpcode() == ISD::CopyToReg) 9614 HasCopyToRegUses = true; 9615 } 9616 9617 if (HasCopyToRegUses) { 9618 bool BothLiveOut = false; 9619 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 9620 UI != UE; ++UI) { 9621 SDUse &Use = UI.getUse(); 9622 if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) { 9623 BothLiveOut = true; 9624 break; 9625 } 9626 } 9627 if (BothLiveOut) 9628 // Both unextended and extended values are live out. There had better be 9629 // a good reason for the transformation. 9630 return ExtendNodes.size(); 9631 } 9632 return true; 9633 } 9634 9635 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, 9636 SDValue OrigLoad, SDValue ExtLoad, 9637 ISD::NodeType ExtType) { 9638 // Extend SetCC uses if necessary. 9639 SDLoc DL(ExtLoad); 9640 for (SDNode *SetCC : SetCCs) { 9641 SmallVector<SDValue, 4> Ops; 9642 9643 for (unsigned j = 0; j != 2; ++j) { 9644 SDValue SOp = SetCC->getOperand(j); 9645 if (SOp == OrigLoad) 9646 Ops.push_back(ExtLoad); 9647 else 9648 Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp)); 9649 } 9650 9651 Ops.push_back(SetCC->getOperand(2)); 9652 CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops)); 9653 } 9654 } 9655 9656 // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?). 9657 SDValue DAGCombiner::CombineExtLoad(SDNode *N) { 9658 SDValue N0 = N->getOperand(0); 9659 EVT DstVT = N->getValueType(0); 9660 EVT SrcVT = N0.getValueType(); 9661 9662 assert((N->getOpcode() == ISD::SIGN_EXTEND || 9663 N->getOpcode() == ISD::ZERO_EXTEND) && 9664 "Unexpected node type (not an extend)!"); 9665 9666 // fold (sext (load x)) to multiple smaller sextloads; same for zext. 9667 // For example, on a target with legal v4i32, but illegal v8i32, turn: 9668 // (v8i32 (sext (v8i16 (load x)))) 9669 // into: 9670 // (v8i32 (concat_vectors (v4i32 (sextload x)), 9671 // (v4i32 (sextload (x + 16))))) 9672 // Where uses of the original load, i.e.: 9673 // (v8i16 (load x)) 9674 // are replaced with: 9675 // (v8i16 (truncate 9676 // (v8i32 (concat_vectors (v4i32 (sextload x)), 9677 // (v4i32 (sextload (x + 16))))))) 9678 // 9679 // This combine is only applicable to illegal, but splittable, vectors. 9680 // All legal types, and illegal non-vector types, are handled elsewhere. 9681 // This combine is controlled by TargetLowering::isVectorLoadExtDesirable. 9682 // 9683 if (N0->getOpcode() != ISD::LOAD) 9684 return SDValue(); 9685 9686 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 9687 9688 if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) || 9689 !N0.hasOneUse() || !LN0->isSimple() || 9690 !DstVT.isVector() || !DstVT.isPow2VectorType() || 9691 !TLI.isVectorLoadExtDesirable(SDValue(N, 0))) 9692 return SDValue(); 9693 9694 SmallVector<SDNode *, 4> SetCCs; 9695 if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI)) 9696 return SDValue(); 9697 9698 ISD::LoadExtType ExtType = 9699 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 9700 9701 // Try to split the vector types to get down to legal types. 9702 EVT SplitSrcVT = SrcVT; 9703 EVT SplitDstVT = DstVT; 9704 while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && 9705 SplitSrcVT.getVectorNumElements() > 1) { 9706 SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; 9707 SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; 9708 } 9709 9710 if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) 9711 return SDValue(); 9712 9713 assert(!DstVT.isScalableVector() && "Unexpected scalable vector type"); 9714 9715 SDLoc DL(N); 9716 const unsigned NumSplits = 9717 DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); 9718 const unsigned Stride = SplitSrcVT.getStoreSize(); 9719 SmallVector<SDValue, 4> Loads; 9720 SmallVector<SDValue, 4> Chains; 9721 9722 SDValue BasePtr = LN0->getBasePtr(); 9723 for (unsigned Idx = 0; Idx < NumSplits; Idx++) { 9724 const unsigned Offset = Idx * Stride; 9725 const unsigned Align = MinAlign(LN0->getAlignment(), Offset); 9726 9727 SDValue SplitLoad = DAG.getExtLoad( 9728 ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr, 9729 LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align, 9730 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 9731 9732 BasePtr = DAG.getMemBasePlusOffset(BasePtr, Stride, DL); 9733 9734 Loads.push_back(SplitLoad.getValue(0)); 9735 Chains.push_back(SplitLoad.getValue(1)); 9736 } 9737 9738 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 9739 SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); 9740 9741 // Simplify TF. 9742 AddToWorklist(NewChain.getNode()); 9743 9744 CombineTo(N, NewValue); 9745 9746 // Replace uses of the original load (before extension) 9747 // with a truncate of the concatenated sextloaded vectors. 9748 SDValue Trunc = 9749 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue); 9750 ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode()); 9751 CombineTo(N0.getNode(), Trunc, NewChain); 9752 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9753 } 9754 9755 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> 9756 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) 9757 SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { 9758 assert(N->getOpcode() == ISD::ZERO_EXTEND); 9759 EVT VT = N->getValueType(0); 9760 EVT OrigVT = N->getOperand(0).getValueType(); 9761 if (TLI.isZExtFree(OrigVT, VT)) 9762 return SDValue(); 9763 9764 // and/or/xor 9765 SDValue N0 = N->getOperand(0); 9766 if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 9767 N0.getOpcode() == ISD::XOR) || 9768 N0.getOperand(1).getOpcode() != ISD::Constant || 9769 (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))) 9770 return SDValue(); 9771 9772 // shl/shr 9773 SDValue N1 = N0->getOperand(0); 9774 if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) || 9775 N1.getOperand(1).getOpcode() != ISD::Constant || 9776 (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT))) 9777 return SDValue(); 9778 9779 // load 9780 if (!isa<LoadSDNode>(N1.getOperand(0))) 9781 return SDValue(); 9782 LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0)); 9783 EVT MemVT = Load->getMemoryVT(); 9784 if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) || 9785 Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed()) 9786 return SDValue(); 9787 9788 9789 // If the shift op is SHL, the logic op must be AND, otherwise the result 9790 // will be wrong. 9791 if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND) 9792 return SDValue(); 9793 9794 if (!N0.hasOneUse() || !N1.hasOneUse()) 9795 return SDValue(); 9796 9797 SmallVector<SDNode*, 4> SetCCs; 9798 if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0), 9799 ISD::ZERO_EXTEND, SetCCs, TLI)) 9800 return SDValue(); 9801 9802 // Actually do the transformation. 9803 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT, 9804 Load->getChain(), Load->getBasePtr(), 9805 Load->getMemoryVT(), Load->getMemOperand()); 9806 9807 SDLoc DL1(N1); 9808 SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad, 9809 N1.getOperand(1)); 9810 9811 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 9812 SDLoc DL0(N0); 9813 SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift, 9814 DAG.getConstant(Mask, DL0, VT)); 9815 9816 ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); 9817 CombineTo(N, And); 9818 if (SDValue(Load, 0).hasOneUse()) { 9819 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1)); 9820 } else { 9821 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load), 9822 Load->getValueType(0), ExtLoad); 9823 CombineTo(Load, Trunc, ExtLoad.getValue(1)); 9824 } 9825 9826 // N0 is dead at this point. 9827 recursivelyDeleteUnusedNodes(N0.getNode()); 9828 9829 return SDValue(N,0); // Return N so it doesn't get rechecked! 9830 } 9831 9832 /// If we're narrowing or widening the result of a vector select and the final 9833 /// size is the same size as a setcc (compare) feeding the select, then try to 9834 /// apply the cast operation to the select's operands because matching vector 9835 /// sizes for a select condition and other operands should be more efficient. 9836 SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { 9837 unsigned CastOpcode = Cast->getOpcode(); 9838 assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || 9839 CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || 9840 CastOpcode == ISD::FP_ROUND) && 9841 "Unexpected opcode for vector select narrowing/widening"); 9842 9843 // We only do this transform before legal ops because the pattern may be 9844 // obfuscated by target-specific operations after legalization. Do not create 9845 // an illegal select op, however, because that may be difficult to lower. 9846 EVT VT = Cast->getValueType(0); 9847 if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) 9848 return SDValue(); 9849 9850 SDValue VSel = Cast->getOperand(0); 9851 if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() || 9852 VSel.getOperand(0).getOpcode() != ISD::SETCC) 9853 return SDValue(); 9854 9855 // Does the setcc have the same vector size as the casted select? 9856 SDValue SetCC = VSel.getOperand(0); 9857 EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType()); 9858 if (SetCCVT.getSizeInBits() != VT.getSizeInBits()) 9859 return SDValue(); 9860 9861 // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B) 9862 SDValue A = VSel.getOperand(1); 9863 SDValue B = VSel.getOperand(2); 9864 SDValue CastA, CastB; 9865 SDLoc DL(Cast); 9866 if (CastOpcode == ISD::FP_ROUND) { 9867 // FP_ROUND (fptrunc) has an extra flag operand to pass along. 9868 CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1)); 9869 CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1)); 9870 } else { 9871 CastA = DAG.getNode(CastOpcode, DL, VT, A); 9872 CastB = DAG.getNode(CastOpcode, DL, VT, B); 9873 } 9874 return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB); 9875 } 9876 9877 // fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) 9878 // fold ([s|z]ext ( extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) 9879 static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner, 9880 const TargetLowering &TLI, EVT VT, 9881 bool LegalOperations, SDNode *N, 9882 SDValue N0, ISD::LoadExtType ExtLoadType) { 9883 SDNode *N0Node = N0.getNode(); 9884 bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node) 9885 : ISD::isZEXTLoad(N0Node); 9886 if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) || 9887 !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse()) 9888 return SDValue(); 9889 9890 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 9891 EVT MemVT = LN0->getMemoryVT(); 9892 if ((LegalOperations || !LN0->isSimple() || 9893 VT.isVector()) && 9894 !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT)) 9895 return SDValue(); 9896 9897 SDValue ExtLoad = 9898 DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), 9899 LN0->getBasePtr(), MemVT, LN0->getMemOperand()); 9900 Combiner.CombineTo(N, ExtLoad); 9901 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 9902 if (LN0->use_empty()) 9903 Combiner.recursivelyDeleteUnusedNodes(LN0); 9904 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9905 } 9906 9907 // fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x))) 9908 // Only generate vector extloads when 1) they're legal, and 2) they are 9909 // deemed desirable by the target. 9910 static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, 9911 const TargetLowering &TLI, EVT VT, 9912 bool LegalOperations, SDNode *N, SDValue N0, 9913 ISD::LoadExtType ExtLoadType, 9914 ISD::NodeType ExtOpc) { 9915 if (!ISD::isNON_EXTLoad(N0.getNode()) || 9916 !ISD::isUNINDEXEDLoad(N0.getNode()) || 9917 ((LegalOperations || VT.isVector() || 9918 !cast<LoadSDNode>(N0)->isSimple()) && 9919 !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType()))) 9920 return {}; 9921 9922 bool DoXform = true; 9923 SmallVector<SDNode *, 4> SetCCs; 9924 if (!N0.hasOneUse()) 9925 DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI); 9926 if (VT.isVector()) 9927 DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); 9928 if (!DoXform) 9929 return {}; 9930 9931 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 9932 SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), 9933 LN0->getBasePtr(), N0.getValueType(), 9934 LN0->getMemOperand()); 9935 Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc); 9936 // If the load value is used only by N, replace it via CombineTo N. 9937 bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); 9938 Combiner.CombineTo(N, ExtLoad); 9939 if (NoReplaceTrunc) { 9940 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 9941 Combiner.recursivelyDeleteUnusedNodes(LN0); 9942 } else { 9943 SDValue Trunc = 9944 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); 9945 Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1)); 9946 } 9947 return SDValue(N, 0); // Return N so it doesn't get rechecked! 9948 } 9949 9950 static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG, 9951 const TargetLowering &TLI, EVT VT, 9952 SDNode *N, SDValue N0, 9953 ISD::LoadExtType ExtLoadType, 9954 ISD::NodeType ExtOpc) { 9955 if (!N0.hasOneUse()) 9956 return SDValue(); 9957 9958 MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0); 9959 if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD) 9960 return SDValue(); 9961 9962 if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0))) 9963 return SDValue(); 9964 9965 if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0))) 9966 return SDValue(); 9967 9968 SDLoc dl(Ld); 9969 SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru()); 9970 SDValue NewLoad = DAG.getMaskedLoad( 9971 VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(), 9972 PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(), 9973 ExtLoadType, Ld->isExpandingLoad()); 9974 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); 9975 return NewLoad; 9976 } 9977 9978 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG, 9979 bool LegalOperations) { 9980 assert((N->getOpcode() == ISD::SIGN_EXTEND || 9981 N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext"); 9982 9983 SDValue SetCC = N->getOperand(0); 9984 if (LegalOperations || SetCC.getOpcode() != ISD::SETCC || 9985 !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1) 9986 return SDValue(); 9987 9988 SDValue X = SetCC.getOperand(0); 9989 SDValue Ones = SetCC.getOperand(1); 9990 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); 9991 EVT VT = N->getValueType(0); 9992 EVT XVT = X.getValueType(); 9993 // setge X, C is canonicalized to setgt, so we do not need to match that 9994 // pattern. The setlt sibling is folded in SimplifySelectCC() because it does 9995 // not require the 'not' op. 9996 if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) { 9997 // Invert and smear/shift the sign bit: 9998 // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1) 9999 // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1) 10000 SDLoc DL(N); 10001 unsigned ShCt = VT.getSizeInBits() - 1; 10002 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10003 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { 10004 SDValue NotX = DAG.getNOT(DL, X, VT); 10005 SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT); 10006 auto ShiftOpcode = 10007 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL; 10008 return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount); 10009 } 10010 } 10011 return SDValue(); 10012 } 10013 10014 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { 10015 SDValue N0 = N->getOperand(0); 10016 EVT VT = N->getValueType(0); 10017 SDLoc DL(N); 10018 10019 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 10020 return Res; 10021 10022 // fold (sext (sext x)) -> (sext x) 10023 // fold (sext (aext x)) -> (sext x) 10024 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) 10025 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); 10026 10027 if (N0.getOpcode() == ISD::TRUNCATE) { 10028 // fold (sext (truncate (load x))) -> (sext (smaller load x)) 10029 // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) 10030 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 10031 SDNode *oye = N0.getOperand(0).getNode(); 10032 if (NarrowLoad.getNode() != N0.getNode()) { 10033 CombineTo(N0.getNode(), NarrowLoad); 10034 // CombineTo deleted the truncate, if needed, but not what's under it. 10035 AddToWorklist(oye); 10036 } 10037 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10038 } 10039 10040 // See if the value being truncated is already sign extended. If so, just 10041 // eliminate the trunc/sext pair. 10042 SDValue Op = N0.getOperand(0); 10043 unsigned OpBits = Op.getScalarValueSizeInBits(); 10044 unsigned MidBits = N0.getScalarValueSizeInBits(); 10045 unsigned DestBits = VT.getScalarSizeInBits(); 10046 unsigned NumSignBits = DAG.ComputeNumSignBits(Op); 10047 10048 if (OpBits == DestBits) { 10049 // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign 10050 // bits, it is already ready. 10051 if (NumSignBits > DestBits-MidBits) 10052 return Op; 10053 } else if (OpBits < DestBits) { 10054 // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign 10055 // bits, just sext from i32. 10056 if (NumSignBits > OpBits-MidBits) 10057 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); 10058 } else { 10059 // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign 10060 // bits, just truncate to i32. 10061 if (NumSignBits > OpBits-MidBits) 10062 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); 10063 } 10064 10065 // fold (sext (truncate x)) -> (sextinreg x). 10066 if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, 10067 N0.getValueType())) { 10068 if (OpBits < DestBits) 10069 Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); 10070 else if (OpBits > DestBits) 10071 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); 10072 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, 10073 DAG.getValueType(N0.getValueType())); 10074 } 10075 } 10076 10077 // Try to simplify (sext (load x)). 10078 if (SDValue foldedExt = 10079 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, 10080 ISD::SEXTLOAD, ISD::SIGN_EXTEND)) 10081 return foldedExt; 10082 10083 if (SDValue foldedExt = 10084 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD, 10085 ISD::SIGN_EXTEND)) 10086 return foldedExt; 10087 10088 // fold (sext (load x)) to multiple smaller sextloads. 10089 // Only on illegal but splittable vectors. 10090 if (SDValue ExtLoad = CombineExtLoad(N)) 10091 return ExtLoad; 10092 10093 // Try to simplify (sext (sextload x)). 10094 if (SDValue foldedExt = tryToFoldExtOfExtload( 10095 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD)) 10096 return foldedExt; 10097 10098 // fold (sext (and/or/xor (load x), cst)) -> 10099 // (and/or/xor (sextload x), (sext cst)) 10100 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 10101 N0.getOpcode() == ISD::XOR) && 10102 isa<LoadSDNode>(N0.getOperand(0)) && 10103 N0.getOperand(1).getOpcode() == ISD::Constant && 10104 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { 10105 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0)); 10106 EVT MemVT = LN00->getMemoryVT(); 10107 if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) && 10108 LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) { 10109 SmallVector<SDNode*, 4> SetCCs; 10110 bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), 10111 ISD::SIGN_EXTEND, SetCCs, TLI); 10112 if (DoXform) { 10113 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT, 10114 LN00->getChain(), LN00->getBasePtr(), 10115 LN00->getMemoryVT(), 10116 LN00->getMemOperand()); 10117 APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits()); 10118 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, 10119 ExtLoad, DAG.getConstant(Mask, DL, VT)); 10120 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND); 10121 bool NoReplaceTruncAnd = !N0.hasOneUse(); 10122 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); 10123 CombineTo(N, And); 10124 // If N0 has multiple uses, change other uses as well. 10125 if (NoReplaceTruncAnd) { 10126 SDValue TruncAnd = 10127 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); 10128 CombineTo(N0.getNode(), TruncAnd); 10129 } 10130 if (NoReplaceTrunc) { 10131 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); 10132 } else { 10133 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), 10134 LN00->getValueType(0), ExtLoad); 10135 CombineTo(LN00, Trunc, ExtLoad.getValue(1)); 10136 } 10137 return SDValue(N,0); // Return N so it doesn't get rechecked! 10138 } 10139 } 10140 } 10141 10142 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) 10143 return V; 10144 10145 if (N0.getOpcode() == ISD::SETCC) { 10146 SDValue N00 = N0.getOperand(0); 10147 SDValue N01 = N0.getOperand(1); 10148 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); 10149 EVT N00VT = N0.getOperand(0).getValueType(); 10150 10151 // sext(setcc) -> sext_in_reg(vsetcc) for vectors. 10152 // Only do this before legalize for now. 10153 if (VT.isVector() && !LegalOperations && 10154 TLI.getBooleanContents(N00VT) == 10155 TargetLowering::ZeroOrNegativeOneBooleanContent) { 10156 // On some architectures (such as SSE/NEON/etc) the SETCC result type is 10157 // of the same size as the compared operands. Only optimize sext(setcc()) 10158 // if this is the case. 10159 EVT SVT = getSetCCResultType(N00VT); 10160 10161 // If we already have the desired type, don't change it. 10162 if (SVT != N0.getValueType()) { 10163 // We know that the # elements of the results is the same as the 10164 // # elements of the compare (and the # elements of the compare result 10165 // for that matter). Check to see that they are the same size. If so, 10166 // we know that the element size of the sext'd result matches the 10167 // element size of the compare operands. 10168 if (VT.getSizeInBits() == SVT.getSizeInBits()) 10169 return DAG.getSetCC(DL, VT, N00, N01, CC); 10170 10171 // If the desired elements are smaller or larger than the source 10172 // elements, we can use a matching integer vector type and then 10173 // truncate/sign extend. 10174 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); 10175 if (SVT == MatchingVecType) { 10176 SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); 10177 return DAG.getSExtOrTrunc(VsetCC, DL, VT); 10178 } 10179 } 10180 } 10181 10182 // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) 10183 // Here, T can be 1 or -1, depending on the type of the setcc and 10184 // getBooleanContents(). 10185 unsigned SetCCWidth = N0.getScalarValueSizeInBits(); 10186 10187 // To determine the "true" side of the select, we need to know the high bit 10188 // of the value returned by the setcc if it evaluates to true. 10189 // If the type of the setcc is i1, then the true case of the select is just 10190 // sext(i1 1), that is, -1. 10191 // If the type of the setcc is larger (say, i8) then the value of the high 10192 // bit depends on getBooleanContents(), so ask TLI for a real "true" value 10193 // of the appropriate width. 10194 SDValue ExtTrueVal = (SetCCWidth == 1) 10195 ? DAG.getAllOnesConstant(DL, VT) 10196 : DAG.getBoolConstant(true, DL, VT, N00VT); 10197 SDValue Zero = DAG.getConstant(0, DL, VT); 10198 if (SDValue SCC = 10199 SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) 10200 return SCC; 10201 10202 if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) { 10203 EVT SetCCVT = getSetCCResultType(N00VT); 10204 // Don't do this transform for i1 because there's a select transform 10205 // that would reverse it. 10206 // TODO: We should not do this transform at all without a target hook 10207 // because a sext is likely cheaper than a select? 10208 if (SetCCVT.getScalarSizeInBits() != 1 && 10209 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { 10210 SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); 10211 return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); 10212 } 10213 } 10214 } 10215 10216 // fold (sext x) -> (zext x) if the sign bit is known zero. 10217 if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && 10218 DAG.SignBitIsZero(N0)) 10219 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); 10220 10221 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 10222 return NewVSel; 10223 10224 // Eliminate this sign extend by doing a negation in the destination type: 10225 // sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64) 10226 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && 10227 isNullOrNullSplat(N0.getOperand(0)) && 10228 N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND && 10229 TLI.isOperationLegalOrCustom(ISD::SUB, VT)) { 10230 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT); 10231 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext); 10232 } 10233 // Eliminate this sign extend by doing a decrement in the destination type: 10234 // sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1) 10235 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && 10236 isAllOnesOrAllOnesSplat(N0.getOperand(1)) && 10237 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && 10238 TLI.isOperationLegalOrCustom(ISD::ADD, VT)) { 10239 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT); 10240 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); 10241 } 10242 10243 return SDValue(); 10244 } 10245 10246 // isTruncateOf - If N is a truncate of some other value, return true, record 10247 // the value being truncated in Op and which of Op's bits are zero/one in Known. 10248 // This function computes KnownBits to avoid a duplicated call to 10249 // computeKnownBits in the caller. 10250 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, 10251 KnownBits &Known) { 10252 if (N->getOpcode() == ISD::TRUNCATE) { 10253 Op = N->getOperand(0); 10254 Known = DAG.computeKnownBits(Op); 10255 return true; 10256 } 10257 10258 if (N.getOpcode() != ISD::SETCC || 10259 N.getValueType().getScalarType() != MVT::i1 || 10260 cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE) 10261 return false; 10262 10263 SDValue Op0 = N->getOperand(0); 10264 SDValue Op1 = N->getOperand(1); 10265 assert(Op0.getValueType() == Op1.getValueType()); 10266 10267 if (isNullOrNullSplat(Op0)) 10268 Op = Op1; 10269 else if (isNullOrNullSplat(Op1)) 10270 Op = Op0; 10271 else 10272 return false; 10273 10274 Known = DAG.computeKnownBits(Op); 10275 10276 return (Known.Zero | 1).isAllOnesValue(); 10277 } 10278 10279 /// Given an extending node with a pop-count operand, if the target does not 10280 /// support a pop-count in the narrow source type but does support it in the 10281 /// destination type, widen the pop-count to the destination type. 10282 static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) { 10283 assert((Extend->getOpcode() == ISD::ZERO_EXTEND || 10284 Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op"); 10285 10286 SDValue CtPop = Extend->getOperand(0); 10287 if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse()) 10288 return SDValue(); 10289 10290 EVT VT = Extend->getValueType(0); 10291 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10292 if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) || 10293 !TLI.isOperationLegalOrCustom(ISD::CTPOP, VT)) 10294 return SDValue(); 10295 10296 // zext (ctpop X) --> ctpop (zext X) 10297 SDLoc DL(Extend); 10298 SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT); 10299 return DAG.getNode(ISD::CTPOP, DL, VT, NewZext); 10300 } 10301 10302 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { 10303 SDValue N0 = N->getOperand(0); 10304 EVT VT = N->getValueType(0); 10305 10306 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 10307 return Res; 10308 10309 // fold (zext (zext x)) -> (zext x) 10310 // fold (zext (aext x)) -> (zext x) 10311 if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) 10312 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, 10313 N0.getOperand(0)); 10314 10315 // fold (zext (truncate x)) -> (zext x) or 10316 // (zext (truncate x)) -> (truncate x) 10317 // This is valid when the truncated bits of x are already zero. 10318 SDValue Op; 10319 KnownBits Known; 10320 if (isTruncateOf(DAG, N0, Op, Known)) { 10321 APInt TruncatedBits = 10322 (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ? 10323 APInt(Op.getScalarValueSizeInBits(), 0) : 10324 APInt::getBitsSet(Op.getScalarValueSizeInBits(), 10325 N0.getScalarValueSizeInBits(), 10326 std::min(Op.getScalarValueSizeInBits(), 10327 VT.getScalarSizeInBits())); 10328 if (TruncatedBits.isSubsetOf(Known.Zero)) 10329 return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); 10330 } 10331 10332 // fold (zext (truncate x)) -> (and x, mask) 10333 if (N0.getOpcode() == ISD::TRUNCATE) { 10334 // fold (zext (truncate (load x))) -> (zext (smaller load x)) 10335 // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) 10336 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 10337 SDNode *oye = N0.getOperand(0).getNode(); 10338 if (NarrowLoad.getNode() != N0.getNode()) { 10339 CombineTo(N0.getNode(), NarrowLoad); 10340 // CombineTo deleted the truncate, if needed, but not what's under it. 10341 AddToWorklist(oye); 10342 } 10343 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10344 } 10345 10346 EVT SrcVT = N0.getOperand(0).getValueType(); 10347 EVT MinVT = N0.getValueType(); 10348 10349 // Try to mask before the extension to avoid having to generate a larger mask, 10350 // possibly over several sub-vectors. 10351 if (SrcVT.bitsLT(VT) && VT.isVector()) { 10352 if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && 10353 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { 10354 SDValue Op = N0.getOperand(0); 10355 Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT); 10356 AddToWorklist(Op.getNode()); 10357 SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT); 10358 // Transfer the debug info; the new node is equivalent to N0. 10359 DAG.transferDbgValues(N0, ZExtOrTrunc); 10360 return ZExtOrTrunc; 10361 } 10362 } 10363 10364 if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { 10365 SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); 10366 AddToWorklist(Op.getNode()); 10367 SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT); 10368 // We may safely transfer the debug info describing the truncate node over 10369 // to the equivalent and operation. 10370 DAG.transferDbgValues(N0, And); 10371 return And; 10372 } 10373 } 10374 10375 // Fold (zext (and (trunc x), cst)) -> (and x, cst), 10376 // if either of the casts is not free. 10377 if (N0.getOpcode() == ISD::AND && 10378 N0.getOperand(0).getOpcode() == ISD::TRUNCATE && 10379 N0.getOperand(1).getOpcode() == ISD::Constant && 10380 (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), 10381 N0.getValueType()) || 10382 !TLI.isZExtFree(N0.getValueType(), VT))) { 10383 SDValue X = N0.getOperand(0).getOperand(0); 10384 X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); 10385 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 10386 SDLoc DL(N); 10387 return DAG.getNode(ISD::AND, DL, VT, 10388 X, DAG.getConstant(Mask, DL, VT)); 10389 } 10390 10391 // Try to simplify (zext (load x)). 10392 if (SDValue foldedExt = 10393 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, 10394 ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) 10395 return foldedExt; 10396 10397 if (SDValue foldedExt = 10398 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD, 10399 ISD::ZERO_EXTEND)) 10400 return foldedExt; 10401 10402 // fold (zext (load x)) to multiple smaller zextloads. 10403 // Only on illegal but splittable vectors. 10404 if (SDValue ExtLoad = CombineExtLoad(N)) 10405 return ExtLoad; 10406 10407 // fold (zext (and/or/xor (load x), cst)) -> 10408 // (and/or/xor (zextload x), (zext cst)) 10409 // Unless (and (load x) cst) will match as a zextload already and has 10410 // additional users. 10411 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || 10412 N0.getOpcode() == ISD::XOR) && 10413 isa<LoadSDNode>(N0.getOperand(0)) && 10414 N0.getOperand(1).getOpcode() == ISD::Constant && 10415 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { 10416 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0)); 10417 EVT MemVT = LN00->getMemoryVT(); 10418 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) && 10419 LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) { 10420 bool DoXform = true; 10421 SmallVector<SDNode*, 4> SetCCs; 10422 if (!N0.hasOneUse()) { 10423 if (N0.getOpcode() == ISD::AND) { 10424 auto *AndC = cast<ConstantSDNode>(N0.getOperand(1)); 10425 EVT LoadResultTy = AndC->getValueType(0); 10426 EVT ExtVT; 10427 if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT)) 10428 DoXform = false; 10429 } 10430 } 10431 if (DoXform) 10432 DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), 10433 ISD::ZERO_EXTEND, SetCCs, TLI); 10434 if (DoXform) { 10435 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT, 10436 LN00->getChain(), LN00->getBasePtr(), 10437 LN00->getMemoryVT(), 10438 LN00->getMemOperand()); 10439 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 10440 SDLoc DL(N); 10441 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, 10442 ExtLoad, DAG.getConstant(Mask, DL, VT)); 10443 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); 10444 bool NoReplaceTruncAnd = !N0.hasOneUse(); 10445 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); 10446 CombineTo(N, And); 10447 // If N0 has multiple uses, change other uses as well. 10448 if (NoReplaceTruncAnd) { 10449 SDValue TruncAnd = 10450 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); 10451 CombineTo(N0.getNode(), TruncAnd); 10452 } 10453 if (NoReplaceTrunc) { 10454 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); 10455 } else { 10456 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), 10457 LN00->getValueType(0), ExtLoad); 10458 CombineTo(LN00, Trunc, ExtLoad.getValue(1)); 10459 } 10460 return SDValue(N,0); // Return N so it doesn't get rechecked! 10461 } 10462 } 10463 } 10464 10465 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> 10466 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) 10467 if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N)) 10468 return ZExtLoad; 10469 10470 // Try to simplify (zext (zextload x)). 10471 if (SDValue foldedExt = tryToFoldExtOfExtload( 10472 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD)) 10473 return foldedExt; 10474 10475 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) 10476 return V; 10477 10478 if (N0.getOpcode() == ISD::SETCC) { 10479 // Only do this before legalize for now. 10480 if (!LegalOperations && VT.isVector() && 10481 N0.getValueType().getVectorElementType() == MVT::i1) { 10482 EVT N00VT = N0.getOperand(0).getValueType(); 10483 if (getSetCCResultType(N00VT) == N0.getValueType()) 10484 return SDValue(); 10485 10486 // We know that the # elements of the results is the same as the # 10487 // elements of the compare (and the # elements of the compare result for 10488 // that matter). Check to see that they are the same size. If so, we know 10489 // that the element size of the sext'd result matches the element size of 10490 // the compare operands. 10491 SDLoc DL(N); 10492 if (VT.getSizeInBits() == N00VT.getSizeInBits()) { 10493 // zext(setcc) -> zext_in_reg(vsetcc) for vectors. 10494 SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0), 10495 N0.getOperand(1), N0.getOperand(2)); 10496 return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType()); 10497 } 10498 10499 // If the desired elements are smaller or larger than the source 10500 // elements we can use a matching integer vector type and then 10501 // truncate/any extend followed by zext_in_reg. 10502 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); 10503 SDValue VsetCC = 10504 DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0), 10505 N0.getOperand(1), N0.getOperand(2)); 10506 return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL, 10507 N0.getValueType()); 10508 } 10509 10510 // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc 10511 SDLoc DL(N); 10512 if (SDValue SCC = SimplifySelectCC( 10513 DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), 10514 DAG.getConstant(0, DL, VT), 10515 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) 10516 return SCC; 10517 } 10518 10519 // (zext (shl (zext x), cst)) -> (shl (zext x), cst) 10520 if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) && 10521 isa<ConstantSDNode>(N0.getOperand(1)) && 10522 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && 10523 N0.hasOneUse()) { 10524 SDValue ShAmt = N0.getOperand(1); 10525 if (N0.getOpcode() == ISD::SHL) { 10526 SDValue InnerZExt = N0.getOperand(0); 10527 // If the original shl may be shifting out bits, do not perform this 10528 // transformation. 10529 unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() - 10530 InnerZExt.getOperand(0).getValueSizeInBits(); 10531 if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits)) 10532 return SDValue(); 10533 } 10534 10535 SDLoc DL(N); 10536 10537 // Ensure that the shift amount is wide enough for the shifted value. 10538 if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits()) 10539 ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); 10540 10541 return DAG.getNode(N0.getOpcode(), DL, VT, 10542 DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)), 10543 ShAmt); 10544 } 10545 10546 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 10547 return NewVSel; 10548 10549 if (SDValue NewCtPop = widenCtPop(N, DAG)) 10550 return NewCtPop; 10551 10552 return SDValue(); 10553 } 10554 10555 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { 10556 SDValue N0 = N->getOperand(0); 10557 EVT VT = N->getValueType(0); 10558 10559 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 10560 return Res; 10561 10562 // fold (aext (aext x)) -> (aext x) 10563 // fold (aext (zext x)) -> (zext x) 10564 // fold (aext (sext x)) -> (sext x) 10565 if (N0.getOpcode() == ISD::ANY_EXTEND || 10566 N0.getOpcode() == ISD::ZERO_EXTEND || 10567 N0.getOpcode() == ISD::SIGN_EXTEND) 10568 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); 10569 10570 // fold (aext (truncate (load x))) -> (aext (smaller load x)) 10571 // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) 10572 if (N0.getOpcode() == ISD::TRUNCATE) { 10573 if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { 10574 SDNode *oye = N0.getOperand(0).getNode(); 10575 if (NarrowLoad.getNode() != N0.getNode()) { 10576 CombineTo(N0.getNode(), NarrowLoad); 10577 // CombineTo deleted the truncate, if needed, but not what's under it. 10578 AddToWorklist(oye); 10579 } 10580 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10581 } 10582 } 10583 10584 // fold (aext (truncate x)) 10585 if (N0.getOpcode() == ISD::TRUNCATE) 10586 return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); 10587 10588 // Fold (aext (and (trunc x), cst)) -> (and x, cst) 10589 // if the trunc is not free. 10590 if (N0.getOpcode() == ISD::AND && 10591 N0.getOperand(0).getOpcode() == ISD::TRUNCATE && 10592 N0.getOperand(1).getOpcode() == ISD::Constant && 10593 !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), 10594 N0.getValueType())) { 10595 SDLoc DL(N); 10596 SDValue X = N0.getOperand(0).getOperand(0); 10597 X = DAG.getAnyExtOrTrunc(X, DL, VT); 10598 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); 10599 return DAG.getNode(ISD::AND, DL, VT, 10600 X, DAG.getConstant(Mask, DL, VT)); 10601 } 10602 10603 // fold (aext (load x)) -> (aext (truncate (extload x))) 10604 // None of the supported targets knows how to perform load and any_ext 10605 // on vectors in one instruction. We only perform this transformation on 10606 // scalars. 10607 if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && 10608 ISD::isUNINDEXEDLoad(N0.getNode()) && 10609 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { 10610 bool DoXform = true; 10611 SmallVector<SDNode*, 4> SetCCs; 10612 if (!N0.hasOneUse()) 10613 DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, 10614 TLI); 10615 if (DoXform) { 10616 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10617 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 10618 LN0->getChain(), 10619 LN0->getBasePtr(), N0.getValueType(), 10620 LN0->getMemOperand()); 10621 ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND); 10622 // If the load value is used only by N, replace it via CombineTo N. 10623 bool NoReplaceTrunc = N0.hasOneUse(); 10624 CombineTo(N, ExtLoad); 10625 if (NoReplaceTrunc) { 10626 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 10627 recursivelyDeleteUnusedNodes(LN0); 10628 } else { 10629 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), 10630 N0.getValueType(), ExtLoad); 10631 CombineTo(LN0, Trunc, ExtLoad.getValue(1)); 10632 } 10633 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10634 } 10635 } 10636 10637 // fold (aext (zextload x)) -> (aext (truncate (zextload x))) 10638 // fold (aext (sextload x)) -> (aext (truncate (sextload x))) 10639 // fold (aext ( extload x)) -> (aext (truncate (extload x))) 10640 if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) && 10641 ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { 10642 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10643 ISD::LoadExtType ExtType = LN0->getExtensionType(); 10644 EVT MemVT = LN0->getMemoryVT(); 10645 if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { 10646 SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), 10647 VT, LN0->getChain(), LN0->getBasePtr(), 10648 MemVT, LN0->getMemOperand()); 10649 CombineTo(N, ExtLoad); 10650 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); 10651 recursivelyDeleteUnusedNodes(LN0); 10652 return SDValue(N, 0); // Return N so it doesn't get rechecked! 10653 } 10654 } 10655 10656 if (N0.getOpcode() == ISD::SETCC) { 10657 // For vectors: 10658 // aext(setcc) -> vsetcc 10659 // aext(setcc) -> truncate(vsetcc) 10660 // aext(setcc) -> aext(vsetcc) 10661 // Only do this before legalize for now. 10662 if (VT.isVector() && !LegalOperations) { 10663 EVT N00VT = N0.getOperand(0).getValueType(); 10664 if (getSetCCResultType(N00VT) == N0.getValueType()) 10665 return SDValue(); 10666 10667 // We know that the # elements of the results is the same as the 10668 // # elements of the compare (and the # elements of the compare result 10669 // for that matter). Check to see that they are the same size. If so, 10670 // we know that the element size of the sext'd result matches the 10671 // element size of the compare operands. 10672 if (VT.getSizeInBits() == N00VT.getSizeInBits()) 10673 return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), 10674 N0.getOperand(1), 10675 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 10676 10677 // If the desired elements are smaller or larger than the source 10678 // elements we can use a matching integer vector type and then 10679 // truncate/any extend 10680 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); 10681 SDValue VsetCC = 10682 DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), 10683 N0.getOperand(1), 10684 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 10685 return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT); 10686 } 10687 10688 // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc 10689 SDLoc DL(N); 10690 if (SDValue SCC = SimplifySelectCC( 10691 DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), 10692 DAG.getConstant(0, DL, VT), 10693 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) 10694 return SCC; 10695 } 10696 10697 if (SDValue NewCtPop = widenCtPop(N, DAG)) 10698 return NewCtPop; 10699 10700 return SDValue(); 10701 } 10702 10703 SDValue DAGCombiner::visitAssertExt(SDNode *N) { 10704 unsigned Opcode = N->getOpcode(); 10705 SDValue N0 = N->getOperand(0); 10706 SDValue N1 = N->getOperand(1); 10707 EVT AssertVT = cast<VTSDNode>(N1)->getVT(); 10708 10709 // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt) 10710 if (N0.getOpcode() == Opcode && 10711 AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) 10712 return N0; 10713 10714 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && 10715 N0.getOperand(0).getOpcode() == Opcode) { 10716 // We have an assert, truncate, assert sandwich. Make one stronger assert 10717 // by asserting on the smallest asserted type to the larger source type. 10718 // This eliminates the later assert: 10719 // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN 10720 // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN 10721 SDValue BigA = N0.getOperand(0); 10722 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); 10723 assert(BigA_AssertVT.bitsLE(N0.getValueType()) && 10724 "Asserting zero/sign-extended bits to a type larger than the " 10725 "truncated destination does not provide information"); 10726 10727 SDLoc DL(N); 10728 EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT; 10729 SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT); 10730 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), 10731 BigA.getOperand(0), MinAssertVTVal); 10732 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); 10733 } 10734 10735 // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller 10736 // than X. Just move the AssertZext in front of the truncate and drop the 10737 // AssertSExt. 10738 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && 10739 N0.getOperand(0).getOpcode() == ISD::AssertSext && 10740 Opcode == ISD::AssertZext) { 10741 SDValue BigA = N0.getOperand(0); 10742 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); 10743 assert(BigA_AssertVT.bitsLE(N0.getValueType()) && 10744 "Asserting zero/sign-extended bits to a type larger than the " 10745 "truncated destination does not provide information"); 10746 10747 if (AssertVT.bitsLT(BigA_AssertVT)) { 10748 SDLoc DL(N); 10749 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), 10750 BigA.getOperand(0), N1); 10751 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); 10752 } 10753 } 10754 10755 return SDValue(); 10756 } 10757 10758 SDValue DAGCombiner::visitAssertAlign(SDNode *N) { 10759 SDLoc DL(N); 10760 10761 Align AL = cast<AssertAlignSDNode>(N)->getAlign(); 10762 SDValue N0 = N->getOperand(0); 10763 10764 // Fold (assertalign (assertalign x, AL0), AL1) -> 10765 // (assertalign x, max(AL0, AL1)) 10766 if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0)) 10767 return DAG.getAssertAlign(DL, N0.getOperand(0), 10768 std::max(AL, AAN->getAlign())); 10769 10770 // In rare cases, there are trivial arithmetic ops in source operands. Sink 10771 // this assert down to source operands so that those arithmetic ops could be 10772 // exposed to the DAG combining. 10773 switch (N0.getOpcode()) { 10774 default: 10775 break; 10776 case ISD::ADD: 10777 case ISD::SUB: { 10778 unsigned AlignShift = Log2(AL); 10779 SDValue LHS = N0.getOperand(0); 10780 SDValue RHS = N0.getOperand(1); 10781 unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros(); 10782 unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros(); 10783 if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) { 10784 if (LHSAlignShift < AlignShift) 10785 LHS = DAG.getAssertAlign(DL, LHS, AL); 10786 if (RHSAlignShift < AlignShift) 10787 RHS = DAG.getAssertAlign(DL, RHS, AL); 10788 return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS); 10789 } 10790 break; 10791 } 10792 } 10793 10794 return SDValue(); 10795 } 10796 10797 /// If the result of a wider load is shifted to right of N bits and then 10798 /// truncated to a narrower type and where N is a multiple of number of bits of 10799 /// the narrower type, transform it to a narrower load from address + N / num of 10800 /// bits of new type. Also narrow the load if the result is masked with an AND 10801 /// to effectively produce a smaller type. If the result is to be extended, also 10802 /// fold the extension to form a extending load. 10803 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { 10804 unsigned Opc = N->getOpcode(); 10805 10806 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 10807 SDValue N0 = N->getOperand(0); 10808 EVT VT = N->getValueType(0); 10809 EVT ExtVT = VT; 10810 10811 // This transformation isn't valid for vector loads. 10812 if (VT.isVector()) 10813 return SDValue(); 10814 10815 unsigned ShAmt = 0; 10816 bool HasShiftedOffset = false; 10817 // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then 10818 // extended to VT. 10819 if (Opc == ISD::SIGN_EXTEND_INREG) { 10820 ExtType = ISD::SEXTLOAD; 10821 ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 10822 } else if (Opc == ISD::SRL) { 10823 // Another special-case: SRL is basically zero-extending a narrower value, 10824 // or it maybe shifting a higher subword, half or byte into the lowest 10825 // bits. 10826 ExtType = ISD::ZEXTLOAD; 10827 N0 = SDValue(N, 0); 10828 10829 auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0)); 10830 auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10831 if (!N01 || !LN0) 10832 return SDValue(); 10833 10834 uint64_t ShiftAmt = N01->getZExtValue(); 10835 uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits(); 10836 if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) 10837 ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); 10838 else 10839 ExtVT = EVT::getIntegerVT(*DAG.getContext(), 10840 VT.getSizeInBits() - ShiftAmt); 10841 } else if (Opc == ISD::AND) { 10842 // An AND with a constant mask is the same as a truncate + zero-extend. 10843 auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10844 if (!AndC) 10845 return SDValue(); 10846 10847 const APInt &Mask = AndC->getAPIntValue(); 10848 unsigned ActiveBits = 0; 10849 if (Mask.isMask()) { 10850 ActiveBits = Mask.countTrailingOnes(); 10851 } else if (Mask.isShiftedMask()) { 10852 ShAmt = Mask.countTrailingZeros(); 10853 APInt ShiftedMask = Mask.lshr(ShAmt); 10854 ActiveBits = ShiftedMask.countTrailingOnes(); 10855 HasShiftedOffset = true; 10856 } else 10857 return SDValue(); 10858 10859 ExtType = ISD::ZEXTLOAD; 10860 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); 10861 } 10862 10863 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { 10864 SDValue SRL = N0; 10865 if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) { 10866 ShAmt = ConstShift->getZExtValue(); 10867 unsigned EVTBits = ExtVT.getSizeInBits(); 10868 // Is the shift amount a multiple of size of VT? 10869 if ((ShAmt & (EVTBits-1)) == 0) { 10870 N0 = N0.getOperand(0); 10871 // Is the load width a multiple of size of VT? 10872 if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0) 10873 return SDValue(); 10874 } 10875 10876 // At this point, we must have a load or else we can't do the transform. 10877 auto *LN0 = dyn_cast<LoadSDNode>(N0); 10878 if (!LN0) return SDValue(); 10879 10880 // Because a SRL must be assumed to *need* to zero-extend the high bits 10881 // (as opposed to anyext the high bits), we can't combine the zextload 10882 // lowering of SRL and an sextload. 10883 if (LN0->getExtensionType() == ISD::SEXTLOAD) 10884 return SDValue(); 10885 10886 // If the shift amount is larger than the input type then we're not 10887 // accessing any of the loaded bytes. If the load was a zextload/extload 10888 // then the result of the shift+trunc is zero/undef (handled elsewhere). 10889 if (ShAmt >= LN0->getMemoryVT().getSizeInBits()) 10890 return SDValue(); 10891 10892 // If the SRL is only used by a masking AND, we may be able to adjust 10893 // the ExtVT to make the AND redundant. 10894 SDNode *Mask = *(SRL->use_begin()); 10895 if (Mask->getOpcode() == ISD::AND && 10896 isa<ConstantSDNode>(Mask->getOperand(1))) { 10897 const APInt& ShiftMask = Mask->getConstantOperandAPInt(1); 10898 if (ShiftMask.isMask()) { 10899 EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), 10900 ShiftMask.countTrailingOnes()); 10901 // If the mask is smaller, recompute the type. 10902 if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) && 10903 TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT)) 10904 ExtVT = MaskedVT; 10905 } 10906 } 10907 } 10908 } 10909 10910 // If the load is shifted left (and the result isn't shifted back right), 10911 // we can fold the truncate through the shift. 10912 unsigned ShLeftAmt = 0; 10913 if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && 10914 ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { 10915 if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { 10916 ShLeftAmt = N01->getZExtValue(); 10917 N0 = N0.getOperand(0); 10918 } 10919 } 10920 10921 // If we haven't found a load, we can't narrow it. 10922 if (!isa<LoadSDNode>(N0)) 10923 return SDValue(); 10924 10925 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 10926 // Reducing the width of a volatile load is illegal. For atomics, we may be 10927 // able to reduce the width provided we never widen again. (see D66309) 10928 if (!LN0->isSimple() || 10929 !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt)) 10930 return SDValue(); 10931 10932 auto AdjustBigEndianShift = [&](unsigned ShAmt) { 10933 unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits(); 10934 unsigned EVTStoreBits = ExtVT.getStoreSizeInBits(); 10935 return LVTStoreBits - EVTStoreBits - ShAmt; 10936 }; 10937 10938 // For big endian targets, we need to adjust the offset to the pointer to 10939 // load the correct bytes. 10940 if (DAG.getDataLayout().isBigEndian()) 10941 ShAmt = AdjustBigEndianShift(ShAmt); 10942 10943 uint64_t PtrOff = ShAmt / 8; 10944 unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); 10945 SDLoc DL(LN0); 10946 // The original load itself didn't wrap, so an offset within it doesn't. 10947 SDNodeFlags Flags; 10948 Flags.setNoUnsignedWrap(true); 10949 SDValue NewPtr = 10950 DAG.getMemBasePlusOffset(LN0->getBasePtr(), PtrOff, DL, Flags); 10951 AddToWorklist(NewPtr.getNode()); 10952 10953 SDValue Load; 10954 if (ExtType == ISD::NON_EXTLOAD) 10955 Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr, 10956 LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign, 10957 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 10958 else 10959 Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr, 10960 LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT, 10961 NewAlign, LN0->getMemOperand()->getFlags(), 10962 LN0->getAAInfo()); 10963 10964 // Replace the old load's chain with the new load's chain. 10965 WorklistRemover DeadNodes(*this); 10966 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 10967 10968 // Shift the result left, if we've swallowed a left shift. 10969 SDValue Result = Load; 10970 if (ShLeftAmt != 0) { 10971 EVT ShImmTy = getShiftAmountTy(Result.getValueType()); 10972 if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt)) 10973 ShImmTy = VT; 10974 // If the shift amount is as large as the result size (but, presumably, 10975 // no larger than the source) then the useful bits of the result are 10976 // zero; we can't simply return the shortened shift, because the result 10977 // of that operation is undefined. 10978 if (ShLeftAmt >= VT.getSizeInBits()) 10979 Result = DAG.getConstant(0, DL, VT); 10980 else 10981 Result = DAG.getNode(ISD::SHL, DL, VT, 10982 Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy)); 10983 } 10984 10985 if (HasShiftedOffset) { 10986 // Recalculate the shift amount after it has been altered to calculate 10987 // the offset. 10988 if (DAG.getDataLayout().isBigEndian()) 10989 ShAmt = AdjustBigEndianShift(ShAmt); 10990 10991 // We're using a shifted mask, so the load now has an offset. This means 10992 // that data has been loaded into the lower bytes than it would have been 10993 // before, so we need to shl the loaded data into the correct position in the 10994 // register. 10995 SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT); 10996 Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC); 10997 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 10998 } 10999 11000 // Return the new loaded value. 11001 return Result; 11002 } 11003 11004 SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { 11005 SDValue N0 = N->getOperand(0); 11006 SDValue N1 = N->getOperand(1); 11007 EVT VT = N->getValueType(0); 11008 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 11009 unsigned VTBits = VT.getScalarSizeInBits(); 11010 unsigned ExtVTBits = ExtVT.getScalarSizeInBits(); 11011 11012 // sext_vector_inreg(undef) = 0 because the top bit will all be the same. 11013 if (N0.isUndef()) 11014 return DAG.getConstant(0, SDLoc(N), VT); 11015 11016 // fold (sext_in_reg c1) -> c1 11017 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) 11018 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); 11019 11020 // If the input is already sign extended, just drop the extension. 11021 if (DAG.ComputeNumSignBits(N0) >= (VTBits - ExtVTBits + 1)) 11022 return N0; 11023 11024 // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 11025 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG && 11026 ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT())) 11027 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), 11028 N1); 11029 11030 // fold (sext_in_reg (sext x)) -> (sext x) 11031 // fold (sext_in_reg (aext x)) -> (sext x) 11032 // if x is small enough or if we know that x has more than 1 sign bit and the 11033 // sign_extend_inreg is extending from one of them. 11034 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { 11035 SDValue N00 = N0.getOperand(0); 11036 unsigned N00Bits = N00.getScalarValueSizeInBits(); 11037 if ((N00Bits <= ExtVTBits || 11038 (N00Bits - DAG.ComputeNumSignBits(N00)) < ExtVTBits) && 11039 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) 11040 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00); 11041 } 11042 11043 // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) 11044 if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || 11045 N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || 11046 N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && 11047 N0.getOperand(0).getScalarValueSizeInBits() == ExtVTBits) { 11048 if (!LegalOperations || 11049 TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) 11050 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, 11051 N0.getOperand(0)); 11052 } 11053 11054 // fold (sext_in_reg (zext x)) -> (sext x) 11055 // iff we are extending the source sign bit. 11056 if (N0.getOpcode() == ISD::ZERO_EXTEND) { 11057 SDValue N00 = N0.getOperand(0); 11058 if (N00.getScalarValueSizeInBits() == ExtVTBits && 11059 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) 11060 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); 11061 } 11062 11063 // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. 11064 if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1))) 11065 return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT); 11066 11067 // fold operands of sext_in_reg based on knowledge that the top bits are not 11068 // demanded. 11069 if (SimplifyDemandedBits(SDValue(N, 0))) 11070 return SDValue(N, 0); 11071 11072 // fold (sext_in_reg (load x)) -> (smaller sextload x) 11073 // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) 11074 if (SDValue NarrowLoad = ReduceLoadWidth(N)) 11075 return NarrowLoad; 11076 11077 // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) 11078 // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible. 11079 // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above. 11080 if (N0.getOpcode() == ISD::SRL) { 11081 if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1))) 11082 if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) { 11083 // We can turn this into an SRA iff the input to the SRL is already sign 11084 // extended enough. 11085 unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); 11086 if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits) 11087 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), 11088 N0.getOperand(1)); 11089 } 11090 } 11091 11092 // fold (sext_inreg (extload x)) -> (sextload x) 11093 // If sextload is not supported by target, we can only do the combine when 11094 // load has one use. Doing otherwise can block folding the extload with other 11095 // extends that the target does support. 11096 if (ISD::isEXTLoad(N0.getNode()) && 11097 ISD::isUNINDEXEDLoad(N0.getNode()) && 11098 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() && 11099 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() && 11100 N0.hasOneUse()) || 11101 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { 11102 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11103 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, 11104 LN0->getChain(), 11105 LN0->getBasePtr(), ExtVT, 11106 LN0->getMemOperand()); 11107 CombineTo(N, ExtLoad); 11108 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 11109 AddToWorklist(ExtLoad.getNode()); 11110 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11111 } 11112 // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use 11113 if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && 11114 N0.hasOneUse() && 11115 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() && 11116 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) && 11117 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { 11118 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11119 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, 11120 LN0->getChain(), 11121 LN0->getBasePtr(), ExtVT, 11122 LN0->getMemOperand()); 11123 CombineTo(N, ExtLoad); 11124 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); 11125 return SDValue(N, 0); // Return N so it doesn't get rechecked! 11126 } 11127 11128 // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) 11129 if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) { 11130 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), 11131 N0.getOperand(1), false)) 11132 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1); 11133 } 11134 11135 return SDValue(); 11136 } 11137 11138 SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { 11139 SDValue N0 = N->getOperand(0); 11140 EVT VT = N->getValueType(0); 11141 11142 // sext_vector_inreg(undef) = 0 because the top bit will all be the same. 11143 if (N0.isUndef()) 11144 return DAG.getConstant(0, SDLoc(N), VT); 11145 11146 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 11147 return Res; 11148 11149 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 11150 return SDValue(N, 0); 11151 11152 return SDValue(); 11153 } 11154 11155 SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { 11156 SDValue N0 = N->getOperand(0); 11157 EVT VT = N->getValueType(0); 11158 11159 // zext_vector_inreg(undef) = 0 because the top bits will be zero. 11160 if (N0.isUndef()) 11161 return DAG.getConstant(0, SDLoc(N), VT); 11162 11163 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) 11164 return Res; 11165 11166 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 11167 return SDValue(N, 0); 11168 11169 return SDValue(); 11170 } 11171 11172 SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { 11173 SDValue N0 = N->getOperand(0); 11174 EVT VT = N->getValueType(0); 11175 EVT SrcVT = N0.getValueType(); 11176 bool isLE = DAG.getDataLayout().isLittleEndian(); 11177 11178 // noop truncate 11179 if (SrcVT == VT) 11180 return N0; 11181 11182 // fold (truncate (truncate x)) -> (truncate x) 11183 if (N0.getOpcode() == ISD::TRUNCATE) 11184 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); 11185 11186 // fold (truncate c1) -> c1 11187 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { 11188 SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); 11189 if (C.getNode() != N) 11190 return C; 11191 } 11192 11193 // fold (truncate (ext x)) -> (ext x) or (truncate x) or x 11194 if (N0.getOpcode() == ISD::ZERO_EXTEND || 11195 N0.getOpcode() == ISD::SIGN_EXTEND || 11196 N0.getOpcode() == ISD::ANY_EXTEND) { 11197 // if the source is smaller than the dest, we still need an extend. 11198 if (N0.getOperand(0).getValueType().bitsLT(VT)) 11199 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); 11200 // if the source is larger than the dest, than we just need the truncate. 11201 if (N0.getOperand(0).getValueType().bitsGT(VT)) 11202 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); 11203 // if the source and dest are the same type, we can drop both the extend 11204 // and the truncate. 11205 return N0.getOperand(0); 11206 } 11207 11208 // If this is anyext(trunc), don't fold it, allow ourselves to be folded. 11209 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND)) 11210 return SDValue(); 11211 11212 // Fold extract-and-trunc into a narrow extract. For example: 11213 // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1) 11214 // i32 y = TRUNCATE(i64 x) 11215 // -- becomes -- 11216 // v16i8 b = BITCAST (v2i64 val) 11217 // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8) 11218 // 11219 // Note: We only run this optimization after type legalization (which often 11220 // creates this pattern) and before operation legalization after which 11221 // we need to be more careful about the vector instructions that we generate. 11222 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 11223 LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { 11224 EVT VecTy = N0.getOperand(0).getValueType(); 11225 EVT ExTy = N0.getValueType(); 11226 EVT TrTy = N->getValueType(0); 11227 11228 unsigned NumElem = VecTy.getVectorNumElements(); 11229 unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits(); 11230 11231 EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem); 11232 assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size"); 11233 11234 SDValue EltNo = N0->getOperand(1); 11235 if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) { 11236 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 11237 int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1)); 11238 11239 SDLoc DL(N); 11240 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy, 11241 DAG.getBitcast(NVT, N0.getOperand(0)), 11242 DAG.getVectorIdxConstant(Index, DL)); 11243 } 11244 } 11245 11246 // trunc (select c, a, b) -> select c, (trunc a), (trunc b) 11247 if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) { 11248 if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && 11249 TLI.isTruncateFree(SrcVT, VT)) { 11250 SDLoc SL(N0); 11251 SDValue Cond = N0.getOperand(0); 11252 SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); 11253 SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2)); 11254 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1); 11255 } 11256 } 11257 11258 // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() 11259 if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && 11260 (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) && 11261 TLI.isTypeDesirableForOp(ISD::SHL, VT)) { 11262 SDValue Amt = N0.getOperand(1); 11263 KnownBits Known = DAG.computeKnownBits(Amt); 11264 unsigned Size = VT.getScalarSizeInBits(); 11265 if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { 11266 SDLoc SL(N); 11267 EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); 11268 11269 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); 11270 if (AmtVT != Amt.getValueType()) { 11271 Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); 11272 AddToWorklist(Amt.getNode()); 11273 } 11274 return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); 11275 } 11276 } 11277 11278 // Attempt to pre-truncate BUILD_VECTOR sources. 11279 if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations && 11280 TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) && 11281 // Avoid creating illegal types if running after type legalizer. 11282 (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) { 11283 SDLoc DL(N); 11284 EVT SVT = VT.getScalarType(); 11285 SmallVector<SDValue, 8> TruncOps; 11286 for (const SDValue &Op : N0->op_values()) { 11287 SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op); 11288 TruncOps.push_back(TruncOp); 11289 } 11290 return DAG.getBuildVector(VT, DL, TruncOps); 11291 } 11292 11293 // Fold a series of buildvector, bitcast, and truncate if possible. 11294 // For example fold 11295 // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to 11296 // (2xi32 (buildvector x, y)). 11297 if (Level == AfterLegalizeVectorOps && VT.isVector() && 11298 N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && 11299 N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 11300 N0.getOperand(0).hasOneUse()) { 11301 SDValue BuildVect = N0.getOperand(0); 11302 EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); 11303 EVT TruncVecEltTy = VT.getVectorElementType(); 11304 11305 // Check that the element types match. 11306 if (BuildVectEltTy == TruncVecEltTy) { 11307 // Now we only need to compute the offset of the truncated elements. 11308 unsigned BuildVecNumElts = BuildVect.getNumOperands(); 11309 unsigned TruncVecNumElts = VT.getVectorNumElements(); 11310 unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts; 11311 11312 assert((BuildVecNumElts % TruncVecNumElts) == 0 && 11313 "Invalid number of elements"); 11314 11315 SmallVector<SDValue, 8> Opnds; 11316 for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) 11317 Opnds.push_back(BuildVect.getOperand(i)); 11318 11319 return DAG.getBuildVector(VT, SDLoc(N), Opnds); 11320 } 11321 } 11322 11323 // See if we can simplify the input to this truncate through knowledge that 11324 // only the low bits are being used. 11325 // For example "trunc (or (shl x, 8), y)" // -> trunc y 11326 // Currently we only perform this optimization on scalars because vectors 11327 // may have different active low bits. 11328 if (!VT.isVector()) { 11329 APInt Mask = 11330 APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); 11331 if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) 11332 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); 11333 } 11334 11335 // fold (truncate (load x)) -> (smaller load x) 11336 // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) 11337 if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { 11338 if (SDValue Reduced = ReduceLoadWidth(N)) 11339 return Reduced; 11340 11341 // Handle the case where the load remains an extending load even 11342 // after truncation. 11343 if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { 11344 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11345 if (LN0->isSimple() && 11346 LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) { 11347 SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0), 11348 VT, LN0->getChain(), LN0->getBasePtr(), 11349 LN0->getMemoryVT(), 11350 LN0->getMemOperand()); 11351 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1)); 11352 return NewLoad; 11353 } 11354 } 11355 } 11356 11357 // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), 11358 // where ... are all 'undef'. 11359 if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { 11360 SmallVector<EVT, 8> VTs; 11361 SDValue V; 11362 unsigned Idx = 0; 11363 unsigned NumDefs = 0; 11364 11365 for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { 11366 SDValue X = N0.getOperand(i); 11367 if (!X.isUndef()) { 11368 V = X; 11369 Idx = i; 11370 NumDefs++; 11371 } 11372 // Stop if more than one members are non-undef. 11373 if (NumDefs > 1) 11374 break; 11375 11376 VTs.push_back(EVT::getVectorVT(*DAG.getContext(), 11377 VT.getVectorElementType(), 11378 X.getValueType().getVectorElementCount())); 11379 } 11380 11381 if (NumDefs == 0) 11382 return DAG.getUNDEF(VT); 11383 11384 if (NumDefs == 1) { 11385 assert(V.getNode() && "The single defined operand is empty!"); 11386 SmallVector<SDValue, 8> Opnds; 11387 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 11388 if (i != Idx) { 11389 Opnds.push_back(DAG.getUNDEF(VTs[i])); 11390 continue; 11391 } 11392 SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V); 11393 AddToWorklist(NV.getNode()); 11394 Opnds.push_back(NV); 11395 } 11396 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds); 11397 } 11398 } 11399 11400 // Fold truncate of a bitcast of a vector to an extract of the low vector 11401 // element. 11402 // 11403 // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx 11404 if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { 11405 SDValue VecSrc = N0.getOperand(0); 11406 EVT VecSrcVT = VecSrc.getValueType(); 11407 if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT && 11408 (!LegalOperations || 11409 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) { 11410 SDLoc SL(N); 11411 11412 unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1; 11413 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc, 11414 DAG.getVectorIdxConstant(Idx, SL)); 11415 } 11416 } 11417 11418 // Simplify the operands using demanded-bits information. 11419 if (!VT.isVector() && 11420 SimplifyDemandedBits(SDValue(N, 0))) 11421 return SDValue(N, 0); 11422 11423 // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) 11424 // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) 11425 // When the adde's carry is not used. 11426 if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && 11427 N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && 11428 // We only do for addcarry before legalize operation 11429 ((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) || 11430 TLI.isOperationLegal(N0.getOpcode(), VT))) { 11431 SDLoc SL(N); 11432 auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); 11433 auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); 11434 auto VTs = DAG.getVTList(VT, N0->getValueType(1)); 11435 return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2)); 11436 } 11437 11438 // fold (truncate (extract_subvector(ext x))) -> 11439 // (extract_subvector x) 11440 // TODO: This can be generalized to cover cases where the truncate and extract 11441 // do not fully cancel each other out. 11442 if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 11443 SDValue N00 = N0.getOperand(0); 11444 if (N00.getOpcode() == ISD::SIGN_EXTEND || 11445 N00.getOpcode() == ISD::ZERO_EXTEND || 11446 N00.getOpcode() == ISD::ANY_EXTEND) { 11447 if (N00.getOperand(0)->getValueType(0).getVectorElementType() == 11448 VT.getVectorElementType()) 11449 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT, 11450 N00.getOperand(0), N0.getOperand(1)); 11451 } 11452 } 11453 11454 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 11455 return NewVSel; 11456 11457 // Narrow a suitable binary operation with a non-opaque constant operand by 11458 // moving it ahead of the truncate. This is limited to pre-legalization 11459 // because targets may prefer a wider type during later combines and invert 11460 // this transform. 11461 switch (N0.getOpcode()) { 11462 case ISD::ADD: 11463 case ISD::SUB: 11464 case ISD::MUL: 11465 case ISD::AND: 11466 case ISD::OR: 11467 case ISD::XOR: 11468 if (!LegalOperations && N0.hasOneUse() && 11469 (isConstantOrConstantVector(N0.getOperand(0), true) || 11470 isConstantOrConstantVector(N0.getOperand(1), true))) { 11471 // TODO: We already restricted this to pre-legalization, but for vectors 11472 // we are extra cautious to not create an unsupported operation. 11473 // Target-specific changes are likely needed to avoid regressions here. 11474 if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) { 11475 SDLoc DL(N); 11476 SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); 11477 SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1)); 11478 return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR); 11479 } 11480 } 11481 } 11482 11483 return SDValue(); 11484 } 11485 11486 static SDNode *getBuildPairElt(SDNode *N, unsigned i) { 11487 SDValue Elt = N->getOperand(i); 11488 if (Elt.getOpcode() != ISD::MERGE_VALUES) 11489 return Elt.getNode(); 11490 return Elt.getOperand(Elt.getResNo()).getNode(); 11491 } 11492 11493 /// build_pair (load, load) -> load 11494 /// if load locations are consecutive. 11495 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { 11496 assert(N->getOpcode() == ISD::BUILD_PAIR); 11497 11498 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0)); 11499 LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1)); 11500 11501 // A BUILD_PAIR is always having the least significant part in elt 0 and the 11502 // most significant part in elt 1. So when combining into one large load, we 11503 // need to consider the endianness. 11504 if (DAG.getDataLayout().isBigEndian()) 11505 std::swap(LD1, LD2); 11506 11507 if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || 11508 LD1->getAddressSpace() != LD2->getAddressSpace()) 11509 return SDValue(); 11510 EVT LD1VT = LD1->getValueType(0); 11511 unsigned LD1Bytes = LD1VT.getStoreSize(); 11512 if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && 11513 DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) { 11514 Align Alignment = LD1->getAlign(); 11515 Align NewAlign = DAG.getDataLayout().getABITypeAlign( 11516 VT.getTypeForEVT(*DAG.getContext())); 11517 11518 if (NewAlign <= Alignment && 11519 (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) 11520 return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(), 11521 LD1->getPointerInfo(), Alignment); 11522 } 11523 11524 return SDValue(); 11525 } 11526 11527 static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) { 11528 // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi 11529 // and Lo parts; on big-endian machines it doesn't. 11530 return DAG.getDataLayout().isBigEndian() ? 1 : 0; 11531 } 11532 11533 static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, 11534 const TargetLowering &TLI) { 11535 // If this is not a bitcast to an FP type or if the target doesn't have 11536 // IEEE754-compliant FP logic, we're done. 11537 EVT VT = N->getValueType(0); 11538 if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT)) 11539 return SDValue(); 11540 11541 // TODO: Handle cases where the integer constant is a different scalar 11542 // bitwidth to the FP. 11543 SDValue N0 = N->getOperand(0); 11544 EVT SourceVT = N0.getValueType(); 11545 if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits()) 11546 return SDValue(); 11547 11548 unsigned FPOpcode; 11549 APInt SignMask; 11550 switch (N0.getOpcode()) { 11551 case ISD::AND: 11552 FPOpcode = ISD::FABS; 11553 SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits()); 11554 break; 11555 case ISD::XOR: 11556 FPOpcode = ISD::FNEG; 11557 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits()); 11558 break; 11559 case ISD::OR: 11560 FPOpcode = ISD::FABS; 11561 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits()); 11562 break; 11563 default: 11564 return SDValue(); 11565 } 11566 11567 // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X 11568 // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X 11569 // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) -> 11570 // fneg (fabs X) 11571 SDValue LogicOp0 = N0.getOperand(0); 11572 ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true); 11573 if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask && 11574 LogicOp0.getOpcode() == ISD::BITCAST && 11575 LogicOp0.getOperand(0).getValueType() == VT) { 11576 SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0)); 11577 NumFPLogicOpsConv++; 11578 if (N0.getOpcode() == ISD::OR) 11579 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp); 11580 return FPOp; 11581 } 11582 11583 return SDValue(); 11584 } 11585 11586 SDValue DAGCombiner::visitBITCAST(SDNode *N) { 11587 SDValue N0 = N->getOperand(0); 11588 EVT VT = N->getValueType(0); 11589 11590 if (N0.isUndef()) 11591 return DAG.getUNDEF(VT); 11592 11593 // If the input is a BUILD_VECTOR with all constant elements, fold this now. 11594 // Only do this before legalize types, unless both types are integer and the 11595 // scalar type is legal. Only do this before legalize ops, since the target 11596 // maybe depending on the bitcast. 11597 // First check to see if this is all constant. 11598 // TODO: Support FP bitcasts after legalize types. 11599 if (VT.isVector() && 11600 (!LegalTypes || 11601 (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() && 11602 TLI.isTypeLegal(VT.getVectorElementType()))) && 11603 N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() && 11604 cast<BuildVectorSDNode>(N0)->isConstant()) 11605 return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), 11606 VT.getVectorElementType()); 11607 11608 // If the input is a constant, let getNode fold it. 11609 if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) { 11610 // If we can't allow illegal operations, we need to check that this is just 11611 // a fp -> int or int -> conversion and that the resulting operation will 11612 // be legal. 11613 if (!LegalOperations || 11614 (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() && 11615 TLI.isOperationLegal(ISD::ConstantFP, VT)) || 11616 (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() && 11617 TLI.isOperationLegal(ISD::Constant, VT))) { 11618 SDValue C = DAG.getBitcast(VT, N0); 11619 if (C.getNode() != N) 11620 return C; 11621 } 11622 } 11623 11624 // (conv (conv x, t1), t2) -> (conv x, t2) 11625 if (N0.getOpcode() == ISD::BITCAST) 11626 return DAG.getBitcast(VT, N0.getOperand(0)); 11627 11628 // fold (conv (load x)) -> (load (conv*)x) 11629 // If the resultant load doesn't need a higher alignment than the original! 11630 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 11631 // Do not remove the cast if the types differ in endian layout. 11632 TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) == 11633 TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) && 11634 // If the load is volatile, we only want to change the load type if the 11635 // resulting load is legal. Otherwise we might increase the number of 11636 // memory accesses. We don't care if the original type was legal or not 11637 // as we assume software couldn't rely on the number of accesses of an 11638 // illegal type. 11639 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) || 11640 TLI.isOperationLegal(ISD::LOAD, VT))) { 11641 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 11642 11643 if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, 11644 *LN0->getMemOperand())) { 11645 SDValue Load = 11646 DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 11647 LN0->getPointerInfo(), LN0->getAlignment(), 11648 LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); 11649 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 11650 return Load; 11651 } 11652 } 11653 11654 if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI)) 11655 return V; 11656 11657 // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) 11658 // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) 11659 // 11660 // For ppc_fp128: 11661 // fold (bitcast (fneg x)) -> 11662 // flipbit = signbit 11663 // (xor (bitcast x) (build_pair flipbit, flipbit)) 11664 // 11665 // fold (bitcast (fabs x)) -> 11666 // flipbit = (and (extract_element (bitcast x), 0), signbit) 11667 // (xor (bitcast x) (build_pair flipbit, flipbit)) 11668 // This often reduces constant pool loads. 11669 if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) || 11670 (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) && 11671 N0.getNode()->hasOneUse() && VT.isInteger() && 11672 !VT.isVector() && !N0.getValueType().isVector()) { 11673 SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0)); 11674 AddToWorklist(NewConv.getNode()); 11675 11676 SDLoc DL(N); 11677 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { 11678 assert(VT.getSizeInBits() == 128); 11679 SDValue SignBit = DAG.getConstant( 11680 APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); 11681 SDValue FlipBit; 11682 if (N0.getOpcode() == ISD::FNEG) { 11683 FlipBit = SignBit; 11684 AddToWorklist(FlipBit.getNode()); 11685 } else { 11686 assert(N0.getOpcode() == ISD::FABS); 11687 SDValue Hi = 11688 DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv, 11689 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), 11690 SDLoc(NewConv))); 11691 AddToWorklist(Hi.getNode()); 11692 FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit); 11693 AddToWorklist(FlipBit.getNode()); 11694 } 11695 SDValue FlipBits = 11696 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); 11697 AddToWorklist(FlipBits.getNode()); 11698 return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); 11699 } 11700 APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); 11701 if (N0.getOpcode() == ISD::FNEG) 11702 return DAG.getNode(ISD::XOR, DL, VT, 11703 NewConv, DAG.getConstant(SignBit, DL, VT)); 11704 assert(N0.getOpcode() == ISD::FABS); 11705 return DAG.getNode(ISD::AND, DL, VT, 11706 NewConv, DAG.getConstant(~SignBit, DL, VT)); 11707 } 11708 11709 // fold (bitconvert (fcopysign cst, x)) -> 11710 // (or (and (bitconvert x), sign), (and cst, (not sign))) 11711 // Note that we don't handle (copysign x, cst) because this can always be 11712 // folded to an fneg or fabs. 11713 // 11714 // For ppc_fp128: 11715 // fold (bitcast (fcopysign cst, x)) -> 11716 // flipbit = (and (extract_element 11717 // (xor (bitcast cst), (bitcast x)), 0), 11718 // signbit) 11719 // (xor (bitcast cst) (build_pair flipbit, flipbit)) 11720 if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() && 11721 isa<ConstantFPSDNode>(N0.getOperand(0)) && 11722 VT.isInteger() && !VT.isVector()) { 11723 unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits(); 11724 EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth); 11725 if (isTypeLegal(IntXVT)) { 11726 SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1)); 11727 AddToWorklist(X.getNode()); 11728 11729 // If X has a different width than the result/lhs, sext it or truncate it. 11730 unsigned VTWidth = VT.getSizeInBits(); 11731 if (OrigXWidth < VTWidth) { 11732 X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X); 11733 AddToWorklist(X.getNode()); 11734 } else if (OrigXWidth > VTWidth) { 11735 // To get the sign bit in the right place, we have to shift it right 11736 // before truncating. 11737 SDLoc DL(X); 11738 X = DAG.getNode(ISD::SRL, DL, 11739 X.getValueType(), X, 11740 DAG.getConstant(OrigXWidth-VTWidth, DL, 11741 X.getValueType())); 11742 AddToWorklist(X.getNode()); 11743 X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); 11744 AddToWorklist(X.getNode()); 11745 } 11746 11747 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { 11748 APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2); 11749 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); 11750 AddToWorklist(Cst.getNode()); 11751 SDValue X = DAG.getBitcast(VT, N0.getOperand(1)); 11752 AddToWorklist(X.getNode()); 11753 SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X); 11754 AddToWorklist(XorResult.getNode()); 11755 SDValue XorResult64 = DAG.getNode( 11756 ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult, 11757 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), 11758 SDLoc(XorResult))); 11759 AddToWorklist(XorResult64.getNode()); 11760 SDValue FlipBit = 11761 DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64, 11762 DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64)); 11763 AddToWorklist(FlipBit.getNode()); 11764 SDValue FlipBits = 11765 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); 11766 AddToWorklist(FlipBits.getNode()); 11767 return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); 11768 } 11769 APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); 11770 X = DAG.getNode(ISD::AND, SDLoc(X), VT, 11771 X, DAG.getConstant(SignBit, SDLoc(X), VT)); 11772 AddToWorklist(X.getNode()); 11773 11774 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); 11775 Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT, 11776 Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT)); 11777 AddToWorklist(Cst.getNode()); 11778 11779 return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst); 11780 } 11781 } 11782 11783 // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. 11784 if (N0.getOpcode() == ISD::BUILD_PAIR) 11785 if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) 11786 return CombineLD; 11787 11788 // Remove double bitcasts from shuffles - this is often a legacy of 11789 // XformToShuffleWithZero being used to combine bitmaskings (of 11790 // float vectors bitcast to integer vectors) into shuffles. 11791 // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1) 11792 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() && 11793 N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() && 11794 VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() && 11795 !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) { 11796 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0); 11797 11798 // If operands are a bitcast, peek through if it casts the original VT. 11799 // If operands are a constant, just bitcast back to original VT. 11800 auto PeekThroughBitcast = [&](SDValue Op) { 11801 if (Op.getOpcode() == ISD::BITCAST && 11802 Op.getOperand(0).getValueType() == VT) 11803 return SDValue(Op.getOperand(0)); 11804 if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || 11805 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) 11806 return DAG.getBitcast(VT, Op); 11807 return SDValue(); 11808 }; 11809 11810 // FIXME: If either input vector is bitcast, try to convert the shuffle to 11811 // the result type of this bitcast. This would eliminate at least one 11812 // bitcast. See the transform in InstCombine. 11813 SDValue SV0 = PeekThroughBitcast(N0->getOperand(0)); 11814 SDValue SV1 = PeekThroughBitcast(N0->getOperand(1)); 11815 if (!(SV0 && SV1)) 11816 return SDValue(); 11817 11818 int MaskScale = 11819 VT.getVectorNumElements() / N0.getValueType().getVectorNumElements(); 11820 SmallVector<int, 8> NewMask; 11821 for (int M : SVN->getMask()) 11822 for (int i = 0; i != MaskScale; ++i) 11823 NewMask.push_back(M < 0 ? -1 : M * MaskScale + i); 11824 11825 SDValue LegalShuffle = 11826 TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG); 11827 if (LegalShuffle) 11828 return LegalShuffle; 11829 } 11830 11831 return SDValue(); 11832 } 11833 11834 SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) { 11835 EVT VT = N->getValueType(0); 11836 return CombineConsecutiveLoads(N, VT); 11837 } 11838 11839 SDValue DAGCombiner::visitFREEZE(SDNode *N) { 11840 SDValue N0 = N->getOperand(0); 11841 11842 // (freeze (freeze x)) -> (freeze x) 11843 if (N0.getOpcode() == ISD::FREEZE) 11844 return N0; 11845 11846 // If the input is a constant, return it. 11847 if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) 11848 return N0; 11849 11850 return SDValue(); 11851 } 11852 11853 /// We know that BV is a build_vector node with Constant, ConstantFP or Undef 11854 /// operands. DstEltVT indicates the destination element value type. 11855 SDValue DAGCombiner:: 11856 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { 11857 EVT SrcEltVT = BV->getValueType(0).getVectorElementType(); 11858 11859 // If this is already the right type, we're done. 11860 if (SrcEltVT == DstEltVT) return SDValue(BV, 0); 11861 11862 unsigned SrcBitSize = SrcEltVT.getSizeInBits(); 11863 unsigned DstBitSize = DstEltVT.getSizeInBits(); 11864 11865 // If this is a conversion of N elements of one type to N elements of another 11866 // type, convert each element. This handles FP<->INT cases. 11867 if (SrcBitSize == DstBitSize) { 11868 SmallVector<SDValue, 8> Ops; 11869 for (SDValue Op : BV->op_values()) { 11870 // If the vector element type is not legal, the BUILD_VECTOR operands 11871 // are promoted and implicitly truncated. Make that explicit here. 11872 if (Op.getValueType() != SrcEltVT) 11873 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op); 11874 Ops.push_back(DAG.getBitcast(DstEltVT, Op)); 11875 AddToWorklist(Ops.back().getNode()); 11876 } 11877 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, 11878 BV->getValueType(0).getVectorNumElements()); 11879 return DAG.getBuildVector(VT, SDLoc(BV), Ops); 11880 } 11881 11882 // Otherwise, we're growing or shrinking the elements. To avoid having to 11883 // handle annoying details of growing/shrinking FP values, we convert them to 11884 // int first. 11885 if (SrcEltVT.isFloatingPoint()) { 11886 // Convert the input float vector to a int vector where the elements are the 11887 // same sizes. 11888 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits()); 11889 BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode(); 11890 SrcEltVT = IntVT; 11891 } 11892 11893 // Now we know the input is an integer vector. If the output is a FP type, 11894 // convert to integer first, then to FP of the right size. 11895 if (DstEltVT.isFloatingPoint()) { 11896 EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits()); 11897 SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode(); 11898 11899 // Next, convert to FP elements of the same size. 11900 return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT); 11901 } 11902 11903 SDLoc DL(BV); 11904 11905 // Okay, we know the src/dst types are both integers of differing types. 11906 // Handling growing first. 11907 assert(SrcEltVT.isInteger() && DstEltVT.isInteger()); 11908 if (SrcBitSize < DstBitSize) { 11909 unsigned NumInputsPerOutput = DstBitSize/SrcBitSize; 11910 11911 SmallVector<SDValue, 8> Ops; 11912 for (unsigned i = 0, e = BV->getNumOperands(); i != e; 11913 i += NumInputsPerOutput) { 11914 bool isLE = DAG.getDataLayout().isLittleEndian(); 11915 APInt NewBits = APInt(DstBitSize, 0); 11916 bool EltIsUndef = true; 11917 for (unsigned j = 0; j != NumInputsPerOutput; ++j) { 11918 // Shift the previously computed bits over. 11919 NewBits <<= SrcBitSize; 11920 SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j)); 11921 if (Op.isUndef()) continue; 11922 EltIsUndef = false; 11923 11924 NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue(). 11925 zextOrTrunc(SrcBitSize).zext(DstBitSize); 11926 } 11927 11928 if (EltIsUndef) 11929 Ops.push_back(DAG.getUNDEF(DstEltVT)); 11930 else 11931 Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT)); 11932 } 11933 11934 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size()); 11935 return DAG.getBuildVector(VT, DL, Ops); 11936 } 11937 11938 // Finally, this must be the case where we are shrinking elements: each input 11939 // turns into multiple outputs. 11940 unsigned NumOutputsPerInput = SrcBitSize/DstBitSize; 11941 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, 11942 NumOutputsPerInput*BV->getNumOperands()); 11943 SmallVector<SDValue, 8> Ops; 11944 11945 for (const SDValue &Op : BV->op_values()) { 11946 if (Op.isUndef()) { 11947 Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT)); 11948 continue; 11949 } 11950 11951 APInt OpVal = cast<ConstantSDNode>(Op)-> 11952 getAPIntValue().zextOrTrunc(SrcBitSize); 11953 11954 for (unsigned j = 0; j != NumOutputsPerInput; ++j) { 11955 APInt ThisVal = OpVal.trunc(DstBitSize); 11956 Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT)); 11957 OpVal.lshrInPlace(DstBitSize); 11958 } 11959 11960 // For big endian targets, swap the order of the pieces of each element. 11961 if (DAG.getDataLayout().isBigEndian()) 11962 std::reverse(Ops.end()-NumOutputsPerInput, Ops.end()); 11963 } 11964 11965 return DAG.getBuildVector(VT, DL, Ops); 11966 } 11967 11968 static bool isContractable(SDNode *N) { 11969 SDNodeFlags F = N->getFlags(); 11970 return F.hasAllowContract() || F.hasAllowReassociation(); 11971 } 11972 11973 /// Try to perform FMA combining on a given FADD node. 11974 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { 11975 SDValue N0 = N->getOperand(0); 11976 SDValue N1 = N->getOperand(1); 11977 EVT VT = N->getValueType(0); 11978 SDLoc SL(N); 11979 11980 const TargetOptions &Options = DAG.getTarget().Options; 11981 11982 // Floating-point multiply-add with intermediate rounding. 11983 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N)); 11984 11985 // Floating-point multiply-add without intermediate rounding. 11986 bool HasFMA = 11987 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 11988 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 11989 11990 // No valid opcode, do not combine. 11991 if (!HasFMAD && !HasFMA) 11992 return SDValue(); 11993 11994 SDNodeFlags Flags = N->getFlags(); 11995 bool CanFuse = Options.UnsafeFPMath || isContractable(N); 11996 bool CanReassociate = 11997 Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); 11998 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || 11999 CanFuse || HasFMAD); 12000 // If the addition is not contractable, do not combine. 12001 if (!AllowFusionGlobally && !isContractable(N)) 12002 return SDValue(); 12003 12004 if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) 12005 return SDValue(); 12006 12007 // Always prefer FMAD to FMA for precision. 12008 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 12009 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 12010 12011 // Is the node an FMUL and contractable either due to global flags or 12012 // SDNodeFlags. 12013 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { 12014 if (N.getOpcode() != ISD::FMUL) 12015 return false; 12016 return AllowFusionGlobally || isContractable(N.getNode()); 12017 }; 12018 // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), 12019 // prefer to fold the multiply with fewer uses. 12020 if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { 12021 if (N0.getNode()->use_size() > N1.getNode()->use_size()) 12022 std::swap(N0, N1); 12023 } 12024 12025 // fold (fadd (fmul x, y), z) -> (fma x, y, z) 12026 if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { 12027 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12028 N0.getOperand(0), N0.getOperand(1), N1, Flags); 12029 } 12030 12031 // fold (fadd x, (fmul y, z)) -> (fma y, z, x) 12032 // Note: Commutes FADD operands. 12033 if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { 12034 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12035 N1.getOperand(0), N1.getOperand(1), N0, Flags); 12036 } 12037 12038 // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E) 12039 // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E) 12040 // This requires reassociation because it changes the order of operations. 12041 SDValue FMA, E; 12042 if (CanReassociate && N0.getOpcode() == PreferredFusedOpcode && 12043 N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() && 12044 N0.getOperand(2).hasOneUse()) { 12045 FMA = N0; 12046 E = N1; 12047 } else if (CanReassociate && N1.getOpcode() == PreferredFusedOpcode && 12048 N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() && 12049 N1.getOperand(2).hasOneUse()) { 12050 FMA = N1; 12051 E = N0; 12052 } 12053 if (FMA && E) { 12054 SDValue A = FMA.getOperand(0); 12055 SDValue B = FMA.getOperand(1); 12056 SDValue C = FMA.getOperand(2).getOperand(0); 12057 SDValue D = FMA.getOperand(2).getOperand(1); 12058 SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E, Flags); 12059 return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE, Flags); 12060 } 12061 12062 // Look through FP_EXTEND nodes to do more combining. 12063 12064 // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) 12065 if (N0.getOpcode() == ISD::FP_EXTEND) { 12066 SDValue N00 = N0.getOperand(0); 12067 if (isContractableFMUL(N00) && 12068 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12069 N00.getValueType())) { 12070 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12071 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12072 N00.getOperand(0)), 12073 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12074 N00.getOperand(1)), N1, Flags); 12075 } 12076 } 12077 12078 // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) 12079 // Note: Commutes FADD operands. 12080 if (N1.getOpcode() == ISD::FP_EXTEND) { 12081 SDValue N10 = N1.getOperand(0); 12082 if (isContractableFMUL(N10) && 12083 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12084 N10.getValueType())) { 12085 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12086 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12087 N10.getOperand(0)), 12088 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12089 N10.getOperand(1)), N0, Flags); 12090 } 12091 } 12092 12093 // More folding opportunities when target permits. 12094 if (Aggressive) { 12095 // fold (fadd (fma x, y, (fpext (fmul u, v))), z) 12096 // -> (fma x, y, (fma (fpext u), (fpext v), z)) 12097 auto FoldFAddFMAFPExtFMul = [&] ( 12098 SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, 12099 SDNodeFlags Flags) { 12100 return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, 12101 DAG.getNode(PreferredFusedOpcode, SL, VT, 12102 DAG.getNode(ISD::FP_EXTEND, SL, VT, U), 12103 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), 12104 Z, Flags), Flags); 12105 }; 12106 if (N0.getOpcode() == PreferredFusedOpcode) { 12107 SDValue N02 = N0.getOperand(2); 12108 if (N02.getOpcode() == ISD::FP_EXTEND) { 12109 SDValue N020 = N02.getOperand(0); 12110 if (isContractableFMUL(N020) && 12111 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12112 N020.getValueType())) { 12113 return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), 12114 N020.getOperand(0), N020.getOperand(1), 12115 N1, Flags); 12116 } 12117 } 12118 } 12119 12120 // fold (fadd (fpext (fma x, y, (fmul u, v))), z) 12121 // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) 12122 // FIXME: This turns two single-precision and one double-precision 12123 // operation into two double-precision operations, which might not be 12124 // interesting for all targets, especially GPUs. 12125 auto FoldFAddFPExtFMAFMul = [&] ( 12126 SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, 12127 SDNodeFlags Flags) { 12128 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12129 DAG.getNode(ISD::FP_EXTEND, SL, VT, X), 12130 DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), 12131 DAG.getNode(PreferredFusedOpcode, SL, VT, 12132 DAG.getNode(ISD::FP_EXTEND, SL, VT, U), 12133 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), 12134 Z, Flags), Flags); 12135 }; 12136 if (N0.getOpcode() == ISD::FP_EXTEND) { 12137 SDValue N00 = N0.getOperand(0); 12138 if (N00.getOpcode() == PreferredFusedOpcode) { 12139 SDValue N002 = N00.getOperand(2); 12140 if (isContractableFMUL(N002) && 12141 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12142 N00.getValueType())) { 12143 return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), 12144 N002.getOperand(0), N002.getOperand(1), 12145 N1, Flags); 12146 } 12147 } 12148 } 12149 12150 // fold (fadd x, (fma y, z, (fpext (fmul u, v))) 12151 // -> (fma y, z, (fma (fpext u), (fpext v), x)) 12152 if (N1.getOpcode() == PreferredFusedOpcode) { 12153 SDValue N12 = N1.getOperand(2); 12154 if (N12.getOpcode() == ISD::FP_EXTEND) { 12155 SDValue N120 = N12.getOperand(0); 12156 if (isContractableFMUL(N120) && 12157 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12158 N120.getValueType())) { 12159 return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), 12160 N120.getOperand(0), N120.getOperand(1), 12161 N0, Flags); 12162 } 12163 } 12164 } 12165 12166 // fold (fadd x, (fpext (fma y, z, (fmul u, v))) 12167 // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) 12168 // FIXME: This turns two single-precision and one double-precision 12169 // operation into two double-precision operations, which might not be 12170 // interesting for all targets, especially GPUs. 12171 if (N1.getOpcode() == ISD::FP_EXTEND) { 12172 SDValue N10 = N1.getOperand(0); 12173 if (N10.getOpcode() == PreferredFusedOpcode) { 12174 SDValue N102 = N10.getOperand(2); 12175 if (isContractableFMUL(N102) && 12176 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12177 N10.getValueType())) { 12178 return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), 12179 N102.getOperand(0), N102.getOperand(1), 12180 N0, Flags); 12181 } 12182 } 12183 } 12184 } 12185 12186 return SDValue(); 12187 } 12188 12189 /// Try to perform FMA combining on a given FSUB node. 12190 SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { 12191 SDValue N0 = N->getOperand(0); 12192 SDValue N1 = N->getOperand(1); 12193 EVT VT = N->getValueType(0); 12194 SDLoc SL(N); 12195 12196 const TargetOptions &Options = DAG.getTarget().Options; 12197 // Floating-point multiply-add with intermediate rounding. 12198 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N)); 12199 12200 // Floating-point multiply-add without intermediate rounding. 12201 bool HasFMA = 12202 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 12203 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 12204 12205 // No valid opcode, do not combine. 12206 if (!HasFMAD && !HasFMA) 12207 return SDValue(); 12208 12209 const SDNodeFlags Flags = N->getFlags(); 12210 bool CanFuse = Options.UnsafeFPMath || isContractable(N); 12211 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || 12212 CanFuse || HasFMAD); 12213 12214 // If the subtraction is not contractable, do not combine. 12215 if (!AllowFusionGlobally && !isContractable(N)) 12216 return SDValue(); 12217 12218 if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) 12219 return SDValue(); 12220 12221 // Always prefer FMAD to FMA for precision. 12222 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 12223 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 12224 bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros(); 12225 12226 // Is the node an FMUL and contractable either due to global flags or 12227 // SDNodeFlags. 12228 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { 12229 if (N.getOpcode() != ISD::FMUL) 12230 return false; 12231 return AllowFusionGlobally || isContractable(N.getNode()); 12232 }; 12233 12234 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 12235 auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) { 12236 if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) { 12237 return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), 12238 XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z), 12239 Flags); 12240 } 12241 return SDValue(); 12242 }; 12243 12244 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 12245 // Note: Commutes FSUB operands. 12246 auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) { 12247 if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) { 12248 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12249 DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), 12250 YZ.getOperand(1), X, Flags); 12251 } 12252 return SDValue(); 12253 }; 12254 12255 // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)), 12256 // prefer to fold the multiply with fewer uses. 12257 if (isContractableFMUL(N0) && isContractableFMUL(N1) && 12258 (N0.getNode()->use_size() > N1.getNode()->use_size())) { 12259 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b)) 12260 if (SDValue V = tryToFoldXSubYZ(N0, N1)) 12261 return V; 12262 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d))) 12263 if (SDValue V = tryToFoldXYSubZ(N0, N1)) 12264 return V; 12265 } else { 12266 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) 12267 if (SDValue V = tryToFoldXYSubZ(N0, N1)) 12268 return V; 12269 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) 12270 if (SDValue V = tryToFoldXSubYZ(N0, N1)) 12271 return V; 12272 } 12273 12274 // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) 12275 if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && 12276 (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { 12277 SDValue N00 = N0.getOperand(0).getOperand(0); 12278 SDValue N01 = N0.getOperand(0).getOperand(1); 12279 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12280 DAG.getNode(ISD::FNEG, SL, VT, N00), N01, 12281 DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); 12282 } 12283 12284 // Look through FP_EXTEND nodes to do more combining. 12285 12286 // fold (fsub (fpext (fmul x, y)), z) 12287 // -> (fma (fpext x), (fpext y), (fneg z)) 12288 if (N0.getOpcode() == ISD::FP_EXTEND) { 12289 SDValue N00 = N0.getOperand(0); 12290 if (isContractableFMUL(N00) && 12291 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12292 N00.getValueType())) { 12293 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12294 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12295 N00.getOperand(0)), 12296 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12297 N00.getOperand(1)), 12298 DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); 12299 } 12300 } 12301 12302 // fold (fsub x, (fpext (fmul y, z))) 12303 // -> (fma (fneg (fpext y)), (fpext z), x) 12304 // Note: Commutes FSUB operands. 12305 if (N1.getOpcode() == ISD::FP_EXTEND) { 12306 SDValue N10 = N1.getOperand(0); 12307 if (isContractableFMUL(N10) && 12308 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12309 N10.getValueType())) { 12310 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12311 DAG.getNode(ISD::FNEG, SL, VT, 12312 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12313 N10.getOperand(0))), 12314 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12315 N10.getOperand(1)), 12316 N0, Flags); 12317 } 12318 } 12319 12320 // fold (fsub (fpext (fneg (fmul, x, y))), z) 12321 // -> (fneg (fma (fpext x), (fpext y), z)) 12322 // Note: This could be removed with appropriate canonicalization of the 12323 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the 12324 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent 12325 // from implementing the canonicalization in visitFSUB. 12326 if (N0.getOpcode() == ISD::FP_EXTEND) { 12327 SDValue N00 = N0.getOperand(0); 12328 if (N00.getOpcode() == ISD::FNEG) { 12329 SDValue N000 = N00.getOperand(0); 12330 if (isContractableFMUL(N000) && 12331 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12332 N00.getValueType())) { 12333 return DAG.getNode(ISD::FNEG, SL, VT, 12334 DAG.getNode(PreferredFusedOpcode, SL, VT, 12335 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12336 N000.getOperand(0)), 12337 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12338 N000.getOperand(1)), 12339 N1, Flags)); 12340 } 12341 } 12342 } 12343 12344 // fold (fsub (fneg (fpext (fmul, x, y))), z) 12345 // -> (fneg (fma (fpext x)), (fpext y), z) 12346 // Note: This could be removed with appropriate canonicalization of the 12347 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the 12348 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent 12349 // from implementing the canonicalization in visitFSUB. 12350 if (N0.getOpcode() == ISD::FNEG) { 12351 SDValue N00 = N0.getOperand(0); 12352 if (N00.getOpcode() == ISD::FP_EXTEND) { 12353 SDValue N000 = N00.getOperand(0); 12354 if (isContractableFMUL(N000) && 12355 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12356 N000.getValueType())) { 12357 return DAG.getNode(ISD::FNEG, SL, VT, 12358 DAG.getNode(PreferredFusedOpcode, SL, VT, 12359 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12360 N000.getOperand(0)), 12361 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12362 N000.getOperand(1)), 12363 N1, Flags)); 12364 } 12365 } 12366 } 12367 12368 // More folding opportunities when target permits. 12369 if (Aggressive) { 12370 // fold (fsub (fma x, y, (fmul u, v)), z) 12371 // -> (fma x, y (fma u, v, (fneg z))) 12372 if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && 12373 isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && 12374 N0.getOperand(2)->hasOneUse()) { 12375 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12376 N0.getOperand(0), N0.getOperand(1), 12377 DAG.getNode(PreferredFusedOpcode, SL, VT, 12378 N0.getOperand(2).getOperand(0), 12379 N0.getOperand(2).getOperand(1), 12380 DAG.getNode(ISD::FNEG, SL, VT, 12381 N1), Flags), Flags); 12382 } 12383 12384 // fold (fsub x, (fma y, z, (fmul u, v))) 12385 // -> (fma (fneg y), z, (fma (fneg u), v, x)) 12386 if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && 12387 isContractableFMUL(N1.getOperand(2)) && 12388 N1->hasOneUse() && NoSignedZero) { 12389 SDValue N20 = N1.getOperand(2).getOperand(0); 12390 SDValue N21 = N1.getOperand(2).getOperand(1); 12391 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12392 DAG.getNode(ISD::FNEG, SL, VT, 12393 N1.getOperand(0)), 12394 N1.getOperand(1), 12395 DAG.getNode(PreferredFusedOpcode, SL, VT, 12396 DAG.getNode(ISD::FNEG, SL, VT, N20), 12397 N21, N0, Flags), Flags); 12398 } 12399 12400 12401 // fold (fsub (fma x, y, (fpext (fmul u, v))), z) 12402 // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) 12403 if (N0.getOpcode() == PreferredFusedOpcode && 12404 N0->hasOneUse()) { 12405 SDValue N02 = N0.getOperand(2); 12406 if (N02.getOpcode() == ISD::FP_EXTEND) { 12407 SDValue N020 = N02.getOperand(0); 12408 if (isContractableFMUL(N020) && 12409 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12410 N020.getValueType())) { 12411 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12412 N0.getOperand(0), N0.getOperand(1), 12413 DAG.getNode(PreferredFusedOpcode, SL, VT, 12414 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12415 N020.getOperand(0)), 12416 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12417 N020.getOperand(1)), 12418 DAG.getNode(ISD::FNEG, SL, VT, 12419 N1), Flags), Flags); 12420 } 12421 } 12422 } 12423 12424 // fold (fsub (fpext (fma x, y, (fmul u, v))), z) 12425 // -> (fma (fpext x), (fpext y), 12426 // (fma (fpext u), (fpext v), (fneg z))) 12427 // FIXME: This turns two single-precision and one double-precision 12428 // operation into two double-precision operations, which might not be 12429 // interesting for all targets, especially GPUs. 12430 if (N0.getOpcode() == ISD::FP_EXTEND) { 12431 SDValue N00 = N0.getOperand(0); 12432 if (N00.getOpcode() == PreferredFusedOpcode) { 12433 SDValue N002 = N00.getOperand(2); 12434 if (isContractableFMUL(N002) && 12435 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12436 N00.getValueType())) { 12437 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12438 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12439 N00.getOperand(0)), 12440 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12441 N00.getOperand(1)), 12442 DAG.getNode(PreferredFusedOpcode, SL, VT, 12443 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12444 N002.getOperand(0)), 12445 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12446 N002.getOperand(1)), 12447 DAG.getNode(ISD::FNEG, SL, VT, 12448 N1), Flags), Flags); 12449 } 12450 } 12451 } 12452 12453 // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) 12454 // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) 12455 if (N1.getOpcode() == PreferredFusedOpcode && 12456 N1.getOperand(2).getOpcode() == ISD::FP_EXTEND && 12457 N1->hasOneUse()) { 12458 SDValue N120 = N1.getOperand(2).getOperand(0); 12459 if (isContractableFMUL(N120) && 12460 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12461 N120.getValueType())) { 12462 SDValue N1200 = N120.getOperand(0); 12463 SDValue N1201 = N120.getOperand(1); 12464 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12465 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), 12466 N1.getOperand(1), 12467 DAG.getNode(PreferredFusedOpcode, SL, VT, 12468 DAG.getNode(ISD::FNEG, SL, VT, 12469 DAG.getNode(ISD::FP_EXTEND, SL, 12470 VT, N1200)), 12471 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12472 N1201), 12473 N0, Flags), Flags); 12474 } 12475 } 12476 12477 // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) 12478 // -> (fma (fneg (fpext y)), (fpext z), 12479 // (fma (fneg (fpext u)), (fpext v), x)) 12480 // FIXME: This turns two single-precision and one double-precision 12481 // operation into two double-precision operations, which might not be 12482 // interesting for all targets, especially GPUs. 12483 if (N1.getOpcode() == ISD::FP_EXTEND && 12484 N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { 12485 SDValue CvtSrc = N1.getOperand(0); 12486 SDValue N100 = CvtSrc.getOperand(0); 12487 SDValue N101 = CvtSrc.getOperand(1); 12488 SDValue N102 = CvtSrc.getOperand(2); 12489 if (isContractableFMUL(N102) && 12490 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, 12491 CvtSrc.getValueType())) { 12492 SDValue N1020 = N102.getOperand(0); 12493 SDValue N1021 = N102.getOperand(1); 12494 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12495 DAG.getNode(ISD::FNEG, SL, VT, 12496 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12497 N100)), 12498 DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), 12499 DAG.getNode(PreferredFusedOpcode, SL, VT, 12500 DAG.getNode(ISD::FNEG, SL, VT, 12501 DAG.getNode(ISD::FP_EXTEND, SL, 12502 VT, N1020)), 12503 DAG.getNode(ISD::FP_EXTEND, SL, VT, 12504 N1021), 12505 N0, Flags), Flags); 12506 } 12507 } 12508 } 12509 12510 return SDValue(); 12511 } 12512 12513 /// Try to perform FMA combining on a given FMUL node based on the distributive 12514 /// law x * (y + 1) = x * y + x and variants thereof (commuted versions, 12515 /// subtraction instead of addition). 12516 SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { 12517 SDValue N0 = N->getOperand(0); 12518 SDValue N1 = N->getOperand(1); 12519 EVT VT = N->getValueType(0); 12520 SDLoc SL(N); 12521 const SDNodeFlags Flags = N->getFlags(); 12522 12523 assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); 12524 12525 const TargetOptions &Options = DAG.getTarget().Options; 12526 12527 // The transforms below are incorrect when x == 0 and y == inf, because the 12528 // intermediate multiplication produces a nan. 12529 if (!Options.NoInfsFPMath) 12530 return SDValue(); 12531 12532 // Floating-point multiply-add without intermediate rounding. 12533 bool HasFMA = 12534 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && 12535 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && 12536 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); 12537 12538 // Floating-point multiply-add with intermediate rounding. This can result 12539 // in a less precise result due to the changed rounding order. 12540 bool HasFMAD = Options.UnsafeFPMath && 12541 (LegalOperations && TLI.isFMADLegal(DAG, N)); 12542 12543 // No valid opcode, do not combine. 12544 if (!HasFMAD && !HasFMA) 12545 return SDValue(); 12546 12547 // Always prefer FMAD to FMA for precision. 12548 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; 12549 bool Aggressive = TLI.enableAggressiveFMAFusion(VT); 12550 12551 // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y) 12552 // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y)) 12553 auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { 12554 if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { 12555 if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) { 12556 if (C->isExactlyValue(+1.0)) 12557 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12558 Y, Flags); 12559 if (C->isExactlyValue(-1.0)) 12560 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12561 DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); 12562 } 12563 } 12564 return SDValue(); 12565 }; 12566 12567 if (SDValue FMA = FuseFADD(N0, N1, Flags)) 12568 return FMA; 12569 if (SDValue FMA = FuseFADD(N1, N0, Flags)) 12570 return FMA; 12571 12572 // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y) 12573 // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y)) 12574 // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y)) 12575 // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y) 12576 auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { 12577 if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { 12578 if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) { 12579 if (C0->isExactlyValue(+1.0)) 12580 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12581 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, 12582 Y, Flags); 12583 if (C0->isExactlyValue(-1.0)) 12584 return DAG.getNode(PreferredFusedOpcode, SL, VT, 12585 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, 12586 DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); 12587 } 12588 if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) { 12589 if (C1->isExactlyValue(+1.0)) 12590 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12591 DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); 12592 if (C1->isExactlyValue(-1.0)) 12593 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, 12594 Y, Flags); 12595 } 12596 } 12597 return SDValue(); 12598 }; 12599 12600 if (SDValue FMA = FuseFSUB(N0, N1, Flags)) 12601 return FMA; 12602 if (SDValue FMA = FuseFSUB(N1, N0, Flags)) 12603 return FMA; 12604 12605 return SDValue(); 12606 } 12607 12608 SDValue DAGCombiner::visitFADD(SDNode *N) { 12609 SDValue N0 = N->getOperand(0); 12610 SDValue N1 = N->getOperand(1); 12611 bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); 12612 bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); 12613 EVT VT = N->getValueType(0); 12614 SDLoc DL(N); 12615 const TargetOptions &Options = DAG.getTarget().Options; 12616 const SDNodeFlags Flags = N->getFlags(); 12617 12618 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 12619 return R; 12620 12621 // fold vector ops 12622 if (VT.isVector()) 12623 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 12624 return FoldedVOp; 12625 12626 // fold (fadd c1, c2) -> c1 + c2 12627 if (N0CFP && N1CFP) 12628 return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags); 12629 12630 // canonicalize constant to RHS 12631 if (N0CFP && !N1CFP) 12632 return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); 12633 12634 // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) 12635 ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); 12636 if (N1C && N1C->isZero()) 12637 if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) 12638 return N0; 12639 12640 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12641 return NewSel; 12642 12643 // fold (fadd A, (fneg B)) -> (fsub A, B) 12644 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) 12645 if (SDValue NegN1 = TLI.getCheaperNegatedExpression( 12646 N1, DAG, LegalOperations, ForCodeSize)) 12647 return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1, Flags); 12648 12649 // fold (fadd (fneg A), B) -> (fsub B, A) 12650 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) 12651 if (SDValue NegN0 = TLI.getCheaperNegatedExpression( 12652 N0, DAG, LegalOperations, ForCodeSize)) 12653 return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0, Flags); 12654 12655 auto isFMulNegTwo = [](SDValue FMul) { 12656 if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) 12657 return false; 12658 auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true); 12659 return C && C->isExactlyValue(-2.0); 12660 }; 12661 12662 // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B) 12663 if (isFMulNegTwo(N0)) { 12664 SDValue B = N0.getOperand(0); 12665 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); 12666 return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags); 12667 } 12668 // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B) 12669 if (isFMulNegTwo(N1)) { 12670 SDValue B = N1.getOperand(0); 12671 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); 12672 return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags); 12673 } 12674 12675 // No FP constant should be created after legalization as Instruction 12676 // Selection pass has a hard time dealing with FP constants. 12677 bool AllowNewConst = (Level < AfterLegalizeDAG); 12678 12679 // If nnan is enabled, fold lots of things. 12680 if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) { 12681 // If allowed, fold (fadd (fneg x), x) -> 0.0 12682 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) 12683 return DAG.getConstantFP(0.0, DL, VT); 12684 12685 // If allowed, fold (fadd x, (fneg x)) -> 0.0 12686 if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) 12687 return DAG.getConstantFP(0.0, DL, VT); 12688 } 12689 12690 // If 'unsafe math' or reassoc and nsz, fold lots of things. 12691 // TODO: break out portions of the transformations below for which Unsafe is 12692 // considered and which do not require both nsz and reassoc 12693 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || 12694 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && 12695 AllowNewConst) { 12696 // fadd (fadd x, c1), c2 -> fadd x, c1 + c2 12697 if (N1CFP && N0.getOpcode() == ISD::FADD && 12698 isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { 12699 SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags); 12700 return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags); 12701 } 12702 12703 // We can fold chains of FADD's of the same value into multiplications. 12704 // This transform is not safe in general because we are reducing the number 12705 // of rounding steps. 12706 if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { 12707 if (N0.getOpcode() == ISD::FMUL) { 12708 bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); 12709 bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); 12710 12711 // (fadd (fmul x, c), x) -> (fmul x, c+1) 12712 if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { 12713 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), 12714 DAG.getConstantFP(1.0, DL, VT), Flags); 12715 return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags); 12716 } 12717 12718 // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) 12719 if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD && 12720 N1.getOperand(0) == N1.getOperand(1) && 12721 N0.getOperand(0) == N1.getOperand(0)) { 12722 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), 12723 DAG.getConstantFP(2.0, DL, VT), Flags); 12724 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags); 12725 } 12726 } 12727 12728 if (N1.getOpcode() == ISD::FMUL) { 12729 bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); 12730 bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); 12731 12732 // (fadd x, (fmul x, c)) -> (fmul x, c+1) 12733 if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { 12734 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), 12735 DAG.getConstantFP(1.0, DL, VT), Flags); 12736 return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags); 12737 } 12738 12739 // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) 12740 if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD && 12741 N0.getOperand(0) == N0.getOperand(1) && 12742 N1.getOperand(0) == N0.getOperand(0)) { 12743 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), 12744 DAG.getConstantFP(2.0, DL, VT), Flags); 12745 return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags); 12746 } 12747 } 12748 12749 if (N0.getOpcode() == ISD::FADD) { 12750 bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); 12751 // (fadd (fadd x, x), x) -> (fmul x, 3.0) 12752 if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && 12753 (N0.getOperand(0) == N1)) { 12754 return DAG.getNode(ISD::FMUL, DL, VT, 12755 N1, DAG.getConstantFP(3.0, DL, VT), Flags); 12756 } 12757 } 12758 12759 if (N1.getOpcode() == ISD::FADD) { 12760 bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); 12761 // (fadd x, (fadd x, x)) -> (fmul x, 3.0) 12762 if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && 12763 N1.getOperand(0) == N0) { 12764 return DAG.getNode(ISD::FMUL, DL, VT, 12765 N0, DAG.getConstantFP(3.0, DL, VT), Flags); 12766 } 12767 } 12768 12769 // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0) 12770 if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD && 12771 N0.getOperand(0) == N0.getOperand(1) && 12772 N1.getOperand(0) == N1.getOperand(1) && 12773 N0.getOperand(0) == N1.getOperand(0)) { 12774 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), 12775 DAG.getConstantFP(4.0, DL, VT), Flags); 12776 } 12777 } 12778 } // enable-unsafe-fp-math 12779 12780 // FADD -> FMA combines: 12781 if (SDValue Fused = visitFADDForFMACombine(N)) { 12782 AddToWorklist(Fused.getNode()); 12783 return Fused; 12784 } 12785 return SDValue(); 12786 } 12787 12788 SDValue DAGCombiner::visitFSUB(SDNode *N) { 12789 SDValue N0 = N->getOperand(0); 12790 SDValue N1 = N->getOperand(1); 12791 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true); 12792 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); 12793 EVT VT = N->getValueType(0); 12794 SDLoc DL(N); 12795 const TargetOptions &Options = DAG.getTarget().Options; 12796 const SDNodeFlags Flags = N->getFlags(); 12797 12798 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 12799 return R; 12800 12801 // fold vector ops 12802 if (VT.isVector()) 12803 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 12804 return FoldedVOp; 12805 12806 // fold (fsub c1, c2) -> c1-c2 12807 if (N0CFP && N1CFP) 12808 return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); 12809 12810 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12811 return NewSel; 12812 12813 // (fsub A, 0) -> A 12814 if (N1CFP && N1CFP->isZero()) { 12815 if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath || 12816 Flags.hasNoSignedZeros()) { 12817 return N0; 12818 } 12819 } 12820 12821 if (N0 == N1) { 12822 // (fsub x, x) -> 0.0 12823 if (Options.NoNaNsFPMath || Flags.hasNoNaNs()) 12824 return DAG.getConstantFP(0.0f, DL, VT); 12825 } 12826 12827 // (fsub -0.0, N1) -> -N1 12828 // NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the 12829 // FSUB does not specify the sign bit of a NaN. Also note that for 12830 // the same reason, the inverse transform is not safe, unless fast math 12831 // flags are in play. 12832 if (N0CFP && N0CFP->isZero()) { 12833 if (N0CFP->isNegative() || 12834 (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { 12835 if (SDValue NegN1 = 12836 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) 12837 return NegN1; 12838 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 12839 return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags); 12840 } 12841 } 12842 12843 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || 12844 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && 12845 N1.getOpcode() == ISD::FADD) { 12846 // X - (X + Y) -> -Y 12847 if (N0 == N1->getOperand(0)) 12848 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags); 12849 // X - (Y + X) -> -Y 12850 if (N0 == N1->getOperand(1)) 12851 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0), Flags); 12852 } 12853 12854 // fold (fsub A, (fneg B)) -> (fadd A, B) 12855 if (SDValue NegN1 = 12856 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) 12857 return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1, Flags); 12858 12859 // FSUB -> FMA combines: 12860 if (SDValue Fused = visitFSUBForFMACombine(N)) { 12861 AddToWorklist(Fused.getNode()); 12862 return Fused; 12863 } 12864 12865 return SDValue(); 12866 } 12867 12868 SDValue DAGCombiner::visitFMUL(SDNode *N) { 12869 SDValue N0 = N->getOperand(0); 12870 SDValue N1 = N->getOperand(1); 12871 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true); 12872 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); 12873 EVT VT = N->getValueType(0); 12874 SDLoc DL(N); 12875 const TargetOptions &Options = DAG.getTarget().Options; 12876 const SDNodeFlags Flags = N->getFlags(); 12877 12878 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 12879 return R; 12880 12881 // fold vector ops 12882 if (VT.isVector()) { 12883 // This just handles C1 * C2 for vectors. Other vector folds are below. 12884 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 12885 return FoldedVOp; 12886 } 12887 12888 // fold (fmul c1, c2) -> c1*c2 12889 if (N0CFP && N1CFP) 12890 return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags); 12891 12892 // canonicalize constant to RHS 12893 if (isConstantFPBuildVectorOrConstantFP(N0) && 12894 !isConstantFPBuildVectorOrConstantFP(N1)) 12895 return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags); 12896 12897 if (SDValue NewSel = foldBinOpIntoSelect(N)) 12898 return NewSel; 12899 12900 if ((Options.NoNaNsFPMath && Options.NoSignedZerosFPMath) || 12901 (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) { 12902 // fold (fmul A, 0) -> 0 12903 if (N1CFP && N1CFP->isZero()) 12904 return N1; 12905 } 12906 12907 if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) { 12908 // fmul (fmul X, C1), C2 -> fmul X, C1 * C2 12909 if (isConstantFPBuildVectorOrConstantFP(N1) && 12910 N0.getOpcode() == ISD::FMUL) { 12911 SDValue N00 = N0.getOperand(0); 12912 SDValue N01 = N0.getOperand(1); 12913 // Avoid an infinite loop by making sure that N00 is not a constant 12914 // (the inner multiply has not been constant folded yet). 12915 if (isConstantFPBuildVectorOrConstantFP(N01) && 12916 !isConstantFPBuildVectorOrConstantFP(N00)) { 12917 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags); 12918 return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags); 12919 } 12920 } 12921 12922 // Match a special-case: we convert X * 2.0 into fadd. 12923 // fmul (fadd X, X), C -> fmul X, 2.0 * C 12924 if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() && 12925 N0.getOperand(0) == N0.getOperand(1)) { 12926 const SDValue Two = DAG.getConstantFP(2.0, DL, VT); 12927 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags); 12928 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags); 12929 } 12930 } 12931 12932 // fold (fmul X, 2.0) -> (fadd X, X) 12933 if (N1CFP && N1CFP->isExactlyValue(+2.0)) 12934 return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); 12935 12936 // fold (fmul X, -1.0) -> (fneg X) 12937 if (N1CFP && N1CFP->isExactlyValue(-1.0)) 12938 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 12939 return DAG.getNode(ISD::FNEG, DL, VT, N0); 12940 12941 // -N0 * -N1 --> N0 * N1 12942 TargetLowering::NegatibleCost CostN0 = 12943 TargetLowering::NegatibleCost::Expensive; 12944 TargetLowering::NegatibleCost CostN1 = 12945 TargetLowering::NegatibleCost::Expensive; 12946 SDValue NegN0 = 12947 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0); 12948 SDValue NegN1 = 12949 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); 12950 if (NegN0 && NegN1 && 12951 (CostN0 == TargetLowering::NegatibleCost::Cheaper || 12952 CostN1 == TargetLowering::NegatibleCost::Cheaper)) 12953 return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags); 12954 12955 // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) 12956 // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) 12957 if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && 12958 (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && 12959 TLI.isOperationLegal(ISD::FABS, VT)) { 12960 SDValue Select = N0, X = N1; 12961 if (Select.getOpcode() != ISD::SELECT) 12962 std::swap(Select, X); 12963 12964 SDValue Cond = Select.getOperand(0); 12965 auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1)); 12966 auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2)); 12967 12968 if (TrueOpnd && FalseOpnd && 12969 Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && 12970 isa<ConstantFPSDNode>(Cond.getOperand(1)) && 12971 cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) { 12972 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 12973 switch (CC) { 12974 default: break; 12975 case ISD::SETOLT: 12976 case ISD::SETULT: 12977 case ISD::SETOLE: 12978 case ISD::SETULE: 12979 case ISD::SETLT: 12980 case ISD::SETLE: 12981 std::swap(TrueOpnd, FalseOpnd); 12982 LLVM_FALLTHROUGH; 12983 case ISD::SETOGT: 12984 case ISD::SETUGT: 12985 case ISD::SETOGE: 12986 case ISD::SETUGE: 12987 case ISD::SETGT: 12988 case ISD::SETGE: 12989 if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && 12990 TLI.isOperationLegal(ISD::FNEG, VT)) 12991 return DAG.getNode(ISD::FNEG, DL, VT, 12992 DAG.getNode(ISD::FABS, DL, VT, X)); 12993 if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) 12994 return DAG.getNode(ISD::FABS, DL, VT, X); 12995 12996 break; 12997 } 12998 } 12999 } 13000 13001 // FMUL -> FMA combines: 13002 if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { 13003 AddToWorklist(Fused.getNode()); 13004 return Fused; 13005 } 13006 13007 return SDValue(); 13008 } 13009 13010 SDValue DAGCombiner::visitFMA(SDNode *N) { 13011 SDValue N0 = N->getOperand(0); 13012 SDValue N1 = N->getOperand(1); 13013 SDValue N2 = N->getOperand(2); 13014 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 13015 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 13016 EVT VT = N->getValueType(0); 13017 SDLoc DL(N); 13018 const TargetOptions &Options = DAG.getTarget().Options; 13019 13020 // FMA nodes have flags that propagate to the created nodes. 13021 const SDNodeFlags Flags = N->getFlags(); 13022 bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N); 13023 13024 // Constant fold FMA. 13025 if (isa<ConstantFPSDNode>(N0) && 13026 isa<ConstantFPSDNode>(N1) && 13027 isa<ConstantFPSDNode>(N2)) { 13028 return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); 13029 } 13030 13031 // (-N0 * -N1) + N2 --> (N0 * N1) + N2 13032 TargetLowering::NegatibleCost CostN0 = 13033 TargetLowering::NegatibleCost::Expensive; 13034 TargetLowering::NegatibleCost CostN1 = 13035 TargetLowering::NegatibleCost::Expensive; 13036 SDValue NegN0 = 13037 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0); 13038 SDValue NegN1 = 13039 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); 13040 if (NegN0 && NegN1 && 13041 (CostN0 == TargetLowering::NegatibleCost::Cheaper || 13042 CostN1 == TargetLowering::NegatibleCost::Cheaper)) 13043 return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags); 13044 13045 if (UnsafeFPMath) { 13046 if (N0CFP && N0CFP->isZero()) 13047 return N2; 13048 if (N1CFP && N1CFP->isZero()) 13049 return N2; 13050 } 13051 // TODO: The FMA node should have flags that propagate to these nodes. 13052 if (N0CFP && N0CFP->isExactlyValue(1.0)) 13053 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); 13054 if (N1CFP && N1CFP->isExactlyValue(1.0)) 13055 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); 13056 13057 // Canonicalize (fma c, x, y) -> (fma x, c, y) 13058 if (isConstantFPBuildVectorOrConstantFP(N0) && 13059 !isConstantFPBuildVectorOrConstantFP(N1)) 13060 return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); 13061 13062 if (UnsafeFPMath) { 13063 // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) 13064 if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && 13065 isConstantFPBuildVectorOrConstantFP(N1) && 13066 isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { 13067 return DAG.getNode(ISD::FMUL, DL, VT, N0, 13068 DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1), 13069 Flags), Flags); 13070 } 13071 13072 // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) 13073 if (N0.getOpcode() == ISD::FMUL && 13074 isConstantFPBuildVectorOrConstantFP(N1) && 13075 isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { 13076 return DAG.getNode(ISD::FMA, DL, VT, 13077 N0.getOperand(0), 13078 DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1), 13079 Flags), 13080 N2); 13081 } 13082 } 13083 13084 // (fma x, 1, y) -> (fadd x, y) 13085 // (fma x, -1, y) -> (fadd (fneg x), y) 13086 if (N1CFP) { 13087 if (N1CFP->isExactlyValue(1.0)) 13088 // TODO: The FMA node should have flags that propagate to this node. 13089 return DAG.getNode(ISD::FADD, DL, VT, N0, N2); 13090 13091 if (N1CFP->isExactlyValue(-1.0) && 13092 (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { 13093 SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); 13094 AddToWorklist(RHSNeg.getNode()); 13095 // TODO: The FMA node should have flags that propagate to this node. 13096 return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); 13097 } 13098 13099 // fma (fneg x), K, y -> fma x -K, y 13100 if (N0.getOpcode() == ISD::FNEG && 13101 (TLI.isOperationLegal(ISD::ConstantFP, VT) || 13102 (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, 13103 ForCodeSize)))) { 13104 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), 13105 DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); 13106 } 13107 } 13108 13109 if (UnsafeFPMath) { 13110 // (fma x, c, x) -> (fmul x, (c+1)) 13111 if (N1CFP && N0 == N2) { 13112 return DAG.getNode(ISD::FMUL, DL, VT, N0, 13113 DAG.getNode(ISD::FADD, DL, VT, N1, 13114 DAG.getConstantFP(1.0, DL, VT), Flags), 13115 Flags); 13116 } 13117 13118 // (fma x, c, (fneg x)) -> (fmul x, (c-1)) 13119 if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { 13120 return DAG.getNode(ISD::FMUL, DL, VT, N0, 13121 DAG.getNode(ISD::FADD, DL, VT, N1, 13122 DAG.getConstantFP(-1.0, DL, VT), Flags), 13123 Flags); 13124 } 13125 } 13126 13127 // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z)) 13128 // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z)) 13129 if (!TLI.isFNegFree(VT)) 13130 if (SDValue Neg = TLI.getCheaperNegatedExpression( 13131 SDValue(N, 0), DAG, LegalOperations, ForCodeSize)) 13132 return DAG.getNode(ISD::FNEG, DL, VT, Neg, Flags); 13133 return SDValue(); 13134 } 13135 13136 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 13137 // reciprocal. 13138 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) 13139 // Notice that this is not always beneficial. One reason is different targets 13140 // may have different costs for FDIV and FMUL, so sometimes the cost of two 13141 // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason 13142 // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". 13143 SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { 13144 // TODO: Limit this transform based on optsize/minsize - it always creates at 13145 // least 1 extra instruction. But the perf win may be substantial enough 13146 // that only minsize should restrict this. 13147 bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; 13148 const SDNodeFlags Flags = N->getFlags(); 13149 if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal())) 13150 return SDValue(); 13151 13152 // Skip if current node is a reciprocal/fneg-reciprocal. 13153 SDValue N0 = N->getOperand(0); 13154 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true); 13155 if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) 13156 return SDValue(); 13157 13158 // Exit early if the target does not want this transform or if there can't 13159 // possibly be enough uses of the divisor to make the transform worthwhile. 13160 SDValue N1 = N->getOperand(1); 13161 unsigned MinUses = TLI.combineRepeatedFPDivisors(); 13162 13163 // For splat vectors, scale the number of uses by the splat factor. If we can 13164 // convert the division into a scalar op, that will likely be much faster. 13165 unsigned NumElts = 1; 13166 EVT VT = N->getValueType(0); 13167 if (VT.isVector() && DAG.isSplatValue(N1)) 13168 NumElts = VT.getVectorNumElements(); 13169 13170 if (!MinUses || (N1->use_size() * NumElts) < MinUses) 13171 return SDValue(); 13172 13173 // Find all FDIV users of the same divisor. 13174 // Use a set because duplicates may be present in the user list. 13175 SetVector<SDNode *> Users; 13176 for (auto *U : N1->uses()) { 13177 if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { 13178 // This division is eligible for optimization only if global unsafe math 13179 // is enabled or if this division allows reciprocal formation. 13180 if (UnsafeMath || U->getFlags().hasAllowReciprocal()) 13181 Users.insert(U); 13182 } 13183 } 13184 13185 // Now that we have the actual number of divisor uses, make sure it meets 13186 // the minimum threshold specified by the target. 13187 if ((Users.size() * NumElts) < MinUses) 13188 return SDValue(); 13189 13190 SDLoc DL(N); 13191 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); 13192 SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags); 13193 13194 // Dividend / Divisor -> Dividend * Reciprocal 13195 for (auto *U : Users) { 13196 SDValue Dividend = U->getOperand(0); 13197 if (Dividend != FPOne) { 13198 SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend, 13199 Reciprocal, Flags); 13200 CombineTo(U, NewNode); 13201 } else if (U != Reciprocal.getNode()) { 13202 // In the absence of fast-math-flags, this user node is always the 13203 // same node as Reciprocal, but with FMF they may be different nodes. 13204 CombineTo(U, Reciprocal); 13205 } 13206 } 13207 return SDValue(N, 0); // N was replaced. 13208 } 13209 13210 SDValue DAGCombiner::visitFDIV(SDNode *N) { 13211 SDValue N0 = N->getOperand(0); 13212 SDValue N1 = N->getOperand(1); 13213 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 13214 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 13215 EVT VT = N->getValueType(0); 13216 SDLoc DL(N); 13217 const TargetOptions &Options = DAG.getTarget().Options; 13218 SDNodeFlags Flags = N->getFlags(); 13219 13220 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 13221 return R; 13222 13223 // fold vector ops 13224 if (VT.isVector()) 13225 if (SDValue FoldedVOp = SimplifyVBinOp(N)) 13226 return FoldedVOp; 13227 13228 // fold (fdiv c1, c2) -> c1/c2 13229 if (N0CFP && N1CFP) 13230 return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); 13231 13232 if (SDValue NewSel = foldBinOpIntoSelect(N)) 13233 return NewSel; 13234 13235 if (SDValue V = combineRepeatedFPDivisors(N)) 13236 return V; 13237 13238 if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { 13239 // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. 13240 if (N1CFP) { 13241 // Compute the reciprocal 1.0 / c2. 13242 const APFloat &N1APF = N1CFP->getValueAPF(); 13243 APFloat Recip(N1APF.getSemantics(), 1); // 1.0 13244 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven); 13245 // Only do the transform if the reciprocal is a legal fp immediate that 13246 // isn't too nasty (eg NaN, denormal, ...). 13247 if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty 13248 (!LegalOperations || 13249 // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM 13250 // backend)... we should handle this gracefully after Legalize. 13251 // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || 13252 TLI.isOperationLegal(ISD::ConstantFP, VT) || 13253 TLI.isFPImmLegal(Recip, VT, ForCodeSize))) 13254 return DAG.getNode(ISD::FMUL, DL, VT, N0, 13255 DAG.getConstantFP(Recip, DL, VT), Flags); 13256 } 13257 13258 // If this FDIV is part of a reciprocal square root, it may be folded 13259 // into a target-specific square root estimate instruction. 13260 if (N1.getOpcode() == ISD::FSQRT) { 13261 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) 13262 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); 13263 } else if (N1.getOpcode() == ISD::FP_EXTEND && 13264 N1.getOperand(0).getOpcode() == ISD::FSQRT) { 13265 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), 13266 Flags)) { 13267 RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); 13268 AddToWorklist(RV.getNode()); 13269 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); 13270 } 13271 } else if (N1.getOpcode() == ISD::FP_ROUND && 13272 N1.getOperand(0).getOpcode() == ISD::FSQRT) { 13273 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), 13274 Flags)) { 13275 RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); 13276 AddToWorklist(RV.getNode()); 13277 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); 13278 } 13279 } else if (N1.getOpcode() == ISD::FMUL) { 13280 // Look through an FMUL. Even though this won't remove the FDIV directly, 13281 // it's still worthwhile to get rid of the FSQRT if possible. 13282 SDValue Sqrt, Y; 13283 if (N1.getOperand(0).getOpcode() == ISD::FSQRT) { 13284 Sqrt = N1.getOperand(0); 13285 Y = N1.getOperand(1); 13286 } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) { 13287 Sqrt = N1.getOperand(1); 13288 Y = N1.getOperand(0); 13289 } 13290 if (Sqrt.getNode()) { 13291 // If the other multiply operand is known positive, pull it into the 13292 // sqrt. That will eliminate the division if we convert to an estimate: 13293 // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z) 13294 // TODO: Also fold the case where A == Z (fabs is missing). 13295 if (Flags.hasAllowReassociation() && N1.hasOneUse() && 13296 N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse() && 13297 Y.getOpcode() == ISD::FABS && Y.hasOneUse()) { 13298 SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, Y.getOperand(0), 13299 Y.getOperand(0), Flags); 13300 SDValue AAZ = 13301 DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0), Flags); 13302 if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags)) 13303 return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt, Flags); 13304 13305 // Estimate creation failed. Clean up speculatively created nodes. 13306 recursivelyDeleteUnusedNodes(AAZ.getNode()); 13307 } 13308 13309 // We found a FSQRT, so try to make this fold: 13310 // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y) 13311 if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) { 13312 SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y, Flags); 13313 AddToWorklist(Div.getNode()); 13314 return DAG.getNode(ISD::FMUL, DL, VT, N0, Div, Flags); 13315 } 13316 } 13317 } 13318 13319 // Fold into a reciprocal estimate and multiply instead of a real divide. 13320 if (Options.NoInfsFPMath || Flags.hasNoInfs()) 13321 if (SDValue RV = BuildDivEstimate(N0, N1, Flags)) 13322 return RV; 13323 } 13324 13325 // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) 13326 TargetLowering::NegatibleCost CostN0 = 13327 TargetLowering::NegatibleCost::Expensive; 13328 TargetLowering::NegatibleCost CostN1 = 13329 TargetLowering::NegatibleCost::Expensive; 13330 SDValue NegN0 = 13331 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0); 13332 SDValue NegN1 = 13333 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); 13334 if (NegN0 && NegN1 && 13335 (CostN0 == TargetLowering::NegatibleCost::Cheaper || 13336 CostN1 == TargetLowering::NegatibleCost::Cheaper)) 13337 return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1, Flags); 13338 13339 return SDValue(); 13340 } 13341 13342 SDValue DAGCombiner::visitFREM(SDNode *N) { 13343 SDValue N0 = N->getOperand(0); 13344 SDValue N1 = N->getOperand(1); 13345 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 13346 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 13347 EVT VT = N->getValueType(0); 13348 SDNodeFlags Flags = N->getFlags(); 13349 13350 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) 13351 return R; 13352 13353 // fold (frem c1, c2) -> fmod(c1,c2) 13354 if (N0CFP && N1CFP) 13355 return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags()); 13356 13357 if (SDValue NewSel = foldBinOpIntoSelect(N)) 13358 return NewSel; 13359 13360 return SDValue(); 13361 } 13362 13363 SDValue DAGCombiner::visitFSQRT(SDNode *N) { 13364 SDNodeFlags Flags = N->getFlags(); 13365 const TargetOptions &Options = DAG.getTarget().Options; 13366 13367 // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as: 13368 // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN 13369 if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) || 13370 (!Options.NoInfsFPMath && !Flags.hasNoInfs())) 13371 return SDValue(); 13372 13373 SDValue N0 = N->getOperand(0); 13374 if (TLI.isFsqrtCheap(N0, DAG)) 13375 return SDValue(); 13376 13377 // FSQRT nodes have flags that propagate to the created nodes. 13378 return buildSqrtEstimate(N0, Flags); 13379 } 13380 13381 /// copysign(x, fp_extend(y)) -> copysign(x, y) 13382 /// copysign(x, fp_round(y)) -> copysign(x, y) 13383 static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { 13384 SDValue N1 = N->getOperand(1); 13385 if ((N1.getOpcode() == ISD::FP_EXTEND || 13386 N1.getOpcode() == ISD::FP_ROUND)) { 13387 // Do not optimize out type conversion of f128 type yet. 13388 // For some targets like x86_64, configuration is changed to keep one f128 13389 // value in one SSE register, but instruction selection cannot handle 13390 // FCOPYSIGN on SSE registers yet. 13391 EVT N1VT = N1->getValueType(0); 13392 EVT N1Op0VT = N1->getOperand(0).getValueType(); 13393 return (N1VT == N1Op0VT || N1Op0VT != MVT::f128); 13394 } 13395 return false; 13396 } 13397 13398 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { 13399 SDValue N0 = N->getOperand(0); 13400 SDValue N1 = N->getOperand(1); 13401 bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); 13402 bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); 13403 EVT VT = N->getValueType(0); 13404 13405 if (N0CFP && N1CFP) // Constant fold 13406 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1); 13407 13408 if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) { 13409 const APFloat &V = N1C->getValueAPF(); 13410 // copysign(x, c1) -> fabs(x) iff ispos(c1) 13411 // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) 13412 if (!V.isNegative()) { 13413 if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) 13414 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 13415 } else { 13416 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) 13417 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, 13418 DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); 13419 } 13420 } 13421 13422 // copysign(fabs(x), y) -> copysign(x, y) 13423 // copysign(fneg(x), y) -> copysign(x, y) 13424 // copysign(copysign(x,z), y) -> copysign(x, y) 13425 if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || 13426 N0.getOpcode() == ISD::FCOPYSIGN) 13427 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1); 13428 13429 // copysign(x, abs(y)) -> abs(x) 13430 if (N1.getOpcode() == ISD::FABS) 13431 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 13432 13433 // copysign(x, copysign(y,z)) -> copysign(x, z) 13434 if (N1.getOpcode() == ISD::FCOPYSIGN) 13435 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1)); 13436 13437 // copysign(x, fp_extend(y)) -> copysign(x, y) 13438 // copysign(x, fp_round(y)) -> copysign(x, y) 13439 if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) 13440 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); 13441 13442 return SDValue(); 13443 } 13444 13445 SDValue DAGCombiner::visitFPOW(SDNode *N) { 13446 ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1)); 13447 if (!ExponentC) 13448 return SDValue(); 13449 13450 // Try to convert x ** (1/3) into cube root. 13451 // TODO: Handle the various flavors of long double. 13452 // TODO: Since we're approximating, we don't need an exact 1/3 exponent. 13453 // Some range near 1/3 should be fine. 13454 EVT VT = N->getValueType(0); 13455 if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) || 13456 (VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) { 13457 // pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0. 13458 // pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf. 13459 // pow(-val, 1/3) = nan; cbrt(-val) = -num. 13460 // For regular numbers, rounding may cause the results to differ. 13461 // Therefore, we require { nsz ninf nnan afn } for this transform. 13462 // TODO: We could select out the special cases if we don't have nsz/ninf. 13463 SDNodeFlags Flags = N->getFlags(); 13464 if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || !Flags.hasNoNaNs() || 13465 !Flags.hasApproximateFuncs()) 13466 return SDValue(); 13467 13468 // Do not create a cbrt() libcall if the target does not have it, and do not 13469 // turn a pow that has lowering support into a cbrt() libcall. 13470 if (!DAG.getLibInfo().has(LibFunc_cbrt) || 13471 (!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) && 13472 DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT))) 13473 return SDValue(); 13474 13475 return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0), Flags); 13476 } 13477 13478 // Try to convert x ** (1/4) and x ** (3/4) into square roots. 13479 // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case. 13480 // TODO: This could be extended (using a target hook) to handle smaller 13481 // power-of-2 fractional exponents. 13482 bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25); 13483 bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75); 13484 if (ExponentIs025 || ExponentIs075) { 13485 // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0. 13486 // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN. 13487 // pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0. 13488 // pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN. 13489 // For regular numbers, rounding may cause the results to differ. 13490 // Therefore, we require { nsz ninf afn } for this transform. 13491 // TODO: We could select out the special cases if we don't have nsz/ninf. 13492 SDNodeFlags Flags = N->getFlags(); 13493 13494 // We only need no signed zeros for the 0.25 case. 13495 if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() || 13496 !Flags.hasApproximateFuncs()) 13497 return SDValue(); 13498 13499 // Don't double the number of libcalls. We are trying to inline fast code. 13500 if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT)) 13501 return SDValue(); 13502 13503 // Assume that libcalls are the smallest code. 13504 // TODO: This restriction should probably be lifted for vectors. 13505 if (ForCodeSize) 13506 return SDValue(); 13507 13508 // pow(X, 0.25) --> sqrt(sqrt(X)) 13509 SDLoc DL(N); 13510 SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags); 13511 SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags); 13512 if (ExponentIs025) 13513 return SqrtSqrt; 13514 // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X)) 13515 return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt, Flags); 13516 } 13517 13518 return SDValue(); 13519 } 13520 13521 static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, 13522 const TargetLowering &TLI) { 13523 // This optimization is guarded by a function attribute because it may produce 13524 // unexpected results. Ie, programs may be relying on the platform-specific 13525 // undefined behavior when the float-to-int conversion overflows. 13526 const Function &F = DAG.getMachineFunction().getFunction(); 13527 Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow"); 13528 if (StrictOverflow.getValueAsString().equals("false")) 13529 return SDValue(); 13530 13531 // We only do this if the target has legal ftrunc. Otherwise, we'd likely be 13532 // replacing casts with a libcall. We also must be allowed to ignore -0.0 13533 // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer 13534 // conversions would return +0.0. 13535 // FIXME: We should be able to use node-level FMF here. 13536 // TODO: If strict math, should we use FABS (+ range check for signed cast)? 13537 EVT VT = N->getValueType(0); 13538 if (!TLI.isOperationLegal(ISD::FTRUNC, VT) || 13539 !DAG.getTarget().Options.NoSignedZerosFPMath) 13540 return SDValue(); 13541 13542 // fptosi/fptoui round towards zero, so converting from FP to integer and 13543 // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X 13544 SDValue N0 = N->getOperand(0); 13545 if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && 13546 N0.getOperand(0).getValueType() == VT) 13547 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); 13548 13549 if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && 13550 N0.getOperand(0).getValueType() == VT) 13551 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); 13552 13553 return SDValue(); 13554 } 13555 13556 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { 13557 SDValue N0 = N->getOperand(0); 13558 EVT VT = N->getValueType(0); 13559 EVT OpVT = N0.getValueType(); 13560 13561 // [us]itofp(undef) = 0, because the result value is bounded. 13562 if (N0.isUndef()) 13563 return DAG.getConstantFP(0.0, SDLoc(N), VT); 13564 13565 // fold (sint_to_fp c1) -> c1fp 13566 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 13567 // ...but only if the target supports immediate floating-point values 13568 (!LegalOperations || 13569 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) 13570 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); 13571 13572 // If the input is a legal type, and SINT_TO_FP is not legal on this target, 13573 // but UINT_TO_FP is legal on this target, try to convert. 13574 if (!hasOperation(ISD::SINT_TO_FP, OpVT) && 13575 hasOperation(ISD::UINT_TO_FP, OpVT)) { 13576 // If the sign bit is known to be zero, we can change this to UINT_TO_FP. 13577 if (DAG.SignBitIsZero(N0)) 13578 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); 13579 } 13580 13581 // The next optimizations are desirable only if SELECT_CC can be lowered. 13582 // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0) 13583 if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && 13584 !VT.isVector() && 13585 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 13586 SDLoc DL(N); 13587 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT), 13588 DAG.getConstantFP(0.0, DL, VT)); 13589 } 13590 13591 // fold (sint_to_fp (zext (setcc x, y, cc))) -> 13592 // (select (setcc x, y, cc), 1.0, 0.0) 13593 if (N0.getOpcode() == ISD::ZERO_EXTEND && 13594 N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() && 13595 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 13596 SDLoc DL(N); 13597 return DAG.getSelect(DL, VT, N0.getOperand(0), 13598 DAG.getConstantFP(1.0, DL, VT), 13599 DAG.getConstantFP(0.0, DL, VT)); 13600 } 13601 13602 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) 13603 return FTrunc; 13604 13605 return SDValue(); 13606 } 13607 13608 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { 13609 SDValue N0 = N->getOperand(0); 13610 EVT VT = N->getValueType(0); 13611 EVT OpVT = N0.getValueType(); 13612 13613 // [us]itofp(undef) = 0, because the result value is bounded. 13614 if (N0.isUndef()) 13615 return DAG.getConstantFP(0.0, SDLoc(N), VT); 13616 13617 // fold (uint_to_fp c1) -> c1fp 13618 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && 13619 // ...but only if the target supports immediate floating-point values 13620 (!LegalOperations || 13621 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) 13622 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); 13623 13624 // If the input is a legal type, and UINT_TO_FP is not legal on this target, 13625 // but SINT_TO_FP is legal on this target, try to convert. 13626 if (!hasOperation(ISD::UINT_TO_FP, OpVT) && 13627 hasOperation(ISD::SINT_TO_FP, OpVT)) { 13628 // If the sign bit is known to be zero, we can change this to SINT_TO_FP. 13629 if (DAG.SignBitIsZero(N0)) 13630 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); 13631 } 13632 13633 // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0) 13634 if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && 13635 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { 13636 SDLoc DL(N); 13637 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT), 13638 DAG.getConstantFP(0.0, DL, VT)); 13639 } 13640 13641 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) 13642 return FTrunc; 13643 13644 return SDValue(); 13645 } 13646 13647 // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x 13648 static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { 13649 SDValue N0 = N->getOperand(0); 13650 EVT VT = N->getValueType(0); 13651 13652 if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP) 13653 return SDValue(); 13654 13655 SDValue Src = N0.getOperand(0); 13656 EVT SrcVT = Src.getValueType(); 13657 bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP; 13658 bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT; 13659 13660 // We can safely assume the conversion won't overflow the output range, 13661 // because (for example) (uint8_t)18293.f is undefined behavior. 13662 13663 // Since we can assume the conversion won't overflow, our decision as to 13664 // whether the input will fit in the float should depend on the minimum 13665 // of the input range and output range. 13666 13667 // This means this is also safe for a signed input and unsigned output, since 13668 // a negative input would lead to undefined behavior. 13669 unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned; 13670 unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned; 13671 unsigned ActualSize = std::min(InputSize, OutputSize); 13672 const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType()); 13673 13674 // We can only fold away the float conversion if the input range can be 13675 // represented exactly in the float range. 13676 if (APFloat::semanticsPrecision(sem) >= ActualSize) { 13677 if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) { 13678 unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND 13679 : ISD::ZERO_EXTEND; 13680 return DAG.getNode(ExtOp, SDLoc(N), VT, Src); 13681 } 13682 if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits()) 13683 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src); 13684 return DAG.getBitcast(VT, Src); 13685 } 13686 return SDValue(); 13687 } 13688 13689 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { 13690 SDValue N0 = N->getOperand(0); 13691 EVT VT = N->getValueType(0); 13692 13693 // fold (fp_to_sint undef) -> undef 13694 if (N0.isUndef()) 13695 return DAG.getUNDEF(VT); 13696 13697 // fold (fp_to_sint c1fp) -> c1 13698 if (isConstantFPBuildVectorOrConstantFP(N0)) 13699 return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); 13700 13701 return FoldIntToFPToInt(N, DAG); 13702 } 13703 13704 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { 13705 SDValue N0 = N->getOperand(0); 13706 EVT VT = N->getValueType(0); 13707 13708 // fold (fp_to_uint undef) -> undef 13709 if (N0.isUndef()) 13710 return DAG.getUNDEF(VT); 13711 13712 // fold (fp_to_uint c1fp) -> c1 13713 if (isConstantFPBuildVectorOrConstantFP(N0)) 13714 return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); 13715 13716 return FoldIntToFPToInt(N, DAG); 13717 } 13718 13719 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { 13720 SDValue N0 = N->getOperand(0); 13721 SDValue N1 = N->getOperand(1); 13722 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 13723 EVT VT = N->getValueType(0); 13724 13725 // fold (fp_round c1fp) -> c1fp 13726 if (N0CFP) 13727 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1); 13728 13729 // fold (fp_round (fp_extend x)) -> x 13730 if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType()) 13731 return N0.getOperand(0); 13732 13733 // fold (fp_round (fp_round x)) -> (fp_round x) 13734 if (N0.getOpcode() == ISD::FP_ROUND) { 13735 const bool NIsTrunc = N->getConstantOperandVal(1) == 1; 13736 const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1; 13737 13738 // Skip this folding if it results in an fp_round from f80 to f16. 13739 // 13740 // f80 to f16 always generates an expensive (and as yet, unimplemented) 13741 // libcall to __truncxfhf2 instead of selecting native f16 conversion 13742 // instructions from f32 or f64. Moreover, the first (value-preserving) 13743 // fp_round from f80 to either f32 or f64 may become a NOP in platforms like 13744 // x86. 13745 if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16) 13746 return SDValue(); 13747 13748 // If the first fp_round isn't a value preserving truncation, it might 13749 // introduce a tie in the second fp_round, that wouldn't occur in the 13750 // single-step fp_round we want to fold to. 13751 // In other words, double rounding isn't the same as rounding. 13752 // Also, this is a value preserving truncation iff both fp_round's are. 13753 if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { 13754 SDLoc DL(N); 13755 return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), 13756 DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL)); 13757 } 13758 } 13759 13760 // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) 13761 if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) { 13762 SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, 13763 N0.getOperand(0), N1); 13764 AddToWorklist(Tmp.getNode()); 13765 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, 13766 Tmp, N0.getOperand(1)); 13767 } 13768 13769 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 13770 return NewVSel; 13771 13772 return SDValue(); 13773 } 13774 13775 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { 13776 SDValue N0 = N->getOperand(0); 13777 EVT VT = N->getValueType(0); 13778 13779 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. 13780 if (N->hasOneUse() && 13781 N->use_begin()->getOpcode() == ISD::FP_ROUND) 13782 return SDValue(); 13783 13784 // fold (fp_extend c1fp) -> c1fp 13785 if (isConstantFPBuildVectorOrConstantFP(N0)) 13786 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); 13787 13788 // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op) 13789 if (N0.getOpcode() == ISD::FP16_TO_FP && 13790 TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal) 13791 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0)); 13792 13793 // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the 13794 // value of X. 13795 if (N0.getOpcode() == ISD::FP_ROUND 13796 && N0.getConstantOperandVal(1) == 1) { 13797 SDValue In = N0.getOperand(0); 13798 if (In.getValueType() == VT) return In; 13799 if (VT.bitsLT(In.getValueType())) 13800 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, 13801 In, N0.getOperand(1)); 13802 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In); 13803 } 13804 13805 // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) 13806 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 13807 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { 13808 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 13809 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 13810 LN0->getChain(), 13811 LN0->getBasePtr(), N0.getValueType(), 13812 LN0->getMemOperand()); 13813 CombineTo(N, ExtLoad); 13814 CombineTo(N0.getNode(), 13815 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), 13816 N0.getValueType(), ExtLoad, 13817 DAG.getIntPtrConstant(1, SDLoc(N0))), 13818 ExtLoad.getValue(1)); 13819 return SDValue(N, 0); // Return N so it doesn't get rechecked! 13820 } 13821 13822 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) 13823 return NewVSel; 13824 13825 return SDValue(); 13826 } 13827 13828 SDValue DAGCombiner::visitFCEIL(SDNode *N) { 13829 SDValue N0 = N->getOperand(0); 13830 EVT VT = N->getValueType(0); 13831 13832 // fold (fceil c1) -> fceil(c1) 13833 if (isConstantFPBuildVectorOrConstantFP(N0)) 13834 return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); 13835 13836 return SDValue(); 13837 } 13838 13839 SDValue DAGCombiner::visitFTRUNC(SDNode *N) { 13840 SDValue N0 = N->getOperand(0); 13841 EVT VT = N->getValueType(0); 13842 13843 // fold (ftrunc c1) -> ftrunc(c1) 13844 if (isConstantFPBuildVectorOrConstantFP(N0)) 13845 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); 13846 13847 // fold ftrunc (known rounded int x) -> x 13848 // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is 13849 // likely to be generated to extract integer from a rounded floating value. 13850 switch (N0.getOpcode()) { 13851 default: break; 13852 case ISD::FRINT: 13853 case ISD::FTRUNC: 13854 case ISD::FNEARBYINT: 13855 case ISD::FFLOOR: 13856 case ISD::FCEIL: 13857 return N0; 13858 } 13859 13860 return SDValue(); 13861 } 13862 13863 SDValue DAGCombiner::visitFFLOOR(SDNode *N) { 13864 SDValue N0 = N->getOperand(0); 13865 EVT VT = N->getValueType(0); 13866 13867 // fold (ffloor c1) -> ffloor(c1) 13868 if (isConstantFPBuildVectorOrConstantFP(N0)) 13869 return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); 13870 13871 return SDValue(); 13872 } 13873 13874 // FIXME: FNEG and FABS have a lot in common; refactor. 13875 SDValue DAGCombiner::visitFNEG(SDNode *N) { 13876 SDValue N0 = N->getOperand(0); 13877 EVT VT = N->getValueType(0); 13878 13879 // Constant fold FNEG. 13880 if (isConstantFPBuildVectorOrConstantFP(N0)) 13881 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); 13882 13883 if (SDValue NegN0 = 13884 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize)) 13885 return NegN0; 13886 13887 // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 13888 // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't 13889 // know it was called from a context with a nsz flag if the input fsub does 13890 // not. 13891 if (N0.getOpcode() == ISD::FSUB && 13892 (DAG.getTarget().Options.NoSignedZerosFPMath || 13893 N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) { 13894 return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1), 13895 N0.getOperand(0), N->getFlags()); 13896 } 13897 13898 // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading 13899 // constant pool values. 13900 if (!TLI.isFNegFree(VT) && 13901 N0.getOpcode() == ISD::BITCAST && 13902 N0.getNode()->hasOneUse()) { 13903 SDValue Int = N0.getOperand(0); 13904 EVT IntVT = Int.getValueType(); 13905 if (IntVT.isInteger() && !IntVT.isVector()) { 13906 APInt SignMask; 13907 if (N0.getValueType().isVector()) { 13908 // For a vector, get a mask such as 0x80... per scalar element 13909 // and splat it. 13910 SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); 13911 SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); 13912 } else { 13913 // For a scalar, just generate 0x80... 13914 SignMask = APInt::getSignMask(IntVT.getSizeInBits()); 13915 } 13916 SDLoc DL0(N0); 13917 Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, 13918 DAG.getConstant(SignMask, DL0, IntVT)); 13919 AddToWorklist(Int.getNode()); 13920 return DAG.getBitcast(VT, Int); 13921 } 13922 } 13923 13924 // (fneg (fmul c, x)) -> (fmul -c, x) 13925 if (N0.getOpcode() == ISD::FMUL && 13926 (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) { 13927 ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1)); 13928 if (CFP1) { 13929 APFloat CVal = CFP1->getValueAPF(); 13930 CVal.changeSign(); 13931 if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) || 13932 TLI.isOperationLegal(ISD::ConstantFP, VT))) 13933 return DAG.getNode( 13934 ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), 13935 DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), 13936 N0->getFlags()); 13937 } 13938 } 13939 13940 return SDValue(); 13941 } 13942 13943 static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, 13944 APFloat (*Op)(const APFloat &, const APFloat &)) { 13945 SDValue N0 = N->getOperand(0); 13946 SDValue N1 = N->getOperand(1); 13947 EVT VT = N->getValueType(0); 13948 const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); 13949 const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); 13950 13951 if (N0CFP && N1CFP) { 13952 const APFloat &C0 = N0CFP->getValueAPF(); 13953 const APFloat &C1 = N1CFP->getValueAPF(); 13954 return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT); 13955 } 13956 13957 // Canonicalize to constant on RHS. 13958 if (isConstantFPBuildVectorOrConstantFP(N0) && 13959 !isConstantFPBuildVectorOrConstantFP(N1)) 13960 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); 13961 13962 return SDValue(); 13963 } 13964 13965 SDValue DAGCombiner::visitFMINNUM(SDNode *N) { 13966 return visitFMinMax(DAG, N, minnum); 13967 } 13968 13969 SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { 13970 return visitFMinMax(DAG, N, maxnum); 13971 } 13972 13973 SDValue DAGCombiner::visitFMINIMUM(SDNode *N) { 13974 return visitFMinMax(DAG, N, minimum); 13975 } 13976 13977 SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) { 13978 return visitFMinMax(DAG, N, maximum); 13979 } 13980 13981 SDValue DAGCombiner::visitFABS(SDNode *N) { 13982 SDValue N0 = N->getOperand(0); 13983 EVT VT = N->getValueType(0); 13984 13985 // fold (fabs c1) -> fabs(c1) 13986 if (isConstantFPBuildVectorOrConstantFP(N0)) 13987 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); 13988 13989 // fold (fabs (fabs x)) -> (fabs x) 13990 if (N0.getOpcode() == ISD::FABS) 13991 return N->getOperand(0); 13992 13993 // fold (fabs (fneg x)) -> (fabs x) 13994 // fold (fabs (fcopysign x, y)) -> (fabs x) 13995 if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) 13996 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); 13997 13998 // fabs(bitcast(x)) -> bitcast(x & ~sign) to avoid constant pool loads. 13999 if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) { 14000 SDValue Int = N0.getOperand(0); 14001 EVT IntVT = Int.getValueType(); 14002 if (IntVT.isInteger() && !IntVT.isVector()) { 14003 APInt SignMask; 14004 if (N0.getValueType().isVector()) { 14005 // For a vector, get a mask such as 0x7f... per scalar element 14006 // and splat it. 14007 SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits()); 14008 SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); 14009 } else { 14010 // For a scalar, just generate 0x7f... 14011 SignMask = ~APInt::getSignMask(IntVT.getSizeInBits()); 14012 } 14013 SDLoc DL(N0); 14014 Int = DAG.getNode(ISD::AND, DL, IntVT, Int, 14015 DAG.getConstant(SignMask, DL, IntVT)); 14016 AddToWorklist(Int.getNode()); 14017 return DAG.getBitcast(N->getValueType(0), Int); 14018 } 14019 } 14020 14021 return SDValue(); 14022 } 14023 14024 SDValue DAGCombiner::visitBRCOND(SDNode *N) { 14025 SDValue Chain = N->getOperand(0); 14026 SDValue N1 = N->getOperand(1); 14027 SDValue N2 = N->getOperand(2); 14028 14029 // If N is a constant we could fold this into a fallthrough or unconditional 14030 // branch. However that doesn't happen very often in normal code, because 14031 // Instcombine/SimplifyCFG should have handled the available opportunities. 14032 // If we did this folding here, it would be necessary to update the 14033 // MachineBasicBlock CFG, which is awkward. 14034 14035 // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal 14036 // on the target. 14037 if (N1.getOpcode() == ISD::SETCC && 14038 TLI.isOperationLegalOrCustom(ISD::BR_CC, 14039 N1.getOperand(0).getValueType())) { 14040 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, 14041 Chain, N1.getOperand(2), 14042 N1.getOperand(0), N1.getOperand(1), N2); 14043 } 14044 14045 if (N1.hasOneUse()) { 14046 // rebuildSetCC calls visitXor which may change the Chain when there is a 14047 // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes. 14048 HandleSDNode ChainHandle(Chain); 14049 if (SDValue NewN1 = rebuildSetCC(N1)) 14050 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, 14051 ChainHandle.getValue(), NewN1, N2); 14052 } 14053 14054 return SDValue(); 14055 } 14056 14057 SDValue DAGCombiner::rebuildSetCC(SDValue N) { 14058 if (N.getOpcode() == ISD::SRL || 14059 (N.getOpcode() == ISD::TRUNCATE && 14060 (N.getOperand(0).hasOneUse() && 14061 N.getOperand(0).getOpcode() == ISD::SRL))) { 14062 // Look pass the truncate. 14063 if (N.getOpcode() == ISD::TRUNCATE) 14064 N = N.getOperand(0); 14065 14066 // Match this pattern so that we can generate simpler code: 14067 // 14068 // %a = ... 14069 // %b = and i32 %a, 2 14070 // %c = srl i32 %b, 1 14071 // brcond i32 %c ... 14072 // 14073 // into 14074 // 14075 // %a = ... 14076 // %b = and i32 %a, 2 14077 // %c = setcc eq %b, 0 14078 // brcond %c ... 14079 // 14080 // This applies only when the AND constant value has one bit set and the 14081 // SRL constant is equal to the log2 of the AND constant. The back-end is 14082 // smart enough to convert the result into a TEST/JMP sequence. 14083 SDValue Op0 = N.getOperand(0); 14084 SDValue Op1 = N.getOperand(1); 14085 14086 if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) { 14087 SDValue AndOp1 = Op0.getOperand(1); 14088 14089 if (AndOp1.getOpcode() == ISD::Constant) { 14090 const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue(); 14091 14092 if (AndConst.isPowerOf2() && 14093 cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) { 14094 SDLoc DL(N); 14095 return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()), 14096 Op0, DAG.getConstant(0, DL, Op0.getValueType()), 14097 ISD::SETNE); 14098 } 14099 } 14100 } 14101 } 14102 14103 // Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne)) 14104 // Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq)) 14105 if (N.getOpcode() == ISD::XOR) { 14106 // Because we may call this on a speculatively constructed 14107 // SimplifiedSetCC Node, we need to simplify this node first. 14108 // Ideally this should be folded into SimplifySetCC and not 14109 // here. For now, grab a handle to N so we don't lose it from 14110 // replacements interal to the visit. 14111 HandleSDNode XORHandle(N); 14112 while (N.getOpcode() == ISD::XOR) { 14113 SDValue Tmp = visitXOR(N.getNode()); 14114 // No simplification done. 14115 if (!Tmp.getNode()) 14116 break; 14117 // Returning N is form in-visit replacement that may invalidated 14118 // N. Grab value from Handle. 14119 if (Tmp.getNode() == N.getNode()) 14120 N = XORHandle.getValue(); 14121 else // Node simplified. Try simplifying again. 14122 N = Tmp; 14123 } 14124 14125 if (N.getOpcode() != ISD::XOR) 14126 return N; 14127 14128 SDValue Op0 = N->getOperand(0); 14129 SDValue Op1 = N->getOperand(1); 14130 14131 if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) { 14132 bool Equal = false; 14133 // (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq)) 14134 if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR && 14135 Op0.getValueType() == MVT::i1) { 14136 N = Op0; 14137 Op0 = N->getOperand(0); 14138 Op1 = N->getOperand(1); 14139 Equal = true; 14140 } 14141 14142 EVT SetCCVT = N.getValueType(); 14143 if (LegalTypes) 14144 SetCCVT = getSetCCResultType(SetCCVT); 14145 // Replace the uses of XOR with SETCC 14146 return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1, 14147 Equal ? ISD::SETEQ : ISD::SETNE); 14148 } 14149 } 14150 14151 return SDValue(); 14152 } 14153 14154 // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB. 14155 // 14156 SDValue DAGCombiner::visitBR_CC(SDNode *N) { 14157 CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1)); 14158 SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3); 14159 14160 // If N is a constant we could fold this into a fallthrough or unconditional 14161 // branch. However that doesn't happen very often in normal code, because 14162 // Instcombine/SimplifyCFG should have handled the available opportunities. 14163 // If we did this folding here, it would be necessary to update the 14164 // MachineBasicBlock CFG, which is awkward. 14165 14166 // Use SimplifySetCC to simplify SETCC's. 14167 SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()), 14168 CondLHS, CondRHS, CC->get(), SDLoc(N), 14169 false); 14170 if (Simp.getNode()) AddToWorklist(Simp.getNode()); 14171 14172 // fold to a simpler setcc 14173 if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC) 14174 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, 14175 N->getOperand(0), Simp.getOperand(2), 14176 Simp.getOperand(0), Simp.getOperand(1), 14177 N->getOperand(4)); 14178 14179 return SDValue(); 14180 } 14181 14182 /// Return true if 'Use' is a load or a store that uses N as its base pointer 14183 /// and that N may be folded in the load / store addressing mode. 14184 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, 14185 SelectionDAG &DAG, 14186 const TargetLowering &TLI) { 14187 EVT VT; 14188 unsigned AS; 14189 14190 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) { 14191 if (LD->isIndexed() || LD->getBasePtr().getNode() != N) 14192 return false; 14193 VT = LD->getMemoryVT(); 14194 AS = LD->getAddressSpace(); 14195 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) { 14196 if (ST->isIndexed() || ST->getBasePtr().getNode() != N) 14197 return false; 14198 VT = ST->getMemoryVT(); 14199 AS = ST->getAddressSpace(); 14200 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) { 14201 if (LD->isIndexed() || LD->getBasePtr().getNode() != N) 14202 return false; 14203 VT = LD->getMemoryVT(); 14204 AS = LD->getAddressSpace(); 14205 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) { 14206 if (ST->isIndexed() || ST->getBasePtr().getNode() != N) 14207 return false; 14208 VT = ST->getMemoryVT(); 14209 AS = ST->getAddressSpace(); 14210 } else 14211 return false; 14212 14213 TargetLowering::AddrMode AM; 14214 if (N->getOpcode() == ISD::ADD) { 14215 AM.HasBaseReg = true; 14216 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14217 if (Offset) 14218 // [reg +/- imm] 14219 AM.BaseOffs = Offset->getSExtValue(); 14220 else 14221 // [reg +/- reg] 14222 AM.Scale = 1; 14223 } else if (N->getOpcode() == ISD::SUB) { 14224 AM.HasBaseReg = true; 14225 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14226 if (Offset) 14227 // [reg +/- imm] 14228 AM.BaseOffs = -Offset->getSExtValue(); 14229 else 14230 // [reg +/- reg] 14231 AM.Scale = 1; 14232 } else 14233 return false; 14234 14235 return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, 14236 VT.getTypeForEVT(*DAG.getContext()), AS); 14237 } 14238 14239 static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec, 14240 bool &IsLoad, bool &IsMasked, SDValue &Ptr, 14241 const TargetLowering &TLI) { 14242 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 14243 if (LD->isIndexed()) 14244 return false; 14245 EVT VT = LD->getMemoryVT(); 14246 if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT)) 14247 return false; 14248 Ptr = LD->getBasePtr(); 14249 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 14250 if (ST->isIndexed()) 14251 return false; 14252 EVT VT = ST->getMemoryVT(); 14253 if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT)) 14254 return false; 14255 Ptr = ST->getBasePtr(); 14256 IsLoad = false; 14257 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 14258 if (LD->isIndexed()) 14259 return false; 14260 EVT VT = LD->getMemoryVT(); 14261 if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) && 14262 !TLI.isIndexedMaskedLoadLegal(Dec, VT)) 14263 return false; 14264 Ptr = LD->getBasePtr(); 14265 IsMasked = true; 14266 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 14267 if (ST->isIndexed()) 14268 return false; 14269 EVT VT = ST->getMemoryVT(); 14270 if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) && 14271 !TLI.isIndexedMaskedStoreLegal(Dec, VT)) 14272 return false; 14273 Ptr = ST->getBasePtr(); 14274 IsLoad = false; 14275 IsMasked = true; 14276 } else { 14277 return false; 14278 } 14279 return true; 14280 } 14281 14282 /// Try turning a load/store into a pre-indexed load/store when the base 14283 /// pointer is an add or subtract and it has other uses besides the load/store. 14284 /// After the transformation, the new indexed load/store has effectively folded 14285 /// the add/subtract in and all of its other uses are redirected to the 14286 /// new load/store. 14287 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { 14288 if (Level < AfterLegalizeDAG) 14289 return false; 14290 14291 bool IsLoad = true; 14292 bool IsMasked = false; 14293 SDValue Ptr; 14294 if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked, 14295 Ptr, TLI)) 14296 return false; 14297 14298 // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail 14299 // out. There is no reason to make this a preinc/predec. 14300 if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) || 14301 Ptr.getNode()->hasOneUse()) 14302 return false; 14303 14304 // Ask the target to do addressing mode selection. 14305 SDValue BasePtr; 14306 SDValue Offset; 14307 ISD::MemIndexedMode AM = ISD::UNINDEXED; 14308 if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG)) 14309 return false; 14310 14311 // Backends without true r+i pre-indexed forms may need to pass a 14312 // constant base with a variable offset so that constant coercion 14313 // will work with the patterns in canonical form. 14314 bool Swapped = false; 14315 if (isa<ConstantSDNode>(BasePtr)) { 14316 std::swap(BasePtr, Offset); 14317 Swapped = true; 14318 } 14319 14320 // Don't create a indexed load / store with zero offset. 14321 if (isNullConstant(Offset)) 14322 return false; 14323 14324 // Try turning it into a pre-indexed load / store except when: 14325 // 1) The new base ptr is a frame index. 14326 // 2) If N is a store and the new base ptr is either the same as or is a 14327 // predecessor of the value being stored. 14328 // 3) Another use of old base ptr is a predecessor of N. If ptr is folded 14329 // that would create a cycle. 14330 // 4) All uses are load / store ops that use it as old base ptr. 14331 14332 // Check #1. Preinc'ing a frame index would require copying the stack pointer 14333 // (plus the implicit offset) to a register to preinc anyway. 14334 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr)) 14335 return false; 14336 14337 // Check #2. 14338 if (!IsLoad) { 14339 SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue() 14340 : cast<StoreSDNode>(N)->getValue(); 14341 14342 // Would require a copy. 14343 if (Val == BasePtr) 14344 return false; 14345 14346 // Would create a cycle. 14347 if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode())) 14348 return false; 14349 } 14350 14351 // Caches for hasPredecessorHelper. 14352 SmallPtrSet<const SDNode *, 32> Visited; 14353 SmallVector<const SDNode *, 16> Worklist; 14354 Worklist.push_back(N); 14355 14356 // If the offset is a constant, there may be other adds of constants that 14357 // can be folded with this one. We should do this to avoid having to keep 14358 // a copy of the original base pointer. 14359 SmallVector<SDNode *, 16> OtherUses; 14360 if (isa<ConstantSDNode>(Offset)) 14361 for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(), 14362 UE = BasePtr.getNode()->use_end(); 14363 UI != UE; ++UI) { 14364 SDUse &Use = UI.getUse(); 14365 // Skip the use that is Ptr and uses of other results from BasePtr's 14366 // node (important for nodes that return multiple results). 14367 if (Use.getUser() == Ptr.getNode() || Use != BasePtr) 14368 continue; 14369 14370 if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist)) 14371 continue; 14372 14373 if (Use.getUser()->getOpcode() != ISD::ADD && 14374 Use.getUser()->getOpcode() != ISD::SUB) { 14375 OtherUses.clear(); 14376 break; 14377 } 14378 14379 SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1); 14380 if (!isa<ConstantSDNode>(Op1)) { 14381 OtherUses.clear(); 14382 break; 14383 } 14384 14385 // FIXME: In some cases, we can be smarter about this. 14386 if (Op1.getValueType() != Offset.getValueType()) { 14387 OtherUses.clear(); 14388 break; 14389 } 14390 14391 OtherUses.push_back(Use.getUser()); 14392 } 14393 14394 if (Swapped) 14395 std::swap(BasePtr, Offset); 14396 14397 // Now check for #3 and #4. 14398 bool RealUse = false; 14399 14400 for (SDNode *Use : Ptr.getNode()->uses()) { 14401 if (Use == N) 14402 continue; 14403 if (SDNode::hasPredecessorHelper(Use, Visited, Worklist)) 14404 return false; 14405 14406 // If Ptr may be folded in addressing mode of other use, then it's 14407 // not profitable to do this transformation. 14408 if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI)) 14409 RealUse = true; 14410 } 14411 14412 if (!RealUse) 14413 return false; 14414 14415 SDValue Result; 14416 if (!IsMasked) { 14417 if (IsLoad) 14418 Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); 14419 else 14420 Result = 14421 DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); 14422 } else { 14423 if (IsLoad) 14424 Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr, 14425 Offset, AM); 14426 else 14427 Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr, 14428 Offset, AM); 14429 } 14430 ++PreIndexedNodes; 14431 ++NodesCombined; 14432 LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; 14433 Result.getNode()->dump(&DAG); dbgs() << '\n'); 14434 WorklistRemover DeadNodes(*this); 14435 if (IsLoad) { 14436 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); 14437 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); 14438 } else { 14439 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); 14440 } 14441 14442 // Finally, since the node is now dead, remove it from the graph. 14443 deleteAndRecombine(N); 14444 14445 if (Swapped) 14446 std::swap(BasePtr, Offset); 14447 14448 // Replace other uses of BasePtr that can be updated to use Ptr 14449 for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) { 14450 unsigned OffsetIdx = 1; 14451 if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode()) 14452 OffsetIdx = 0; 14453 assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() == 14454 BasePtr.getNode() && "Expected BasePtr operand"); 14455 14456 // We need to replace ptr0 in the following expression: 14457 // x0 * offset0 + y0 * ptr0 = t0 14458 // knowing that 14459 // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store) 14460 // 14461 // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the 14462 // indexed load/store and the expression that needs to be re-written. 14463 // 14464 // Therefore, we have: 14465 // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 14466 14467 ConstantSDNode *CN = 14468 cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx)); 14469 int X0, X1, Y0, Y1; 14470 const APInt &Offset0 = CN->getAPIntValue(); 14471 APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue(); 14472 14473 X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; 14474 Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; 14475 X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; 14476 Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; 14477 14478 unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD; 14479 14480 APInt CNV = Offset0; 14481 if (X0 < 0) CNV = -CNV; 14482 if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1; 14483 else CNV = CNV - Offset1; 14484 14485 SDLoc DL(OtherUses[i]); 14486 14487 // We can now generate the new expression. 14488 SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0)); 14489 SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0); 14490 14491 SDValue NewUse = DAG.getNode(Opcode, 14492 DL, 14493 OtherUses[i]->getValueType(0), NewOp1, NewOp2); 14494 DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); 14495 deleteAndRecombine(OtherUses[i]); 14496 } 14497 14498 // Replace the uses of Ptr with uses of the updated base value. 14499 DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0)); 14500 deleteAndRecombine(Ptr.getNode()); 14501 AddToWorklist(Result.getNode()); 14502 14503 return true; 14504 } 14505 14506 static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse, 14507 SDValue &BasePtr, SDValue &Offset, 14508 ISD::MemIndexedMode &AM, 14509 SelectionDAG &DAG, 14510 const TargetLowering &TLI) { 14511 if (PtrUse == N || 14512 (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB)) 14513 return false; 14514 14515 if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG)) 14516 return false; 14517 14518 // Don't create a indexed load / store with zero offset. 14519 if (isNullConstant(Offset)) 14520 return false; 14521 14522 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr)) 14523 return false; 14524 14525 SmallPtrSet<const SDNode *, 32> Visited; 14526 for (SDNode *Use : BasePtr.getNode()->uses()) { 14527 if (Use == Ptr.getNode()) 14528 continue; 14529 14530 // No if there's a later user which could perform the index instead. 14531 if (isa<MemSDNode>(Use)) { 14532 bool IsLoad = true; 14533 bool IsMasked = false; 14534 SDValue OtherPtr; 14535 if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad, 14536 IsMasked, OtherPtr, TLI)) { 14537 SmallVector<const SDNode *, 2> Worklist; 14538 Worklist.push_back(Use); 14539 if (SDNode::hasPredecessorHelper(N, Visited, Worklist)) 14540 return false; 14541 } 14542 } 14543 14544 // If all the uses are load / store addresses, then don't do the 14545 // transformation. 14546 if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { 14547 for (SDNode *UseUse : Use->uses()) 14548 if (canFoldInAddressingMode(Use, UseUse, DAG, TLI)) 14549 return false; 14550 } 14551 } 14552 return true; 14553 } 14554 14555 static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad, 14556 bool &IsMasked, SDValue &Ptr, 14557 SDValue &BasePtr, SDValue &Offset, 14558 ISD::MemIndexedMode &AM, 14559 SelectionDAG &DAG, 14560 const TargetLowering &TLI) { 14561 if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, 14562 IsMasked, Ptr, TLI) || 14563 Ptr.getNode()->hasOneUse()) 14564 return nullptr; 14565 14566 // Try turning it into a post-indexed load / store except when 14567 // 1) All uses are load / store ops that use it as base ptr (and 14568 // it may be folded as addressing mmode). 14569 // 2) Op must be independent of N, i.e. Op is neither a predecessor 14570 // nor a successor of N. Otherwise, if Op is folded that would 14571 // create a cycle. 14572 for (SDNode *Op : Ptr->uses()) { 14573 // Check for #1. 14574 if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI)) 14575 continue; 14576 14577 // Check for #2. 14578 SmallPtrSet<const SDNode *, 32> Visited; 14579 SmallVector<const SDNode *, 8> Worklist; 14580 // Ptr is predecessor to both N and Op. 14581 Visited.insert(Ptr.getNode()); 14582 Worklist.push_back(N); 14583 Worklist.push_back(Op); 14584 if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) && 14585 !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) 14586 return Op; 14587 } 14588 return nullptr; 14589 } 14590 14591 /// Try to combine a load/store with a add/sub of the base pointer node into a 14592 /// post-indexed load/store. The transformation folded the add/subtract into the 14593 /// new indexed load/store effectively and all of its uses are redirected to the 14594 /// new load/store. 14595 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { 14596 if (Level < AfterLegalizeDAG) 14597 return false; 14598 14599 bool IsLoad = true; 14600 bool IsMasked = false; 14601 SDValue Ptr; 14602 SDValue BasePtr; 14603 SDValue Offset; 14604 ISD::MemIndexedMode AM = ISD::UNINDEXED; 14605 SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr, 14606 Offset, AM, DAG, TLI); 14607 if (!Op) 14608 return false; 14609 14610 SDValue Result; 14611 if (!IsMasked) 14612 Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, 14613 Offset, AM) 14614 : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), 14615 BasePtr, Offset, AM); 14616 else 14617 Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), 14618 BasePtr, Offset, AM) 14619 : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), 14620 BasePtr, Offset, AM); 14621 ++PostIndexedNodes; 14622 ++NodesCombined; 14623 LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); 14624 dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); 14625 dbgs() << '\n'); 14626 WorklistRemover DeadNodes(*this); 14627 if (IsLoad) { 14628 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); 14629 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); 14630 } else { 14631 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); 14632 } 14633 14634 // Finally, since the node is now dead, remove it from the graph. 14635 deleteAndRecombine(N); 14636 14637 // Replace the uses of Use with uses of the updated base value. 14638 DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), 14639 Result.getValue(IsLoad ? 1 : 0)); 14640 deleteAndRecombine(Op); 14641 return true; 14642 } 14643 14644 /// Return the base-pointer arithmetic from an indexed \p LD. 14645 SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { 14646 ISD::MemIndexedMode AM = LD->getAddressingMode(); 14647 assert(AM != ISD::UNINDEXED); 14648 SDValue BP = LD->getOperand(1); 14649 SDValue Inc = LD->getOperand(2); 14650 14651 // Some backends use TargetConstants for load offsets, but don't expect 14652 // TargetConstants in general ADD nodes. We can convert these constants into 14653 // regular Constants (if the constant is not opaque). 14654 assert((Inc.getOpcode() != ISD::TargetConstant || 14655 !cast<ConstantSDNode>(Inc)->isOpaque()) && 14656 "Cannot split out indexing using opaque target constants"); 14657 if (Inc.getOpcode() == ISD::TargetConstant) { 14658 ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc); 14659 Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc), 14660 ConstInc->getValueType(0)); 14661 } 14662 14663 unsigned Opc = 14664 (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB); 14665 return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); 14666 } 14667 14668 static inline int numVectorEltsOrZero(EVT T) { 14669 return T.isVector() ? T.getVectorNumElements() : 0; 14670 } 14671 14672 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) { 14673 Val = ST->getValue(); 14674 EVT STType = Val.getValueType(); 14675 EVT STMemType = ST->getMemoryVT(); 14676 if (STType == STMemType) 14677 return true; 14678 if (isTypeLegal(STMemType)) 14679 return false; // fail. 14680 if (STType.isFloatingPoint() && STMemType.isFloatingPoint() && 14681 TLI.isOperationLegal(ISD::FTRUNC, STMemType)) { 14682 Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val); 14683 return true; 14684 } 14685 if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) && 14686 STType.isInteger() && STMemType.isInteger()) { 14687 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val); 14688 return true; 14689 } 14690 if (STType.getSizeInBits() == STMemType.getSizeInBits()) { 14691 Val = DAG.getBitcast(STMemType, Val); 14692 return true; 14693 } 14694 return false; // fail. 14695 } 14696 14697 bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) { 14698 EVT LDMemType = LD->getMemoryVT(); 14699 EVT LDType = LD->getValueType(0); 14700 assert(Val.getValueType() == LDMemType && 14701 "Attempting to extend value of non-matching type"); 14702 if (LDType == LDMemType) 14703 return true; 14704 if (LDMemType.isInteger() && LDType.isInteger()) { 14705 switch (LD->getExtensionType()) { 14706 case ISD::NON_EXTLOAD: 14707 Val = DAG.getBitcast(LDType, Val); 14708 return true; 14709 case ISD::EXTLOAD: 14710 Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val); 14711 return true; 14712 case ISD::SEXTLOAD: 14713 Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val); 14714 return true; 14715 case ISD::ZEXTLOAD: 14716 Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val); 14717 return true; 14718 } 14719 } 14720 return false; 14721 } 14722 14723 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { 14724 if (OptLevel == CodeGenOpt::None || !LD->isSimple()) 14725 return SDValue(); 14726 SDValue Chain = LD->getOperand(0); 14727 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode()); 14728 // TODO: Relax this restriction for unordered atomics (see D66309) 14729 if (!ST || !ST->isSimple()) 14730 return SDValue(); 14731 14732 EVT LDType = LD->getValueType(0); 14733 EVT LDMemType = LD->getMemoryVT(); 14734 EVT STMemType = ST->getMemoryVT(); 14735 EVT STType = ST->getValue().getValueType(); 14736 14737 BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); 14738 BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); 14739 int64_t Offset; 14740 if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) 14741 return SDValue(); 14742 14743 // Normalize for Endianness. After this Offset=0 will denote that the least 14744 // significant bit in the loaded value maps to the least significant bit in 14745 // the stored value). With Offset=n (for n > 0) the loaded value starts at the 14746 // n:th least significant byte of the stored value. 14747 if (DAG.getDataLayout().isBigEndian()) 14748 Offset = ((int64_t)STMemType.getStoreSizeInBits() - 14749 (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset; 14750 14751 // Check that the stored value cover all bits that are loaded. 14752 bool STCoversLD = 14753 (Offset >= 0) && 14754 (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); 14755 14756 auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue { 14757 if (LD->isIndexed()) { 14758 // Cannot handle opaque target constants and we must respect the user's 14759 // request not to split indexes from loads. 14760 if (!canSplitIdx(LD)) 14761 return SDValue(); 14762 SDValue Idx = SplitIndexingFromLoad(LD); 14763 SDValue Ops[] = {Val, Idx, Chain}; 14764 return CombineTo(LD, Ops, 3); 14765 } 14766 return CombineTo(LD, Val, Chain); 14767 }; 14768 14769 if (!STCoversLD) 14770 return SDValue(); 14771 14772 // Memory as copy space (potentially masked). 14773 if (Offset == 0 && LDType == STType && STMemType == LDMemType) { 14774 // Simple case: Direct non-truncating forwarding 14775 if (LDType.getSizeInBits() == LDMemType.getSizeInBits()) 14776 return ReplaceLd(LD, ST->getValue(), Chain); 14777 // Can we model the truncate and extension with an and mask? 14778 if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && 14779 !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { 14780 // Mask to size of LDMemType 14781 auto Mask = 14782 DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(), 14783 STMemType.getSizeInBits()), 14784 SDLoc(ST), STType); 14785 auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); 14786 return ReplaceLd(LD, Val, Chain); 14787 } 14788 } 14789 14790 // TODO: Deal with nonzero offset. 14791 if (LD->getBasePtr().isUndef() || Offset != 0) 14792 return SDValue(); 14793 // Model necessary truncations / extenstions. 14794 SDValue Val; 14795 // Truncate Value To Stored Memory Size. 14796 do { 14797 if (!getTruncatedStoreValue(ST, Val)) 14798 continue; 14799 if (!isTypeLegal(LDMemType)) 14800 continue; 14801 if (STMemType != LDMemType) { 14802 // TODO: Support vectors? This requires extract_subvector/bitcast. 14803 if (!STMemType.isVector() && !LDMemType.isVector() && 14804 STMemType.isInteger() && LDMemType.isInteger()) 14805 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val); 14806 else 14807 continue; 14808 } 14809 if (!extendLoadedValueToExtension(LD, Val)) 14810 continue; 14811 return ReplaceLd(LD, Val, Chain); 14812 } while (false); 14813 14814 // On failure, cleanup dead nodes we may have created. 14815 if (Val->use_empty()) 14816 deleteAndRecombine(Val.getNode()); 14817 return SDValue(); 14818 } 14819 14820 SDValue DAGCombiner::visitLOAD(SDNode *N) { 14821 LoadSDNode *LD = cast<LoadSDNode>(N); 14822 SDValue Chain = LD->getChain(); 14823 SDValue Ptr = LD->getBasePtr(); 14824 14825 // If load is not volatile and there are no uses of the loaded value (and 14826 // the updated indexed value in case of indexed loads), change uses of the 14827 // chain value into uses of the chain input (i.e. delete the dead load). 14828 // TODO: Allow this for unordered atomics (see D66309) 14829 if (LD->isSimple()) { 14830 if (N->getValueType(1) == MVT::Other) { 14831 // Unindexed loads. 14832 if (!N->hasAnyUseOfValue(0)) { 14833 // It's not safe to use the two value CombineTo variant here. e.g. 14834 // v1, chain2 = load chain1, loc 14835 // v2, chain3 = load chain2, loc 14836 // v3 = add v2, c 14837 // Now we replace use of chain2 with chain1. This makes the second load 14838 // isomorphic to the one we are deleting, and thus makes this load live. 14839 LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG); 14840 dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG); 14841 dbgs() << "\n"); 14842 WorklistRemover DeadNodes(*this); 14843 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); 14844 AddUsersToWorklist(Chain.getNode()); 14845 if (N->use_empty()) 14846 deleteAndRecombine(N); 14847 14848 return SDValue(N, 0); // Return N so it doesn't get rechecked! 14849 } 14850 } else { 14851 // Indexed loads. 14852 assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?"); 14853 14854 // If this load has an opaque TargetConstant offset, then we cannot split 14855 // the indexing into an add/sub directly (that TargetConstant may not be 14856 // valid for a different type of node, and we cannot convert an opaque 14857 // target constant into a regular constant). 14858 bool CanSplitIdx = canSplitIdx(LD); 14859 14860 if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) { 14861 SDValue Undef = DAG.getUNDEF(N->getValueType(0)); 14862 SDValue Index; 14863 if (N->hasAnyUseOfValue(1) && CanSplitIdx) { 14864 Index = SplitIndexingFromLoad(LD); 14865 // Try to fold the base pointer arithmetic into subsequent loads and 14866 // stores. 14867 AddUsersToWorklist(N); 14868 } else 14869 Index = DAG.getUNDEF(N->getValueType(1)); 14870 LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG); 14871 dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG); 14872 dbgs() << " and 2 other values\n"); 14873 WorklistRemover DeadNodes(*this); 14874 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); 14875 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index); 14876 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); 14877 deleteAndRecombine(N); 14878 return SDValue(N, 0); // Return N so it doesn't get rechecked! 14879 } 14880 } 14881 } 14882 14883 // If this load is directly stored, replace the load value with the stored 14884 // value. 14885 if (auto V = ForwardStoreValueToDirectLoad(LD)) 14886 return V; 14887 14888 // Try to infer better alignment information than the load already has. 14889 if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) { 14890 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) { 14891 if (*Alignment > LD->getAlign() && 14892 isAligned(*Alignment, LD->getSrcValueOffset())) { 14893 SDValue NewLoad = DAG.getExtLoad( 14894 LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr, 14895 LD->getPointerInfo(), LD->getMemoryVT(), *Alignment, 14896 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 14897 // NewLoad will always be N as we are only refining the alignment 14898 assert(NewLoad.getNode() == N); 14899 (void)NewLoad; 14900 } 14901 } 14902 } 14903 14904 if (LD->isUnindexed()) { 14905 // Walk up chain skipping non-aliasing memory nodes. 14906 SDValue BetterChain = FindBetterChain(LD, Chain); 14907 14908 // If there is a better chain. 14909 if (Chain != BetterChain) { 14910 SDValue ReplLoad; 14911 14912 // Replace the chain to void dependency. 14913 if (LD->getExtensionType() == ISD::NON_EXTLOAD) { 14914 ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD), 14915 BetterChain, Ptr, LD->getMemOperand()); 14916 } else { 14917 ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), 14918 LD->getValueType(0), 14919 BetterChain, Ptr, LD->getMemoryVT(), 14920 LD->getMemOperand()); 14921 } 14922 14923 // Create token factor to keep old chain connected. 14924 SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), 14925 MVT::Other, Chain, ReplLoad.getValue(1)); 14926 14927 // Replace uses with load result and token factor 14928 return CombineTo(N, ReplLoad.getValue(0), Token); 14929 } 14930 } 14931 14932 // Try transforming N to an indexed load. 14933 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 14934 return SDValue(N, 0); 14935 14936 // Try to slice up N to more direct loads if the slices are mapped to 14937 // different register banks or pairing can take place. 14938 if (SliceUpLoad(N)) 14939 return SDValue(N, 0); 14940 14941 return SDValue(); 14942 } 14943 14944 namespace { 14945 14946 /// Helper structure used to slice a load in smaller loads. 14947 /// Basically a slice is obtained from the following sequence: 14948 /// Origin = load Ty1, Base 14949 /// Shift = srl Ty1 Origin, CstTy Amount 14950 /// Inst = trunc Shift to Ty2 14951 /// 14952 /// Then, it will be rewritten into: 14953 /// Slice = load SliceTy, Base + SliceOffset 14954 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 14955 /// 14956 /// SliceTy is deduced from the number of bits that are actually used to 14957 /// build Inst. 14958 struct LoadedSlice { 14959 /// Helper structure used to compute the cost of a slice. 14960 struct Cost { 14961 /// Are we optimizing for code size. 14962 bool ForCodeSize = false; 14963 14964 /// Various cost. 14965 unsigned Loads = 0; 14966 unsigned Truncates = 0; 14967 unsigned CrossRegisterBanksCopies = 0; 14968 unsigned ZExts = 0; 14969 unsigned Shift = 0; 14970 14971 explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {} 14972 14973 /// Get the cost of one isolated slice. 14974 Cost(const LoadedSlice &LS, bool ForCodeSize) 14975 : ForCodeSize(ForCodeSize), Loads(1) { 14976 EVT TruncType = LS.Inst->getValueType(0); 14977 EVT LoadedType = LS.getLoadedType(); 14978 if (TruncType != LoadedType && 14979 !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType)) 14980 ZExts = 1; 14981 } 14982 14983 /// Account for slicing gain in the current cost. 14984 /// Slicing provide a few gains like removing a shift or a 14985 /// truncate. This method allows to grow the cost of the original 14986 /// load with the gain from this slice. 14987 void addSliceGain(const LoadedSlice &LS) { 14988 // Each slice saves a truncate. 14989 const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); 14990 if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(), 14991 LS.Inst->getValueType(0))) 14992 ++Truncates; 14993 // If there is a shift amount, this slice gets rid of it. 14994 if (LS.Shift) 14995 ++Shift; 14996 // If this slice can merge a cross register bank copy, account for it. 14997 if (LS.canMergeExpensiveCrossRegisterBankCopy()) 14998 ++CrossRegisterBanksCopies; 14999 } 15000 15001 Cost &operator+=(const Cost &RHS) { 15002 Loads += RHS.Loads; 15003 Truncates += RHS.Truncates; 15004 CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies; 15005 ZExts += RHS.ZExts; 15006 Shift += RHS.Shift; 15007 return *this; 15008 } 15009 15010 bool operator==(const Cost &RHS) const { 15011 return Loads == RHS.Loads && Truncates == RHS.Truncates && 15012 CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies && 15013 ZExts == RHS.ZExts && Shift == RHS.Shift; 15014 } 15015 15016 bool operator!=(const Cost &RHS) const { return !(*this == RHS); } 15017 15018 bool operator<(const Cost &RHS) const { 15019 // Assume cross register banks copies are as expensive as loads. 15020 // FIXME: Do we want some more target hooks? 15021 unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies; 15022 unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies; 15023 // Unless we are optimizing for code size, consider the 15024 // expensive operation first. 15025 if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS) 15026 return ExpensiveOpsLHS < ExpensiveOpsRHS; 15027 return (Truncates + ZExts + Shift + ExpensiveOpsLHS) < 15028 (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS); 15029 } 15030 15031 bool operator>(const Cost &RHS) const { return RHS < *this; } 15032 15033 bool operator<=(const Cost &RHS) const { return !(RHS < *this); } 15034 15035 bool operator>=(const Cost &RHS) const { return !(*this < RHS); } 15036 }; 15037 15038 // The last instruction that represent the slice. This should be a 15039 // truncate instruction. 15040 SDNode *Inst; 15041 15042 // The original load instruction. 15043 LoadSDNode *Origin; 15044 15045 // The right shift amount in bits from the original load. 15046 unsigned Shift; 15047 15048 // The DAG from which Origin came from. 15049 // This is used to get some contextual information about legal types, etc. 15050 SelectionDAG *DAG; 15051 15052 LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr, 15053 unsigned Shift = 0, SelectionDAG *DAG = nullptr) 15054 : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {} 15055 15056 /// Get the bits used in a chunk of bits \p BitWidth large. 15057 /// \return Result is \p BitWidth and has used bits set to 1 and 15058 /// not used bits set to 0. 15059 APInt getUsedBits() const { 15060 // Reproduce the trunc(lshr) sequence: 15061 // - Start from the truncated value. 15062 // - Zero extend to the desired bit width. 15063 // - Shift left. 15064 assert(Origin && "No original load to compare against."); 15065 unsigned BitWidth = Origin->getValueSizeInBits(0); 15066 assert(Inst && "This slice is not bound to an instruction"); 15067 assert(Inst->getValueSizeInBits(0) <= BitWidth && 15068 "Extracted slice is bigger than the whole type!"); 15069 APInt UsedBits(Inst->getValueSizeInBits(0), 0); 15070 UsedBits.setAllBits(); 15071 UsedBits = UsedBits.zext(BitWidth); 15072 UsedBits <<= Shift; 15073 return UsedBits; 15074 } 15075 15076 /// Get the size of the slice to be loaded in bytes. 15077 unsigned getLoadedSize() const { 15078 unsigned SliceSize = getUsedBits().countPopulation(); 15079 assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); 15080 return SliceSize / 8; 15081 } 15082 15083 /// Get the type that will be loaded for this slice. 15084 /// Note: This may not be the final type for the slice. 15085 EVT getLoadedType() const { 15086 assert(DAG && "Missing context"); 15087 LLVMContext &Ctxt = *DAG->getContext(); 15088 return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8); 15089 } 15090 15091 /// Get the alignment of the load used for this slice. 15092 Align getAlign() const { 15093 Align Alignment = Origin->getAlign(); 15094 uint64_t Offset = getOffsetFromBase(); 15095 if (Offset != 0) 15096 Alignment = commonAlignment(Alignment, Alignment.value() + Offset); 15097 return Alignment; 15098 } 15099 15100 /// Check if this slice can be rewritten with legal operations. 15101 bool isLegal() const { 15102 // An invalid slice is not legal. 15103 if (!Origin || !Inst || !DAG) 15104 return false; 15105 15106 // Offsets are for indexed load only, we do not handle that. 15107 if (!Origin->getOffset().isUndef()) 15108 return false; 15109 15110 const TargetLowering &TLI = DAG->getTargetLoweringInfo(); 15111 15112 // Check that the type is legal. 15113 EVT SliceType = getLoadedType(); 15114 if (!TLI.isTypeLegal(SliceType)) 15115 return false; 15116 15117 // Check that the load is legal for this type. 15118 if (!TLI.isOperationLegal(ISD::LOAD, SliceType)) 15119 return false; 15120 15121 // Check that the offset can be computed. 15122 // 1. Check its type. 15123 EVT PtrType = Origin->getBasePtr().getValueType(); 15124 if (PtrType == MVT::Untyped || PtrType.isExtended()) 15125 return false; 15126 15127 // 2. Check that it fits in the immediate. 15128 if (!TLI.isLegalAddImmediate(getOffsetFromBase())) 15129 return false; 15130 15131 // 3. Check that the computation is legal. 15132 if (!TLI.isOperationLegal(ISD::ADD, PtrType)) 15133 return false; 15134 15135 // Check that the zext is legal if it needs one. 15136 EVT TruncateType = Inst->getValueType(0); 15137 if (TruncateType != SliceType && 15138 !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType)) 15139 return false; 15140 15141 return true; 15142 } 15143 15144 /// Get the offset in bytes of this slice in the original chunk of 15145 /// bits. 15146 /// \pre DAG != nullptr. 15147 uint64_t getOffsetFromBase() const { 15148 assert(DAG && "Missing context."); 15149 bool IsBigEndian = DAG->getDataLayout().isBigEndian(); 15150 assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported."); 15151 uint64_t Offset = Shift / 8; 15152 unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8; 15153 assert(!(Origin->getValueSizeInBits(0) & 0x7) && 15154 "The size of the original loaded type is not a multiple of a" 15155 " byte."); 15156 // If Offset is bigger than TySizeInBytes, it means we are loading all 15157 // zeros. This should have been optimized before in the process. 15158 assert(TySizeInBytes > Offset && 15159 "Invalid shift amount for given loaded size"); 15160 if (IsBigEndian) 15161 Offset = TySizeInBytes - Offset - getLoadedSize(); 15162 return Offset; 15163 } 15164 15165 /// Generate the sequence of instructions to load the slice 15166 /// represented by this object and redirect the uses of this slice to 15167 /// this new sequence of instructions. 15168 /// \pre this->Inst && this->Origin are valid Instructions and this 15169 /// object passed the legal check: LoadedSlice::isLegal returned true. 15170 /// \return The last instruction of the sequence used to load the slice. 15171 SDValue loadSlice() const { 15172 assert(Inst && Origin && "Unable to replace a non-existing slice."); 15173 const SDValue &OldBaseAddr = Origin->getBasePtr(); 15174 SDValue BaseAddr = OldBaseAddr; 15175 // Get the offset in that chunk of bytes w.r.t. the endianness. 15176 int64_t Offset = static_cast<int64_t>(getOffsetFromBase()); 15177 assert(Offset >= 0 && "Offset too big to fit in int64_t!"); 15178 if (Offset) { 15179 // BaseAddr = BaseAddr + Offset. 15180 EVT ArithType = BaseAddr.getValueType(); 15181 SDLoc DL(Origin); 15182 BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr, 15183 DAG->getConstant(Offset, DL, ArithType)); 15184 } 15185 15186 // Create the type of the loaded slice according to its size. 15187 EVT SliceType = getLoadedType(); 15188 15189 // Create the load for the slice. 15190 SDValue LastInst = 15191 DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr, 15192 Origin->getPointerInfo().getWithOffset(Offset), getAlign(), 15193 Origin->getMemOperand()->getFlags()); 15194 // If the final type is not the same as the loaded type, this means that 15195 // we have to pad with zero. Create a zero extend for that. 15196 EVT FinalType = Inst->getValueType(0); 15197 if (SliceType != FinalType) 15198 LastInst = 15199 DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst); 15200 return LastInst; 15201 } 15202 15203 /// Check if this slice can be merged with an expensive cross register 15204 /// bank copy. E.g., 15205 /// i = load i32 15206 /// f = bitcast i32 i to float 15207 bool canMergeExpensiveCrossRegisterBankCopy() const { 15208 if (!Inst || !Inst->hasOneUse()) 15209 return false; 15210 SDNode *Use = *Inst->use_begin(); 15211 if (Use->getOpcode() != ISD::BITCAST) 15212 return false; 15213 assert(DAG && "Missing context"); 15214 const TargetLowering &TLI = DAG->getTargetLoweringInfo(); 15215 EVT ResVT = Use->getValueType(0); 15216 const TargetRegisterClass *ResRC = 15217 TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent()); 15218 const TargetRegisterClass *ArgRC = 15219 TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(), 15220 Use->getOperand(0)->isDivergent()); 15221 if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) 15222 return false; 15223 15224 // At this point, we know that we perform a cross-register-bank copy. 15225 // Check if it is expensive. 15226 const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo(); 15227 // Assume bitcasts are cheap, unless both register classes do not 15228 // explicitly share a common sub class. 15229 if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) 15230 return false; 15231 15232 // Check if it will be merged with the load. 15233 // 1. Check the alignment constraint. 15234 Align RequiredAlignment = DAG->getDataLayout().getABITypeAlign( 15235 ResVT.getTypeForEVT(*DAG->getContext())); 15236 15237 if (RequiredAlignment > getAlign()) 15238 return false; 15239 15240 // 2. Check that the load is a legal operation for that type. 15241 if (!TLI.isOperationLegal(ISD::LOAD, ResVT)) 15242 return false; 15243 15244 // 3. Check that we do not have a zext in the way. 15245 if (Inst->getValueType(0) != getLoadedType()) 15246 return false; 15247 15248 return true; 15249 } 15250 }; 15251 15252 } // end anonymous namespace 15253 15254 /// Check that all bits set in \p UsedBits form a dense region, i.e., 15255 /// \p UsedBits looks like 0..0 1..1 0..0. 15256 static bool areUsedBitsDense(const APInt &UsedBits) { 15257 // If all the bits are one, this is dense! 15258 if (UsedBits.isAllOnesValue()) 15259 return true; 15260 15261 // Get rid of the unused bits on the right. 15262 APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); 15263 // Get rid of the unused bits on the left. 15264 if (NarrowedUsedBits.countLeadingZeros()) 15265 NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); 15266 // Check that the chunk of bits is completely used. 15267 return NarrowedUsedBits.isAllOnesValue(); 15268 } 15269 15270 /// Check whether or not \p First and \p Second are next to each other 15271 /// in memory. This means that there is no hole between the bits loaded 15272 /// by \p First and the bits loaded by \p Second. 15273 static bool areSlicesNextToEachOther(const LoadedSlice &First, 15274 const LoadedSlice &Second) { 15275 assert(First.Origin == Second.Origin && First.Origin && 15276 "Unable to match different memory origins."); 15277 APInt UsedBits = First.getUsedBits(); 15278 assert((UsedBits & Second.getUsedBits()) == 0 && 15279 "Slices are not supposed to overlap."); 15280 UsedBits |= Second.getUsedBits(); 15281 return areUsedBitsDense(UsedBits); 15282 } 15283 15284 /// Adjust the \p GlobalLSCost according to the target 15285 /// paring capabilities and the layout of the slices. 15286 /// \pre \p GlobalLSCost should account for at least as many loads as 15287 /// there is in the slices in \p LoadedSlices. 15288 static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices, 15289 LoadedSlice::Cost &GlobalLSCost) { 15290 unsigned NumberOfSlices = LoadedSlices.size(); 15291 // If there is less than 2 elements, no pairing is possible. 15292 if (NumberOfSlices < 2) 15293 return; 15294 15295 // Sort the slices so that elements that are likely to be next to each 15296 // other in memory are next to each other in the list. 15297 llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) { 15298 assert(LHS.Origin == RHS.Origin && "Different bases not implemented."); 15299 return LHS.getOffsetFromBase() < RHS.getOffsetFromBase(); 15300 }); 15301 const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo(); 15302 // First (resp. Second) is the first (resp. Second) potentially candidate 15303 // to be placed in a paired load. 15304 const LoadedSlice *First = nullptr; 15305 const LoadedSlice *Second = nullptr; 15306 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, 15307 // Set the beginning of the pair. 15308 First = Second) { 15309 Second = &LoadedSlices[CurrSlice]; 15310 15311 // If First is NULL, it means we start a new pair. 15312 // Get to the next slice. 15313 if (!First) 15314 continue; 15315 15316 EVT LoadedType = First->getLoadedType(); 15317 15318 // If the types of the slices are different, we cannot pair them. 15319 if (LoadedType != Second->getLoadedType()) 15320 continue; 15321 15322 // Check if the target supplies paired loads for this type. 15323 Align RequiredAlignment; 15324 if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) { 15325 // move to the next pair, this type is hopeless. 15326 Second = nullptr; 15327 continue; 15328 } 15329 // Check if we meet the alignment requirement. 15330 if (First->getAlign() < RequiredAlignment) 15331 continue; 15332 15333 // Check that both loads are next to each other in memory. 15334 if (!areSlicesNextToEachOther(*First, *Second)) 15335 continue; 15336 15337 assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!"); 15338 --GlobalLSCost.Loads; 15339 // Move to the next pair. 15340 Second = nullptr; 15341 } 15342 } 15343 15344 /// Check the profitability of all involved LoadedSlice. 15345 /// Currently, it is considered profitable if there is exactly two 15346 /// involved slices (1) which are (2) next to each other in memory, and 15347 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3). 15348 /// 15349 /// Note: The order of the elements in \p LoadedSlices may be modified, but not 15350 /// the elements themselves. 15351 /// 15352 /// FIXME: When the cost model will be mature enough, we can relax 15353 /// constraints (1) and (2). 15354 static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices, 15355 const APInt &UsedBits, bool ForCodeSize) { 15356 unsigned NumberOfSlices = LoadedSlices.size(); 15357 if (StressLoadSlicing) 15358 return NumberOfSlices > 1; 15359 15360 // Check (1). 15361 if (NumberOfSlices != 2) 15362 return false; 15363 15364 // Check (2). 15365 if (!areUsedBitsDense(UsedBits)) 15366 return false; 15367 15368 // Check (3). 15369 LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize); 15370 // The original code has one big load. 15371 OrigCost.Loads = 1; 15372 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) { 15373 const LoadedSlice &LS = LoadedSlices[CurrSlice]; 15374 // Accumulate the cost of all the slices. 15375 LoadedSlice::Cost SliceCost(LS, ForCodeSize); 15376 GlobalSlicingCost += SliceCost; 15377 15378 // Account as cost in the original configuration the gain obtained 15379 // with the current slices. 15380 OrigCost.addSliceGain(LS); 15381 } 15382 15383 // If the target supports paired load, adjust the cost accordingly. 15384 adjustCostForPairing(LoadedSlices, GlobalSlicingCost); 15385 return OrigCost > GlobalSlicingCost; 15386 } 15387 15388 /// If the given load, \p LI, is used only by trunc or trunc(lshr) 15389 /// operations, split it in the various pieces being extracted. 15390 /// 15391 /// This sort of thing is introduced by SROA. 15392 /// This slicing takes care not to insert overlapping loads. 15393 /// \pre LI is a simple load (i.e., not an atomic or volatile load). 15394 bool DAGCombiner::SliceUpLoad(SDNode *N) { 15395 if (Level < AfterLegalizeDAG) 15396 return false; 15397 15398 LoadSDNode *LD = cast<LoadSDNode>(N); 15399 if (!LD->isSimple() || !ISD::isNormalLoad(LD) || 15400 !LD->getValueType(0).isInteger()) 15401 return false; 15402 15403 // The algorithm to split up a load of a scalable vector into individual 15404 // elements currently requires knowing the length of the loaded type, 15405 // so will need adjusting to work on scalable vectors. 15406 if (LD->getValueType(0).isScalableVector()) 15407 return false; 15408 15409 // Keep track of already used bits to detect overlapping values. 15410 // In that case, we will just abort the transformation. 15411 APInt UsedBits(LD->getValueSizeInBits(0), 0); 15412 15413 SmallVector<LoadedSlice, 4> LoadedSlices; 15414 15415 // Check if this load is used as several smaller chunks of bits. 15416 // Basically, look for uses in trunc or trunc(lshr) and record a new chain 15417 // of computation for each trunc. 15418 for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); 15419 UI != UIEnd; ++UI) { 15420 // Skip the uses of the chain. 15421 if (UI.getUse().getResNo() != 0) 15422 continue; 15423 15424 SDNode *User = *UI; 15425 unsigned Shift = 0; 15426 15427 // Check if this is a trunc(lshr). 15428 if (User->getOpcode() == ISD::SRL && User->hasOneUse() && 15429 isa<ConstantSDNode>(User->getOperand(1))) { 15430 Shift = User->getConstantOperandVal(1); 15431 User = *User->use_begin(); 15432 } 15433 15434 // At this point, User is a Truncate, iff we encountered, trunc or 15435 // trunc(lshr). 15436 if (User->getOpcode() != ISD::TRUNCATE) 15437 return false; 15438 15439 // The width of the type must be a power of 2 and greater than 8-bits. 15440 // Otherwise the load cannot be represented in LLVM IR. 15441 // Moreover, if we shifted with a non-8-bits multiple, the slice 15442 // will be across several bytes. We do not support that. 15443 unsigned Width = User->getValueSizeInBits(0); 15444 if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) 15445 return false; 15446 15447 // Build the slice for this chain of computations. 15448 LoadedSlice LS(User, LD, Shift, &DAG); 15449 APInt CurrentUsedBits = LS.getUsedBits(); 15450 15451 // Check if this slice overlaps with another. 15452 if ((CurrentUsedBits & UsedBits) != 0) 15453 return false; 15454 // Update the bits used globally. 15455 UsedBits |= CurrentUsedBits; 15456 15457 // Check if the new slice would be legal. 15458 if (!LS.isLegal()) 15459 return false; 15460 15461 // Record the slice. 15462 LoadedSlices.push_back(LS); 15463 } 15464 15465 // Abort slicing if it does not seem to be profitable. 15466 if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize)) 15467 return false; 15468 15469 ++SlicedLoads; 15470 15471 // Rewrite each chain to use an independent load. 15472 // By construction, each chain can be represented by a unique load. 15473 15474 // Prepare the argument for the new token factor for all the slices. 15475 SmallVector<SDValue, 8> ArgChains; 15476 for (SmallVectorImpl<LoadedSlice>::const_iterator 15477 LSIt = LoadedSlices.begin(), 15478 LSItEnd = LoadedSlices.end(); 15479 LSIt != LSItEnd; ++LSIt) { 15480 SDValue SliceInst = LSIt->loadSlice(); 15481 CombineTo(LSIt->Inst, SliceInst, true); 15482 if (SliceInst.getOpcode() != ISD::LOAD) 15483 SliceInst = SliceInst.getOperand(0); 15484 assert(SliceInst->getOpcode() == ISD::LOAD && 15485 "It takes more than a zext to get to the loaded slice!!"); 15486 ArgChains.push_back(SliceInst.getValue(1)); 15487 } 15488 15489 SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, 15490 ArgChains); 15491 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); 15492 AddToWorklist(Chain.getNode()); 15493 return true; 15494 } 15495 15496 /// Check to see if V is (and load (ptr), imm), where the load is having 15497 /// specific bytes cleared out. If so, return the byte size being masked out 15498 /// and the shift amount. 15499 static std::pair<unsigned, unsigned> 15500 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { 15501 std::pair<unsigned, unsigned> Result(0, 0); 15502 15503 // Check for the structure we're looking for. 15504 if (V->getOpcode() != ISD::AND || 15505 !isa<ConstantSDNode>(V->getOperand(1)) || 15506 !ISD::isNormalLoad(V->getOperand(0).getNode())) 15507 return Result; 15508 15509 // Check the chain and pointer. 15510 LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0)); 15511 if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. 15512 15513 // This only handles simple types. 15514 if (V.getValueType() != MVT::i16 && 15515 V.getValueType() != MVT::i32 && 15516 V.getValueType() != MVT::i64) 15517 return Result; 15518 15519 // Check the constant mask. Invert it so that the bits being masked out are 15520 // 0 and the bits being kept are 1. Use getSExtValue so that leading bits 15521 // follow the sign bit for uniformity. 15522 uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue(); 15523 unsigned NotMaskLZ = countLeadingZeros(NotMask); 15524 if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. 15525 unsigned NotMaskTZ = countTrailingZeros(NotMask); 15526 if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. 15527 if (NotMaskLZ == 64) return Result; // All zero mask. 15528 15529 // See if we have a continuous run of bits. If so, we have 0*1+0* 15530 if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) 15531 return Result; 15532 15533 // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. 15534 if (V.getValueType() != MVT::i64 && NotMaskLZ) 15535 NotMaskLZ -= 64-V.getValueSizeInBits(); 15536 15537 unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8; 15538 switch (MaskedBytes) { 15539 case 1: 15540 case 2: 15541 case 4: break; 15542 default: return Result; // All one mask, or 5-byte mask. 15543 } 15544 15545 // Verify that the first bit starts at a multiple of mask so that the access 15546 // is aligned the same as the access width. 15547 if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; 15548 15549 // For narrowing to be valid, it must be the case that the load the 15550 // immediately preceding memory operation before the store. 15551 if (LD == Chain.getNode()) 15552 ; // ok. 15553 else if (Chain->getOpcode() == ISD::TokenFactor && 15554 SDValue(LD, 1).hasOneUse()) { 15555 // LD has only 1 chain use so they are no indirect dependencies. 15556 if (!LD->isOperandOf(Chain.getNode())) 15557 return Result; 15558 } else 15559 return Result; // Fail. 15560 15561 Result.first = MaskedBytes; 15562 Result.second = NotMaskTZ/8; 15563 return Result; 15564 } 15565 15566 /// Check to see if IVal is something that provides a value as specified by 15567 /// MaskInfo. If so, replace the specified store with a narrower store of 15568 /// truncated IVal. 15569 static SDValue 15570 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, 15571 SDValue IVal, StoreSDNode *St, 15572 DAGCombiner *DC) { 15573 unsigned NumBytes = MaskInfo.first; 15574 unsigned ByteShift = MaskInfo.second; 15575 SelectionDAG &DAG = DC->getDAG(); 15576 15577 // Check to see if IVal is all zeros in the part being masked in by the 'or' 15578 // that uses this. If not, this is not a replacement. 15579 APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(), 15580 ByteShift*8, (ByteShift+NumBytes)*8); 15581 if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue(); 15582 15583 // Check that it is legal on the target to do this. It is legal if the new 15584 // VT we're shrinking to (i8/i16/i32) is legal or we're still before type 15585 // legalization (and the target doesn't explicitly think this is a bad idea). 15586 MVT VT = MVT::getIntegerVT(NumBytes * 8); 15587 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15588 if (!DC->isTypeLegal(VT)) 15589 return SDValue(); 15590 if (St->getMemOperand() && 15591 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 15592 *St->getMemOperand())) 15593 return SDValue(); 15594 15595 // Okay, we can do this! Replace the 'St' store with a store of IVal that is 15596 // shifted by ByteShift and truncated down to NumBytes. 15597 if (ByteShift) { 15598 SDLoc DL(IVal); 15599 IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal, 15600 DAG.getConstant(ByteShift*8, DL, 15601 DC->getShiftAmountTy(IVal.getValueType()))); 15602 } 15603 15604 // Figure out the offset for the store and the alignment of the access. 15605 unsigned StOffset; 15606 unsigned NewAlign = St->getAlignment(); 15607 15608 if (DAG.getDataLayout().isLittleEndian()) 15609 StOffset = ByteShift; 15610 else 15611 StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes; 15612 15613 SDValue Ptr = St->getBasePtr(); 15614 if (StOffset) { 15615 SDLoc DL(IVal); 15616 Ptr = DAG.getMemBasePlusOffset(Ptr, StOffset, DL); 15617 NewAlign = MinAlign(NewAlign, StOffset); 15618 } 15619 15620 // Truncate down to the new size. 15621 IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal); 15622 15623 ++OpsNarrowed; 15624 return DAG 15625 .getStore(St->getChain(), SDLoc(St), IVal, Ptr, 15626 St->getPointerInfo().getWithOffset(StOffset), NewAlign); 15627 } 15628 15629 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and 15630 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try 15631 /// narrowing the load and store if it would end up being a win for performance 15632 /// or code size. 15633 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { 15634 StoreSDNode *ST = cast<StoreSDNode>(N); 15635 if (!ST->isSimple()) 15636 return SDValue(); 15637 15638 SDValue Chain = ST->getChain(); 15639 SDValue Value = ST->getValue(); 15640 SDValue Ptr = ST->getBasePtr(); 15641 EVT VT = Value.getValueType(); 15642 15643 if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) 15644 return SDValue(); 15645 15646 unsigned Opc = Value.getOpcode(); 15647 15648 // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst 15649 // is a byte mask indicating a consecutive number of bytes, check to see if 15650 // Y is known to provide just those bytes. If so, we try to replace the 15651 // load + replace + store sequence with a single (narrower) store, which makes 15652 // the load dead. 15653 if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) { 15654 std::pair<unsigned, unsigned> MaskedLoad; 15655 MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain); 15656 if (MaskedLoad.first) 15657 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, 15658 Value.getOperand(1), ST,this)) 15659 return NewST; 15660 15661 // Or is commutative, so try swapping X and Y. 15662 MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain); 15663 if (MaskedLoad.first) 15664 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, 15665 Value.getOperand(0), ST,this)) 15666 return NewST; 15667 } 15668 15669 if (!EnableReduceLoadOpStoreWidth) 15670 return SDValue(); 15671 15672 if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || 15673 Value.getOperand(1).getOpcode() != ISD::Constant) 15674 return SDValue(); 15675 15676 SDValue N0 = Value.getOperand(0); 15677 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 15678 Chain == SDValue(N0.getNode(), 1)) { 15679 LoadSDNode *LD = cast<LoadSDNode>(N0); 15680 if (LD->getBasePtr() != Ptr || 15681 LD->getPointerInfo().getAddrSpace() != 15682 ST->getPointerInfo().getAddrSpace()) 15683 return SDValue(); 15684 15685 // Find the type to narrow it the load / op / store to. 15686 SDValue N1 = Value.getOperand(1); 15687 unsigned BitWidth = N1.getValueSizeInBits(); 15688 APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue(); 15689 if (Opc == ISD::AND) 15690 Imm ^= APInt::getAllOnesValue(BitWidth); 15691 if (Imm == 0 || Imm.isAllOnesValue()) 15692 return SDValue(); 15693 unsigned ShAmt = Imm.countTrailingZeros(); 15694 unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; 15695 unsigned NewBW = NextPowerOf2(MSB - ShAmt); 15696 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); 15697 // The narrowing should be profitable, the load/store operation should be 15698 // legal (or custom) and the store size should be equal to the NewVT width. 15699 while (NewBW < BitWidth && 15700 (NewVT.getStoreSizeInBits() != NewBW || 15701 !TLI.isOperationLegalOrCustom(Opc, NewVT) || 15702 !TLI.isNarrowingProfitable(VT, NewVT))) { 15703 NewBW = NextPowerOf2(NewBW); 15704 NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); 15705 } 15706 if (NewBW >= BitWidth) 15707 return SDValue(); 15708 15709 // If the lsb changed does not start at the type bitwidth boundary, 15710 // start at the previous one. 15711 if (ShAmt % NewBW) 15712 ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; 15713 APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, 15714 std::min(BitWidth, ShAmt + NewBW)); 15715 if ((Imm & Mask) == Imm) { 15716 APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); 15717 if (Opc == ISD::AND) 15718 NewImm ^= APInt::getAllOnesValue(NewBW); 15719 uint64_t PtrOff = ShAmt / 8; 15720 // For big endian targets, we need to adjust the offset to the pointer to 15721 // load the correct bytes. 15722 if (DAG.getDataLayout().isBigEndian()) 15723 PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; 15724 15725 Align NewAlign = commonAlignment(LD->getAlign(), PtrOff); 15726 Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext()); 15727 if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy)) 15728 return SDValue(); 15729 15730 SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD)); 15731 SDValue NewLD = 15732 DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, 15733 LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, 15734 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 15735 SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, 15736 DAG.getConstant(NewImm, SDLoc(Value), 15737 NewVT)); 15738 SDValue NewST = 15739 DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, 15740 ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); 15741 15742 AddToWorklist(NewPtr.getNode()); 15743 AddToWorklist(NewLD.getNode()); 15744 AddToWorklist(NewVal.getNode()); 15745 WorklistRemover DeadNodes(*this); 15746 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); 15747 ++OpsNarrowed; 15748 return NewST; 15749 } 15750 } 15751 15752 return SDValue(); 15753 } 15754 15755 /// For a given floating point load / store pair, if the load value isn't used 15756 /// by any other operations, then consider transforming the pair to integer 15757 /// load / store operations if the target deems the transformation profitable. 15758 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { 15759 StoreSDNode *ST = cast<StoreSDNode>(N); 15760 SDValue Value = ST->getValue(); 15761 if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) && 15762 Value.hasOneUse()) { 15763 LoadSDNode *LD = cast<LoadSDNode>(Value); 15764 EVT VT = LD->getMemoryVT(); 15765 if (!VT.isFloatingPoint() || 15766 VT != ST->getMemoryVT() || 15767 LD->isNonTemporal() || 15768 ST->isNonTemporal() || 15769 LD->getPointerInfo().getAddrSpace() != 0 || 15770 ST->getPointerInfo().getAddrSpace() != 0) 15771 return SDValue(); 15772 15773 TypeSize VTSize = VT.getSizeInBits(); 15774 15775 // We don't know the size of scalable types at compile time so we cannot 15776 // create an integer of the equivalent size. 15777 if (VTSize.isScalable()) 15778 return SDValue(); 15779 15780 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize()); 15781 if (!TLI.isOperationLegal(ISD::LOAD, IntVT) || 15782 !TLI.isOperationLegal(ISD::STORE, IntVT) || 15783 !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) || 15784 !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT)) 15785 return SDValue(); 15786 15787 Align LDAlign = LD->getAlign(); 15788 Align STAlign = ST->getAlign(); 15789 Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext()); 15790 Align ABIAlign = DAG.getDataLayout().getABITypeAlign(IntVTTy); 15791 if (LDAlign < ABIAlign || STAlign < ABIAlign) 15792 return SDValue(); 15793 15794 SDValue NewLD = 15795 DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(), 15796 LD->getPointerInfo(), LDAlign); 15797 15798 SDValue NewST = 15799 DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(), 15800 ST->getPointerInfo(), STAlign); 15801 15802 AddToWorklist(NewLD.getNode()); 15803 AddToWorklist(NewST.getNode()); 15804 WorklistRemover DeadNodes(*this); 15805 DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); 15806 ++LdStFP2Int; 15807 return NewST; 15808 } 15809 15810 return SDValue(); 15811 } 15812 15813 // This is a helper function for visitMUL to check the profitability 15814 // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). 15815 // MulNode is the original multiply, AddNode is (add x, c1), 15816 // and ConstNode is c2. 15817 // 15818 // If the (add x, c1) has multiple uses, we could increase 15819 // the number of adds if we make this transformation. 15820 // It would only be worth doing this if we can remove a 15821 // multiply in the process. Check for that here. 15822 // To illustrate: 15823 // (A + c1) * c3 15824 // (A + c2) * c3 15825 // We're checking for cases where we have common "c3 * A" expressions. 15826 bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, 15827 SDValue &AddNode, 15828 SDValue &ConstNode) { 15829 APInt Val; 15830 15831 // If the add only has one use, this would be OK to do. 15832 if (AddNode.getNode()->hasOneUse()) 15833 return true; 15834 15835 // Walk all the users of the constant with which we're multiplying. 15836 for (SDNode *Use : ConstNode->uses()) { 15837 if (Use == MulNode) // This use is the one we're on right now. Skip it. 15838 continue; 15839 15840 if (Use->getOpcode() == ISD::MUL) { // We have another multiply use. 15841 SDNode *OtherOp; 15842 SDNode *MulVar = AddNode.getOperand(0).getNode(); 15843 15844 // OtherOp is what we're multiplying against the constant. 15845 if (Use->getOperand(0) == ConstNode) 15846 OtherOp = Use->getOperand(1).getNode(); 15847 else 15848 OtherOp = Use->getOperand(0).getNode(); 15849 15850 // Check to see if multiply is with the same operand of our "add". 15851 // 15852 // ConstNode = CONST 15853 // Use = ConstNode * A <-- visiting Use. OtherOp is A. 15854 // ... 15855 // AddNode = (A + c1) <-- MulVar is A. 15856 // = AddNode * ConstNode <-- current visiting instruction. 15857 // 15858 // If we make this transformation, we will have a common 15859 // multiply (ConstNode * A) that we can save. 15860 if (OtherOp == MulVar) 15861 return true; 15862 15863 // Now check to see if a future expansion will give us a common 15864 // multiply. 15865 // 15866 // ConstNode = CONST 15867 // AddNode = (A + c1) 15868 // ... = AddNode * ConstNode <-- current visiting instruction. 15869 // ... 15870 // OtherOp = (A + c2) 15871 // Use = OtherOp * ConstNode <-- visiting Use. 15872 // 15873 // If we make this transformation, we will have a common 15874 // multiply (CONST * A) after we also do the same transformation 15875 // to the "t2" instruction. 15876 if (OtherOp->getOpcode() == ISD::ADD && 15877 DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) && 15878 OtherOp->getOperand(0).getNode() == MulVar) 15879 return true; 15880 } 15881 } 15882 15883 // Didn't find a case where this would be profitable. 15884 return false; 15885 } 15886 15887 SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, 15888 unsigned NumStores) { 15889 SmallVector<SDValue, 8> Chains; 15890 SmallPtrSet<const SDNode *, 8> Visited; 15891 SDLoc StoreDL(StoreNodes[0].MemNode); 15892 15893 for (unsigned i = 0; i < NumStores; ++i) { 15894 Visited.insert(StoreNodes[i].MemNode); 15895 } 15896 15897 // don't include nodes that are children or repeated nodes. 15898 for (unsigned i = 0; i < NumStores; ++i) { 15899 if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second) 15900 Chains.push_back(StoreNodes[i].MemNode->getChain()); 15901 } 15902 15903 assert(Chains.size() > 0 && "Chain should have generated a chain"); 15904 return DAG.getTokenFactor(StoreDL, Chains); 15905 } 15906 15907 bool DAGCombiner::mergeStoresOfConstantsOrVecElts( 15908 SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, 15909 bool IsConstantSrc, bool UseVector, bool UseTrunc) { 15910 // Make sure we have something to merge. 15911 if (NumStores < 2) 15912 return false; 15913 15914 // The latest Node in the DAG. 15915 SDLoc DL(StoreNodes[0].MemNode); 15916 15917 TypeSize ElementSizeBits = MemVT.getStoreSizeInBits(); 15918 unsigned SizeInBits = NumStores * ElementSizeBits; 15919 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 15920 15921 EVT StoreTy; 15922 if (UseVector) { 15923 unsigned Elts = NumStores * NumMemElts; 15924 // Get the type for the merged vector store. 15925 StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); 15926 } else 15927 StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); 15928 15929 SDValue StoredVal; 15930 if (UseVector) { 15931 if (IsConstantSrc) { 15932 SmallVector<SDValue, 8> BuildVector; 15933 for (unsigned I = 0; I != NumStores; ++I) { 15934 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); 15935 SDValue Val = St->getValue(); 15936 // If constant is of the wrong type, convert it now. 15937 if (MemVT != Val.getValueType()) { 15938 Val = peekThroughBitcasts(Val); 15939 // Deal with constants of wrong size. 15940 if (ElementSizeBits != Val.getValueSizeInBits()) { 15941 EVT IntMemVT = 15942 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 15943 if (isa<ConstantFPSDNode>(Val)) { 15944 // Not clear how to truncate FP values. 15945 return false; 15946 } else if (auto *C = dyn_cast<ConstantSDNode>(Val)) 15947 Val = DAG.getConstant(C->getAPIntValue() 15948 .zextOrTrunc(Val.getValueSizeInBits()) 15949 .zextOrTrunc(ElementSizeBits), 15950 SDLoc(C), IntMemVT); 15951 } 15952 // Make sure correctly size type is the correct type. 15953 Val = DAG.getBitcast(MemVT, Val); 15954 } 15955 BuildVector.push_back(Val); 15956 } 15957 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS 15958 : ISD::BUILD_VECTOR, 15959 DL, StoreTy, BuildVector); 15960 } else { 15961 SmallVector<SDValue, 8> Ops; 15962 for (unsigned i = 0; i < NumStores; ++i) { 15963 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); 15964 SDValue Val = peekThroughBitcasts(St->getValue()); 15965 // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of 15966 // type MemVT. If the underlying value is not the correct 15967 // type, but it is an extraction of an appropriate vector we 15968 // can recast Val to be of the correct type. This may require 15969 // converting between EXTRACT_VECTOR_ELT and 15970 // EXTRACT_SUBVECTOR. 15971 if ((MemVT != Val.getValueType()) && 15972 (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || 15973 Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) { 15974 EVT MemVTScalarTy = MemVT.getScalarType(); 15975 // We may need to add a bitcast here to get types to line up. 15976 if (MemVTScalarTy != Val.getValueType().getScalarType()) { 15977 Val = DAG.getBitcast(MemVT, Val); 15978 } else { 15979 unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR 15980 : ISD::EXTRACT_VECTOR_ELT; 15981 SDValue Vec = Val.getOperand(0); 15982 SDValue Idx = Val.getOperand(1); 15983 Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx); 15984 } 15985 } 15986 Ops.push_back(Val); 15987 } 15988 15989 // Build the extracted vector elements back into a vector. 15990 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS 15991 : ISD::BUILD_VECTOR, 15992 DL, StoreTy, Ops); 15993 } 15994 } else { 15995 // We should always use a vector store when merging extracted vector 15996 // elements, so this path implies a store of constants. 15997 assert(IsConstantSrc && "Merged vector elements should use vector store"); 15998 15999 APInt StoreInt(SizeInBits, 0); 16000 16001 // Construct a single integer constant which is made of the smaller 16002 // constant inputs. 16003 bool IsLE = DAG.getDataLayout().isLittleEndian(); 16004 for (unsigned i = 0; i < NumStores; ++i) { 16005 unsigned Idx = IsLE ? (NumStores - 1 - i) : i; 16006 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); 16007 16008 SDValue Val = St->getValue(); 16009 Val = peekThroughBitcasts(Val); 16010 StoreInt <<= ElementSizeBits; 16011 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { 16012 StoreInt |= C->getAPIntValue() 16013 .zextOrTrunc(ElementSizeBits) 16014 .zextOrTrunc(SizeInBits); 16015 } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { 16016 StoreInt |= C->getValueAPF() 16017 .bitcastToAPInt() 16018 .zextOrTrunc(ElementSizeBits) 16019 .zextOrTrunc(SizeInBits); 16020 // If fp truncation is necessary give up for now. 16021 if (MemVT.getSizeInBits() != ElementSizeBits) 16022 return false; 16023 } else { 16024 llvm_unreachable("Invalid constant element type"); 16025 } 16026 } 16027 16028 // Create the new Load and Store operations. 16029 StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); 16030 } 16031 16032 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 16033 SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); 16034 16035 // make sure we use trunc store if it's necessary to be legal. 16036 SDValue NewStore; 16037 if (!UseTrunc) { 16038 NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), 16039 FirstInChain->getPointerInfo(), 16040 FirstInChain->getAlignment()); 16041 } else { // Must be realized as a trunc store 16042 EVT LegalizedStoredValTy = 16043 TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); 16044 unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits(); 16045 ConstantSDNode *C = cast<ConstantSDNode>(StoredVal); 16046 SDValue ExtendedStoreVal = 16047 DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL, 16048 LegalizedStoredValTy); 16049 NewStore = DAG.getTruncStore( 16050 NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), 16051 FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, 16052 FirstInChain->getAlignment(), 16053 FirstInChain->getMemOperand()->getFlags()); 16054 } 16055 16056 // Replace all merged stores with the new store. 16057 for (unsigned i = 0; i < NumStores; ++i) 16058 CombineTo(StoreNodes[i].MemNode, NewStore); 16059 16060 AddToWorklist(NewChain.getNode()); 16061 return true; 16062 } 16063 16064 void DAGCombiner::getStoreMergeCandidates( 16065 StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes, 16066 SDNode *&RootNode) { 16067 // This holds the base pointer, index, and the offset in bytes from the base 16068 // pointer. 16069 BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 16070 EVT MemVT = St->getMemoryVT(); 16071 16072 SDValue Val = peekThroughBitcasts(St->getValue()); 16073 // We must have a base and an offset. 16074 if (!BasePtr.getBase().getNode()) 16075 return; 16076 16077 // Do not handle stores to undef base pointers. 16078 if (BasePtr.getBase().isUndef()) 16079 return; 16080 16081 StoreSource StoreSrc = getStoreSource(Val); 16082 assert(StoreSrc != StoreSource::Unknown && "Expected known source for store"); 16083 BaseIndexOffset LBasePtr; 16084 // Match on loadbaseptr if relevant. 16085 EVT LoadVT; 16086 if (StoreSrc == StoreSource::Load) { 16087 auto *Ld = cast<LoadSDNode>(Val); 16088 LBasePtr = BaseIndexOffset::match(Ld, DAG); 16089 LoadVT = Ld->getMemoryVT(); 16090 // Load and store should be the same type. 16091 if (MemVT != LoadVT) 16092 return; 16093 // Loads must only have one use. 16094 if (!Ld->hasNUsesOfValue(1, 0)) 16095 return; 16096 // The memory operands must not be volatile/indexed/atomic. 16097 // TODO: May be able to relax for unordered atomics (see D66309) 16098 if (!Ld->isSimple() || Ld->isIndexed()) 16099 return; 16100 } 16101 auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, 16102 int64_t &Offset) -> bool { 16103 // The memory operands must not be volatile/indexed/atomic. 16104 // TODO: May be able to relax for unordered atomics (see D66309) 16105 if (!Other->isSimple() || Other->isIndexed()) 16106 return false; 16107 // Don't mix temporal stores with non-temporal stores. 16108 if (St->isNonTemporal() != Other->isNonTemporal()) 16109 return false; 16110 SDValue OtherBC = peekThroughBitcasts(Other->getValue()); 16111 // Allow merging constants of different types as integers. 16112 bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) 16113 : Other->getMemoryVT() != MemVT; 16114 if (StoreSrc == StoreSource::Load) { 16115 if (NoTypeMatch) 16116 return false; 16117 // The Load's Base Ptr must also match 16118 if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) { 16119 BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); 16120 if (LoadVT != OtherLd->getMemoryVT()) 16121 return false; 16122 // Loads must only have one use. 16123 if (!OtherLd->hasNUsesOfValue(1, 0)) 16124 return false; 16125 // The memory operands must not be volatile/indexed/atomic. 16126 // TODO: May be able to relax for unordered atomics (see D66309) 16127 if (!OtherLd->isSimple() || 16128 OtherLd->isIndexed()) 16129 return false; 16130 // Don't mix temporal loads with non-temporal loads. 16131 if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal()) 16132 return false; 16133 if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) 16134 return false; 16135 } else 16136 return false; 16137 } 16138 if (StoreSrc == StoreSource::Constant) { 16139 if (NoTypeMatch) 16140 return false; 16141 if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC))) 16142 return false; 16143 } 16144 if (StoreSrc == StoreSource::Extract) { 16145 // Do not merge truncated stores here. 16146 if (Other->isTruncatingStore()) 16147 return false; 16148 if (!MemVT.bitsEq(OtherBC.getValueType())) 16149 return false; 16150 if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT && 16151 OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR) 16152 return false; 16153 } 16154 Ptr = BaseIndexOffset::match(Other, DAG); 16155 return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); 16156 }; 16157 16158 // Check if the pair of StoreNode and the RootNode already bail out many 16159 // times which is over the limit in dependence check. 16160 auto OverLimitInDependenceCheck = [&](SDNode *StoreNode, 16161 SDNode *RootNode) -> bool { 16162 auto RootCount = StoreRootCountMap.find(StoreNode); 16163 if (RootCount != StoreRootCountMap.end() && 16164 RootCount->second.first == RootNode && 16165 RootCount->second.second > StoreMergeDependenceLimit) 16166 return true; 16167 return false; 16168 }; 16169 16170 // We looking for a root node which is an ancestor to all mergable 16171 // stores. We search up through a load, to our root and then down 16172 // through all children. For instance we will find Store{1,2,3} if 16173 // St is Store1, Store2. or Store3 where the root is not a load 16174 // which always true for nonvolatile ops. TODO: Expand 16175 // the search to find all valid candidates through multiple layers of loads. 16176 // 16177 // Root 16178 // |-------|-------| 16179 // Load Load Store3 16180 // | | 16181 // Store1 Store2 16182 // 16183 // FIXME: We should be able to climb and 16184 // descend TokenFactors to find candidates as well. 16185 16186 RootNode = St->getChain().getNode(); 16187 16188 unsigned NumNodesExplored = 0; 16189 if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { 16190 RootNode = Ldn->getChain().getNode(); 16191 for (auto I = RootNode->use_begin(), E = RootNode->use_end(); 16192 I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) 16193 if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain 16194 for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) 16195 if (I2.getOperandNo() == 0) 16196 if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) { 16197 BaseIndexOffset Ptr; 16198 int64_t PtrDiff; 16199 if (CandidateMatch(OtherST, Ptr, PtrDiff) && 16200 !OverLimitInDependenceCheck(OtherST, RootNode)) 16201 StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); 16202 } 16203 } else 16204 for (auto I = RootNode->use_begin(), E = RootNode->use_end(); 16205 I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) 16206 if (I.getOperandNo() == 0) 16207 if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { 16208 BaseIndexOffset Ptr; 16209 int64_t PtrDiff; 16210 if (CandidateMatch(OtherST, Ptr, PtrDiff) && 16211 !OverLimitInDependenceCheck(OtherST, RootNode)) 16212 StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); 16213 } 16214 } 16215 16216 // We need to check that merging these stores does not cause a loop in 16217 // the DAG. Any store candidate may depend on another candidate 16218 // indirectly through its operand (we already consider dependencies 16219 // through the chain). Check in parallel by searching up from 16220 // non-chain operands of candidates. 16221 bool DAGCombiner::checkMergeStoreCandidatesForDependencies( 16222 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores, 16223 SDNode *RootNode) { 16224 // FIXME: We should be able to truncate a full search of 16225 // predecessors by doing a BFS and keeping tabs the originating 16226 // stores from which worklist nodes come from in a similar way to 16227 // TokenFactor simplfication. 16228 16229 SmallPtrSet<const SDNode *, 32> Visited; 16230 SmallVector<const SDNode *, 8> Worklist; 16231 16232 // RootNode is a predecessor to all candidates so we need not search 16233 // past it. Add RootNode (peeking through TokenFactors). Do not count 16234 // these towards size check. 16235 16236 Worklist.push_back(RootNode); 16237 while (!Worklist.empty()) { 16238 auto N = Worklist.pop_back_val(); 16239 if (!Visited.insert(N).second) 16240 continue; // Already present in Visited. 16241 if (N->getOpcode() == ISD::TokenFactor) { 16242 for (SDValue Op : N->ops()) 16243 Worklist.push_back(Op.getNode()); 16244 } 16245 } 16246 16247 // Don't count pruning nodes towards max. 16248 unsigned int Max = 1024 + Visited.size(); 16249 // Search Ops of store candidates. 16250 for (unsigned i = 0; i < NumStores; ++i) { 16251 SDNode *N = StoreNodes[i].MemNode; 16252 // Of the 4 Store Operands: 16253 // * Chain (Op 0) -> We have already considered these 16254 // in candidate selection and can be 16255 // safely ignored 16256 // * Value (Op 1) -> Cycles may happen (e.g. through load chains) 16257 // * Address (Op 2) -> Merged addresses may only vary by a fixed constant, 16258 // but aren't necessarily fromt the same base node, so 16259 // cycles possible (e.g. via indexed store). 16260 // * (Op 3) -> Represents the pre or post-indexing offset (or undef for 16261 // non-indexed stores). Not constant on all targets (e.g. ARM) 16262 // and so can participate in a cycle. 16263 for (unsigned j = 1; j < N->getNumOperands(); ++j) 16264 Worklist.push_back(N->getOperand(j).getNode()); 16265 } 16266 // Search through DAG. We can stop early if we find a store node. 16267 for (unsigned i = 0; i < NumStores; ++i) 16268 if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist, 16269 Max)) { 16270 // If the searching bail out, record the StoreNode and RootNode in the 16271 // StoreRootCountMap. If we have seen the pair many times over a limit, 16272 // we won't add the StoreNode into StoreNodes set again. 16273 if (Visited.size() >= Max) { 16274 auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode]; 16275 if (RootCount.first == RootNode) 16276 RootCount.second++; 16277 else 16278 RootCount = {RootNode, 1}; 16279 } 16280 return false; 16281 } 16282 return true; 16283 } 16284 16285 unsigned 16286 DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes, 16287 int64_t ElementSizeBytes) const { 16288 while (true) { 16289 // Find a store past the width of the first store. 16290 size_t StartIdx = 0; 16291 while ((StartIdx + 1 < StoreNodes.size()) && 16292 StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes != 16293 StoreNodes[StartIdx + 1].OffsetFromBase) 16294 ++StartIdx; 16295 16296 // Bail if we don't have enough candidates to merge. 16297 if (StartIdx + 1 >= StoreNodes.size()) 16298 return 0; 16299 16300 // Trim stores that overlapped with the first store. 16301 if (StartIdx) 16302 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx); 16303 16304 // Scan the memory operations on the chain and find the first 16305 // non-consecutive store memory address. 16306 unsigned NumConsecutiveStores = 1; 16307 int64_t StartAddress = StoreNodes[0].OffsetFromBase; 16308 // Check that the addresses are consecutive starting from the second 16309 // element in the list of stores. 16310 for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { 16311 int64_t CurrAddress = StoreNodes[i].OffsetFromBase; 16312 if (CurrAddress - StartAddress != (ElementSizeBytes * i)) 16313 break; 16314 NumConsecutiveStores = i + 1; 16315 } 16316 if (NumConsecutiveStores > 1) 16317 return NumConsecutiveStores; 16318 16319 // There are no consecutive stores at the start of the list. 16320 // Remove the first store and try again. 16321 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); 16322 } 16323 } 16324 16325 bool DAGCombiner::tryStoreMergeOfConstants( 16326 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores, 16327 EVT MemVT, SDNode *RootNode, bool AllowVectors) { 16328 LLVMContext &Context = *DAG.getContext(); 16329 const DataLayout &DL = DAG.getDataLayout(); 16330 int64_t ElementSizeBytes = MemVT.getStoreSize(); 16331 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 16332 bool MadeChange = false; 16333 16334 // Store the constants into memory as one consecutive store. 16335 while (NumConsecutiveStores >= 2) { 16336 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 16337 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 16338 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 16339 unsigned LastLegalType = 1; 16340 unsigned LastLegalVectorType = 1; 16341 bool LastIntegerTrunc = false; 16342 bool NonZero = false; 16343 unsigned FirstZeroAfterNonZero = NumConsecutiveStores; 16344 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 16345 StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); 16346 SDValue StoredVal = ST->getValue(); 16347 bool IsElementZero = false; 16348 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) 16349 IsElementZero = C->isNullValue(); 16350 else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) 16351 IsElementZero = C->getConstantFPValue()->isNullValue(); 16352 if (IsElementZero) { 16353 if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) 16354 FirstZeroAfterNonZero = i; 16355 } 16356 NonZero |= !IsElementZero; 16357 16358 // Find a legal type for the constant store. 16359 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; 16360 EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); 16361 bool IsFast = false; 16362 16363 // Break early when size is too large to be legal. 16364 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) 16365 break; 16366 16367 if (TLI.isTypeLegal(StoreTy) && 16368 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 16369 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16370 *FirstInChain->getMemOperand(), &IsFast) && 16371 IsFast) { 16372 LastIntegerTrunc = false; 16373 LastLegalType = i + 1; 16374 // Or check whether a truncstore is legal. 16375 } else if (TLI.getTypeAction(Context, StoreTy) == 16376 TargetLowering::TypePromoteInteger) { 16377 EVT LegalizedStoredValTy = 16378 TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); 16379 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && 16380 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && 16381 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16382 *FirstInChain->getMemOperand(), &IsFast) && 16383 IsFast) { 16384 LastIntegerTrunc = true; 16385 LastLegalType = i + 1; 16386 } 16387 } 16388 16389 // We only use vectors if the constant is known to be zero or the 16390 // target allows it and the function is not marked with the 16391 // noimplicitfloat attribute. 16392 if ((!NonZero || 16393 TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && 16394 AllowVectors) { 16395 // Find a legal type for the vector store. 16396 unsigned Elts = (i + 1) * NumMemElts; 16397 EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 16398 if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) && 16399 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && 16400 TLI.allowsMemoryAccess(Context, DL, Ty, 16401 *FirstInChain->getMemOperand(), &IsFast) && 16402 IsFast) 16403 LastLegalVectorType = i + 1; 16404 } 16405 } 16406 16407 bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors; 16408 unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; 16409 16410 // Check if we found a legal integer type that creates a meaningful 16411 // merge. 16412 if (NumElem < 2) { 16413 // We know that candidate stores are in order and of correct 16414 // shape. While there is no mergeable sequence from the 16415 // beginning one may start later in the sequence. The only 16416 // reason a merge of size N could have failed where another of 16417 // the same size would not have, is if the alignment has 16418 // improved or we've dropped a non-zero value. Drop as many 16419 // candidates as we can here. 16420 unsigned NumSkip = 1; 16421 while ((NumSkip < NumConsecutiveStores) && 16422 (NumSkip < FirstZeroAfterNonZero) && 16423 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 16424 NumSkip++; 16425 16426 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 16427 NumConsecutiveStores -= NumSkip; 16428 continue; 16429 } 16430 16431 // Check that we can merge these candidates without causing a cycle. 16432 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, 16433 RootNode)) { 16434 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 16435 NumConsecutiveStores -= NumElem; 16436 continue; 16437 } 16438 16439 MadeChange |= mergeStoresOfConstantsOrVecElts( 16440 StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc); 16441 16442 // Remove merged stores for next iteration. 16443 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 16444 NumConsecutiveStores -= NumElem; 16445 } 16446 return MadeChange; 16447 } 16448 16449 bool DAGCombiner::tryStoreMergeOfExtracts( 16450 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores, 16451 EVT MemVT, SDNode *RootNode) { 16452 LLVMContext &Context = *DAG.getContext(); 16453 const DataLayout &DL = DAG.getDataLayout(); 16454 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 16455 bool MadeChange = false; 16456 16457 // Loop on Consecutive Stores on success. 16458 while (NumConsecutiveStores >= 2) { 16459 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 16460 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 16461 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 16462 unsigned NumStoresToMerge = 1; 16463 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 16464 // Find a legal type for the vector store. 16465 unsigned Elts = (i + 1) * NumMemElts; 16466 EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); 16467 bool IsFast = false; 16468 16469 // Break early when size is too large to be legal. 16470 if (Ty.getSizeInBits() > MaximumLegalStoreInBits) 16471 break; 16472 16473 if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && 16474 TLI.allowsMemoryAccess(Context, DL, Ty, 16475 *FirstInChain->getMemOperand(), &IsFast) && 16476 IsFast) 16477 NumStoresToMerge = i + 1; 16478 } 16479 16480 // Check if we found a legal integer type creating a meaningful 16481 // merge. 16482 if (NumStoresToMerge < 2) { 16483 // We know that candidate stores are in order and of correct 16484 // shape. While there is no mergeable sequence from the 16485 // beginning one may start later in the sequence. The only 16486 // reason a merge of size N could have failed where another of 16487 // the same size would not have, is if the alignment has 16488 // improved. Drop as many candidates as we can here. 16489 unsigned NumSkip = 1; 16490 while ((NumSkip < NumConsecutiveStores) && 16491 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 16492 NumSkip++; 16493 16494 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 16495 NumConsecutiveStores -= NumSkip; 16496 continue; 16497 } 16498 16499 // Check that we can merge these candidates without causing a cycle. 16500 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge, 16501 RootNode)) { 16502 StoreNodes.erase(StoreNodes.begin(), 16503 StoreNodes.begin() + NumStoresToMerge); 16504 NumConsecutiveStores -= NumStoresToMerge; 16505 continue; 16506 } 16507 16508 MadeChange |= mergeStoresOfConstantsOrVecElts( 16509 StoreNodes, MemVT, NumStoresToMerge, false, true, false); 16510 16511 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge); 16512 NumConsecutiveStores -= NumStoresToMerge; 16513 } 16514 return MadeChange; 16515 } 16516 16517 bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, 16518 unsigned NumConsecutiveStores, EVT MemVT, 16519 SDNode *RootNode, bool AllowVectors, 16520 bool IsNonTemporalStore, 16521 bool IsNonTemporalLoad) { 16522 LLVMContext &Context = *DAG.getContext(); 16523 const DataLayout &DL = DAG.getDataLayout(); 16524 int64_t ElementSizeBytes = MemVT.getStoreSize(); 16525 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; 16526 bool MadeChange = false; 16527 16528 int64_t StartAddress = StoreNodes[0].OffsetFromBase; 16529 16530 // Look for load nodes which are used by the stored values. 16531 SmallVector<MemOpLink, 8> LoadNodes; 16532 16533 // Find acceptable loads. Loads need to have the same chain (token factor), 16534 // must not be zext, volatile, indexed, and they must be consecutive. 16535 BaseIndexOffset LdBasePtr; 16536 16537 for (unsigned i = 0; i < NumConsecutiveStores; ++i) { 16538 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); 16539 SDValue Val = peekThroughBitcasts(St->getValue()); 16540 LoadSDNode *Ld = cast<LoadSDNode>(Val); 16541 16542 BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG); 16543 // If this is not the first ptr that we check. 16544 int64_t LdOffset = 0; 16545 if (LdBasePtr.getBase().getNode()) { 16546 // The base ptr must be the same. 16547 if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset)) 16548 break; 16549 } else { 16550 // Check that all other base pointers are the same as this one. 16551 LdBasePtr = LdPtr; 16552 } 16553 16554 // We found a potential memory operand to merge. 16555 LoadNodes.push_back(MemOpLink(Ld, LdOffset)); 16556 } 16557 16558 while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) { 16559 Align RequiredAlignment; 16560 bool NeedRotate = false; 16561 if (LoadNodes.size() == 2) { 16562 // If we have load/store pair instructions and we only have two values, 16563 // don't bother merging. 16564 if (TLI.hasPairedLoad(MemVT, RequiredAlignment) && 16565 StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) { 16566 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); 16567 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2); 16568 break; 16569 } 16570 // If the loads are reversed, see if we can rotate the halves into place. 16571 int64_t Offset0 = LoadNodes[0].OffsetFromBase; 16572 int64_t Offset1 = LoadNodes[1].OffsetFromBase; 16573 EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2); 16574 if (Offset0 - Offset1 == ElementSizeBytes && 16575 (hasOperation(ISD::ROTL, PairVT) || 16576 hasOperation(ISD::ROTR, PairVT))) { 16577 std::swap(LoadNodes[0], LoadNodes[1]); 16578 NeedRotate = true; 16579 } 16580 } 16581 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; 16582 unsigned FirstStoreAS = FirstInChain->getAddressSpace(); 16583 unsigned FirstStoreAlign = FirstInChain->getAlignment(); 16584 LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); 16585 16586 // Scan the memory operations on the chain and find the first 16587 // non-consecutive load memory address. These variables hold the index in 16588 // the store node array. 16589 16590 unsigned LastConsecutiveLoad = 1; 16591 16592 // This variable refers to the size and not index in the array. 16593 unsigned LastLegalVectorType = 1; 16594 unsigned LastLegalIntegerType = 1; 16595 bool isDereferenceable = true; 16596 bool DoIntegerTruncate = false; 16597 StartAddress = LoadNodes[0].OffsetFromBase; 16598 SDValue LoadChain = FirstLoad->getChain(); 16599 for (unsigned i = 1; i < LoadNodes.size(); ++i) { 16600 // All loads must share the same chain. 16601 if (LoadNodes[i].MemNode->getChain() != LoadChain) 16602 break; 16603 16604 int64_t CurrAddress = LoadNodes[i].OffsetFromBase; 16605 if (CurrAddress - StartAddress != (ElementSizeBytes * i)) 16606 break; 16607 LastConsecutiveLoad = i; 16608 16609 if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable()) 16610 isDereferenceable = false; 16611 16612 // Find a legal type for the vector store. 16613 unsigned Elts = (i + 1) * NumMemElts; 16614 EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 16615 16616 // Break early when size is too large to be legal. 16617 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) 16618 break; 16619 16620 bool IsFastSt = false; 16621 bool IsFastLd = false; 16622 if (TLI.isTypeLegal(StoreTy) && 16623 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 16624 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16625 *FirstInChain->getMemOperand(), &IsFastSt) && 16626 IsFastSt && 16627 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16628 *FirstLoad->getMemOperand(), &IsFastLd) && 16629 IsFastLd) { 16630 LastLegalVectorType = i + 1; 16631 } 16632 16633 // Find a legal type for the integer store. 16634 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; 16635 StoreTy = EVT::getIntegerVT(Context, SizeInBits); 16636 if (TLI.isTypeLegal(StoreTy) && 16637 TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && 16638 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16639 *FirstInChain->getMemOperand(), &IsFastSt) && 16640 IsFastSt && 16641 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16642 *FirstLoad->getMemOperand(), &IsFastLd) && 16643 IsFastLd) { 16644 LastLegalIntegerType = i + 1; 16645 DoIntegerTruncate = false; 16646 // Or check whether a truncstore and extload is legal. 16647 } else if (TLI.getTypeAction(Context, StoreTy) == 16648 TargetLowering::TypePromoteInteger) { 16649 EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy); 16650 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && 16651 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && 16652 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) && 16653 TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) && 16654 TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) && 16655 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16656 *FirstInChain->getMemOperand(), &IsFastSt) && 16657 IsFastSt && 16658 TLI.allowsMemoryAccess(Context, DL, StoreTy, 16659 *FirstLoad->getMemOperand(), &IsFastLd) && 16660 IsFastLd) { 16661 LastLegalIntegerType = i + 1; 16662 DoIntegerTruncate = true; 16663 } 16664 } 16665 } 16666 16667 // Only use vector types if the vector type is larger than the integer 16668 // type. If they are the same, use integers. 16669 bool UseVectorTy = 16670 LastLegalVectorType > LastLegalIntegerType && AllowVectors; 16671 unsigned LastLegalType = 16672 std::max(LastLegalVectorType, LastLegalIntegerType); 16673 16674 // We add +1 here because the LastXXX variables refer to location while 16675 // the NumElem refers to array/index size. 16676 unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); 16677 NumElem = std::min(LastLegalType, NumElem); 16678 unsigned FirstLoadAlign = FirstLoad->getAlignment(); 16679 16680 if (NumElem < 2) { 16681 // We know that candidate stores are in order and of correct 16682 // shape. While there is no mergeable sequence from the 16683 // beginning one may start later in the sequence. The only 16684 // reason a merge of size N could have failed where another of 16685 // the same size would not have is if the alignment or either 16686 // the load or store has improved. Drop as many candidates as we 16687 // can here. 16688 unsigned NumSkip = 1; 16689 while ((NumSkip < LoadNodes.size()) && 16690 (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && 16691 (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) 16692 NumSkip++; 16693 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); 16694 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip); 16695 NumConsecutiveStores -= NumSkip; 16696 continue; 16697 } 16698 16699 // Check that we can merge these candidates without causing a cycle. 16700 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, 16701 RootNode)) { 16702 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 16703 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); 16704 NumConsecutiveStores -= NumElem; 16705 continue; 16706 } 16707 16708 // Find if it is better to use vectors or integers to load and store 16709 // to memory. 16710 EVT JointMemOpVT; 16711 if (UseVectorTy) { 16712 // Find a legal type for the vector store. 16713 unsigned Elts = NumElem * NumMemElts; 16714 JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); 16715 } else { 16716 unsigned SizeInBits = NumElem * ElementSizeBytes * 8; 16717 JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); 16718 } 16719 16720 SDLoc LoadDL(LoadNodes[0].MemNode); 16721 SDLoc StoreDL(StoreNodes[0].MemNode); 16722 16723 // The merged loads are required to have the same incoming chain, so 16724 // using the first's chain is acceptable. 16725 16726 SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); 16727 AddToWorklist(NewStoreChain.getNode()); 16728 16729 MachineMemOperand::Flags LdMMOFlags = 16730 isDereferenceable ? MachineMemOperand::MODereferenceable 16731 : MachineMemOperand::MONone; 16732 if (IsNonTemporalLoad) 16733 LdMMOFlags |= MachineMemOperand::MONonTemporal; 16734 16735 MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore 16736 ? MachineMemOperand::MONonTemporal 16737 : MachineMemOperand::MONone; 16738 16739 SDValue NewLoad, NewStore; 16740 if (UseVectorTy || !DoIntegerTruncate) { 16741 NewLoad = DAG.getLoad( 16742 JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), 16743 FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags); 16744 SDValue StoreOp = NewLoad; 16745 if (NeedRotate) { 16746 unsigned LoadWidth = ElementSizeBytes * 8 * 2; 16747 assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) && 16748 "Unexpected type for rotate-able load pair"); 16749 SDValue RotAmt = 16750 DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL); 16751 // Target can convert to the identical ROTR if it does not have ROTL. 16752 StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt); 16753 } 16754 NewStore = DAG.getStore( 16755 NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(), 16756 FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); 16757 } else { // This must be the truncstore/extload case 16758 EVT ExtendedTy = 16759 TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); 16760 NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, 16761 FirstLoad->getChain(), FirstLoad->getBasePtr(), 16762 FirstLoad->getPointerInfo(), JointMemOpVT, 16763 FirstLoadAlign, LdMMOFlags); 16764 NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, 16765 FirstInChain->getBasePtr(), 16766 FirstInChain->getPointerInfo(), JointMemOpVT, 16767 FirstInChain->getAlignment(), 16768 FirstInChain->getMemOperand()->getFlags()); 16769 } 16770 16771 // Transfer chain users from old loads to the new load. 16772 for (unsigned i = 0; i < NumElem; ++i) { 16773 LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); 16774 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), 16775 SDValue(NewLoad.getNode(), 1)); 16776 } 16777 16778 // Replace all stores with the new store. Recursively remove corresponding 16779 // values if they are no longer used. 16780 for (unsigned i = 0; i < NumElem; ++i) { 16781 SDValue Val = StoreNodes[i].MemNode->getOperand(1); 16782 CombineTo(StoreNodes[i].MemNode, NewStore); 16783 if (Val.getNode()->use_empty()) 16784 recursivelyDeleteUnusedNodes(Val.getNode()); 16785 } 16786 16787 MadeChange = true; 16788 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); 16789 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); 16790 NumConsecutiveStores -= NumElem; 16791 } 16792 return MadeChange; 16793 } 16794 16795 bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) { 16796 if (OptLevel == CodeGenOpt::None || !EnableStoreMerging) 16797 return false; 16798 16799 // TODO: Extend this function to merge stores of scalable vectors. 16800 // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8> 16801 // store since we know <vscale x 16 x i8> is exactly twice as large as 16802 // <vscale x 8 x i8>). Until then, bail out for scalable vectors. 16803 EVT MemVT = St->getMemoryVT(); 16804 if (MemVT.isScalableVector()) 16805 return false; 16806 if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) 16807 return false; 16808 16809 // This function cannot currently deal with non-byte-sized memory sizes. 16810 int64_t ElementSizeBytes = MemVT.getStoreSize(); 16811 if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits()) 16812 return false; 16813 16814 // Do not bother looking at stored values that are not constants, loads, or 16815 // extracted vector elements. 16816 SDValue StoredVal = peekThroughBitcasts(St->getValue()); 16817 const StoreSource StoreSrc = getStoreSource(StoredVal); 16818 if (StoreSrc == StoreSource::Unknown) 16819 return false; 16820 16821 SmallVector<MemOpLink, 8> StoreNodes; 16822 SDNode *RootNode; 16823 // Find potential store merge candidates by searching through chain sub-DAG 16824 getStoreMergeCandidates(St, StoreNodes, RootNode); 16825 16826 // Check if there is anything to merge. 16827 if (StoreNodes.size() < 2) 16828 return false; 16829 16830 // Sort the memory operands according to their distance from the 16831 // base pointer. 16832 llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) { 16833 return LHS.OffsetFromBase < RHS.OffsetFromBase; 16834 }); 16835 16836 bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute( 16837 Attribute::NoImplicitFloat); 16838 bool IsNonTemporalStore = St->isNonTemporal(); 16839 bool IsNonTemporalLoad = StoreSrc == StoreSource::Load && 16840 cast<LoadSDNode>(StoredVal)->isNonTemporal(); 16841 16842 // Store Merge attempts to merge the lowest stores. This generally 16843 // works out as if successful, as the remaining stores are checked 16844 // after the first collection of stores is merged. However, in the 16845 // case that a non-mergeable store is found first, e.g., {p[-2], 16846 // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent 16847 // mergeable cases. To prevent this, we prune such stores from the 16848 // front of StoreNodes here. 16849 bool MadeChange = false; 16850 while (StoreNodes.size() > 1) { 16851 unsigned NumConsecutiveStores = 16852 getConsecutiveStores(StoreNodes, ElementSizeBytes); 16853 // There are no more stores in the list to examine. 16854 if (NumConsecutiveStores == 0) 16855 return MadeChange; 16856 16857 // We have at least 2 consecutive stores. Try to merge them. 16858 assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores"); 16859 switch (StoreSrc) { 16860 case StoreSource::Constant: 16861 MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores, 16862 MemVT, RootNode, AllowVectors); 16863 break; 16864 16865 case StoreSource::Extract: 16866 MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores, 16867 MemVT, RootNode); 16868 break; 16869 16870 case StoreSource::Load: 16871 MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores, 16872 MemVT, RootNode, AllowVectors, 16873 IsNonTemporalStore, IsNonTemporalLoad); 16874 break; 16875 16876 default: 16877 llvm_unreachable("Unhandled store source type"); 16878 } 16879 } 16880 return MadeChange; 16881 } 16882 16883 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { 16884 SDLoc SL(ST); 16885 SDValue ReplStore; 16886 16887 // Replace the chain to avoid dependency. 16888 if (ST->isTruncatingStore()) { 16889 ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(), 16890 ST->getBasePtr(), ST->getMemoryVT(), 16891 ST->getMemOperand()); 16892 } else { 16893 ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), 16894 ST->getMemOperand()); 16895 } 16896 16897 // Create token to keep both nodes around. 16898 SDValue Token = DAG.getNode(ISD::TokenFactor, SL, 16899 MVT::Other, ST->getChain(), ReplStore); 16900 16901 // Make sure the new and old chains are cleaned up. 16902 AddToWorklist(Token.getNode()); 16903 16904 // Don't add users to work list. 16905 return CombineTo(ST, Token, false); 16906 } 16907 16908 SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { 16909 SDValue Value = ST->getValue(); 16910 if (Value.getOpcode() == ISD::TargetConstantFP) 16911 return SDValue(); 16912 16913 if (!ISD::isNormalStore(ST)) 16914 return SDValue(); 16915 16916 SDLoc DL(ST); 16917 16918 SDValue Chain = ST->getChain(); 16919 SDValue Ptr = ST->getBasePtr(); 16920 16921 const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value); 16922 16923 // NOTE: If the original store is volatile, this transform must not increase 16924 // the number of stores. For example, on x86-32 an f64 can be stored in one 16925 // processor operation but an i64 (which is not legal) requires two. So the 16926 // transform should not be done in this case. 16927 16928 SDValue Tmp; 16929 switch (CFP->getSimpleValueType(0).SimpleTy) { 16930 default: 16931 llvm_unreachable("Unknown FP type"); 16932 case MVT::f16: // We don't do this for these yet. 16933 case MVT::f80: 16934 case MVT::f128: 16935 case MVT::ppcf128: 16936 return SDValue(); 16937 case MVT::f32: 16938 if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) || 16939 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { 16940 ; 16941 Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). 16942 bitcastToAPInt().getZExtValue(), SDLoc(CFP), 16943 MVT::i32); 16944 return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); 16945 } 16946 16947 return SDValue(); 16948 case MVT::f64: 16949 if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && 16950 ST->isSimple()) || 16951 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { 16952 ; 16953 Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). 16954 getZExtValue(), SDLoc(CFP), MVT::i64); 16955 return DAG.getStore(Chain, DL, Tmp, 16956 Ptr, ST->getMemOperand()); 16957 } 16958 16959 if (ST->isSimple() && 16960 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { 16961 // Many FP stores are not made apparent until after legalize, e.g. for 16962 // argument passing. Since this is so common, custom legalize the 16963 // 64-bit integer store into two 32-bit stores. 16964 uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); 16965 SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32); 16966 SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32); 16967 if (DAG.getDataLayout().isBigEndian()) 16968 std::swap(Lo, Hi); 16969 16970 unsigned Alignment = ST->getAlignment(); 16971 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); 16972 AAMDNodes AAInfo = ST->getAAInfo(); 16973 16974 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), 16975 ST->getAlignment(), MMOFlags, AAInfo); 16976 Ptr = DAG.getMemBasePlusOffset(Ptr, 4, DL); 16977 Alignment = MinAlign(Alignment, 4U); 16978 SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, 16979 ST->getPointerInfo().getWithOffset(4), 16980 Alignment, MMOFlags, AAInfo); 16981 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 16982 St0, St1); 16983 } 16984 16985 return SDValue(); 16986 } 16987 } 16988 16989 SDValue DAGCombiner::visitSTORE(SDNode *N) { 16990 StoreSDNode *ST = cast<StoreSDNode>(N); 16991 SDValue Chain = ST->getChain(); 16992 SDValue Value = ST->getValue(); 16993 SDValue Ptr = ST->getBasePtr(); 16994 16995 // If this is a store of a bit convert, store the input value if the 16996 // resultant store does not need a higher alignment than the original. 16997 if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && 16998 ST->isUnindexed()) { 16999 EVT SVT = Value.getOperand(0).getValueType(); 17000 // If the store is volatile, we only want to change the store type if the 17001 // resulting store is legal. Otherwise we might increase the number of 17002 // memory accesses. We don't care if the original type was legal or not 17003 // as we assume software couldn't rely on the number of accesses of an 17004 // illegal type. 17005 // TODO: May be able to relax for unordered atomics (see D66309) 17006 if (((!LegalOperations && ST->isSimple()) || 17007 TLI.isOperationLegal(ISD::STORE, SVT)) && 17008 TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT, 17009 DAG, *ST->getMemOperand())) { 17010 return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, 17011 ST->getMemOperand()); 17012 } 17013 } 17014 17015 // Turn 'store undef, Ptr' -> nothing. 17016 if (Value.isUndef() && ST->isUnindexed()) 17017 return Chain; 17018 17019 // Try to infer better alignment information than the store already has. 17020 if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) { 17021 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) { 17022 if (*Alignment > ST->getAlign() && 17023 isAligned(*Alignment, ST->getSrcValueOffset())) { 17024 SDValue NewStore = 17025 DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(), 17026 ST->getMemoryVT(), *Alignment, 17027 ST->getMemOperand()->getFlags(), ST->getAAInfo()); 17028 // NewStore will always be N as we are only refining the alignment 17029 assert(NewStore.getNode() == N); 17030 (void)NewStore; 17031 } 17032 } 17033 } 17034 17035 // Try transforming a pair floating point load / store ops to integer 17036 // load / store ops. 17037 if (SDValue NewST = TransformFPLoadStorePair(N)) 17038 return NewST; 17039 17040 // Try transforming several stores into STORE (BSWAP). 17041 if (SDValue Store = MatchStoreCombine(ST)) 17042 return Store; 17043 17044 if (ST->isUnindexed()) { 17045 // Walk up chain skipping non-aliasing memory nodes, on this store and any 17046 // adjacent stores. 17047 if (findBetterNeighborChains(ST)) { 17048 // replaceStoreChain uses CombineTo, which handled all of the worklist 17049 // manipulation. Return the original node to not do anything else. 17050 return SDValue(ST, 0); 17051 } 17052 Chain = ST->getChain(); 17053 } 17054 17055 // FIXME: is there such a thing as a truncating indexed store? 17056 if (ST->isTruncatingStore() && ST->isUnindexed() && 17057 Value.getValueType().isInteger() && 17058 (!isa<ConstantSDNode>(Value) || 17059 !cast<ConstantSDNode>(Value)->isOpaque())) { 17060 APInt TruncDemandedBits = 17061 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), 17062 ST->getMemoryVT().getScalarSizeInBits()); 17063 17064 // See if we can simplify the input to this truncstore with knowledge that 17065 // only the low bits are being used. For example: 17066 // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" 17067 AddToWorklist(Value.getNode()); 17068 if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits)) 17069 return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), 17070 ST->getMemOperand()); 17071 17072 // Otherwise, see if we can simplify the operation with 17073 // SimplifyDemandedBits, which only works if the value has a single use. 17074 if (SimplifyDemandedBits(Value, TruncDemandedBits)) { 17075 // Re-visit the store if anything changed and the store hasn't been merged 17076 // with another node (N is deleted) SimplifyDemandedBits will add Value's 17077 // node back to the worklist if necessary, but we also need to re-visit 17078 // the Store node itself. 17079 if (N->getOpcode() != ISD::DELETED_NODE) 17080 AddToWorklist(N); 17081 return SDValue(N, 0); 17082 } 17083 } 17084 17085 // If this is a load followed by a store to the same location, then the store 17086 // is dead/noop. 17087 // TODO: Can relax for unordered atomics (see D66309) 17088 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) { 17089 if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() && 17090 ST->isUnindexed() && ST->isSimple() && 17091 // There can't be any side effects between the load and store, such as 17092 // a call or store. 17093 Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) { 17094 // The store is dead, remove it. 17095 return Chain; 17096 } 17097 } 17098 17099 // TODO: Can relax for unordered atomics (see D66309) 17100 if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { 17101 if (ST->isUnindexed() && ST->isSimple() && 17102 ST1->isUnindexed() && ST1->isSimple()) { 17103 if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value && 17104 ST->getMemoryVT() == ST1->getMemoryVT()) { 17105 // If this is a store followed by a store with the same value to the 17106 // same location, then the store is dead/noop. 17107 return Chain; 17108 } 17109 17110 if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && 17111 !ST1->getBasePtr().isUndef() && 17112 // BaseIndexOffset and the code below requires knowing the size 17113 // of a vector, so bail out if MemoryVT is scalable. 17114 !ST1->getMemoryVT().isScalableVector()) { 17115 const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); 17116 const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); 17117 unsigned STBitSize = ST->getMemoryVT().getSizeInBits(); 17118 unsigned ChainBitSize = ST1->getMemoryVT().getSizeInBits(); 17119 // If this is a store who's preceding store to a subset of the current 17120 // location and no one other node is chained to that store we can 17121 // effectively drop the store. Do not remove stores to undef as they may 17122 // be used as data sinks. 17123 if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) { 17124 CombineTo(ST1, ST1->getChain()); 17125 return SDValue(); 17126 } 17127 } 17128 } 17129 } 17130 17131 // If this is an FP_ROUND or TRUNC followed by a store, fold this into a 17132 // truncating store. We can do this even if this is already a truncstore. 17133 if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE) 17134 && Value.getNode()->hasOneUse() && ST->isUnindexed() && 17135 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), 17136 ST->getMemoryVT())) { 17137 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), 17138 Ptr, ST->getMemoryVT(), ST->getMemOperand()); 17139 } 17140 17141 // Always perform this optimization before types are legal. If the target 17142 // prefers, also try this after legalization to catch stores that were created 17143 // by intrinsics or other nodes. 17144 if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) { 17145 while (true) { 17146 // There can be multiple store sequences on the same chain. 17147 // Keep trying to merge store sequences until we are unable to do so 17148 // or until we merge the last store on the chain. 17149 bool Changed = mergeConsecutiveStores(ST); 17150 if (!Changed) break; 17151 // Return N as merge only uses CombineTo and no worklist clean 17152 // up is necessary. 17153 if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N)) 17154 return SDValue(N, 0); 17155 } 17156 } 17157 17158 // Try transforming N to an indexed store. 17159 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) 17160 return SDValue(N, 0); 17161 17162 // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' 17163 // 17164 // Make sure to do this only after attempting to merge stores in order to 17165 // avoid changing the types of some subset of stores due to visit order, 17166 // preventing their merging. 17167 if (isa<ConstantFPSDNode>(ST->getValue())) { 17168 if (SDValue NewSt = replaceStoreOfFPConstant(ST)) 17169 return NewSt; 17170 } 17171 17172 if (SDValue NewSt = splitMergedValStore(ST)) 17173 return NewSt; 17174 17175 return ReduceLoadOpStoreWidth(N); 17176 } 17177 17178 SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { 17179 const auto *LifetimeEnd = cast<LifetimeSDNode>(N); 17180 if (!LifetimeEnd->hasOffset()) 17181 return SDValue(); 17182 17183 const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), 17184 LifetimeEnd->getOffset(), false); 17185 17186 // We walk up the chains to find stores. 17187 SmallVector<SDValue, 8> Chains = {N->getOperand(0)}; 17188 while (!Chains.empty()) { 17189 SDValue Chain = Chains.back(); 17190 Chains.pop_back(); 17191 if (!Chain.hasOneUse()) 17192 continue; 17193 switch (Chain.getOpcode()) { 17194 case ISD::TokenFactor: 17195 for (unsigned Nops = Chain.getNumOperands(); Nops;) 17196 Chains.push_back(Chain.getOperand(--Nops)); 17197 break; 17198 case ISD::LIFETIME_START: 17199 case ISD::LIFETIME_END: 17200 // We can forward past any lifetime start/end that can be proven not to 17201 // alias the node. 17202 if (!isAlias(Chain.getNode(), N)) 17203 Chains.push_back(Chain.getOperand(0)); 17204 break; 17205 case ISD::STORE: { 17206 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain); 17207 // TODO: Can relax for unordered atomics (see D66309) 17208 if (!ST->isSimple() || ST->isIndexed()) 17209 continue; 17210 const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); 17211 // If we store purely within object bounds just before its lifetime ends, 17212 // we can remove the store. 17213 if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, 17214 ST->getMemoryVT().getStoreSizeInBits())) { 17215 LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); 17216 dbgs() << "\nwithin LIFETIME_END of : "; 17217 LifetimeEndBase.dump(); dbgs() << "\n"); 17218 CombineTo(ST, ST->getChain()); 17219 return SDValue(N, 0); 17220 } 17221 } 17222 } 17223 } 17224 return SDValue(); 17225 } 17226 17227 /// For the instruction sequence of store below, F and I values 17228 /// are bundled together as an i64 value before being stored into memory. 17229 /// Sometimes it is more efficent to generate separate stores for F and I, 17230 /// which can remove the bitwise instructions or sink them to colder places. 17231 /// 17232 /// (store (or (zext (bitcast F to i32) to i64), 17233 /// (shl (zext I to i64), 32)), addr) --> 17234 /// (store F, addr) and (store I, addr+4) 17235 /// 17236 /// Similarly, splitting for other merged store can also be beneficial, like: 17237 /// For pair of {i32, i32}, i64 store --> two i32 stores. 17238 /// For pair of {i32, i16}, i64 store --> two i32 stores. 17239 /// For pair of {i16, i16}, i32 store --> two i16 stores. 17240 /// For pair of {i16, i8}, i32 store --> two i16 stores. 17241 /// For pair of {i8, i8}, i16 store --> two i8 stores. 17242 /// 17243 /// We allow each target to determine specifically which kind of splitting is 17244 /// supported. 17245 /// 17246 /// The store patterns are commonly seen from the simple code snippet below 17247 /// if only std::make_pair(...) is sroa transformed before inlined into hoo. 17248 /// void goo(const std::pair<int, float> &); 17249 /// hoo() { 17250 /// ... 17251 /// goo(std::make_pair(tmp, ftmp)); 17252 /// ... 17253 /// } 17254 /// 17255 SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { 17256 if (OptLevel == CodeGenOpt::None) 17257 return SDValue(); 17258 17259 // Can't change the number of memory accesses for a volatile store or break 17260 // atomicity for an atomic one. 17261 if (!ST->isSimple()) 17262 return SDValue(); 17263 17264 SDValue Val = ST->getValue(); 17265 SDLoc DL(ST); 17266 17267 // Match OR operand. 17268 if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR) 17269 return SDValue(); 17270 17271 // Match SHL operand and get Lower and Higher parts of Val. 17272 SDValue Op1 = Val.getOperand(0); 17273 SDValue Op2 = Val.getOperand(1); 17274 SDValue Lo, Hi; 17275 if (Op1.getOpcode() != ISD::SHL) { 17276 std::swap(Op1, Op2); 17277 if (Op1.getOpcode() != ISD::SHL) 17278 return SDValue(); 17279 } 17280 Lo = Op2; 17281 Hi = Op1.getOperand(0); 17282 if (!Op1.hasOneUse()) 17283 return SDValue(); 17284 17285 // Match shift amount to HalfValBitSize. 17286 unsigned HalfValBitSize = Val.getValueSizeInBits() / 2; 17287 ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1)); 17288 if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize) 17289 return SDValue(); 17290 17291 // Lo and Hi are zero-extended from int with size less equal than 32 17292 // to i64. 17293 if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() || 17294 !Lo.getOperand(0).getValueType().isScalarInteger() || 17295 Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize || 17296 Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() || 17297 !Hi.getOperand(0).getValueType().isScalarInteger() || 17298 Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize) 17299 return SDValue(); 17300 17301 // Use the EVT of low and high parts before bitcast as the input 17302 // of target query. 17303 EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST) 17304 ? Lo.getOperand(0).getValueType() 17305 : Lo.getValueType(); 17306 EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST) 17307 ? Hi.getOperand(0).getValueType() 17308 : Hi.getValueType(); 17309 if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) 17310 return SDValue(); 17311 17312 // Start to split store. 17313 unsigned Alignment = ST->getAlignment(); 17314 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); 17315 AAMDNodes AAInfo = ST->getAAInfo(); 17316 17317 // Change the sizes of Lo and Hi's value types to HalfValBitSize. 17318 EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize); 17319 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0)); 17320 Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0)); 17321 17322 SDValue Chain = ST->getChain(); 17323 SDValue Ptr = ST->getBasePtr(); 17324 // Lower value store. 17325 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), 17326 ST->getAlignment(), MMOFlags, AAInfo); 17327 Ptr = DAG.getMemBasePlusOffset(Ptr, HalfValBitSize / 8, DL); 17328 // Higher value store. 17329 SDValue St1 = 17330 DAG.getStore(St0, DL, Hi, Ptr, 17331 ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), 17332 Alignment / 2, MMOFlags, AAInfo); 17333 return St1; 17334 } 17335 17336 /// Convert a disguised subvector insertion into a shuffle: 17337 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { 17338 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && 17339 "Expected extract_vector_elt"); 17340 SDValue InsertVal = N->getOperand(1); 17341 SDValue Vec = N->getOperand(0); 17342 17343 // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), 17344 // InsIndex) 17345 // --> (vector_shuffle X, Y) and variations where shuffle operands may be 17346 // CONCAT_VECTORS. 17347 if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() && 17348 InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 17349 isa<ConstantSDNode>(InsertVal.getOperand(1))) { 17350 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode()); 17351 ArrayRef<int> Mask = SVN->getMask(); 17352 17353 SDValue X = Vec.getOperand(0); 17354 SDValue Y = Vec.getOperand(1); 17355 17356 // Vec's operand 0 is using indices from 0 to N-1 and 17357 // operand 1 from N to 2N - 1, where N is the number of 17358 // elements in the vectors. 17359 SDValue InsertVal0 = InsertVal.getOperand(0); 17360 int ElementOffset = -1; 17361 17362 // We explore the inputs of the shuffle in order to see if we find the 17363 // source of the extract_vector_elt. If so, we can use it to modify the 17364 // shuffle rather than perform an insert_vector_elt. 17365 SmallVector<std::pair<int, SDValue>, 8> ArgWorkList; 17366 ArgWorkList.emplace_back(Mask.size(), Y); 17367 ArgWorkList.emplace_back(0, X); 17368 17369 while (!ArgWorkList.empty()) { 17370 int ArgOffset; 17371 SDValue ArgVal; 17372 std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val(); 17373 17374 if (ArgVal == InsertVal0) { 17375 ElementOffset = ArgOffset; 17376 break; 17377 } 17378 17379 // Peek through concat_vector. 17380 if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) { 17381 int CurrentArgOffset = 17382 ArgOffset + ArgVal.getValueType().getVectorNumElements(); 17383 int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements(); 17384 for (SDValue Op : reverse(ArgVal->ops())) { 17385 CurrentArgOffset -= Step; 17386 ArgWorkList.emplace_back(CurrentArgOffset, Op); 17387 } 17388 17389 // Make sure we went through all the elements and did not screw up index 17390 // computation. 17391 assert(CurrentArgOffset == ArgOffset); 17392 } 17393 } 17394 17395 if (ElementOffset != -1) { 17396 SmallVector<int, 16> NewMask(Mask.begin(), Mask.end()); 17397 17398 auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1)); 17399 NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue(); 17400 assert(NewMask[InsIndex] < 17401 (int)(2 * Vec.getValueType().getVectorNumElements()) && 17402 NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound"); 17403 17404 SDValue LegalShuffle = 17405 TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X, 17406 Y, NewMask, DAG); 17407 if (LegalShuffle) 17408 return LegalShuffle; 17409 } 17410 } 17411 17412 // insert_vector_elt V, (bitcast X from vector type), IdxC --> 17413 // bitcast(shuffle (bitcast V), (extended X), Mask) 17414 // Note: We do not use an insert_subvector node because that requires a 17415 // legal subvector type. 17416 if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || 17417 !InsertVal.getOperand(0).getValueType().isVector()) 17418 return SDValue(); 17419 17420 SDValue SubVec = InsertVal.getOperand(0); 17421 SDValue DestVec = N->getOperand(0); 17422 EVT SubVecVT = SubVec.getValueType(); 17423 EVT VT = DestVec.getValueType(); 17424 unsigned NumSrcElts = SubVecVT.getVectorNumElements(); 17425 // If the source only has a single vector element, the cost of creating adding 17426 // it to a vector is likely to exceed the cost of a insert_vector_elt. 17427 if (NumSrcElts == 1) 17428 return SDValue(); 17429 unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); 17430 unsigned NumMaskVals = ExtendRatio * NumSrcElts; 17431 17432 // Step 1: Create a shuffle mask that implements this insert operation. The 17433 // vector that we are inserting into will be operand 0 of the shuffle, so 17434 // those elements are just 'i'. The inserted subvector is in the first 17435 // positions of operand 1 of the shuffle. Example: 17436 // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} 17437 SmallVector<int, 16> Mask(NumMaskVals); 17438 for (unsigned i = 0; i != NumMaskVals; ++i) { 17439 if (i / NumSrcElts == InsIndex) 17440 Mask[i] = (i % NumSrcElts) + NumMaskVals; 17441 else 17442 Mask[i] = i; 17443 } 17444 17445 // Bail out if the target can not handle the shuffle we want to create. 17446 EVT SubVecEltVT = SubVecVT.getVectorElementType(); 17447 EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); 17448 if (!TLI.isShuffleMaskLegal(Mask, ShufVT)) 17449 return SDValue(); 17450 17451 // Step 2: Create a wide vector from the inserted source vector by appending 17452 // undefined elements. This is the same size as our destination vector. 17453 SDLoc DL(N); 17454 SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); 17455 ConcatOps[0] = SubVec; 17456 SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); 17457 17458 // Step 3: Shuffle in the padded subvector. 17459 SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); 17460 SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); 17461 AddToWorklist(PaddedSubV.getNode()); 17462 AddToWorklist(DestVecBC.getNode()); 17463 AddToWorklist(Shuf.getNode()); 17464 return DAG.getBitcast(VT, Shuf); 17465 } 17466 17467 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { 17468 SDValue InVec = N->getOperand(0); 17469 SDValue InVal = N->getOperand(1); 17470 SDValue EltNo = N->getOperand(2); 17471 SDLoc DL(N); 17472 17473 EVT VT = InVec.getValueType(); 17474 auto *IndexC = dyn_cast<ConstantSDNode>(EltNo); 17475 17476 // Insert into out-of-bounds element is undefined. 17477 if (IndexC && VT.isFixedLengthVector() && 17478 IndexC->getZExtValue() >= VT.getVectorNumElements()) 17479 return DAG.getUNDEF(VT); 17480 17481 // Remove redundant insertions: 17482 // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x 17483 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 17484 InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) 17485 return InVec; 17486 17487 if (!IndexC) { 17488 // If this is variable insert to undef vector, it might be better to splat: 17489 // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > 17490 if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) { 17491 if (VT.isScalableVector()) 17492 return DAG.getSplatVector(VT, DL, InVal); 17493 else { 17494 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal); 17495 return DAG.getBuildVector(VT, DL, Ops); 17496 } 17497 } 17498 return SDValue(); 17499 } 17500 17501 if (VT.isScalableVector()) 17502 return SDValue(); 17503 17504 unsigned NumElts = VT.getVectorNumElements(); 17505 17506 // We must know which element is being inserted for folds below here. 17507 unsigned Elt = IndexC->getZExtValue(); 17508 if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) 17509 return Shuf; 17510 17511 // Canonicalize insert_vector_elt dag nodes. 17512 // Example: 17513 // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1) 17514 // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0) 17515 // 17516 // Do this only if the child insert_vector node has one use; also 17517 // do this only if indices are both constants and Idx1 < Idx0. 17518 if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() 17519 && isa<ConstantSDNode>(InVec.getOperand(2))) { 17520 unsigned OtherElt = InVec.getConstantOperandVal(2); 17521 if (Elt < OtherElt) { 17522 // Swap nodes. 17523 SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, 17524 InVec.getOperand(0), InVal, EltNo); 17525 AddToWorklist(NewOp.getNode()); 17526 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()), 17527 VT, NewOp, InVec.getOperand(1), InVec.getOperand(2)); 17528 } 17529 } 17530 17531 // If we can't generate a legal BUILD_VECTOR, exit 17532 if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) 17533 return SDValue(); 17534 17535 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 17536 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 17537 // vector elements. 17538 SmallVector<SDValue, 8> Ops; 17539 // Do not combine these two vectors if the output vector will not replace 17540 // the input vector. 17541 if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) { 17542 Ops.append(InVec.getNode()->op_begin(), 17543 InVec.getNode()->op_end()); 17544 } else if (InVec.isUndef()) { 17545 Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType())); 17546 } else { 17547 return SDValue(); 17548 } 17549 assert(Ops.size() == NumElts && "Unexpected vector size"); 17550 17551 // Insert the element 17552 if (Elt < Ops.size()) { 17553 // All the operands of BUILD_VECTOR must have the same type; 17554 // we enforce that here. 17555 EVT OpVT = Ops[0].getValueType(); 17556 Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; 17557 } 17558 17559 // Return the new vector 17560 return DAG.getBuildVector(VT, DL, Ops); 17561 } 17562 17563 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, 17564 SDValue EltNo, 17565 LoadSDNode *OriginalLoad) { 17566 assert(OriginalLoad->isSimple()); 17567 17568 EVT ResultVT = EVE->getValueType(0); 17569 EVT VecEltVT = InVecVT.getVectorElementType(); 17570 Align Alignment = OriginalLoad->getAlign(); 17571 Align NewAlign = DAG.getDataLayout().getABITypeAlign( 17572 VecEltVT.getTypeForEVT(*DAG.getContext())); 17573 17574 if (NewAlign > Alignment || 17575 !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) 17576 return SDValue(); 17577 17578 ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? 17579 ISD::NON_EXTLOAD : ISD::EXTLOAD; 17580 if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) 17581 return SDValue(); 17582 17583 Alignment = NewAlign; 17584 17585 SDValue NewPtr = OriginalLoad->getBasePtr(); 17586 SDValue Offset; 17587 EVT PtrType = NewPtr.getValueType(); 17588 MachinePointerInfo MPI; 17589 SDLoc DL(EVE); 17590 if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) { 17591 int Elt = ConstEltNo->getZExtValue(); 17592 unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; 17593 Offset = DAG.getConstant(PtrOff, DL, PtrType); 17594 MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); 17595 } else { 17596 Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType); 17597 Offset = DAG.getNode( 17598 ISD::MUL, DL, PtrType, Offset, 17599 DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType)); 17600 // Discard the pointer info except the address space because the memory 17601 // operand can't represent this new access since the offset is variable. 17602 MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); 17603 } 17604 NewPtr = DAG.getMemBasePlusOffset(NewPtr, Offset, DL); 17605 17606 // The replacement we need to do here is a little tricky: we need to 17607 // replace an extractelement of a load with a load. 17608 // Use ReplaceAllUsesOfValuesWith to do the replacement. 17609 // Note that this replacement assumes that the extractvalue is the only 17610 // use of the load; that's okay because we don't want to perform this 17611 // transformation in other cases anyway. 17612 SDValue Load; 17613 SDValue Chain; 17614 if (ResultVT.bitsGT(VecEltVT)) { 17615 // If the result type of vextract is wider than the load, then issue an 17616 // extending load instead. 17617 ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, 17618 VecEltVT) 17619 ? ISD::ZEXTLOAD 17620 : ISD::EXTLOAD; 17621 Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, 17622 OriginalLoad->getChain(), NewPtr, MPI, VecEltVT, 17623 Alignment, OriginalLoad->getMemOperand()->getFlags(), 17624 OriginalLoad->getAAInfo()); 17625 Chain = Load.getValue(1); 17626 } else { 17627 Load = DAG.getLoad( 17628 VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Alignment, 17629 OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo()); 17630 Chain = Load.getValue(1); 17631 if (ResultVT.bitsLT(VecEltVT)) 17632 Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load); 17633 else 17634 Load = DAG.getBitcast(ResultVT, Load); 17635 } 17636 WorklistRemover DeadNodes(*this); 17637 SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) }; 17638 SDValue To[] = { Load, Chain }; 17639 DAG.ReplaceAllUsesOfValuesWith(From, To, 2); 17640 // Make sure to revisit this node to clean it up; it will usually be dead. 17641 AddToWorklist(EVE); 17642 // Since we're explicitly calling ReplaceAllUses, add the new node to the 17643 // worklist explicitly as well. 17644 AddToWorklistWithUsers(Load.getNode()); 17645 ++OpsNarrowed; 17646 return SDValue(EVE, 0); 17647 } 17648 17649 /// Transform a vector binary operation into a scalar binary operation by moving 17650 /// the math/logic after an extract element of a vector. 17651 static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG, 17652 bool LegalOperations) { 17653 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17654 SDValue Vec = ExtElt->getOperand(0); 17655 SDValue Index = ExtElt->getOperand(1); 17656 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 17657 if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() || 17658 Vec.getNode()->getNumValues() != 1) 17659 return SDValue(); 17660 17661 // Targets may want to avoid this to prevent an expensive register transfer. 17662 if (!TLI.shouldScalarizeBinop(Vec)) 17663 return SDValue(); 17664 17665 // Extracting an element of a vector constant is constant-folded, so this 17666 // transform is just replacing a vector op with a scalar op while moving the 17667 // extract. 17668 SDValue Op0 = Vec.getOperand(0); 17669 SDValue Op1 = Vec.getOperand(1); 17670 if (isAnyConstantBuildVector(Op0, true) || 17671 isAnyConstantBuildVector(Op1, true)) { 17672 // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C' 17673 // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC) 17674 SDLoc DL(ExtElt); 17675 EVT VT = ExtElt->getValueType(0); 17676 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index); 17677 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index); 17678 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1); 17679 } 17680 17681 return SDValue(); 17682 } 17683 17684 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { 17685 SDValue VecOp = N->getOperand(0); 17686 SDValue Index = N->getOperand(1); 17687 EVT ScalarVT = N->getValueType(0); 17688 EVT VecVT = VecOp.getValueType(); 17689 if (VecOp.isUndef()) 17690 return DAG.getUNDEF(ScalarVT); 17691 17692 // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val 17693 // 17694 // This only really matters if the index is non-constant since other combines 17695 // on the constant elements already work. 17696 SDLoc DL(N); 17697 if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT && 17698 Index == VecOp.getOperand(2)) { 17699 SDValue Elt = VecOp.getOperand(1); 17700 return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt; 17701 } 17702 17703 // (vextract (scalar_to_vector val, 0) -> val 17704 if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) { 17705 // Only 0'th element of SCALAR_TO_VECTOR is defined. 17706 if (DAG.isKnownNeverZero(Index)) 17707 return DAG.getUNDEF(ScalarVT); 17708 17709 // Check if the result type doesn't match the inserted element type. A 17710 // SCALAR_TO_VECTOR may truncate the inserted element and the 17711 // EXTRACT_VECTOR_ELT may widen the extracted vector. 17712 SDValue InOp = VecOp.getOperand(0); 17713 if (InOp.getValueType() != ScalarVT) { 17714 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); 17715 return DAG.getSExtOrTrunc(InOp, DL, ScalarVT); 17716 } 17717 return InOp; 17718 } 17719 17720 // extract_vector_elt of out-of-bounds element -> UNDEF 17721 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 17722 if (IndexC && VecVT.isFixedLengthVector() && 17723 IndexC->getAPIntValue().uge(VecVT.getVectorNumElements())) 17724 return DAG.getUNDEF(ScalarVT); 17725 17726 // extract_vector_elt (build_vector x, y), 1 -> y 17727 if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) || 17728 VecOp.getOpcode() == ISD::SPLAT_VECTOR) && 17729 TLI.isTypeLegal(VecVT) && 17730 (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) { 17731 assert((VecOp.getOpcode() != ISD::BUILD_VECTOR || 17732 VecVT.isFixedLengthVector()) && 17733 "BUILD_VECTOR used for scalable vectors"); 17734 unsigned IndexVal = 17735 VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0; 17736 SDValue Elt = VecOp.getOperand(IndexVal); 17737 EVT InEltVT = Elt.getValueType(); 17738 17739 // Sometimes build_vector's scalar input types do not match result type. 17740 if (ScalarVT == InEltVT) 17741 return Elt; 17742 17743 // TODO: It may be useful to truncate if free if the build_vector implicitly 17744 // converts. 17745 } 17746 17747 if (VecVT.isScalableVector()) 17748 return SDValue(); 17749 17750 // All the code from this point onwards assumes fixed width vectors, but it's 17751 // possible that some of the combinations could be made to work for scalable 17752 // vectors too. 17753 unsigned NumElts = VecVT.getVectorNumElements(); 17754 unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); 17755 17756 // TODO: These transforms should not require the 'hasOneUse' restriction, but 17757 // there are regressions on multiple targets without it. We can end up with a 17758 // mess of scalar and vector code if we reduce only part of the DAG to scalar. 17759 if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() && 17760 VecOp.hasOneUse()) { 17761 // The vector index of the LSBs of the source depend on the endian-ness. 17762 bool IsLE = DAG.getDataLayout().isLittleEndian(); 17763 unsigned ExtractIndex = IndexC->getZExtValue(); 17764 // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x) 17765 unsigned BCTruncElt = IsLE ? 0 : NumElts - 1; 17766 SDValue BCSrc = VecOp.getOperand(0); 17767 if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger()) 17768 return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc); 17769 17770 if (LegalTypes && BCSrc.getValueType().isInteger() && 17771 BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) { 17772 // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt --> 17773 // trunc i64 X to i32 17774 SDValue X = BCSrc.getOperand(0); 17775 assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() && 17776 "Extract element and scalar to vector can't change element type " 17777 "from FP to integer."); 17778 unsigned XBitWidth = X.getValueSizeInBits(); 17779 BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1; 17780 17781 // An extract element return value type can be wider than its vector 17782 // operand element type. In that case, the high bits are undefined, so 17783 // it's possible that we may need to extend rather than truncate. 17784 if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) { 17785 assert(XBitWidth % VecEltBitWidth == 0 && 17786 "Scalar bitwidth must be a multiple of vector element bitwidth"); 17787 return DAG.getAnyExtOrTrunc(X, DL, ScalarVT); 17788 } 17789 } 17790 } 17791 17792 if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations)) 17793 return BO; 17794 17795 // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. 17796 // We only perform this optimization before the op legalization phase because 17797 // we may introduce new vector instructions which are not backed by TD 17798 // patterns. For example on AVX, extracting elements from a wide vector 17799 // without using extract_subvector. However, if we can find an underlying 17800 // scalar value, then we can always use that. 17801 if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) { 17802 auto *Shuf = cast<ShuffleVectorSDNode>(VecOp); 17803 // Find the new index to extract from. 17804 int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue()); 17805 17806 // Extracting an undef index is undef. 17807 if (OrigElt == -1) 17808 return DAG.getUNDEF(ScalarVT); 17809 17810 // Select the right vector half to extract from. 17811 SDValue SVInVec; 17812 if (OrigElt < (int)NumElts) { 17813 SVInVec = VecOp.getOperand(0); 17814 } else { 17815 SVInVec = VecOp.getOperand(1); 17816 OrigElt -= NumElts; 17817 } 17818 17819 if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { 17820 SDValue InOp = SVInVec.getOperand(OrigElt); 17821 if (InOp.getValueType() != ScalarVT) { 17822 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger()); 17823 InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT); 17824 } 17825 17826 return InOp; 17827 } 17828 17829 // FIXME: We should handle recursing on other vector shuffles and 17830 // scalar_to_vector here as well. 17831 17832 if (!LegalOperations || 17833 // FIXME: Should really be just isOperationLegalOrCustom. 17834 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) || 17835 TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) { 17836 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec, 17837 DAG.getVectorIdxConstant(OrigElt, DL)); 17838 } 17839 } 17840 17841 // If only EXTRACT_VECTOR_ELT nodes use the source vector we can 17842 // simplify it based on the (valid) extraction indices. 17843 if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) { 17844 return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 17845 Use->getOperand(0) == VecOp && 17846 isa<ConstantSDNode>(Use->getOperand(1)); 17847 })) { 17848 APInt DemandedElts = APInt::getNullValue(NumElts); 17849 for (SDNode *Use : VecOp->uses()) { 17850 auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1)); 17851 if (CstElt->getAPIntValue().ult(NumElts)) 17852 DemandedElts.setBit(CstElt->getZExtValue()); 17853 } 17854 if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) { 17855 // We simplified the vector operand of this extract element. If this 17856 // extract is not dead, visit it again so it is folded properly. 17857 if (N->getOpcode() != ISD::DELETED_NODE) 17858 AddToWorklist(N); 17859 return SDValue(N, 0); 17860 } 17861 APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth); 17862 if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) { 17863 // We simplified the vector operand of this extract element. If this 17864 // extract is not dead, visit it again so it is folded properly. 17865 if (N->getOpcode() != ISD::DELETED_NODE) 17866 AddToWorklist(N); 17867 return SDValue(N, 0); 17868 } 17869 } 17870 17871 // Everything under here is trying to match an extract of a loaded value. 17872 // If the result of load has to be truncated, then it's not necessarily 17873 // profitable. 17874 bool BCNumEltsChanged = false; 17875 EVT ExtVT = VecVT.getVectorElementType(); 17876 EVT LVT = ExtVT; 17877 if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT)) 17878 return SDValue(); 17879 17880 if (VecOp.getOpcode() == ISD::BITCAST) { 17881 // Don't duplicate a load with other uses. 17882 if (!VecOp.hasOneUse()) 17883 return SDValue(); 17884 17885 EVT BCVT = VecOp.getOperand(0).getValueType(); 17886 if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) 17887 return SDValue(); 17888 if (NumElts != BCVT.getVectorNumElements()) 17889 BCNumEltsChanged = true; 17890 VecOp = VecOp.getOperand(0); 17891 ExtVT = BCVT.getVectorElementType(); 17892 } 17893 17894 // extract (vector load $addr), i --> load $addr + i * size 17895 if (!LegalOperations && !IndexC && VecOp.hasOneUse() && 17896 ISD::isNormalLoad(VecOp.getNode()) && 17897 !Index->hasPredecessor(VecOp.getNode())) { 17898 auto *VecLoad = dyn_cast<LoadSDNode>(VecOp); 17899 if (VecLoad && VecLoad->isSimple()) 17900 return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad); 17901 } 17902 17903 // Perform only after legalization to ensure build_vector / vector_shuffle 17904 // optimizations have already been done. 17905 if (!LegalOperations || !IndexC) 17906 return SDValue(); 17907 17908 // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size) 17909 // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size) 17910 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr) 17911 int Elt = IndexC->getZExtValue(); 17912 LoadSDNode *LN0 = nullptr; 17913 if (ISD::isNormalLoad(VecOp.getNode())) { 17914 LN0 = cast<LoadSDNode>(VecOp); 17915 } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 17916 VecOp.getOperand(0).getValueType() == ExtVT && 17917 ISD::isNormalLoad(VecOp.getOperand(0).getNode())) { 17918 // Don't duplicate a load with other uses. 17919 if (!VecOp.hasOneUse()) 17920 return SDValue(); 17921 17922 LN0 = cast<LoadSDNode>(VecOp.getOperand(0)); 17923 } 17924 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) { 17925 // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) 17926 // => 17927 // (load $addr+1*size) 17928 17929 // Don't duplicate a load with other uses. 17930 if (!VecOp.hasOneUse()) 17931 return SDValue(); 17932 17933 // If the bit convert changed the number of elements, it is unsafe 17934 // to examine the mask. 17935 if (BCNumEltsChanged) 17936 return SDValue(); 17937 17938 // Select the input vector, guarding against out of range extract vector. 17939 int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt); 17940 VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1); 17941 17942 if (VecOp.getOpcode() == ISD::BITCAST) { 17943 // Don't duplicate a load with other uses. 17944 if (!VecOp.hasOneUse()) 17945 return SDValue(); 17946 17947 VecOp = VecOp.getOperand(0); 17948 } 17949 if (ISD::isNormalLoad(VecOp.getNode())) { 17950 LN0 = cast<LoadSDNode>(VecOp); 17951 Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts; 17952 Index = DAG.getConstant(Elt, DL, Index.getValueType()); 17953 } 17954 } else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged && 17955 VecVT.getVectorElementType() == ScalarVT && 17956 (!LegalTypes || 17957 TLI.isTypeLegal( 17958 VecOp.getOperand(0).getValueType().getVectorElementType()))) { 17959 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0 17960 // -> extract_vector_elt a, 0 17961 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1 17962 // -> extract_vector_elt a, 1 17963 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2 17964 // -> extract_vector_elt b, 0 17965 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3 17966 // -> extract_vector_elt b, 1 17967 SDLoc SL(N); 17968 EVT ConcatVT = VecOp.getOperand(0).getValueType(); 17969 unsigned ConcatNumElts = ConcatVT.getVectorNumElements(); 17970 SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL, 17971 Index.getValueType()); 17972 17973 SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts); 17974 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, 17975 ConcatVT.getVectorElementType(), 17976 ConcatOp, NewIdx); 17977 return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt); 17978 } 17979 17980 // Make sure we found a non-volatile load and the extractelement is 17981 // the only use. 17982 if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple()) 17983 return SDValue(); 17984 17985 // If Idx was -1 above, Elt is going to be -1, so just return undef. 17986 if (Elt == -1) 17987 return DAG.getUNDEF(LVT); 17988 17989 return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0); 17990 } 17991 17992 // Simplify (build_vec (ext )) to (bitcast (build_vec )) 17993 SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { 17994 // We perform this optimization post type-legalization because 17995 // the type-legalizer often scalarizes integer-promoted vectors. 17996 // Performing this optimization before may create bit-casts which 17997 // will be type-legalized to complex code sequences. 17998 // We perform this optimization only before the operation legalizer because we 17999 // may introduce illegal operations. 18000 if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes) 18001 return SDValue(); 18002 18003 unsigned NumInScalars = N->getNumOperands(); 18004 SDLoc DL(N); 18005 EVT VT = N->getValueType(0); 18006 18007 // Check to see if this is a BUILD_VECTOR of a bunch of values 18008 // which come from any_extend or zero_extend nodes. If so, we can create 18009 // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR 18010 // optimizations. We do not handle sign-extend because we can't fill the sign 18011 // using shuffles. 18012 EVT SourceType = MVT::Other; 18013 bool AllAnyExt = true; 18014 18015 for (unsigned i = 0; i != NumInScalars; ++i) { 18016 SDValue In = N->getOperand(i); 18017 // Ignore undef inputs. 18018 if (In.isUndef()) continue; 18019 18020 bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND; 18021 bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND; 18022 18023 // Abort if the element is not an extension. 18024 if (!ZeroExt && !AnyExt) { 18025 SourceType = MVT::Other; 18026 break; 18027 } 18028 18029 // The input is a ZeroExt or AnyExt. Check the original type. 18030 EVT InTy = In.getOperand(0).getValueType(); 18031 18032 // Check that all of the widened source types are the same. 18033 if (SourceType == MVT::Other) 18034 // First time. 18035 SourceType = InTy; 18036 else if (InTy != SourceType) { 18037 // Multiple income types. Abort. 18038 SourceType = MVT::Other; 18039 break; 18040 } 18041 18042 // Check if all of the extends are ANY_EXTENDs. 18043 AllAnyExt &= AnyExt; 18044 } 18045 18046 // In order to have valid types, all of the inputs must be extended from the 18047 // same source type and all of the inputs must be any or zero extend. 18048 // Scalar sizes must be a power of two. 18049 EVT OutScalarTy = VT.getScalarType(); 18050 bool ValidTypes = SourceType != MVT::Other && 18051 isPowerOf2_32(OutScalarTy.getSizeInBits()) && 18052 isPowerOf2_32(SourceType.getSizeInBits()); 18053 18054 // Create a new simpler BUILD_VECTOR sequence which other optimizations can 18055 // turn into a single shuffle instruction. 18056 if (!ValidTypes) 18057 return SDValue(); 18058 18059 // If we already have a splat buildvector, then don't fold it if it means 18060 // introducing zeros. 18061 if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true)) 18062 return SDValue(); 18063 18064 bool isLE = DAG.getDataLayout().isLittleEndian(); 18065 unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits(); 18066 assert(ElemRatio > 1 && "Invalid element size ratio"); 18067 SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType): 18068 DAG.getConstant(0, DL, SourceType); 18069 18070 unsigned NewBVElems = ElemRatio * VT.getVectorNumElements(); 18071 SmallVector<SDValue, 8> Ops(NewBVElems, Filler); 18072 18073 // Populate the new build_vector 18074 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 18075 SDValue Cast = N->getOperand(i); 18076 assert((Cast.getOpcode() == ISD::ANY_EXTEND || 18077 Cast.getOpcode() == ISD::ZERO_EXTEND || 18078 Cast.isUndef()) && "Invalid cast opcode"); 18079 SDValue In; 18080 if (Cast.isUndef()) 18081 In = DAG.getUNDEF(SourceType); 18082 else 18083 In = Cast->getOperand(0); 18084 unsigned Index = isLE ? (i * ElemRatio) : 18085 (i * ElemRatio + (ElemRatio - 1)); 18086 18087 assert(Index < Ops.size() && "Invalid index"); 18088 Ops[Index] = In; 18089 } 18090 18091 // The type of the new BUILD_VECTOR node. 18092 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems); 18093 assert(VecVT.getSizeInBits() == VT.getSizeInBits() && 18094 "Invalid vector size"); 18095 // Check if the new vector type is legal. 18096 if (!isTypeLegal(VecVT) || 18097 (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) && 18098 TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))) 18099 return SDValue(); 18100 18101 // Make the new BUILD_VECTOR. 18102 SDValue BV = DAG.getBuildVector(VecVT, DL, Ops); 18103 18104 // The new BUILD_VECTOR node has the potential to be further optimized. 18105 AddToWorklist(BV.getNode()); 18106 // Bitcast to the desired type. 18107 return DAG.getBitcast(VT, BV); 18108 } 18109 18110 // Simplify (build_vec (trunc $1) 18111 // (trunc (srl $1 half-width)) 18112 // (trunc (srl $1 (2 * half-width))) …) 18113 // to (bitcast $1) 18114 SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) { 18115 assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); 18116 18117 // Only for little endian 18118 if (!DAG.getDataLayout().isLittleEndian()) 18119 return SDValue(); 18120 18121 SDLoc DL(N); 18122 EVT VT = N->getValueType(0); 18123 EVT OutScalarTy = VT.getScalarType(); 18124 uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits(); 18125 18126 // Only for power of two types to be sure that bitcast works well 18127 if (!isPowerOf2_64(ScalarTypeBitsize)) 18128 return SDValue(); 18129 18130 unsigned NumInScalars = N->getNumOperands(); 18131 18132 // Look through bitcasts 18133 auto PeekThroughBitcast = [](SDValue Op) { 18134 if (Op.getOpcode() == ISD::BITCAST) 18135 return Op.getOperand(0); 18136 return Op; 18137 }; 18138 18139 // The source value where all the parts are extracted. 18140 SDValue Src; 18141 for (unsigned i = 0; i != NumInScalars; ++i) { 18142 SDValue In = PeekThroughBitcast(N->getOperand(i)); 18143 // Ignore undef inputs. 18144 if (In.isUndef()) continue; 18145 18146 if (In.getOpcode() != ISD::TRUNCATE) 18147 return SDValue(); 18148 18149 In = PeekThroughBitcast(In.getOperand(0)); 18150 18151 if (In.getOpcode() != ISD::SRL) { 18152 // For now only build_vec without shuffling, handle shifts here in the 18153 // future. 18154 if (i != 0) 18155 return SDValue(); 18156 18157 Src = In; 18158 } else { 18159 // In is SRL 18160 SDValue part = PeekThroughBitcast(In.getOperand(0)); 18161 18162 if (!Src) { 18163 Src = part; 18164 } else if (Src != part) { 18165 // Vector parts do not stem from the same variable 18166 return SDValue(); 18167 } 18168 18169 SDValue ShiftAmtVal = In.getOperand(1); 18170 if (!isa<ConstantSDNode>(ShiftAmtVal)) 18171 return SDValue(); 18172 18173 uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1); 18174 18175 // The extracted value is not extracted at the right position 18176 if (ShiftAmt != i * ScalarTypeBitsize) 18177 return SDValue(); 18178 } 18179 } 18180 18181 // Only cast if the size is the same 18182 if (Src.getValueType().getSizeInBits() != VT.getSizeInBits()) 18183 return SDValue(); 18184 18185 return DAG.getBitcast(VT, Src); 18186 } 18187 18188 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, 18189 ArrayRef<int> VectorMask, 18190 SDValue VecIn1, SDValue VecIn2, 18191 unsigned LeftIdx, bool DidSplitVec) { 18192 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); 18193 18194 EVT VT = N->getValueType(0); 18195 EVT InVT1 = VecIn1.getValueType(); 18196 EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; 18197 18198 unsigned NumElems = VT.getVectorNumElements(); 18199 unsigned ShuffleNumElems = NumElems; 18200 18201 // If we artificially split a vector in two already, then the offsets in the 18202 // operands will all be based off of VecIn1, even those in VecIn2. 18203 unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements(); 18204 18205 // We can't generate a shuffle node with mismatched input and output types. 18206 // Try to make the types match the type of the output. 18207 if (InVT1 != VT || InVT2 != VT) { 18208 if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) { 18209 // If the output vector length is a multiple of both input lengths, 18210 // we can concatenate them and pad the rest with undefs. 18211 unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits(); 18212 assert(NumConcats >= 2 && "Concat needs at least two inputs!"); 18213 SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1)); 18214 ConcatOps[0] = VecIn1; 18215 ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1); 18216 VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 18217 VecIn2 = SDValue(); 18218 } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { 18219 if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems)) 18220 return SDValue(); 18221 18222 if (!VecIn2.getNode()) { 18223 // If we only have one input vector, and it's twice the size of the 18224 // output, split it in two. 18225 VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, 18226 DAG.getVectorIdxConstant(NumElems, DL)); 18227 VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx); 18228 // Since we now have shorter input vectors, adjust the offset of the 18229 // second vector's start. 18230 Vec2Offset = NumElems; 18231 } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) { 18232 // VecIn1 is wider than the output, and we have another, possibly 18233 // smaller input. Pad the smaller input with undefs, shuffle at the 18234 // input vector width, and extract the output. 18235 // The shuffle type is different than VT, so check legality again. 18236 if (LegalOperations && 18237 !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1)) 18238 return SDValue(); 18239 18240 // Legalizing INSERT_SUBVECTOR is tricky - you basically have to 18241 // lower it back into a BUILD_VECTOR. So if the inserted type is 18242 // illegal, don't even try. 18243 if (InVT1 != InVT2) { 18244 if (!TLI.isTypeLegal(InVT2)) 18245 return SDValue(); 18246 VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1, 18247 DAG.getUNDEF(InVT1), VecIn2, ZeroIdx); 18248 } 18249 ShuffleNumElems = NumElems * 2; 18250 } else { 18251 // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider 18252 // than VecIn1. We can't handle this for now - this case will disappear 18253 // when we start sorting the vectors by type. 18254 return SDValue(); 18255 } 18256 } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() && 18257 InVT1.getSizeInBits() == VT.getSizeInBits()) { 18258 SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2)); 18259 ConcatOps[0] = VecIn2; 18260 VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 18261 } else { 18262 // TODO: Support cases where the length mismatch isn't exactly by a 18263 // factor of 2. 18264 // TODO: Move this check upwards, so that if we have bad type 18265 // mismatches, we don't create any DAG nodes. 18266 return SDValue(); 18267 } 18268 } 18269 18270 // Initialize mask to undef. 18271 SmallVector<int, 8> Mask(ShuffleNumElems, -1); 18272 18273 // Only need to run up to the number of elements actually used, not the 18274 // total number of elements in the shuffle - if we are shuffling a wider 18275 // vector, the high lanes should be set to undef. 18276 for (unsigned i = 0; i != NumElems; ++i) { 18277 if (VectorMask[i] <= 0) 18278 continue; 18279 18280 unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1); 18281 if (VectorMask[i] == (int)LeftIdx) { 18282 Mask[i] = ExtIndex; 18283 } else if (VectorMask[i] == (int)LeftIdx + 1) { 18284 Mask[i] = Vec2Offset + ExtIndex; 18285 } 18286 } 18287 18288 // The type the input vectors may have changed above. 18289 InVT1 = VecIn1.getValueType(); 18290 18291 // If we already have a VecIn2, it should have the same type as VecIn1. 18292 // If we don't, get an undef/zero vector of the appropriate type. 18293 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1); 18294 assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type."); 18295 18296 SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask); 18297 if (ShuffleNumElems > NumElems) 18298 Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx); 18299 18300 return Shuffle; 18301 } 18302 18303 static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) { 18304 assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); 18305 18306 // First, determine where the build vector is not undef. 18307 // TODO: We could extend this to handle zero elements as well as undefs. 18308 int NumBVOps = BV->getNumOperands(); 18309 int ZextElt = -1; 18310 for (int i = 0; i != NumBVOps; ++i) { 18311 SDValue Op = BV->getOperand(i); 18312 if (Op.isUndef()) 18313 continue; 18314 if (ZextElt == -1) 18315 ZextElt = i; 18316 else 18317 return SDValue(); 18318 } 18319 // Bail out if there's no non-undef element. 18320 if (ZextElt == -1) 18321 return SDValue(); 18322 18323 // The build vector contains some number of undef elements and exactly 18324 // one other element. That other element must be a zero-extended scalar 18325 // extracted from a vector at a constant index to turn this into a shuffle. 18326 // Also, require that the build vector does not implicitly truncate/extend 18327 // its elements. 18328 // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND. 18329 EVT VT = BV->getValueType(0); 18330 SDValue Zext = BV->getOperand(ZextElt); 18331 if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() || 18332 Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 18333 !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) || 18334 Zext.getValueSizeInBits() != VT.getScalarSizeInBits()) 18335 return SDValue(); 18336 18337 // The zero-extend must be a multiple of the source size, and we must be 18338 // building a vector of the same size as the source of the extract element. 18339 SDValue Extract = Zext.getOperand(0); 18340 unsigned DestSize = Zext.getValueSizeInBits(); 18341 unsigned SrcSize = Extract.getValueSizeInBits(); 18342 if (DestSize % SrcSize != 0 || 18343 Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits()) 18344 return SDValue(); 18345 18346 // Create a shuffle mask that will combine the extracted element with zeros 18347 // and undefs. 18348 int ZextRatio = DestSize / SrcSize; 18349 int NumMaskElts = NumBVOps * ZextRatio; 18350 SmallVector<int, 32> ShufMask(NumMaskElts, -1); 18351 for (int i = 0; i != NumMaskElts; ++i) { 18352 if (i / ZextRatio == ZextElt) { 18353 // The low bits of the (potentially translated) extracted element map to 18354 // the source vector. The high bits map to zero. We will use a zero vector 18355 // as the 2nd source operand of the shuffle, so use the 1st element of 18356 // that vector (mask value is number-of-elements) for the high bits. 18357 if (i % ZextRatio == 0) 18358 ShufMask[i] = Extract.getConstantOperandVal(1); 18359 else 18360 ShufMask[i] = NumMaskElts; 18361 } 18362 18363 // Undef elements of the build vector remain undef because we initialize 18364 // the shuffle mask with -1. 18365 } 18366 18367 // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... --> 18368 // bitcast (shuffle V, ZeroVec, VectorMask) 18369 SDLoc DL(BV); 18370 EVT VecVT = Extract.getOperand(0).getValueType(); 18371 SDValue ZeroVec = DAG.getConstant(0, DL, VecVT); 18372 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18373 SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0), 18374 ZeroVec, ShufMask, DAG); 18375 if (!Shuf) 18376 return SDValue(); 18377 return DAG.getBitcast(VT, Shuf); 18378 } 18379 18380 // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT 18381 // operations. If the types of the vectors we're extracting from allow it, 18382 // turn this into a vector_shuffle node. 18383 SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { 18384 SDLoc DL(N); 18385 EVT VT = N->getValueType(0); 18386 18387 // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes. 18388 if (!isTypeLegal(VT)) 18389 return SDValue(); 18390 18391 if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG)) 18392 return V; 18393 18394 // May only combine to shuffle after legalize if shuffle is legal. 18395 if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT)) 18396 return SDValue(); 18397 18398 bool UsesZeroVector = false; 18399 unsigned NumElems = N->getNumOperands(); 18400 18401 // Record, for each element of the newly built vector, which input vector 18402 // that element comes from. -1 stands for undef, 0 for the zero vector, 18403 // and positive values for the input vectors. 18404 // VectorMask maps each element to its vector number, and VecIn maps vector 18405 // numbers to their initial SDValues. 18406 18407 SmallVector<int, 8> VectorMask(NumElems, -1); 18408 SmallVector<SDValue, 8> VecIn; 18409 VecIn.push_back(SDValue()); 18410 18411 for (unsigned i = 0; i != NumElems; ++i) { 18412 SDValue Op = N->getOperand(i); 18413 18414 if (Op.isUndef()) 18415 continue; 18416 18417 // See if we can use a blend with a zero vector. 18418 // TODO: Should we generalize this to a blend with an arbitrary constant 18419 // vector? 18420 if (isNullConstant(Op) || isNullFPConstant(Op)) { 18421 UsesZeroVector = true; 18422 VectorMask[i] = 0; 18423 continue; 18424 } 18425 18426 // Not an undef or zero. If the input is something other than an 18427 // EXTRACT_VECTOR_ELT with an in-range constant index, bail out. 18428 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 18429 !isa<ConstantSDNode>(Op.getOperand(1))) 18430 return SDValue(); 18431 SDValue ExtractedFromVec = Op.getOperand(0); 18432 18433 if (ExtractedFromVec.getValueType().isScalableVector()) 18434 return SDValue(); 18435 18436 const APInt &ExtractIdx = Op.getConstantOperandAPInt(1); 18437 if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements())) 18438 return SDValue(); 18439 18440 // All inputs must have the same element type as the output. 18441 if (VT.getVectorElementType() != 18442 ExtractedFromVec.getValueType().getVectorElementType()) 18443 return SDValue(); 18444 18445 // Have we seen this input vector before? 18446 // The vectors are expected to be tiny (usually 1 or 2 elements), so using 18447 // a map back from SDValues to numbers isn't worth it. 18448 unsigned Idx = std::distance( 18449 VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec)); 18450 if (Idx == VecIn.size()) 18451 VecIn.push_back(ExtractedFromVec); 18452 18453 VectorMask[i] = Idx; 18454 } 18455 18456 // If we didn't find at least one input vector, bail out. 18457 if (VecIn.size() < 2) 18458 return SDValue(); 18459 18460 // If all the Operands of BUILD_VECTOR extract from same 18461 // vector, then split the vector efficiently based on the maximum 18462 // vector access index and adjust the VectorMask and 18463 // VecIn accordingly. 18464 bool DidSplitVec = false; 18465 if (VecIn.size() == 2) { 18466 unsigned MaxIndex = 0; 18467 unsigned NearestPow2 = 0; 18468 SDValue Vec = VecIn.back(); 18469 EVT InVT = Vec.getValueType(); 18470 SmallVector<unsigned, 8> IndexVec(NumElems, 0); 18471 18472 for (unsigned i = 0; i < NumElems; i++) { 18473 if (VectorMask[i] <= 0) 18474 continue; 18475 unsigned Index = N->getOperand(i).getConstantOperandVal(1); 18476 IndexVec[i] = Index; 18477 MaxIndex = std::max(MaxIndex, Index); 18478 } 18479 18480 NearestPow2 = PowerOf2Ceil(MaxIndex); 18481 if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 && 18482 NumElems * 2 < NearestPow2) { 18483 unsigned SplitSize = NearestPow2 / 2; 18484 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), 18485 InVT.getVectorElementType(), SplitSize); 18486 if (TLI.isTypeLegal(SplitVT)) { 18487 SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, 18488 DAG.getVectorIdxConstant(SplitSize, DL)); 18489 SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, 18490 DAG.getVectorIdxConstant(0, DL)); 18491 VecIn.pop_back(); 18492 VecIn.push_back(VecIn1); 18493 VecIn.push_back(VecIn2); 18494 DidSplitVec = true; 18495 18496 for (unsigned i = 0; i < NumElems; i++) { 18497 if (VectorMask[i] <= 0) 18498 continue; 18499 VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; 18500 } 18501 } 18502 } 18503 } 18504 18505 // TODO: We want to sort the vectors by descending length, so that adjacent 18506 // pairs have similar length, and the longer vector is always first in the 18507 // pair. 18508 18509 // TODO: Should this fire if some of the input vectors has illegal type (like 18510 // it does now), or should we let legalization run its course first? 18511 18512 // Shuffle phase: 18513 // Take pairs of vectors, and shuffle them so that the result has elements 18514 // from these vectors in the correct places. 18515 // For example, given: 18516 // t10: i32 = extract_vector_elt t1, Constant:i64<0> 18517 // t11: i32 = extract_vector_elt t2, Constant:i64<0> 18518 // t12: i32 = extract_vector_elt t3, Constant:i64<0> 18519 // t13: i32 = extract_vector_elt t1, Constant:i64<1> 18520 // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13 18521 // We will generate: 18522 // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2 18523 // t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef 18524 SmallVector<SDValue, 4> Shuffles; 18525 for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) { 18526 unsigned LeftIdx = 2 * In + 1; 18527 SDValue VecLeft = VecIn[LeftIdx]; 18528 SDValue VecRight = 18529 (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); 18530 18531 if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, 18532 VecRight, LeftIdx, DidSplitVec)) 18533 Shuffles.push_back(Shuffle); 18534 else 18535 return SDValue(); 18536 } 18537 18538 // If we need the zero vector as an "ingredient" in the blend tree, add it 18539 // to the list of shuffles. 18540 if (UsesZeroVector) 18541 Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT) 18542 : DAG.getConstantFP(0.0, DL, VT)); 18543 18544 // If we only have one shuffle, we're done. 18545 if (Shuffles.size() == 1) 18546 return Shuffles[0]; 18547 18548 // Update the vector mask to point to the post-shuffle vectors. 18549 for (int &Vec : VectorMask) 18550 if (Vec == 0) 18551 Vec = Shuffles.size() - 1; 18552 else 18553 Vec = (Vec - 1) / 2; 18554 18555 // More than one shuffle. Generate a binary tree of blends, e.g. if from 18556 // the previous step we got the set of shuffles t10, t11, t12, t13, we will 18557 // generate: 18558 // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2 18559 // t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4 18560 // t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6 18561 // t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8 18562 // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11 18563 // t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13 18564 // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21 18565 18566 // Make sure the initial size of the shuffle list is even. 18567 if (Shuffles.size() % 2) 18568 Shuffles.push_back(DAG.getUNDEF(VT)); 18569 18570 for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) { 18571 if (CurSize % 2) { 18572 Shuffles[CurSize] = DAG.getUNDEF(VT); 18573 CurSize++; 18574 } 18575 for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) { 18576 int Left = 2 * In; 18577 int Right = 2 * In + 1; 18578 SmallVector<int, 8> Mask(NumElems, -1); 18579 for (unsigned i = 0; i != NumElems; ++i) { 18580 if (VectorMask[i] == Left) { 18581 Mask[i] = i; 18582 VectorMask[i] = In; 18583 } else if (VectorMask[i] == Right) { 18584 Mask[i] = i + NumElems; 18585 VectorMask[i] = In; 18586 } 18587 } 18588 18589 Shuffles[In] = 18590 DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask); 18591 } 18592 } 18593 return Shuffles[0]; 18594 } 18595 18596 // Try to turn a build vector of zero extends of extract vector elts into a 18597 // a vector zero extend and possibly an extract subvector. 18598 // TODO: Support sign extend? 18599 // TODO: Allow undef elements? 18600 SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { 18601 if (LegalOperations) 18602 return SDValue(); 18603 18604 EVT VT = N->getValueType(0); 18605 18606 bool FoundZeroExtend = false; 18607 SDValue Op0 = N->getOperand(0); 18608 auto checkElem = [&](SDValue Op) -> int64_t { 18609 unsigned Opc = Op.getOpcode(); 18610 FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND); 18611 if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) && 18612 Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 18613 Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0)) 18614 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1))) 18615 return C->getZExtValue(); 18616 return -1; 18617 }; 18618 18619 // Make sure the first element matches 18620 // (zext (extract_vector_elt X, C)) 18621 int64_t Offset = checkElem(Op0); 18622 if (Offset < 0) 18623 return SDValue(); 18624 18625 unsigned NumElems = N->getNumOperands(); 18626 SDValue In = Op0.getOperand(0).getOperand(0); 18627 EVT InSVT = In.getValueType().getScalarType(); 18628 EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems); 18629 18630 // Don't create an illegal input type after type legalization. 18631 if (LegalTypes && !TLI.isTypeLegal(InVT)) 18632 return SDValue(); 18633 18634 // Ensure all the elements come from the same vector and are adjacent. 18635 for (unsigned i = 1; i != NumElems; ++i) { 18636 if ((Offset + i) != checkElem(N->getOperand(i))) 18637 return SDValue(); 18638 } 18639 18640 SDLoc DL(N); 18641 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In, 18642 Op0.getOperand(0).getOperand(1)); 18643 return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL, 18644 VT, In); 18645 } 18646 18647 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { 18648 EVT VT = N->getValueType(0); 18649 18650 // A vector built entirely of undefs is undef. 18651 if (ISD::allOperandsUndef(N)) 18652 return DAG.getUNDEF(VT); 18653 18654 // If this is a splat of a bitcast from another vector, change to a 18655 // concat_vector. 18656 // For example: 18657 // (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) -> 18658 // (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X)))) 18659 // 18660 // If X is a build_vector itself, the concat can become a larger build_vector. 18661 // TODO: Maybe this is useful for non-splat too? 18662 if (!LegalOperations) { 18663 if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) { 18664 Splat = peekThroughBitcasts(Splat); 18665 EVT SrcVT = Splat.getValueType(); 18666 if (SrcVT.isVector()) { 18667 unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements(); 18668 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), 18669 SrcVT.getVectorElementType(), NumElts); 18670 if (!LegalTypes || TLI.isTypeLegal(NewVT)) { 18671 SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat); 18672 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), 18673 NewVT, Ops); 18674 return DAG.getBitcast(VT, Concat); 18675 } 18676 } 18677 } 18678 } 18679 18680 // A splat of a single element is a SPLAT_VECTOR if supported on the target. 18681 if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand) 18682 if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) { 18683 assert(!V.isUndef() && "Splat of undef should have been handled earlier"); 18684 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V); 18685 } 18686 18687 // Check if we can express BUILD VECTOR via subvector extract. 18688 if (!LegalTypes && (N->getNumOperands() > 1)) { 18689 SDValue Op0 = N->getOperand(0); 18690 auto checkElem = [&](SDValue Op) -> uint64_t { 18691 if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && 18692 (Op0.getOperand(0) == Op.getOperand(0))) 18693 if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 18694 return CNode->getZExtValue(); 18695 return -1; 18696 }; 18697 18698 int Offset = checkElem(Op0); 18699 for (unsigned i = 0; i < N->getNumOperands(); ++i) { 18700 if (Offset + i != checkElem(N->getOperand(i))) { 18701 Offset = -1; 18702 break; 18703 } 18704 } 18705 18706 if ((Offset == 0) && 18707 (Op0.getOperand(0).getValueType() == N->getValueType(0))) 18708 return Op0.getOperand(0); 18709 if ((Offset != -1) && 18710 ((Offset % N->getValueType(0).getVectorNumElements()) == 18711 0)) // IDX must be multiple of output size. 18712 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), 18713 Op0.getOperand(0), Op0.getOperand(1)); 18714 } 18715 18716 if (SDValue V = convertBuildVecZextToZext(N)) 18717 return V; 18718 18719 if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) 18720 return V; 18721 18722 if (SDValue V = reduceBuildVecTruncToBitCast(N)) 18723 return V; 18724 18725 if (SDValue V = reduceBuildVecToShuffle(N)) 18726 return V; 18727 18728 return SDValue(); 18729 } 18730 18731 static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { 18732 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18733 EVT OpVT = N->getOperand(0).getValueType(); 18734 18735 // If the operands are legal vectors, leave them alone. 18736 if (TLI.isTypeLegal(OpVT)) 18737 return SDValue(); 18738 18739 SDLoc DL(N); 18740 EVT VT = N->getValueType(0); 18741 SmallVector<SDValue, 8> Ops; 18742 18743 EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits()); 18744 SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); 18745 18746 // Keep track of what we encounter. 18747 bool AnyInteger = false; 18748 bool AnyFP = false; 18749 for (const SDValue &Op : N->ops()) { 18750 if (ISD::BITCAST == Op.getOpcode() && 18751 !Op.getOperand(0).getValueType().isVector()) 18752 Ops.push_back(Op.getOperand(0)); 18753 else if (ISD::UNDEF == Op.getOpcode()) 18754 Ops.push_back(ScalarUndef); 18755 else 18756 return SDValue(); 18757 18758 // Note whether we encounter an integer or floating point scalar. 18759 // If it's neither, bail out, it could be something weird like x86mmx. 18760 EVT LastOpVT = Ops.back().getValueType(); 18761 if (LastOpVT.isFloatingPoint()) 18762 AnyFP = true; 18763 else if (LastOpVT.isInteger()) 18764 AnyInteger = true; 18765 else 18766 return SDValue(); 18767 } 18768 18769 // If any of the operands is a floating point scalar bitcast to a vector, 18770 // use floating point types throughout, and bitcast everything. 18771 // Replace UNDEFs by another scalar UNDEF node, of the final desired type. 18772 if (AnyFP) { 18773 SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits()); 18774 ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); 18775 if (AnyInteger) { 18776 for (SDValue &Op : Ops) { 18777 if (Op.getValueType() == SVT) 18778 continue; 18779 if (Op.isUndef()) 18780 Op = ScalarUndef; 18781 else 18782 Op = DAG.getBitcast(SVT, Op); 18783 } 18784 } 18785 } 18786 18787 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT, 18788 VT.getSizeInBits() / SVT.getSizeInBits()); 18789 return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops)); 18790 } 18791 18792 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR 18793 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at 18794 // most two distinct vectors the same size as the result, attempt to turn this 18795 // into a legal shuffle. 18796 static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { 18797 EVT VT = N->getValueType(0); 18798 EVT OpVT = N->getOperand(0).getValueType(); 18799 18800 // We currently can't generate an appropriate shuffle for a scalable vector. 18801 if (VT.isScalableVector()) 18802 return SDValue(); 18803 18804 int NumElts = VT.getVectorNumElements(); 18805 int NumOpElts = OpVT.getVectorNumElements(); 18806 18807 SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT); 18808 SmallVector<int, 8> Mask; 18809 18810 for (SDValue Op : N->ops()) { 18811 Op = peekThroughBitcasts(Op); 18812 18813 // UNDEF nodes convert to UNDEF shuffle mask values. 18814 if (Op.isUndef()) { 18815 Mask.append((unsigned)NumOpElts, -1); 18816 continue; 18817 } 18818 18819 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) 18820 return SDValue(); 18821 18822 // What vector are we extracting the subvector from and at what index? 18823 SDValue ExtVec = Op.getOperand(0); 18824 int ExtIdx = Op.getConstantOperandVal(1); 18825 18826 // We want the EVT of the original extraction to correctly scale the 18827 // extraction index. 18828 EVT ExtVT = ExtVec.getValueType(); 18829 ExtVec = peekThroughBitcasts(ExtVec); 18830 18831 // UNDEF nodes convert to UNDEF shuffle mask values. 18832 if (ExtVec.isUndef()) { 18833 Mask.append((unsigned)NumOpElts, -1); 18834 continue; 18835 } 18836 18837 // Ensure that we are extracting a subvector from a vector the same 18838 // size as the result. 18839 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) 18840 return SDValue(); 18841 18842 // Scale the subvector index to account for any bitcast. 18843 int NumExtElts = ExtVT.getVectorNumElements(); 18844 if (0 == (NumExtElts % NumElts)) 18845 ExtIdx /= (NumExtElts / NumElts); 18846 else if (0 == (NumElts % NumExtElts)) 18847 ExtIdx *= (NumElts / NumExtElts); 18848 else 18849 return SDValue(); 18850 18851 // At most we can reference 2 inputs in the final shuffle. 18852 if (SV0.isUndef() || SV0 == ExtVec) { 18853 SV0 = ExtVec; 18854 for (int i = 0; i != NumOpElts; ++i) 18855 Mask.push_back(i + ExtIdx); 18856 } else if (SV1.isUndef() || SV1 == ExtVec) { 18857 SV1 = ExtVec; 18858 for (int i = 0; i != NumOpElts; ++i) 18859 Mask.push_back(i + ExtIdx + NumElts); 18860 } else { 18861 return SDValue(); 18862 } 18863 } 18864 18865 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18866 return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0), 18867 DAG.getBitcast(VT, SV1), Mask, DAG); 18868 } 18869 18870 static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) { 18871 unsigned CastOpcode = N->getOperand(0).getOpcode(); 18872 switch (CastOpcode) { 18873 case ISD::SINT_TO_FP: 18874 case ISD::UINT_TO_FP: 18875 case ISD::FP_TO_SINT: 18876 case ISD::FP_TO_UINT: 18877 // TODO: Allow more opcodes? 18878 // case ISD::BITCAST: 18879 // case ISD::TRUNCATE: 18880 // case ISD::ZERO_EXTEND: 18881 // case ISD::SIGN_EXTEND: 18882 // case ISD::FP_EXTEND: 18883 break; 18884 default: 18885 return SDValue(); 18886 } 18887 18888 EVT SrcVT = N->getOperand(0).getOperand(0).getValueType(); 18889 if (!SrcVT.isVector()) 18890 return SDValue(); 18891 18892 // All operands of the concat must be the same kind of cast from the same 18893 // source type. 18894 SmallVector<SDValue, 4> SrcOps; 18895 for (SDValue Op : N->ops()) { 18896 if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() || 18897 Op.getOperand(0).getValueType() != SrcVT) 18898 return SDValue(); 18899 SrcOps.push_back(Op.getOperand(0)); 18900 } 18901 18902 // The wider cast must be supported by the target. This is unusual because 18903 // the operation support type parameter depends on the opcode. In addition, 18904 // check the other type in the cast to make sure this is really legal. 18905 EVT VT = N->getValueType(0); 18906 EVT SrcEltVT = SrcVT.getVectorElementType(); 18907 unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands(); 18908 EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts); 18909 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18910 switch (CastOpcode) { 18911 case ISD::SINT_TO_FP: 18912 case ISD::UINT_TO_FP: 18913 if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) || 18914 !TLI.isTypeLegal(VT)) 18915 return SDValue(); 18916 break; 18917 case ISD::FP_TO_SINT: 18918 case ISD::FP_TO_UINT: 18919 if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) || 18920 !TLI.isTypeLegal(ConcatSrcVT)) 18921 return SDValue(); 18922 break; 18923 default: 18924 llvm_unreachable("Unexpected cast opcode"); 18925 } 18926 18927 // concat (cast X), (cast Y)... -> cast (concat X, Y...) 18928 SDLoc DL(N); 18929 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps); 18930 return DAG.getNode(CastOpcode, DL, VT, NewConcat); 18931 } 18932 18933 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { 18934 // If we only have one input vector, we don't need to do any concatenation. 18935 if (N->getNumOperands() == 1) 18936 return N->getOperand(0); 18937 18938 // Check if all of the operands are undefs. 18939 EVT VT = N->getValueType(0); 18940 if (ISD::allOperandsUndef(N)) 18941 return DAG.getUNDEF(VT); 18942 18943 // Optimize concat_vectors where all but the first of the vectors are undef. 18944 if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) { 18945 return Op.isUndef(); 18946 })) { 18947 SDValue In = N->getOperand(0); 18948 assert(In.getValueType().isVector() && "Must concat vectors"); 18949 18950 // If the input is a concat_vectors, just make a larger concat by padding 18951 // with smaller undefs. 18952 if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) { 18953 unsigned NumOps = N->getNumOperands() * In.getNumOperands(); 18954 SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end()); 18955 Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType())); 18956 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 18957 } 18958 18959 SDValue Scalar = peekThroughOneUseBitcasts(In); 18960 18961 // concat_vectors(scalar_to_vector(scalar), undef) -> 18962 // scalar_to_vector(scalar) 18963 if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR && 18964 Scalar.hasOneUse()) { 18965 EVT SVT = Scalar.getValueType().getVectorElementType(); 18966 if (SVT == Scalar.getOperand(0).getValueType()) 18967 Scalar = Scalar.getOperand(0); 18968 } 18969 18970 // concat_vectors(scalar, undef) -> scalar_to_vector(scalar) 18971 if (!Scalar.getValueType().isVector()) { 18972 // If the bitcast type isn't legal, it might be a trunc of a legal type; 18973 // look through the trunc so we can still do the transform: 18974 // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar) 18975 if (Scalar->getOpcode() == ISD::TRUNCATE && 18976 !TLI.isTypeLegal(Scalar.getValueType()) && 18977 TLI.isTypeLegal(Scalar->getOperand(0).getValueType())) 18978 Scalar = Scalar->getOperand(0); 18979 18980 EVT SclTy = Scalar.getValueType(); 18981 18982 if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) 18983 return SDValue(); 18984 18985 // Bail out if the vector size is not a multiple of the scalar size. 18986 if (VT.getSizeInBits() % SclTy.getSizeInBits()) 18987 return SDValue(); 18988 18989 unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); 18990 if (VNTNumElms < 2) 18991 return SDValue(); 18992 18993 EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); 18994 if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) 18995 return SDValue(); 18996 18997 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar); 18998 return DAG.getBitcast(VT, Res); 18999 } 19000 } 19001 19002 // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR. 19003 // We have already tested above for an UNDEF only concatenation. 19004 // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) 19005 // -> (BUILD_VECTOR A, B, ..., C, D, ...) 19006 auto IsBuildVectorOrUndef = [](const SDValue &Op) { 19007 return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); 19008 }; 19009 if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { 19010 SmallVector<SDValue, 8> Opnds; 19011 EVT SVT = VT.getScalarType(); 19012 19013 EVT MinVT = SVT; 19014 if (!SVT.isFloatingPoint()) { 19015 // If BUILD_VECTOR are from built from integer, they may have different 19016 // operand types. Get the smallest type and truncate all operands to it. 19017 bool FoundMinVT = false; 19018 for (const SDValue &Op : N->ops()) 19019 if (ISD::BUILD_VECTOR == Op.getOpcode()) { 19020 EVT OpSVT = Op.getOperand(0).getValueType(); 19021 MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT; 19022 FoundMinVT = true; 19023 } 19024 assert(FoundMinVT && "Concat vector type mismatch"); 19025 } 19026 19027 for (const SDValue &Op : N->ops()) { 19028 EVT OpVT = Op.getValueType(); 19029 unsigned NumElts = OpVT.getVectorNumElements(); 19030 19031 if (ISD::UNDEF == Op.getOpcode()) 19032 Opnds.append(NumElts, DAG.getUNDEF(MinVT)); 19033 19034 if (ISD::BUILD_VECTOR == Op.getOpcode()) { 19035 if (SVT.isFloatingPoint()) { 19036 assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch"); 19037 Opnds.append(Op->op_begin(), Op->op_begin() + NumElts); 19038 } else { 19039 for (unsigned i = 0; i != NumElts; ++i) 19040 Opnds.push_back( 19041 DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i))); 19042 } 19043 } 19044 } 19045 19046 assert(VT.getVectorNumElements() == Opnds.size() && 19047 "Concat vector type mismatch"); 19048 return DAG.getBuildVector(VT, SDLoc(N), Opnds); 19049 } 19050 19051 // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. 19052 if (SDValue V = combineConcatVectorOfScalars(N, DAG)) 19053 return V; 19054 19055 // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. 19056 if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) 19057 if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) 19058 return V; 19059 19060 if (SDValue V = combineConcatVectorOfCasts(N, DAG)) 19061 return V; 19062 19063 // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR 19064 // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR 19065 // operands and look for a CONCAT operations that place the incoming vectors 19066 // at the exact same location. 19067 // 19068 // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled. 19069 SDValue SingleSource = SDValue(); 19070 unsigned PartNumElem = 19071 N->getOperand(0).getValueType().getVectorMinNumElements(); 19072 19073 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 19074 SDValue Op = N->getOperand(i); 19075 19076 if (Op.isUndef()) 19077 continue; 19078 19079 // Check if this is the identity extract: 19080 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) 19081 return SDValue(); 19082 19083 // Find the single incoming vector for the extract_subvector. 19084 if (SingleSource.getNode()) { 19085 if (Op.getOperand(0) != SingleSource) 19086 return SDValue(); 19087 } else { 19088 SingleSource = Op.getOperand(0); 19089 19090 // Check the source type is the same as the type of the result. 19091 // If not, this concat may extend the vector, so we can not 19092 // optimize it away. 19093 if (SingleSource.getValueType() != N->getValueType(0)) 19094 return SDValue(); 19095 } 19096 19097 // Check that we are reading from the identity index. 19098 unsigned IdentityIndex = i * PartNumElem; 19099 if (Op.getConstantOperandAPInt(1) != IdentityIndex) 19100 return SDValue(); 19101 } 19102 19103 if (SingleSource.getNode()) 19104 return SingleSource; 19105 19106 return SDValue(); 19107 } 19108 19109 // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find 19110 // if the subvector can be sourced for free. 19111 static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) { 19112 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && 19113 V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) { 19114 return V.getOperand(1); 19115 } 19116 auto *IndexC = dyn_cast<ConstantSDNode>(Index); 19117 if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS && 19118 V.getOperand(0).getValueType() == SubVT && 19119 (IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) { 19120 uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements(); 19121 return V.getOperand(SubIdx); 19122 } 19123 return SDValue(); 19124 } 19125 19126 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, 19127 SelectionDAG &DAG) { 19128 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19129 SDValue BinOp = Extract->getOperand(0); 19130 unsigned BinOpcode = BinOp.getOpcode(); 19131 if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1) 19132 return SDValue(); 19133 19134 EVT VecVT = BinOp.getValueType(); 19135 SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1); 19136 if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType()) 19137 return SDValue(); 19138 19139 SDValue Index = Extract->getOperand(1); 19140 EVT SubVT = Extract->getValueType(0); 19141 if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT)) 19142 return SDValue(); 19143 19144 SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT); 19145 SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT); 19146 19147 // TODO: We could handle the case where only 1 operand is being inserted by 19148 // creating an extract of the other operand, but that requires checking 19149 // number of uses and/or costs. 19150 if (!Sub0 || !Sub1) 19151 return SDValue(); 19152 19153 // We are inserting both operands of the wide binop only to extract back 19154 // to the narrow vector size. Eliminate all of the insert/extract: 19155 // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y 19156 return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1, 19157 BinOp->getFlags()); 19158 } 19159 19160 /// If we are extracting a subvector produced by a wide binary operator try 19161 /// to use a narrow binary operator and/or avoid concatenation and extraction. 19162 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { 19163 // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share 19164 // some of these bailouts with other transforms. 19165 19166 if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG)) 19167 return V; 19168 19169 // The extract index must be a constant, so we can map it to a concat operand. 19170 auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); 19171 if (!ExtractIndexC) 19172 return SDValue(); 19173 19174 // We are looking for an optionally bitcasted wide vector binary operator 19175 // feeding an extract subvector. 19176 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19177 SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0)); 19178 unsigned BOpcode = BinOp.getOpcode(); 19179 if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1) 19180 return SDValue(); 19181 19182 // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be 19183 // reduced to the unary fneg when it is visited, and we probably want to deal 19184 // with fneg in a target-specific way. 19185 if (BOpcode == ISD::FSUB) { 19186 auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true); 19187 if (C && C->getValueAPF().isNegZero()) 19188 return SDValue(); 19189 } 19190 19191 // The binop must be a vector type, so we can extract some fraction of it. 19192 EVT WideBVT = BinOp.getValueType(); 19193 if (!WideBVT.isVector()) 19194 return SDValue(); 19195 19196 EVT VT = Extract->getValueType(0); 19197 unsigned ExtractIndex = ExtractIndexC->getZExtValue(); 19198 assert(ExtractIndex % VT.getVectorNumElements() == 0 && 19199 "Extract index is not a multiple of the vector length."); 19200 19201 // Bail out if this is not a proper multiple width extraction. 19202 unsigned WideWidth = WideBVT.getSizeInBits(); 19203 unsigned NarrowWidth = VT.getSizeInBits(); 19204 if (WideWidth % NarrowWidth != 0) 19205 return SDValue(); 19206 19207 // Bail out if we are extracting a fraction of a single operation. This can 19208 // occur because we potentially looked through a bitcast of the binop. 19209 unsigned NarrowingRatio = WideWidth / NarrowWidth; 19210 unsigned WideNumElts = WideBVT.getVectorNumElements(); 19211 if (WideNumElts % NarrowingRatio != 0) 19212 return SDValue(); 19213 19214 // Bail out if the target does not support a narrower version of the binop. 19215 EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), 19216 WideNumElts / NarrowingRatio); 19217 if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) 19218 return SDValue(); 19219 19220 // If extraction is cheap, we don't need to look at the binop operands 19221 // for concat ops. The narrow binop alone makes this transform profitable. 19222 // We can't just reuse the original extract index operand because we may have 19223 // bitcasted. 19224 unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements(); 19225 unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); 19226 if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) && 19227 BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) { 19228 // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N) 19229 SDLoc DL(Extract); 19230 SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL); 19231 SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 19232 BinOp.getOperand(0), NewExtIndex); 19233 SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 19234 BinOp.getOperand(1), NewExtIndex); 19235 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, 19236 BinOp.getNode()->getFlags()); 19237 return DAG.getBitcast(VT, NarrowBinOp); 19238 } 19239 19240 // Only handle the case where we are doubling and then halving. A larger ratio 19241 // may require more than two narrow binops to replace the wide binop. 19242 if (NarrowingRatio != 2) 19243 return SDValue(); 19244 19245 // TODO: The motivating case for this transform is an x86 AVX1 target. That 19246 // target has temptingly almost legal versions of bitwise logic ops in 256-bit 19247 // flavors, but no other 256-bit integer support. This could be extended to 19248 // handle any binop, but that may require fixing/adding other folds to avoid 19249 // codegen regressions. 19250 if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) 19251 return SDValue(); 19252 19253 // We need at least one concatenation operation of a binop operand to make 19254 // this transform worthwhile. The concat must double the input vector sizes. 19255 auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue { 19256 if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2) 19257 return V.getOperand(ConcatOpNum); 19258 return SDValue(); 19259 }; 19260 SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0))); 19261 SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1))); 19262 19263 if (SubVecL || SubVecR) { 19264 // If a binop operand was not the result of a concat, we must extract a 19265 // half-sized operand for our new narrow binop: 19266 // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN 19267 // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC) 19268 // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN 19269 SDLoc DL(Extract); 19270 SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL); 19271 SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL) 19272 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 19273 BinOp.getOperand(0), IndexC); 19274 19275 SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR) 19276 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, 19277 BinOp.getOperand(1), IndexC); 19278 19279 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); 19280 return DAG.getBitcast(VT, NarrowBinOp); 19281 } 19282 19283 return SDValue(); 19284 } 19285 19286 /// If we are extracting a subvector from a wide vector load, convert to a 19287 /// narrow load to eliminate the extraction: 19288 /// (extract_subvector (load wide vector)) --> (load narrow vector) 19289 static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { 19290 // TODO: Add support for big-endian. The offset calculation must be adjusted. 19291 if (DAG.getDataLayout().isBigEndian()) 19292 return SDValue(); 19293 19294 auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0)); 19295 auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); 19296 if (!Ld || Ld->getExtensionType() || !Ld->isSimple() || 19297 !ExtIdx) 19298 return SDValue(); 19299 19300 // Allow targets to opt-out. 19301 EVT VT = Extract->getValueType(0); 19302 19303 // We can only create byte sized loads. 19304 if (!VT.isByteSized()) 19305 return SDValue(); 19306 19307 unsigned Index = ExtIdx->getZExtValue(); 19308 unsigned NumElts = VT.getVectorNumElements(); 19309 19310 // If the index is a multiple of the extract element count, we can offset the 19311 // address by the store size multiplied by the subvector index. Otherwise if 19312 // the scalar type is byte sized, we can just use the index multiplied by 19313 // the element size in bytes as the offset. 19314 unsigned Offset; 19315 if (Index % NumElts == 0) 19316 Offset = (Index / NumElts) * VT.getStoreSize(); 19317 else if (VT.getScalarType().isByteSized()) 19318 Offset = Index * VT.getScalarType().getStoreSize(); 19319 else 19320 return SDValue(); 19321 19322 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19323 if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) 19324 return SDValue(); 19325 19326 // The narrow load will be offset from the base address of the old load if 19327 // we are extracting from something besides index 0 (little-endian). 19328 SDLoc DL(Extract); 19329 SDValue BaseAddr = Ld->getBasePtr(); 19330 19331 // TODO: Use "BaseIndexOffset" to make this more effective. 19332 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); 19333 MachineFunction &MF = DAG.getMachineFunction(); 19334 MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, 19335 VT.getStoreSize()); 19336 SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); 19337 DAG.makeEquivalentMemoryOrdering(Ld, NewLd); 19338 return NewLd; 19339 } 19340 19341 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { 19342 EVT NVT = N->getValueType(0); 19343 SDValue V = N->getOperand(0); 19344 uint64_t ExtIdx = N->getConstantOperandVal(1); 19345 19346 // Extract from UNDEF is UNDEF. 19347 if (V.isUndef()) 19348 return DAG.getUNDEF(NVT); 19349 19350 if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) 19351 if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) 19352 return NarrowLoad; 19353 19354 // Combine an extract of an extract into a single extract_subvector. 19355 // ext (ext X, C), 0 --> ext X, C 19356 if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) { 19357 if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(), 19358 V.getConstantOperandVal(1)) && 19359 TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) { 19360 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0), 19361 V.getOperand(1)); 19362 } 19363 } 19364 19365 // Try to move vector bitcast after extract_subv by scaling extraction index: 19366 // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') 19367 if (V.getOpcode() == ISD::BITCAST && 19368 V.getOperand(0).getValueType().isVector()) { 19369 SDValue SrcOp = V.getOperand(0); 19370 EVT SrcVT = SrcOp.getValueType(); 19371 unsigned SrcNumElts = SrcVT.getVectorMinNumElements(); 19372 unsigned DestNumElts = V.getValueType().getVectorMinNumElements(); 19373 if ((SrcNumElts % DestNumElts) == 0) { 19374 unsigned SrcDestRatio = SrcNumElts / DestNumElts; 19375 ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio; 19376 EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), 19377 NewExtEC); 19378 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { 19379 SDLoc DL(N); 19380 SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL); 19381 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, 19382 V.getOperand(0), NewIndex); 19383 return DAG.getBitcast(NVT, NewExtract); 19384 } 19385 } 19386 if ((DestNumElts % SrcNumElts) == 0) { 19387 unsigned DestSrcRatio = DestNumElts / SrcNumElts; 19388 if ((NVT.getVectorMinNumElements() % DestSrcRatio) == 0) { 19389 ElementCount NewExtEC = NVT.getVectorElementCount() / DestSrcRatio; 19390 EVT ScalarVT = SrcVT.getScalarType(); 19391 if ((ExtIdx % DestSrcRatio) == 0) { 19392 SDLoc DL(N); 19393 unsigned IndexValScaled = ExtIdx / DestSrcRatio; 19394 EVT NewExtVT = 19395 EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC); 19396 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { 19397 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); 19398 SDValue NewExtract = 19399 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, 19400 V.getOperand(0), NewIndex); 19401 return DAG.getBitcast(NVT, NewExtract); 19402 } 19403 if (NewExtEC == 1 && 19404 TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) { 19405 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); 19406 SDValue NewExtract = 19407 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, 19408 V.getOperand(0), NewIndex); 19409 return DAG.getBitcast(NVT, NewExtract); 19410 } 19411 } 19412 } 19413 } 19414 } 19415 19416 if (V.getOpcode() == ISD::CONCAT_VECTORS) { 19417 unsigned ExtNumElts = NVT.getVectorMinNumElements(); 19418 EVT ConcatSrcVT = V.getOperand(0).getValueType(); 19419 assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() && 19420 "Concat and extract subvector do not change element type"); 19421 assert((ExtIdx % ExtNumElts) == 0 && 19422 "Extract index is not a multiple of the input vector length."); 19423 19424 unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements(); 19425 unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts; 19426 19427 // If the concatenated source types match this extract, it's a direct 19428 // simplification: 19429 // extract_subvec (concat V1, V2, ...), i --> Vi 19430 if (ConcatSrcNumElts == ExtNumElts) 19431 return V.getOperand(ConcatOpIdx); 19432 19433 // If the concatenated source vectors are a multiple length of this extract, 19434 // then extract a fraction of one of those source vectors directly from a 19435 // concat operand. Example: 19436 // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 --> 19437 // v2i8 extract_subvec v8i8 Y, 6 19438 if (NVT.isFixedLengthVector() && ConcatSrcNumElts % ExtNumElts == 0) { 19439 SDLoc DL(N); 19440 unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts; 19441 assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts && 19442 "Trying to extract from >1 concat operand?"); 19443 assert(NewExtIdx % ExtNumElts == 0 && 19444 "Extract index is not a multiple of the input vector length."); 19445 SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL); 19446 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, 19447 V.getOperand(ConcatOpIdx), NewIndexC); 19448 } 19449 } 19450 19451 V = peekThroughBitcasts(V); 19452 19453 // If the input is a build vector. Try to make a smaller build vector. 19454 if (V.getOpcode() == ISD::BUILD_VECTOR) { 19455 EVT InVT = V.getValueType(); 19456 unsigned ExtractSize = NVT.getSizeInBits(); 19457 unsigned EltSize = InVT.getScalarSizeInBits(); 19458 // Only do this if we won't split any elements. 19459 if (ExtractSize % EltSize == 0) { 19460 unsigned NumElems = ExtractSize / EltSize; 19461 EVT EltVT = InVT.getVectorElementType(); 19462 EVT ExtractVT = 19463 NumElems == 1 ? EltVT 19464 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems); 19465 if ((Level < AfterLegalizeDAG || 19466 (NumElems == 1 || 19467 TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) && 19468 (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { 19469 unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize; 19470 19471 if (NumElems == 1) { 19472 SDValue Src = V->getOperand(IdxVal); 19473 if (EltVT != Src.getValueType()) 19474 Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src); 19475 return DAG.getBitcast(NVT, Src); 19476 } 19477 19478 // Extract the pieces from the original build_vector. 19479 SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N), 19480 V->ops().slice(IdxVal, NumElems)); 19481 return DAG.getBitcast(NVT, BuildVec); 19482 } 19483 } 19484 } 19485 19486 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { 19487 // Handle only simple case where vector being inserted and vector 19488 // being extracted are of same size. 19489 EVT SmallVT = V.getOperand(1).getValueType(); 19490 if (!NVT.bitsEq(SmallVT)) 19491 return SDValue(); 19492 19493 // Combine: 19494 // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) 19495 // Into: 19496 // indices are equal or bit offsets are equal => V1 19497 // otherwise => (extract_subvec V1, ExtIdx) 19498 uint64_t InsIdx = V.getConstantOperandVal(2); 19499 if (InsIdx * SmallVT.getScalarSizeInBits() == 19500 ExtIdx * NVT.getScalarSizeInBits()) 19501 return DAG.getBitcast(NVT, V.getOperand(1)); 19502 return DAG.getNode( 19503 ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, 19504 DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)), 19505 N->getOperand(1)); 19506 } 19507 19508 if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) 19509 return NarrowBOp; 19510 19511 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 19512 return SDValue(N, 0); 19513 19514 return SDValue(); 19515 } 19516 19517 /// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles 19518 /// followed by concatenation. Narrow vector ops may have better performance 19519 /// than wide ops, and this can unlock further narrowing of other vector ops. 19520 /// Targets can invert this transform later if it is not profitable. 19521 static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf, 19522 SelectionDAG &DAG) { 19523 SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1); 19524 if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || 19525 N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 || 19526 !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) 19527 return SDValue(); 19528 19529 // Split the wide shuffle mask into halves. Any mask element that is accessing 19530 // operand 1 is offset down to account for narrowing of the vectors. 19531 ArrayRef<int> Mask = Shuf->getMask(); 19532 EVT VT = Shuf->getValueType(0); 19533 unsigned NumElts = VT.getVectorNumElements(); 19534 unsigned HalfNumElts = NumElts / 2; 19535 SmallVector<int, 16> Mask0(HalfNumElts, -1); 19536 SmallVector<int, 16> Mask1(HalfNumElts, -1); 19537 for (unsigned i = 0; i != NumElts; ++i) { 19538 if (Mask[i] == -1) 19539 continue; 19540 int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts; 19541 if (i < HalfNumElts) 19542 Mask0[i] = M; 19543 else 19544 Mask1[i - HalfNumElts] = M; 19545 } 19546 19547 // Ask the target if this is a valid transform. 19548 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19549 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 19550 HalfNumElts); 19551 if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) || 19552 !TLI.isShuffleMaskLegal(Mask1, HalfVT)) 19553 return SDValue(); 19554 19555 // shuffle (concat X, undef), (concat Y, undef), Mask --> 19556 // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) 19557 SDValue X = N0.getOperand(0), Y = N1.getOperand(0); 19558 SDLoc DL(Shuf); 19559 SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0); 19560 SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1); 19561 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1); 19562 } 19563 19564 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, 19565 // or turn a shuffle of a single concat into simpler shuffle then concat. 19566 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { 19567 EVT VT = N->getValueType(0); 19568 unsigned NumElts = VT.getVectorNumElements(); 19569 19570 SDValue N0 = N->getOperand(0); 19571 SDValue N1 = N->getOperand(1); 19572 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 19573 ArrayRef<int> Mask = SVN->getMask(); 19574 19575 SmallVector<SDValue, 4> Ops; 19576 EVT ConcatVT = N0.getOperand(0).getValueType(); 19577 unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); 19578 unsigned NumConcats = NumElts / NumElemsPerConcat; 19579 19580 auto IsUndefMaskElt = [](int i) { return i == -1; }; 19581 19582 // Special case: shuffle(concat(A,B)) can be more efficiently represented 19583 // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high 19584 // half vector elements. 19585 if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() && 19586 llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat), 19587 IsUndefMaskElt)) { 19588 N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), 19589 N0.getOperand(1), 19590 Mask.slice(0, NumElemsPerConcat)); 19591 N1 = DAG.getUNDEF(ConcatVT); 19592 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); 19593 } 19594 19595 // Look at every vector that's inserted. We're looking for exact 19596 // subvector-sized copies from a concatenated vector 19597 for (unsigned I = 0; I != NumConcats; ++I) { 19598 unsigned Begin = I * NumElemsPerConcat; 19599 ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat); 19600 19601 // Make sure we're dealing with a copy. 19602 if (llvm::all_of(SubMask, IsUndefMaskElt)) { 19603 Ops.push_back(DAG.getUNDEF(ConcatVT)); 19604 continue; 19605 } 19606 19607 int OpIdx = -1; 19608 for (int i = 0; i != (int)NumElemsPerConcat; ++i) { 19609 if (IsUndefMaskElt(SubMask[i])) 19610 continue; 19611 if ((SubMask[i] % (int)NumElemsPerConcat) != i) 19612 return SDValue(); 19613 int EltOpIdx = SubMask[i] / NumElemsPerConcat; 19614 if (0 <= OpIdx && EltOpIdx != OpIdx) 19615 return SDValue(); 19616 OpIdx = EltOpIdx; 19617 } 19618 assert(0 <= OpIdx && "Unknown concat_vectors op"); 19619 19620 if (OpIdx < (int)N0.getNumOperands()) 19621 Ops.push_back(N0.getOperand(OpIdx)); 19622 else 19623 Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands())); 19624 } 19625 19626 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 19627 } 19628 19629 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - 19630 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. 19631 // 19632 // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always 19633 // a simplification in some sense, but it isn't appropriate in general: some 19634 // BUILD_VECTORs are substantially cheaper than others. The general case 19635 // of a BUILD_VECTOR requires inserting each element individually (or 19636 // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of 19637 // all constants is a single constant pool load. A BUILD_VECTOR where each 19638 // element is identical is a splat. A BUILD_VECTOR where most of the operands 19639 // are undef lowers to a small number of element insertions. 19640 // 19641 // To deal with this, we currently use a bunch of mostly arbitrary heuristics. 19642 // We don't fold shuffles where one side is a non-zero constant, and we don't 19643 // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate 19644 // non-constant operands. This seems to work out reasonably well in practice. 19645 static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, 19646 SelectionDAG &DAG, 19647 const TargetLowering &TLI) { 19648 EVT VT = SVN->getValueType(0); 19649 unsigned NumElts = VT.getVectorNumElements(); 19650 SDValue N0 = SVN->getOperand(0); 19651 SDValue N1 = SVN->getOperand(1); 19652 19653 if (!N0->hasOneUse()) 19654 return SDValue(); 19655 19656 // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as 19657 // discussed above. 19658 if (!N1.isUndef()) { 19659 if (!N1->hasOneUse()) 19660 return SDValue(); 19661 19662 bool N0AnyConst = isAnyConstantBuildVector(N0); 19663 bool N1AnyConst = isAnyConstantBuildVector(N1); 19664 if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) 19665 return SDValue(); 19666 if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode())) 19667 return SDValue(); 19668 } 19669 19670 // If both inputs are splats of the same value then we can safely merge this 19671 // to a single BUILD_VECTOR with undef elements based on the shuffle mask. 19672 bool IsSplat = false; 19673 auto *BV0 = dyn_cast<BuildVectorSDNode>(N0); 19674 auto *BV1 = dyn_cast<BuildVectorSDNode>(N1); 19675 if (BV0 && BV1) 19676 if (SDValue Splat0 = BV0->getSplatValue()) 19677 IsSplat = (Splat0 == BV1->getSplatValue()); 19678 19679 SmallVector<SDValue, 8> Ops; 19680 SmallSet<SDValue, 16> DuplicateOps; 19681 for (int M : SVN->getMask()) { 19682 SDValue Op = DAG.getUNDEF(VT.getScalarType()); 19683 if (M >= 0) { 19684 int Idx = M < (int)NumElts ? M : M - NumElts; 19685 SDValue &S = (M < (int)NumElts ? N0 : N1); 19686 if (S.getOpcode() == ISD::BUILD_VECTOR) { 19687 Op = S.getOperand(Idx); 19688 } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { 19689 SDValue Op0 = S.getOperand(0); 19690 Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType()); 19691 } else { 19692 // Operand can't be combined - bail out. 19693 return SDValue(); 19694 } 19695 } 19696 19697 // Don't duplicate a non-constant BUILD_VECTOR operand unless we're 19698 // generating a splat; semantically, this is fine, but it's likely to 19699 // generate low-quality code if the target can't reconstruct an appropriate 19700 // shuffle. 19701 if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op)) 19702 if (!IsSplat && !DuplicateOps.insert(Op).second) 19703 return SDValue(); 19704 19705 Ops.push_back(Op); 19706 } 19707 19708 // BUILD_VECTOR requires all inputs to be of the same type, find the 19709 // maximum type and extend them all. 19710 EVT SVT = VT.getScalarType(); 19711 if (SVT.isInteger()) 19712 for (SDValue &Op : Ops) 19713 SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); 19714 if (SVT != VT.getScalarType()) 19715 for (SDValue &Op : Ops) 19716 Op = TLI.isZExtFree(Op.getValueType(), SVT) 19717 ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT) 19718 : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT); 19719 return DAG.getBuildVector(VT, SDLoc(SVN), Ops); 19720 } 19721 19722 // Match shuffles that can be converted to any_vector_extend_in_reg. 19723 // This is often generated during legalization. 19724 // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) 19725 // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. 19726 static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, 19727 SelectionDAG &DAG, 19728 const TargetLowering &TLI, 19729 bool LegalOperations) { 19730 EVT VT = SVN->getValueType(0); 19731 bool IsBigEndian = DAG.getDataLayout().isBigEndian(); 19732 19733 // TODO Add support for big-endian when we have a test case. 19734 if (!VT.isInteger() || IsBigEndian) 19735 return SDValue(); 19736 19737 unsigned NumElts = VT.getVectorNumElements(); 19738 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 19739 ArrayRef<int> Mask = SVN->getMask(); 19740 SDValue N0 = SVN->getOperand(0); 19741 19742 // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) 19743 auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { 19744 for (unsigned i = 0; i != NumElts; ++i) { 19745 if (Mask[i] < 0) 19746 continue; 19747 if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) 19748 continue; 19749 return false; 19750 } 19751 return true; 19752 }; 19753 19754 // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for 19755 // power-of-2 extensions as they are the most likely. 19756 for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { 19757 // Check for non power of 2 vector sizes 19758 if (NumElts % Scale != 0) 19759 continue; 19760 if (!isAnyExtend(Scale)) 19761 continue; 19762 19763 EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); 19764 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); 19765 // Never create an illegal type. Only create unsupported operations if we 19766 // are pre-legalization. 19767 if (TLI.isTypeLegal(OutVT)) 19768 if (!LegalOperations || 19769 TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) 19770 return DAG.getBitcast(VT, 19771 DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, 19772 SDLoc(SVN), OutVT, N0)); 19773 } 19774 19775 return SDValue(); 19776 } 19777 19778 // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of 19779 // each source element of a large type into the lowest elements of a smaller 19780 // destination type. This is often generated during legalization. 19781 // If the source node itself was a '*_extend_vector_inreg' node then we should 19782 // then be able to remove it. 19783 static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, 19784 SelectionDAG &DAG) { 19785 EVT VT = SVN->getValueType(0); 19786 bool IsBigEndian = DAG.getDataLayout().isBigEndian(); 19787 19788 // TODO Add support for big-endian when we have a test case. 19789 if (!VT.isInteger() || IsBigEndian) 19790 return SDValue(); 19791 19792 SDValue N0 = peekThroughBitcasts(SVN->getOperand(0)); 19793 19794 unsigned Opcode = N0.getOpcode(); 19795 if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && 19796 Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && 19797 Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) 19798 return SDValue(); 19799 19800 SDValue N00 = N0.getOperand(0); 19801 ArrayRef<int> Mask = SVN->getMask(); 19802 unsigned NumElts = VT.getVectorNumElements(); 19803 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 19804 unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); 19805 unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits(); 19806 19807 if (ExtDstSizeInBits % ExtSrcSizeInBits != 0) 19808 return SDValue(); 19809 unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits; 19810 19811 // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> 19812 // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> 19813 // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> 19814 auto isTruncate = [&Mask, &NumElts](unsigned Scale) { 19815 for (unsigned i = 0; i != NumElts; ++i) { 19816 if (Mask[i] < 0) 19817 continue; 19818 if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) 19819 continue; 19820 return false; 19821 } 19822 return true; 19823 }; 19824 19825 // At the moment we just handle the case where we've truncated back to the 19826 // same size as before the extension. 19827 // TODO: handle more extension/truncation cases as cases arise. 19828 if (EltSizeInBits != ExtSrcSizeInBits) 19829 return SDValue(); 19830 19831 // We can remove *extend_vector_inreg only if the truncation happens at 19832 // the same scale as the extension. 19833 if (isTruncate(ExtScale)) 19834 return DAG.getBitcast(VT, N00); 19835 19836 return SDValue(); 19837 } 19838 19839 // Combine shuffles of splat-shuffles of the form: 19840 // shuffle (shuffle V, undef, splat-mask), undef, M 19841 // If splat-mask contains undef elements, we need to be careful about 19842 // introducing undef's in the folded mask which are not the result of composing 19843 // the masks of the shuffles. 19844 static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf, 19845 SelectionDAG &DAG) { 19846 if (!Shuf->getOperand(1).isUndef()) 19847 return SDValue(); 19848 auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); 19849 if (!Splat || !Splat->isSplat()) 19850 return SDValue(); 19851 19852 ArrayRef<int> ShufMask = Shuf->getMask(); 19853 ArrayRef<int> SplatMask = Splat->getMask(); 19854 assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch"); 19855 19856 // Prefer simplifying to the splat-shuffle, if possible. This is legal if 19857 // every undef mask element in the splat-shuffle has a corresponding undef 19858 // element in the user-shuffle's mask or if the composition of mask elements 19859 // would result in undef. 19860 // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): 19861 // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] 19862 // In this case it is not legal to simplify to the splat-shuffle because we 19863 // may be exposing the users of the shuffle an undef element at index 1 19864 // which was not there before the combine. 19865 // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] 19866 // In this case the composition of masks yields SplatMask, so it's ok to 19867 // simplify to the splat-shuffle. 19868 // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] 19869 // In this case the composed mask includes all undef elements of SplatMask 19870 // and in addition sets element zero to undef. It is safe to simplify to 19871 // the splat-shuffle. 19872 auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask, 19873 ArrayRef<int> SplatMask) { 19874 for (unsigned i = 0, e = UserMask.size(); i != e; ++i) 19875 if (UserMask[i] != -1 && SplatMask[i] == -1 && 19876 SplatMask[UserMask[i]] != -1) 19877 return false; 19878 return true; 19879 }; 19880 if (CanSimplifyToExistingSplat(ShufMask, SplatMask)) 19881 return Shuf->getOperand(0); 19882 19883 // Create a new shuffle with a mask that is composed of the two shuffles' 19884 // masks. 19885 SmallVector<int, 32> NewMask; 19886 for (int Idx : ShufMask) 19887 NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); 19888 19889 return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), 19890 Splat->getOperand(0), Splat->getOperand(1), 19891 NewMask); 19892 } 19893 19894 /// Combine shuffle of shuffle of the form: 19895 /// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X 19896 static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf, 19897 SelectionDAG &DAG) { 19898 if (!OuterShuf->getOperand(1).isUndef()) 19899 return SDValue(); 19900 auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0)); 19901 if (!InnerShuf || !InnerShuf->getOperand(1).isUndef()) 19902 return SDValue(); 19903 19904 ArrayRef<int> OuterMask = OuterShuf->getMask(); 19905 ArrayRef<int> InnerMask = InnerShuf->getMask(); 19906 unsigned NumElts = OuterMask.size(); 19907 assert(NumElts == InnerMask.size() && "Mask length mismatch"); 19908 SmallVector<int, 32> CombinedMask(NumElts, -1); 19909 int SplatIndex = -1; 19910 for (unsigned i = 0; i != NumElts; ++i) { 19911 // Undef lanes remain undef. 19912 int OuterMaskElt = OuterMask[i]; 19913 if (OuterMaskElt == -1) 19914 continue; 19915 19916 // Peek through the shuffle masks to get the underlying source element. 19917 int InnerMaskElt = InnerMask[OuterMaskElt]; 19918 if (InnerMaskElt == -1) 19919 continue; 19920 19921 // Initialize the splatted element. 19922 if (SplatIndex == -1) 19923 SplatIndex = InnerMaskElt; 19924 19925 // Non-matching index - this is not a splat. 19926 if (SplatIndex != InnerMaskElt) 19927 return SDValue(); 19928 19929 CombinedMask[i] = InnerMaskElt; 19930 } 19931 assert((all_of(CombinedMask, [](int M) { return M == -1; }) || 19932 getSplatIndex(CombinedMask) != -1) && 19933 "Expected a splat mask"); 19934 19935 // TODO: The transform may be a win even if the mask is not legal. 19936 EVT VT = OuterShuf->getValueType(0); 19937 assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types"); 19938 if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT)) 19939 return SDValue(); 19940 19941 return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0), 19942 InnerShuf->getOperand(1), CombinedMask); 19943 } 19944 19945 /// If the shuffle mask is taking exactly one element from the first vector 19946 /// operand and passing through all other elements from the second vector 19947 /// operand, return the index of the mask element that is choosing an element 19948 /// from the first operand. Otherwise, return -1. 19949 static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) { 19950 int MaskSize = Mask.size(); 19951 int EltFromOp0 = -1; 19952 // TODO: This does not match if there are undef elements in the shuffle mask. 19953 // Should we ignore undefs in the shuffle mask instead? The trade-off is 19954 // removing an instruction (a shuffle), but losing the knowledge that some 19955 // vector lanes are not needed. 19956 for (int i = 0; i != MaskSize; ++i) { 19957 if (Mask[i] >= 0 && Mask[i] < MaskSize) { 19958 // We're looking for a shuffle of exactly one element from operand 0. 19959 if (EltFromOp0 != -1) 19960 return -1; 19961 EltFromOp0 = i; 19962 } else if (Mask[i] != i + MaskSize) { 19963 // Nothing from operand 1 can change lanes. 19964 return -1; 19965 } 19966 } 19967 return EltFromOp0; 19968 } 19969 19970 /// If a shuffle inserts exactly one element from a source vector operand into 19971 /// another vector operand and we can access the specified element as a scalar, 19972 /// then we can eliminate the shuffle. 19973 static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, 19974 SelectionDAG &DAG) { 19975 // First, check if we are taking one element of a vector and shuffling that 19976 // element into another vector. 19977 ArrayRef<int> Mask = Shuf->getMask(); 19978 SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end()); 19979 SDValue Op0 = Shuf->getOperand(0); 19980 SDValue Op1 = Shuf->getOperand(1); 19981 int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); 19982 if (ShufOp0Index == -1) { 19983 // Commute mask and check again. 19984 ShuffleVectorSDNode::commuteMask(CommutedMask); 19985 ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); 19986 if (ShufOp0Index == -1) 19987 return SDValue(); 19988 // Commute operands to match the commuted shuffle mask. 19989 std::swap(Op0, Op1); 19990 Mask = CommutedMask; 19991 } 19992 19993 // The shuffle inserts exactly one element from operand 0 into operand 1. 19994 // Now see if we can access that element as a scalar via a real insert element 19995 // instruction. 19996 // TODO: We can try harder to locate the element as a scalar. Examples: it 19997 // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. 19998 assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && 19999 "Shuffle mask value must be from operand 0"); 20000 if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) 20001 return SDValue(); 20002 20003 auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2)); 20004 if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) 20005 return SDValue(); 20006 20007 // There's an existing insertelement with constant insertion index, so we 20008 // don't need to check the legality/profitability of a replacement operation 20009 // that differs at most in the constant value. The target should be able to 20010 // lower any of those in a similar way. If not, legalization will expand this 20011 // to a scalar-to-vector plus shuffle. 20012 // 20013 // Note that the shuffle may move the scalar from the position that the insert 20014 // element used. Therefore, our new insert element occurs at the shuffle's 20015 // mask index value, not the insert's index value. 20016 // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' 20017 SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); 20018 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), 20019 Op1, Op0.getOperand(1), NewInsIndex); 20020 } 20021 20022 /// If we have a unary shuffle of a shuffle, see if it can be folded away 20023 /// completely. This has the potential to lose undef knowledge because the first 20024 /// shuffle may not have an undef mask element where the second one does. So 20025 /// only call this after doing simplifications based on demanded elements. 20026 static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) { 20027 // shuf (shuf0 X, Y, Mask0), undef, Mask 20028 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); 20029 if (!Shuf0 || !Shuf->getOperand(1).isUndef()) 20030 return SDValue(); 20031 20032 ArrayRef<int> Mask = Shuf->getMask(); 20033 ArrayRef<int> Mask0 = Shuf0->getMask(); 20034 for (int i = 0, e = (int)Mask.size(); i != e; ++i) { 20035 // Ignore undef elements. 20036 if (Mask[i] == -1) 20037 continue; 20038 assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value"); 20039 20040 // Is the element of the shuffle operand chosen by this shuffle the same as 20041 // the element chosen by the shuffle operand itself? 20042 if (Mask0[Mask[i]] != Mask0[i]) 20043 return SDValue(); 20044 } 20045 // Every element of this shuffle is identical to the result of the previous 20046 // shuffle, so we can replace this value. 20047 return Shuf->getOperand(0); 20048 } 20049 20050 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { 20051 EVT VT = N->getValueType(0); 20052 unsigned NumElts = VT.getVectorNumElements(); 20053 20054 SDValue N0 = N->getOperand(0); 20055 SDValue N1 = N->getOperand(1); 20056 20057 assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG"); 20058 20059 // Canonicalize shuffle undef, undef -> undef 20060 if (N0.isUndef() && N1.isUndef()) 20061 return DAG.getUNDEF(VT); 20062 20063 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 20064 20065 // Canonicalize shuffle v, v -> v, undef 20066 if (N0 == N1) { 20067 SmallVector<int, 8> NewMask; 20068 for (unsigned i = 0; i != NumElts; ++i) { 20069 int Idx = SVN->getMaskElt(i); 20070 if (Idx >= (int)NumElts) Idx -= NumElts; 20071 NewMask.push_back(Idx); 20072 } 20073 return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask); 20074 } 20075 20076 // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. 20077 if (N0.isUndef()) 20078 return DAG.getCommutedVectorShuffle(*SVN); 20079 20080 // Remove references to rhs if it is undef 20081 if (N1.isUndef()) { 20082 bool Changed = false; 20083 SmallVector<int, 8> NewMask; 20084 for (unsigned i = 0; i != NumElts; ++i) { 20085 int Idx = SVN->getMaskElt(i); 20086 if (Idx >= (int)NumElts) { 20087 Idx = -1; 20088 Changed = true; 20089 } 20090 NewMask.push_back(Idx); 20091 } 20092 if (Changed) 20093 return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); 20094 } 20095 20096 if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) 20097 return InsElt; 20098 20099 // A shuffle of a single vector that is a splatted value can always be folded. 20100 if (SDValue V = combineShuffleOfSplatVal(SVN, DAG)) 20101 return V; 20102 20103 if (SDValue V = formSplatFromShuffles(SVN, DAG)) 20104 return V; 20105 20106 // If it is a splat, check if the argument vector is another splat or a 20107 // build_vector. 20108 if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { 20109 int SplatIndex = SVN->getSplatIndex(); 20110 if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) && 20111 TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) { 20112 // splat (vector_bo L, R), Index --> 20113 // splat (scalar_bo (extelt L, Index), (extelt R, Index)) 20114 SDValue L = N0.getOperand(0), R = N0.getOperand(1); 20115 SDLoc DL(N); 20116 EVT EltVT = VT.getScalarType(); 20117 SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL); 20118 SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index); 20119 SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index); 20120 SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, 20121 N0.getNode()->getFlags()); 20122 SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO); 20123 SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0); 20124 return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask); 20125 } 20126 20127 // If this is a bit convert that changes the element type of the vector but 20128 // not the number of vector elements, look through it. Be careful not to 20129 // look though conversions that change things like v4f32 to v2f64. 20130 SDNode *V = N0.getNode(); 20131 if (V->getOpcode() == ISD::BITCAST) { 20132 SDValue ConvInput = V->getOperand(0); 20133 if (ConvInput.getValueType().isVector() && 20134 ConvInput.getValueType().getVectorNumElements() == NumElts) 20135 V = ConvInput.getNode(); 20136 } 20137 20138 if (V->getOpcode() == ISD::BUILD_VECTOR) { 20139 assert(V->getNumOperands() == NumElts && 20140 "BUILD_VECTOR has wrong number of operands"); 20141 SDValue Base; 20142 bool AllSame = true; 20143 for (unsigned i = 0; i != NumElts; ++i) { 20144 if (!V->getOperand(i).isUndef()) { 20145 Base = V->getOperand(i); 20146 break; 20147 } 20148 } 20149 // Splat of <u, u, u, u>, return <u, u, u, u> 20150 if (!Base.getNode()) 20151 return N0; 20152 for (unsigned i = 0; i != NumElts; ++i) { 20153 if (V->getOperand(i) != Base) { 20154 AllSame = false; 20155 break; 20156 } 20157 } 20158 // Splat of <x, x, x, x>, return <x, x, x, x> 20159 if (AllSame) 20160 return N0; 20161 20162 // Canonicalize any other splat as a build_vector. 20163 SDValue Splatted = V->getOperand(SplatIndex); 20164 SmallVector<SDValue, 8> Ops(NumElts, Splatted); 20165 SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); 20166 20167 // We may have jumped through bitcasts, so the type of the 20168 // BUILD_VECTOR may not match the type of the shuffle. 20169 if (V->getValueType(0) != VT) 20170 NewBV = DAG.getBitcast(VT, NewBV); 20171 return NewBV; 20172 } 20173 } 20174 20175 // Simplify source operands based on shuffle mask. 20176 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 20177 return SDValue(N, 0); 20178 20179 // This is intentionally placed after demanded elements simplification because 20180 // it could eliminate knowledge of undef elements created by this shuffle. 20181 if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) 20182 return ShufOp; 20183 20184 // Match shuffles that can be converted to any_vector_extend_in_reg. 20185 if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) 20186 return V; 20187 20188 // Combine "truncate_vector_in_reg" style shuffles. 20189 if (SDValue V = combineTruncationShuffle(SVN, DAG)) 20190 return V; 20191 20192 if (N0.getOpcode() == ISD::CONCAT_VECTORS && 20193 Level < AfterLegalizeVectorOps && 20194 (N1.isUndef() || 20195 (N1.getOpcode() == ISD::CONCAT_VECTORS && 20196 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) { 20197 if (SDValue V = partitionShuffleOfConcats(N, DAG)) 20198 return V; 20199 } 20200 20201 // A shuffle of a concat of the same narrow vector can be reduced to use 20202 // only low-half elements of a concat with undef: 20203 // shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask' 20204 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() && 20205 N0.getNumOperands() == 2 && 20206 N0.getOperand(0) == N0.getOperand(1)) { 20207 int HalfNumElts = (int)NumElts / 2; 20208 SmallVector<int, 8> NewMask; 20209 for (unsigned i = 0; i != NumElts; ++i) { 20210 int Idx = SVN->getMaskElt(i); 20211 if (Idx >= HalfNumElts) { 20212 assert(Idx < (int)NumElts && "Shuffle mask chooses undef op"); 20213 Idx -= HalfNumElts; 20214 } 20215 NewMask.push_back(Idx); 20216 } 20217 if (TLI.isShuffleMaskLegal(NewMask, VT)) { 20218 SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType()); 20219 SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 20220 N0.getOperand(0), UndefVec); 20221 return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask); 20222 } 20223 } 20224 20225 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - 20226 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. 20227 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) 20228 if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI)) 20229 return Res; 20230 20231 // If this shuffle only has a single input that is a bitcasted shuffle, 20232 // attempt to merge the 2 shuffles and suitably bitcast the inputs/output 20233 // back to their original types. 20234 if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && 20235 N1.isUndef() && Level < AfterLegalizeVectorOps && 20236 TLI.isTypeLegal(VT)) { 20237 20238 SDValue BC0 = peekThroughOneUseBitcasts(N0); 20239 if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { 20240 EVT SVT = VT.getScalarType(); 20241 EVT InnerVT = BC0->getValueType(0); 20242 EVT InnerSVT = InnerVT.getScalarType(); 20243 20244 // Determine which shuffle works with the smaller scalar type. 20245 EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; 20246 EVT ScaleSVT = ScaleVT.getScalarType(); 20247 20248 if (TLI.isTypeLegal(ScaleVT) && 20249 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && 20250 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { 20251 int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); 20252 int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); 20253 20254 // Scale the shuffle masks to the smaller scalar type. 20255 ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0); 20256 SmallVector<int, 8> InnerMask; 20257 SmallVector<int, 8> OuterMask; 20258 narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask); 20259 narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask); 20260 20261 // Merge the shuffle masks. 20262 SmallVector<int, 8> NewMask; 20263 for (int M : OuterMask) 20264 NewMask.push_back(M < 0 ? -1 : InnerMask[M]); 20265 20266 // Test for shuffle mask legality over both commutations. 20267 SDValue SV0 = BC0->getOperand(0); 20268 SDValue SV1 = BC0->getOperand(1); 20269 bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); 20270 if (!LegalMask) { 20271 std::swap(SV0, SV1); 20272 ShuffleVectorSDNode::commuteMask(NewMask); 20273 LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); 20274 } 20275 20276 if (LegalMask) { 20277 SV0 = DAG.getBitcast(ScaleVT, SV0); 20278 SV1 = DAG.getBitcast(ScaleVT, SV1); 20279 return DAG.getBitcast( 20280 VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask)); 20281 } 20282 } 20283 } 20284 } 20285 20286 // Canonicalize shuffles according to rules: 20287 // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) 20288 // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) 20289 // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 20290 if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && 20291 N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && 20292 TLI.isTypeLegal(VT)) { 20293 // The incoming shuffle must be of the same type as the result of the 20294 // current shuffle. 20295 assert(N1->getOperand(0).getValueType() == VT && 20296 "Shuffle types don't match"); 20297 20298 SDValue SV0 = N1->getOperand(0); 20299 SDValue SV1 = N1->getOperand(1); 20300 bool HasSameOp0 = N0 == SV0; 20301 bool IsSV1Undef = SV1.isUndef(); 20302 if (HasSameOp0 || IsSV1Undef || N0 == SV1) 20303 // Commute the operands of this shuffle so that next rule 20304 // will trigger. 20305 return DAG.getCommutedVectorShuffle(*SVN); 20306 } 20307 20308 // Try to fold according to rules: 20309 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) 20310 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) 20311 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) 20312 // Don't try to fold shuffles with illegal type. 20313 // Only fold if this shuffle is the only user of the other shuffle. 20314 if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && 20315 Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { 20316 ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0); 20317 20318 // Don't try to fold splats; they're likely to simplify somehow, or they 20319 // might be free. 20320 if (OtherSV->isSplat()) 20321 return SDValue(); 20322 20323 // The incoming shuffle must be of the same type as the result of the 20324 // current shuffle. 20325 assert(OtherSV->getOperand(0).getValueType() == VT && 20326 "Shuffle types don't match"); 20327 20328 SDValue SV0, SV1; 20329 SmallVector<int, 4> Mask; 20330 // Compute the combined shuffle mask for a shuffle with SV0 as the first 20331 // operand, and SV1 as the second operand. 20332 for (unsigned i = 0; i != NumElts; ++i) { 20333 int Idx = SVN->getMaskElt(i); 20334 if (Idx < 0) { 20335 // Propagate Undef. 20336 Mask.push_back(Idx); 20337 continue; 20338 } 20339 20340 SDValue CurrentVec; 20341 if (Idx < (int)NumElts) { 20342 // This shuffle index refers to the inner shuffle N0. Lookup the inner 20343 // shuffle mask to identify which vector is actually referenced. 20344 Idx = OtherSV->getMaskElt(Idx); 20345 if (Idx < 0) { 20346 // Propagate Undef. 20347 Mask.push_back(Idx); 20348 continue; 20349 } 20350 20351 CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0) 20352 : OtherSV->getOperand(1); 20353 } else { 20354 // This shuffle index references an element within N1. 20355 CurrentVec = N1; 20356 } 20357 20358 // Simple case where 'CurrentVec' is UNDEF. 20359 if (CurrentVec.isUndef()) { 20360 Mask.push_back(-1); 20361 continue; 20362 } 20363 20364 // Canonicalize the shuffle index. We don't know yet if CurrentVec 20365 // will be the first or second operand of the combined shuffle. 20366 Idx = Idx % NumElts; 20367 if (!SV0.getNode() || SV0 == CurrentVec) { 20368 // Ok. CurrentVec is the left hand side. 20369 // Update the mask accordingly. 20370 SV0 = CurrentVec; 20371 Mask.push_back(Idx); 20372 continue; 20373 } 20374 20375 // Bail out if we cannot convert the shuffle pair into a single shuffle. 20376 if (SV1.getNode() && SV1 != CurrentVec) 20377 return SDValue(); 20378 20379 // Ok. CurrentVec is the right hand side. 20380 // Update the mask accordingly. 20381 SV1 = CurrentVec; 20382 Mask.push_back(Idx + NumElts); 20383 } 20384 20385 // Check if all indices in Mask are Undef. In case, propagate Undef. 20386 bool isUndefMask = true; 20387 for (unsigned i = 0; i != NumElts && isUndefMask; ++i) 20388 isUndefMask &= Mask[i] < 0; 20389 20390 if (isUndefMask) 20391 return DAG.getUNDEF(VT); 20392 20393 if (!SV0.getNode()) 20394 SV0 = DAG.getUNDEF(VT); 20395 if (!SV1.getNode()) 20396 SV1 = DAG.getUNDEF(VT); 20397 20398 // Avoid introducing shuffles with illegal mask. 20399 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) 20400 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) 20401 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) 20402 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) 20403 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) 20404 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) 20405 return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG); 20406 } 20407 20408 if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG)) 20409 return V; 20410 20411 return SDValue(); 20412 } 20413 20414 SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) { 20415 SDValue InVal = N->getOperand(0); 20416 EVT VT = N->getValueType(0); 20417 20418 // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern 20419 // with a VECTOR_SHUFFLE and possible truncate. 20420 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 20421 VT.isFixedLengthVector() && 20422 InVal->getOperand(0).getValueType().isFixedLengthVector()) { 20423 SDValue InVec = InVal->getOperand(0); 20424 SDValue EltNo = InVal->getOperand(1); 20425 auto InVecT = InVec.getValueType(); 20426 if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) { 20427 SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1); 20428 int Elt = C0->getZExtValue(); 20429 NewMask[0] = Elt; 20430 // If we have an implict truncate do truncate here as long as it's legal. 20431 // if it's not legal, this should 20432 if (VT.getScalarType() != InVal.getValueType() && 20433 InVal.getValueType().isScalarInteger() && 20434 isTypeLegal(VT.getScalarType())) { 20435 SDValue Val = 20436 DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal); 20437 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val); 20438 } 20439 if (VT.getScalarType() == InVecT.getScalarType() && 20440 VT.getVectorNumElements() <= InVecT.getVectorNumElements()) { 20441 SDValue LegalShuffle = 20442 TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec, 20443 DAG.getUNDEF(InVecT), NewMask, DAG); 20444 if (LegalShuffle) { 20445 // If the initial vector is the correct size this shuffle is a 20446 // valid result. 20447 if (VT == InVecT) 20448 return LegalShuffle; 20449 // If not we must truncate the vector. 20450 if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) { 20451 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N)); 20452 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), 20453 InVecT.getVectorElementType(), 20454 VT.getVectorNumElements()); 20455 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, 20456 LegalShuffle, ZeroIdx); 20457 } 20458 } 20459 } 20460 } 20461 } 20462 20463 return SDValue(); 20464 } 20465 20466 SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { 20467 EVT VT = N->getValueType(0); 20468 SDValue N0 = N->getOperand(0); 20469 SDValue N1 = N->getOperand(1); 20470 SDValue N2 = N->getOperand(2); 20471 uint64_t InsIdx = N->getConstantOperandVal(2); 20472 20473 // If inserting an UNDEF, just return the original vector. 20474 if (N1.isUndef()) 20475 return N0; 20476 20477 // If this is an insert of an extracted vector into an undef vector, we can 20478 // just use the input to the extract. 20479 if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && 20480 N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) 20481 return N1.getOperand(0); 20482 20483 // If we are inserting a bitcast value into an undef, with the same 20484 // number of elements, just use the bitcast input of the extract. 20485 // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 -> 20486 // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2) 20487 if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && 20488 N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && 20489 N1.getOperand(0).getOperand(1) == N2 && 20490 N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == 20491 VT.getVectorNumElements() && 20492 N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() == 20493 VT.getSizeInBits()) { 20494 return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); 20495 } 20496 20497 // If both N1 and N2 are bitcast values on which insert_subvector 20498 // would makes sense, pull the bitcast through. 20499 // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 -> 20500 // BITCAST (INSERT_SUBVECTOR N0 N1 N2) 20501 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { 20502 SDValue CN0 = N0.getOperand(0); 20503 SDValue CN1 = N1.getOperand(0); 20504 EVT CN0VT = CN0.getValueType(); 20505 EVT CN1VT = CN1.getValueType(); 20506 if (CN0VT.isVector() && CN1VT.isVector() && 20507 CN0VT.getVectorElementType() == CN1VT.getVectorElementType() && 20508 CN0VT.getVectorNumElements() == VT.getVectorNumElements()) { 20509 SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), 20510 CN0.getValueType(), CN0, CN1, N2); 20511 return DAG.getBitcast(VT, NewINSERT); 20512 } 20513 } 20514 20515 // Combine INSERT_SUBVECTORs where we are inserting to the same index. 20516 // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) 20517 // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) 20518 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && 20519 N0.getOperand(1).getValueType() == N1.getValueType() && 20520 N0.getOperand(2) == N2) 20521 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), 20522 N1, N2); 20523 20524 // Eliminate an intermediate insert into an undef vector: 20525 // insert_subvector undef, (insert_subvector undef, X, 0), N2 --> 20526 // insert_subvector undef, X, N2 20527 if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR && 20528 N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2))) 20529 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0, 20530 N1.getOperand(1), N2); 20531 20532 // Push subvector bitcasts to the output, adjusting the index as we go. 20533 // insert_subvector(bitcast(v), bitcast(s), c1) 20534 // -> bitcast(insert_subvector(v, s, c2)) 20535 if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) && 20536 N1.getOpcode() == ISD::BITCAST) { 20537 SDValue N0Src = peekThroughBitcasts(N0); 20538 SDValue N1Src = peekThroughBitcasts(N1); 20539 EVT N0SrcSVT = N0Src.getValueType().getScalarType(); 20540 EVT N1SrcSVT = N1Src.getValueType().getScalarType(); 20541 if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) && 20542 N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) { 20543 EVT NewVT; 20544 SDLoc DL(N); 20545 SDValue NewIdx; 20546 LLVMContext &Ctx = *DAG.getContext(); 20547 unsigned NumElts = VT.getVectorNumElements(); 20548 unsigned EltSizeInBits = VT.getScalarSizeInBits(); 20549 if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) { 20550 unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits(); 20551 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale); 20552 NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL); 20553 } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) { 20554 unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits; 20555 if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) { 20556 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale); 20557 NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL); 20558 } 20559 } 20560 if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) { 20561 SDValue Res = DAG.getBitcast(NewVT, N0Src); 20562 Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx); 20563 return DAG.getBitcast(VT, Res); 20564 } 20565 } 20566 } 20567 20568 // Canonicalize insert_subvector dag nodes. 20569 // Example: 20570 // (insert_subvector (insert_subvector A, Idx0), Idx1) 20571 // -> (insert_subvector (insert_subvector A, Idx1), Idx0) 20572 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && 20573 N1.getValueType() == N0.getOperand(1).getValueType()) { 20574 unsigned OtherIdx = N0.getConstantOperandVal(2); 20575 if (InsIdx < OtherIdx) { 20576 // Swap nodes. 20577 SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, 20578 N0.getOperand(0), N1, N2); 20579 AddToWorklist(NewOp.getNode()); 20580 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), 20581 VT, NewOp, N0.getOperand(1), N0.getOperand(2)); 20582 } 20583 } 20584 20585 // If the input vector is a concatenation, and the insert replaces 20586 // one of the pieces, we can optimize into a single concat_vectors. 20587 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && 20588 N0.getOperand(0).getValueType() == N1.getValueType()) { 20589 unsigned Factor = N1.getValueType().getVectorNumElements(); 20590 SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); 20591 Ops[InsIdx / Factor] = N1; 20592 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); 20593 } 20594 20595 // Simplify source operands based on insertion. 20596 if (SimplifyDemandedVectorElts(SDValue(N, 0))) 20597 return SDValue(N, 0); 20598 20599 return SDValue(); 20600 } 20601 20602 SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) { 20603 SDValue N0 = N->getOperand(0); 20604 20605 // fold (fp_to_fp16 (fp16_to_fp op)) -> op 20606 if (N0->getOpcode() == ISD::FP16_TO_FP) 20607 return N0->getOperand(0); 20608 20609 return SDValue(); 20610 } 20611 20612 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { 20613 SDValue N0 = N->getOperand(0); 20614 20615 // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) 20616 if (N0->getOpcode() == ISD::AND) { 20617 ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1)); 20618 if (AndConst && AndConst->getAPIntValue() == 0xffff) { 20619 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), 20620 N0.getOperand(0)); 20621 } 20622 } 20623 20624 return SDValue(); 20625 } 20626 20627 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) { 20628 SDValue N0 = N->getOperand(0); 20629 EVT VT = N0.getValueType(); 20630 unsigned Opcode = N->getOpcode(); 20631 20632 // VECREDUCE over 1-element vector is just an extract. 20633 if (VT.getVectorNumElements() == 1) { 20634 SDLoc dl(N); 20635 SDValue Res = 20636 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, 20637 DAG.getVectorIdxConstant(0, dl)); 20638 if (Res.getValueType() != N->getValueType(0)) 20639 Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res); 20640 return Res; 20641 } 20642 20643 // On an boolean vector an and/or reduction is the same as a umin/umax 20644 // reduction. Convert them if the latter is legal while the former isn't. 20645 if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) { 20646 unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND 20647 ? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX; 20648 if (!TLI.isOperationLegalOrCustom(Opcode, VT) && 20649 TLI.isOperationLegalOrCustom(NewOpcode, VT) && 20650 DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits()) 20651 return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0); 20652 } 20653 20654 return SDValue(); 20655 } 20656 20657 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle 20658 /// with the destination vector and a zero vector. 20659 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> 20660 /// vector_shuffle V, Zero, <0, 4, 2, 4> 20661 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { 20662 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); 20663 20664 EVT VT = N->getValueType(0); 20665 SDValue LHS = N->getOperand(0); 20666 SDValue RHS = peekThroughBitcasts(N->getOperand(1)); 20667 SDLoc DL(N); 20668 20669 // Make sure we're not running after operation legalization where it 20670 // may have custom lowered the vector shuffles. 20671 if (LegalOperations) 20672 return SDValue(); 20673 20674 if (RHS.getOpcode() != ISD::BUILD_VECTOR) 20675 return SDValue(); 20676 20677 EVT RVT = RHS.getValueType(); 20678 unsigned NumElts = RHS.getNumOperands(); 20679 20680 // Attempt to create a valid clear mask, splitting the mask into 20681 // sub elements and checking to see if each is 20682 // all zeros or all ones - suitable for shuffle masking. 20683 auto BuildClearMask = [&](int Split) { 20684 int NumSubElts = NumElts * Split; 20685 int NumSubBits = RVT.getScalarSizeInBits() / Split; 20686 20687 SmallVector<int, 8> Indices; 20688 for (int i = 0; i != NumSubElts; ++i) { 20689 int EltIdx = i / Split; 20690 int SubIdx = i % Split; 20691 SDValue Elt = RHS.getOperand(EltIdx); 20692 // X & undef --> 0 (not undef). So this lane must be converted to choose 20693 // from the zero constant vector (same as if the element had all 0-bits). 20694 if (Elt.isUndef()) { 20695 Indices.push_back(i + NumSubElts); 20696 continue; 20697 } 20698 20699 APInt Bits; 20700 if (isa<ConstantSDNode>(Elt)) 20701 Bits = cast<ConstantSDNode>(Elt)->getAPIntValue(); 20702 else if (isa<ConstantFPSDNode>(Elt)) 20703 Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt(); 20704 else 20705 return SDValue(); 20706 20707 // Extract the sub element from the constant bit mask. 20708 if (DAG.getDataLayout().isBigEndian()) 20709 Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits); 20710 else 20711 Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits); 20712 20713 if (Bits.isAllOnesValue()) 20714 Indices.push_back(i); 20715 else if (Bits == 0) 20716 Indices.push_back(i + NumSubElts); 20717 else 20718 return SDValue(); 20719 } 20720 20721 // Let's see if the target supports this vector_shuffle. 20722 EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits); 20723 EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts); 20724 if (!TLI.isVectorClearMaskLegal(Indices, ClearVT)) 20725 return SDValue(); 20726 20727 SDValue Zero = DAG.getConstant(0, DL, ClearVT); 20728 return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL, 20729 DAG.getBitcast(ClearVT, LHS), 20730 Zero, Indices)); 20731 }; 20732 20733 // Determine maximum split level (byte level masking). 20734 int MaxSplit = 1; 20735 if (RVT.getScalarSizeInBits() % 8 == 0) 20736 MaxSplit = RVT.getScalarSizeInBits() / 8; 20737 20738 for (int Split = 1; Split <= MaxSplit; ++Split) 20739 if (RVT.getScalarSizeInBits() % Split == 0) 20740 if (SDValue S = BuildClearMask(Split)) 20741 return S; 20742 20743 return SDValue(); 20744 } 20745 20746 /// If a vector binop is performed on splat values, it may be profitable to 20747 /// extract, scalarize, and insert/splat. 20748 static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) { 20749 SDValue N0 = N->getOperand(0); 20750 SDValue N1 = N->getOperand(1); 20751 unsigned Opcode = N->getOpcode(); 20752 EVT VT = N->getValueType(0); 20753 EVT EltVT = VT.getVectorElementType(); 20754 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20755 20756 // TODO: Remove/replace the extract cost check? If the elements are available 20757 // as scalars, then there may be no extract cost. Should we ask if 20758 // inserting a scalar back into a vector is cheap instead? 20759 int Index0, Index1; 20760 SDValue Src0 = DAG.getSplatSourceVector(N0, Index0); 20761 SDValue Src1 = DAG.getSplatSourceVector(N1, Index1); 20762 if (!Src0 || !Src1 || Index0 != Index1 || 20763 Src0.getValueType().getVectorElementType() != EltVT || 20764 Src1.getValueType().getVectorElementType() != EltVT || 20765 !TLI.isExtractVecEltCheap(VT, Index0) || 20766 !TLI.isOperationLegalOrCustom(Opcode, EltVT)) 20767 return SDValue(); 20768 20769 SDLoc DL(N); 20770 SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL); 20771 SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC); 20772 SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC); 20773 SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); 20774 20775 // If all lanes but 1 are undefined, no need to splat the scalar result. 20776 // TODO: Keep track of undefs and use that info in the general case. 20777 if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() && 20778 count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 && 20779 count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) { 20780 // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> 20781 // build_vec ..undef, (bo X, Y), undef... 20782 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT)); 20783 Ops[Index0] = ScalarBO; 20784 return DAG.getBuildVector(VT, DL, Ops); 20785 } 20786 20787 // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index 20788 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO); 20789 return DAG.getBuildVector(VT, DL, Ops); 20790 } 20791 20792 /// Visit a binary vector operation, like ADD. 20793 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { 20794 assert(N->getValueType(0).isVector() && 20795 "SimplifyVBinOp only works on vectors!"); 20796 20797 SDValue LHS = N->getOperand(0); 20798 SDValue RHS = N->getOperand(1); 20799 SDValue Ops[] = {LHS, RHS}; 20800 EVT VT = N->getValueType(0); 20801 unsigned Opcode = N->getOpcode(); 20802 SDNodeFlags Flags = N->getFlags(); 20803 20804 // See if we can constant fold the vector operation. 20805 if (SDValue Fold = DAG.FoldConstantVectorArithmetic( 20806 Opcode, SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) 20807 return Fold; 20808 20809 // Move unary shuffles with identical masks after a vector binop: 20810 // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask)) 20811 // --> shuffle (VBinOp A, B), Undef, Mask 20812 // This does not require type legality checks because we are creating the 20813 // same types of operations that are in the original sequence. We do have to 20814 // restrict ops like integer div that have immediate UB (eg, div-by-zero) 20815 // though. This code is adapted from the identical transform in instcombine. 20816 if (Opcode != ISD::UDIV && Opcode != ISD::SDIV && 20817 Opcode != ISD::UREM && Opcode != ISD::SREM && 20818 Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) { 20819 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS); 20820 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS); 20821 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) && 20822 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() && 20823 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) { 20824 SDLoc DL(N); 20825 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0), 20826 RHS.getOperand(0), Flags); 20827 SDValue UndefV = LHS.getOperand(1); 20828 return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); 20829 } 20830 20831 // Try to sink a splat shuffle after a binop with a uniform constant. 20832 // This is limited to cases where neither the shuffle nor the constant have 20833 // undefined elements because that could be poison-unsafe or inhibit 20834 // demanded elements analysis. It is further limited to not change a splat 20835 // of an inserted scalar because that may be optimized better by 20836 // load-folding or other target-specific behaviors. 20837 if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) && 20838 Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() && 20839 Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) { 20840 // binop (splat X), (splat C) --> splat (binop X, C) 20841 SDLoc DL(N); 20842 SDValue X = Shuf0->getOperand(0); 20843 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags); 20844 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT), 20845 Shuf0->getMask()); 20846 } 20847 if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) && 20848 Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() && 20849 Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) { 20850 // binop (splat C), (splat X) --> splat (binop C, X) 20851 SDLoc DL(N); 20852 SDValue X = Shuf1->getOperand(0); 20853 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags); 20854 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT), 20855 Shuf1->getMask()); 20856 } 20857 } 20858 20859 // The following pattern is likely to emerge with vector reduction ops. Moving 20860 // the binary operation ahead of insertion may allow using a narrower vector 20861 // instruction that has better performance than the wide version of the op: 20862 // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z 20863 if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() && 20864 RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() && 20865 LHS.getOperand(2) == RHS.getOperand(2) && 20866 (LHS.hasOneUse() || RHS.hasOneUse())) { 20867 SDValue X = LHS.getOperand(1); 20868 SDValue Y = RHS.getOperand(1); 20869 SDValue Z = LHS.getOperand(2); 20870 EVT NarrowVT = X.getValueType(); 20871 if (NarrowVT == Y.getValueType() && 20872 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) { 20873 // (binop undef, undef) may not return undef, so compute that result. 20874 SDLoc DL(N); 20875 SDValue VecC = 20876 DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT)); 20877 SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y); 20878 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z); 20879 } 20880 } 20881 20882 // Make sure all but the first op are undef or constant. 20883 auto ConcatWithConstantOrUndef = [](SDValue Concat) { 20884 return Concat.getOpcode() == ISD::CONCAT_VECTORS && 20885 std::all_of(std::next(Concat->op_begin()), Concat->op_end(), 20886 [](const SDValue &Op) { 20887 return Op.isUndef() || 20888 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); 20889 }); 20890 }; 20891 20892 // The following pattern is likely to emerge with vector reduction ops. Moving 20893 // the binary operation ahead of the concat may allow using a narrower vector 20894 // instruction that has better performance than the wide version of the op: 20895 // VBinOp (concat X, undef/constant), (concat Y, undef/constant) --> 20896 // concat (VBinOp X, Y), VecC 20897 if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) && 20898 (LHS.hasOneUse() || RHS.hasOneUse())) { 20899 EVT NarrowVT = LHS.getOperand(0).getValueType(); 20900 if (NarrowVT == RHS.getOperand(0).getValueType() && 20901 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) { 20902 SDLoc DL(N); 20903 unsigned NumOperands = LHS.getNumOperands(); 20904 SmallVector<SDValue, 4> ConcatOps; 20905 for (unsigned i = 0; i != NumOperands; ++i) { 20906 // This constant fold for operands 1 and up. 20907 ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i), 20908 RHS.getOperand(i))); 20909 } 20910 20911 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); 20912 } 20913 } 20914 20915 if (SDValue V = scalarizeBinOpOfSplats(N, DAG)) 20916 return V; 20917 20918 return SDValue(); 20919 } 20920 20921 SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, 20922 SDValue N2) { 20923 assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!"); 20924 20925 SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2, 20926 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 20927 20928 // If we got a simplified select_cc node back from SimplifySelectCC, then 20929 // break it down into a new SETCC node, and a new SELECT node, and then return 20930 // the SELECT node, since we were called with a SELECT node. 20931 if (SCC.getNode()) { 20932 // Check to see if we got a select_cc back (to turn into setcc/select). 20933 // Otherwise, just return whatever node we got back, like fabs. 20934 if (SCC.getOpcode() == ISD::SELECT_CC) { 20935 const SDNodeFlags Flags = N0.getNode()->getFlags(); 20936 SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0), 20937 N0.getValueType(), 20938 SCC.getOperand(0), SCC.getOperand(1), 20939 SCC.getOperand(4), Flags); 20940 AddToWorklist(SETCC.getNode()); 20941 SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, 20942 SCC.getOperand(2), SCC.getOperand(3)); 20943 SelectNode->setFlags(Flags); 20944 return SelectNode; 20945 } 20946 20947 return SCC; 20948 } 20949 return SDValue(); 20950 } 20951 20952 /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values 20953 /// being selected between, see if we can simplify the select. Callers of this 20954 /// should assume that TheSelect is deleted if this returns true. As such, they 20955 /// should return the appropriate thing (e.g. the node) back to the top-level of 20956 /// the DAG combiner loop to avoid it being looked at. 20957 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, 20958 SDValue RHS) { 20959 // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) 20960 // The select + setcc is redundant, because fsqrt returns NaN for X < 0. 20961 if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) { 20962 if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) { 20963 // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?)) 20964 SDValue Sqrt = RHS; 20965 ISD::CondCode CC; 20966 SDValue CmpLHS; 20967 const ConstantFPSDNode *Zero = nullptr; 20968 20969 if (TheSelect->getOpcode() == ISD::SELECT_CC) { 20970 CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get(); 20971 CmpLHS = TheSelect->getOperand(0); 20972 Zero = isConstOrConstSplatFP(TheSelect->getOperand(1)); 20973 } else { 20974 // SELECT or VSELECT 20975 SDValue Cmp = TheSelect->getOperand(0); 20976 if (Cmp.getOpcode() == ISD::SETCC) { 20977 CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get(); 20978 CmpLHS = Cmp.getOperand(0); 20979 Zero = isConstOrConstSplatFP(Cmp.getOperand(1)); 20980 } 20981 } 20982 if (Zero && Zero->isZero() && 20983 Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT || 20984 CC == ISD::SETULT || CC == ISD::SETLT)) { 20985 // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) 20986 CombineTo(TheSelect, Sqrt); 20987 return true; 20988 } 20989 } 20990 } 20991 // Cannot simplify select with vector condition 20992 if (TheSelect->getOperand(0).getValueType().isVector()) return false; 20993 20994 // If this is a select from two identical things, try to pull the operation 20995 // through the select. 20996 if (LHS.getOpcode() != RHS.getOpcode() || 20997 !LHS.hasOneUse() || !RHS.hasOneUse()) 20998 return false; 20999 21000 // If this is a load and the token chain is identical, replace the select 21001 // of two loads with a load through a select of the address to load from. 21002 // This triggers in things like "select bool X, 10.0, 123.0" after the FP 21003 // constants have been dropped into the constant pool. 21004 if (LHS.getOpcode() == ISD::LOAD) { 21005 LoadSDNode *LLD = cast<LoadSDNode>(LHS); 21006 LoadSDNode *RLD = cast<LoadSDNode>(RHS); 21007 21008 // Token chains must be identical. 21009 if (LHS.getOperand(0) != RHS.getOperand(0) || 21010 // Do not let this transformation reduce the number of volatile loads. 21011 // Be conservative for atomics for the moment 21012 // TODO: This does appear to be legal for unordered atomics (see D66309) 21013 !LLD->isSimple() || !RLD->isSimple() || 21014 // FIXME: If either is a pre/post inc/dec load, 21015 // we'd need to split out the address adjustment. 21016 LLD->isIndexed() || RLD->isIndexed() || 21017 // If this is an EXTLOAD, the VT's must match. 21018 LLD->getMemoryVT() != RLD->getMemoryVT() || 21019 // If this is an EXTLOAD, the kind of extension must match. 21020 (LLD->getExtensionType() != RLD->getExtensionType() && 21021 // The only exception is if one of the extensions is anyext. 21022 LLD->getExtensionType() != ISD::EXTLOAD && 21023 RLD->getExtensionType() != ISD::EXTLOAD) || 21024 // FIXME: this discards src value information. This is 21025 // over-conservative. It would be beneficial to be able to remember 21026 // both potential memory locations. Since we are discarding 21027 // src value info, don't do the transformation if the memory 21028 // locations are not in the default address space. 21029 LLD->getPointerInfo().getAddrSpace() != 0 || 21030 RLD->getPointerInfo().getAddrSpace() != 0 || 21031 // We can't produce a CMOV of a TargetFrameIndex since we won't 21032 // generate the address generation required. 21033 LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || 21034 RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || 21035 !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(), 21036 LLD->getBasePtr().getValueType())) 21037 return false; 21038 21039 // The loads must not depend on one another. 21040 if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD)) 21041 return false; 21042 21043 // Check that the select condition doesn't reach either load. If so, 21044 // folding this will induce a cycle into the DAG. If not, this is safe to 21045 // xform, so create a select of the addresses. 21046 21047 SmallPtrSet<const SDNode *, 32> Visited; 21048 SmallVector<const SDNode *, 16> Worklist; 21049 21050 // Always fail if LLD and RLD are not independent. TheSelect is a 21051 // predecessor to all Nodes in question so we need not search past it. 21052 21053 Visited.insert(TheSelect); 21054 Worklist.push_back(LLD); 21055 Worklist.push_back(RLD); 21056 21057 if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) || 21058 SDNode::hasPredecessorHelper(RLD, Visited, Worklist)) 21059 return false; 21060 21061 SDValue Addr; 21062 if (TheSelect->getOpcode() == ISD::SELECT) { 21063 // We cannot do this optimization if any pair of {RLD, LLD} is a 21064 // predecessor to {RLD, LLD, CondNode}. As we've already compared the 21065 // Loads, we only need to check if CondNode is a successor to one of the 21066 // loads. We can further avoid this if there's no use of their chain 21067 // value. 21068 SDNode *CondNode = TheSelect->getOperand(0).getNode(); 21069 Worklist.push_back(CondNode); 21070 21071 if ((LLD->hasAnyUseOfValue(1) && 21072 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) || 21073 (RLD->hasAnyUseOfValue(1) && 21074 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))) 21075 return false; 21076 21077 Addr = DAG.getSelect(SDLoc(TheSelect), 21078 LLD->getBasePtr().getValueType(), 21079 TheSelect->getOperand(0), LLD->getBasePtr(), 21080 RLD->getBasePtr()); 21081 } else { // Otherwise SELECT_CC 21082 // We cannot do this optimization if any pair of {RLD, LLD} is a 21083 // predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared 21084 // the Loads, we only need to check if CondLHS/CondRHS is a successor to 21085 // one of the loads. We can further avoid this if there's no use of their 21086 // chain value. 21087 21088 SDNode *CondLHS = TheSelect->getOperand(0).getNode(); 21089 SDNode *CondRHS = TheSelect->getOperand(1).getNode(); 21090 Worklist.push_back(CondLHS); 21091 Worklist.push_back(CondRHS); 21092 21093 if ((LLD->hasAnyUseOfValue(1) && 21094 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) || 21095 (RLD->hasAnyUseOfValue(1) && 21096 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))) 21097 return false; 21098 21099 Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect), 21100 LLD->getBasePtr().getValueType(), 21101 TheSelect->getOperand(0), 21102 TheSelect->getOperand(1), 21103 LLD->getBasePtr(), RLD->getBasePtr(), 21104 TheSelect->getOperand(4)); 21105 } 21106 21107 SDValue Load; 21108 // It is safe to replace the two loads if they have different alignments, 21109 // but the new load must be the minimum (most restrictive) alignment of the 21110 // inputs. 21111 unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment()); 21112 MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); 21113 if (!RLD->isInvariant()) 21114 MMOFlags &= ~MachineMemOperand::MOInvariant; 21115 if (!RLD->isDereferenceable()) 21116 MMOFlags &= ~MachineMemOperand::MODereferenceable; 21117 if (LLD->getExtensionType() == ISD::NON_EXTLOAD) { 21118 // FIXME: Discards pointer and AA info. 21119 Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect), 21120 LLD->getChain(), Addr, MachinePointerInfo(), Alignment, 21121 MMOFlags); 21122 } else { 21123 // FIXME: Discards pointer and AA info. 21124 Load = DAG.getExtLoad( 21125 LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType() 21126 : LLD->getExtensionType(), 21127 SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr, 21128 MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags); 21129 } 21130 21131 // Users of the select now use the result of the load. 21132 CombineTo(TheSelect, Load); 21133 21134 // Users of the old loads now use the new load's chain. We know the 21135 // old-load value is dead now. 21136 CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1)); 21137 CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1)); 21138 return true; 21139 } 21140 21141 return false; 21142 } 21143 21144 /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and 21145 /// bitwise 'and'. 21146 SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, 21147 SDValue N1, SDValue N2, SDValue N3, 21148 ISD::CondCode CC) { 21149 // If this is a select where the false operand is zero and the compare is a 21150 // check of the sign bit, see if we can perform the "gzip trick": 21151 // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A 21152 // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A 21153 EVT XType = N0.getValueType(); 21154 EVT AType = N2.getValueType(); 21155 if (!isNullConstant(N3) || !XType.bitsGE(AType)) 21156 return SDValue(); 21157 21158 // If the comparison is testing for a positive value, we have to invert 21159 // the sign bit mask, so only do that transform if the target has a bitwise 21160 // 'and not' instruction (the invert is free). 21161 if (CC == ISD::SETGT && TLI.hasAndNot(N2)) { 21162 // (X > -1) ? A : 0 21163 // (X > 0) ? X : 0 <-- This is canonical signed max. 21164 if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2))) 21165 return SDValue(); 21166 } else if (CC == ISD::SETLT) { 21167 // (X < 0) ? A : 0 21168 // (X < 1) ? X : 0 <-- This is un-canonicalized signed min. 21169 if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2))) 21170 return SDValue(); 21171 } else { 21172 return SDValue(); 21173 } 21174 21175 // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit 21176 // constant. 21177 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); 21178 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); 21179 if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) { 21180 unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1; 21181 if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) { 21182 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); 21183 SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt); 21184 AddToWorklist(Shift.getNode()); 21185 21186 if (XType.bitsGT(AType)) { 21187 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); 21188 AddToWorklist(Shift.getNode()); 21189 } 21190 21191 if (CC == ISD::SETGT) 21192 Shift = DAG.getNOT(DL, Shift, AType); 21193 21194 return DAG.getNode(ISD::AND, DL, AType, Shift, N2); 21195 } 21196 } 21197 21198 unsigned ShCt = XType.getSizeInBits() - 1; 21199 if (TLI.shouldAvoidTransformToShift(XType, ShCt)) 21200 return SDValue(); 21201 21202 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); 21203 SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt); 21204 AddToWorklist(Shift.getNode()); 21205 21206 if (XType.bitsGT(AType)) { 21207 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); 21208 AddToWorklist(Shift.getNode()); 21209 } 21210 21211 if (CC == ISD::SETGT) 21212 Shift = DAG.getNOT(DL, Shift, AType); 21213 21214 return DAG.getNode(ISD::AND, DL, AType, Shift, N2); 21215 } 21216 21217 /// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" 21218 /// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 21219 /// in it. This may be a win when the constant is not otherwise available 21220 /// because it replaces two constant pool loads with one. 21221 SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset( 21222 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, 21223 ISD::CondCode CC) { 21224 if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType())) 21225 return SDValue(); 21226 21227 // If we are before legalize types, we want the other legalization to happen 21228 // first (for example, to avoid messing with soft float). 21229 auto *TV = dyn_cast<ConstantFPSDNode>(N2); 21230 auto *FV = dyn_cast<ConstantFPSDNode>(N3); 21231 EVT VT = N2.getValueType(); 21232 if (!TV || !FV || !TLI.isTypeLegal(VT)) 21233 return SDValue(); 21234 21235 // If a constant can be materialized without loads, this does not make sense. 21236 if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal || 21237 TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) || 21238 TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize)) 21239 return SDValue(); 21240 21241 // If both constants have multiple uses, then we won't need to do an extra 21242 // load. The values are likely around in registers for other users. 21243 if (!TV->hasOneUse() && !FV->hasOneUse()) 21244 return SDValue(); 21245 21246 Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()), 21247 const_cast<ConstantFP*>(TV->getConstantFPValue()) }; 21248 Type *FPTy = Elts[0]->getType(); 21249 const DataLayout &TD = DAG.getDataLayout(); 21250 21251 // Create a ConstantArray of the two constants. 21252 Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); 21253 SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()), 21254 TD.getPrefTypeAlign(FPTy)); 21255 Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign(); 21256 21257 // Get offsets to the 0 and 1 elements of the array, so we can select between 21258 // them. 21259 SDValue Zero = DAG.getIntPtrConstant(0, DL); 21260 unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); 21261 SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV)); 21262 SDValue Cond = 21263 DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); 21264 AddToWorklist(Cond.getNode()); 21265 SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); 21266 AddToWorklist(CstOffset.getNode()); 21267 CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); 21268 AddToWorklist(CPIdx.getNode()); 21269 return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, 21270 MachinePointerInfo::getConstantPool( 21271 DAG.getMachineFunction()), Alignment); 21272 } 21273 21274 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3 21275 /// where 'cond' is the comparison specified by CC. 21276 SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, 21277 SDValue N2, SDValue N3, ISD::CondCode CC, 21278 bool NotExtCompare) { 21279 // (x ? y : y) -> y. 21280 if (N2 == N3) return N2; 21281 21282 EVT CmpOpVT = N0.getValueType(); 21283 EVT CmpResVT = getSetCCResultType(CmpOpVT); 21284 EVT VT = N2.getValueType(); 21285 auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode()); 21286 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); 21287 auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode()); 21288 21289 // Determine if the condition we're dealing with is constant. 21290 if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) { 21291 AddToWorklist(SCC.getNode()); 21292 if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) { 21293 // fold select_cc true, x, y -> x 21294 // fold select_cc false, x, y -> y 21295 return !(SCCC->isNullValue()) ? N2 : N3; 21296 } 21297 } 21298 21299 if (SDValue V = 21300 convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC)) 21301 return V; 21302 21303 if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC)) 21304 return V; 21305 21306 // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A) 21307 // where y is has a single bit set. 21308 // A plaintext description would be, we can turn the SELECT_CC into an AND 21309 // when the condition can be materialized as an all-ones register. Any 21310 // single bit-test can be materialized as an all-ones register with 21311 // shift-left and shift-right-arith. 21312 if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && 21313 N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { 21314 SDValue AndLHS = N0->getOperand(0); 21315 auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 21316 if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { 21317 // Shift the tested bit over the sign bit. 21318 const APInt &AndMask = ConstAndRHS->getAPIntValue(); 21319 unsigned ShCt = AndMask.getBitWidth() - 1; 21320 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { 21321 SDValue ShlAmt = 21322 DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), 21323 getShiftAmountTy(AndLHS.getValueType())); 21324 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); 21325 21326 // Now arithmetic right shift it all the way over, so the result is 21327 // either all-ones, or zero. 21328 SDValue ShrAmt = 21329 DAG.getConstant(ShCt, SDLoc(Shl), 21330 getShiftAmountTy(Shl.getValueType())); 21331 SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); 21332 21333 return DAG.getNode(ISD::AND, DL, VT, Shr, N3); 21334 } 21335 } 21336 } 21337 21338 // fold select C, 16, 0 -> shl C, 4 21339 bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2(); 21340 bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2(); 21341 21342 if ((Fold || Swap) && 21343 TLI.getBooleanContents(CmpOpVT) == 21344 TargetLowering::ZeroOrOneBooleanContent && 21345 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) { 21346 21347 if (Swap) { 21348 CC = ISD::getSetCCInverse(CC, CmpOpVT); 21349 std::swap(N2C, N3C); 21350 } 21351 21352 // If the caller doesn't want us to simplify this into a zext of a compare, 21353 // don't do it. 21354 if (NotExtCompare && N2C->isOne()) 21355 return SDValue(); 21356 21357 SDValue Temp, SCC; 21358 // zext (setcc n0, n1) 21359 if (LegalTypes) { 21360 SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC); 21361 if (VT.bitsLT(SCC.getValueType())) 21362 Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT); 21363 else 21364 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); 21365 } else { 21366 SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); 21367 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC); 21368 } 21369 21370 AddToWorklist(SCC.getNode()); 21371 AddToWorklist(Temp.getNode()); 21372 21373 if (N2C->isOne()) 21374 return Temp; 21375 21376 unsigned ShCt = N2C->getAPIntValue().logBase2(); 21377 if (TLI.shouldAvoidTransformToShift(VT, ShCt)) 21378 return SDValue(); 21379 21380 // shl setcc result by log2 n2c 21381 return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp, 21382 DAG.getConstant(ShCt, SDLoc(Temp), 21383 getShiftAmountTy(Temp.getValueType()))); 21384 } 21385 21386 // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X) 21387 // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) 21388 // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) 21389 // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X) 21390 // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X) 21391 // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X) 21392 // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X) 21393 // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X) 21394 if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 21395 SDValue ValueOnZero = N2; 21396 SDValue Count = N3; 21397 // If the condition is NE instead of E, swap the operands. 21398 if (CC == ISD::SETNE) 21399 std::swap(ValueOnZero, Count); 21400 // Check if the value on zero is a constant equal to the bits in the type. 21401 if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) { 21402 if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) { 21403 // If the other operand is cttz/cttz_zero_undef of N0, and cttz is 21404 // legal, combine to just cttz. 21405 if ((Count.getOpcode() == ISD::CTTZ || 21406 Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && 21407 N0 == Count.getOperand(0) && 21408 (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT))) 21409 return DAG.getNode(ISD::CTTZ, DL, VT, N0); 21410 // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is 21411 // legal, combine to just ctlz. 21412 if ((Count.getOpcode() == ISD::CTLZ || 21413 Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) && 21414 N0 == Count.getOperand(0) && 21415 (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT))) 21416 return DAG.getNode(ISD::CTLZ, DL, VT, N0); 21417 } 21418 } 21419 } 21420 21421 return SDValue(); 21422 } 21423 21424 /// This is a stub for TargetLowering::SimplifySetCC. 21425 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, 21426 ISD::CondCode Cond, const SDLoc &DL, 21427 bool foldBooleans) { 21428 TargetLowering::DAGCombinerInfo 21429 DagCombineInfo(DAG, Level, false, this); 21430 return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); 21431 } 21432 21433 /// Given an ISD::SDIV node expressing a divide by constant, return 21434 /// a DAG expression to select that will generate the same value by multiplying 21435 /// by a magic number. 21436 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". 21437 SDValue DAGCombiner::BuildSDIV(SDNode *N) { 21438 // when optimising for minimum size, we don't want to expand a div to a mul 21439 // and a shift. 21440 if (DAG.getMachineFunction().getFunction().hasMinSize()) 21441 return SDValue(); 21442 21443 SmallVector<SDNode *, 8> Built; 21444 if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) { 21445 for (SDNode *N : Built) 21446 AddToWorklist(N); 21447 return S; 21448 } 21449 21450 return SDValue(); 21451 } 21452 21453 /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a 21454 /// DAG expression that will generate the same value by right shifting. 21455 SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { 21456 ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); 21457 if (!C) 21458 return SDValue(); 21459 21460 // Avoid division by zero. 21461 if (C->isNullValue()) 21462 return SDValue(); 21463 21464 SmallVector<SDNode *, 8> Built; 21465 if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) { 21466 for (SDNode *N : Built) 21467 AddToWorklist(N); 21468 return S; 21469 } 21470 21471 return SDValue(); 21472 } 21473 21474 /// Given an ISD::UDIV node expressing a divide by constant, return a DAG 21475 /// expression that will generate the same value by multiplying by a magic 21476 /// number. 21477 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". 21478 SDValue DAGCombiner::BuildUDIV(SDNode *N) { 21479 // when optimising for minimum size, we don't want to expand a div to a mul 21480 // and a shift. 21481 if (DAG.getMachineFunction().getFunction().hasMinSize()) 21482 return SDValue(); 21483 21484 SmallVector<SDNode *, 8> Built; 21485 if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) { 21486 for (SDNode *N : Built) 21487 AddToWorklist(N); 21488 return S; 21489 } 21490 21491 return SDValue(); 21492 } 21493 21494 /// Determines the LogBase2 value for a non-null input value using the 21495 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). 21496 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { 21497 EVT VT = V.getValueType(); 21498 unsigned EltBits = VT.getScalarSizeInBits(); 21499 SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V); 21500 SDValue Base = DAG.getConstant(EltBits - 1, DL, VT); 21501 SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz); 21502 return LogBase2; 21503 } 21504 21505 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 21506 /// For the reciprocal, we need to find the zero of the function: 21507 /// F(X) = A X - 1 [which has a zero at X = 1/A] 21508 /// => 21509 /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form 21510 /// does not require additional intermediate precision] 21511 /// For the last iteration, put numerator N into it to gain more precision: 21512 /// Result = N X_i + X_i (N - N A X_i) 21513 SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op, 21514 SDNodeFlags Flags) { 21515 if (LegalDAG) 21516 return SDValue(); 21517 21518 // TODO: Handle half and/or extended types? 21519 EVT VT = Op.getValueType(); 21520 if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) 21521 return SDValue(); 21522 21523 // If estimates are explicitly disabled for this function, we're done. 21524 MachineFunction &MF = DAG.getMachineFunction(); 21525 int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF); 21526 if (Enabled == TLI.ReciprocalEstimate::Disabled) 21527 return SDValue(); 21528 21529 // Estimates may be explicitly enabled for this type with a custom number of 21530 // refinement steps. 21531 int Iterations = TLI.getDivRefinementSteps(VT, MF); 21532 if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { 21533 AddToWorklist(Est.getNode()); 21534 21535 SDLoc DL(Op); 21536 if (Iterations) { 21537 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); 21538 21539 // Newton iterations: Est = Est + Est (N - Arg * Est) 21540 // If this is the last iteration, also multiply by the numerator. 21541 for (int i = 0; i < Iterations; ++i) { 21542 SDValue MulEst = Est; 21543 21544 if (i == Iterations - 1) { 21545 MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags); 21546 AddToWorklist(MulEst.getNode()); 21547 } 21548 21549 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags); 21550 AddToWorklist(NewEst.getNode()); 21551 21552 NewEst = DAG.getNode(ISD::FSUB, DL, VT, 21553 (i == Iterations - 1 ? N : FPOne), NewEst, Flags); 21554 AddToWorklist(NewEst.getNode()); 21555 21556 NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); 21557 AddToWorklist(NewEst.getNode()); 21558 21559 Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags); 21560 AddToWorklist(Est.getNode()); 21561 } 21562 } else { 21563 // If no iterations are available, multiply with N. 21564 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags); 21565 AddToWorklist(Est.getNode()); 21566 } 21567 21568 return Est; 21569 } 21570 21571 return SDValue(); 21572 } 21573 21574 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 21575 /// For the reciprocal sqrt, we need to find the zero of the function: 21576 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] 21577 /// => 21578 /// X_{i+1} = X_i (1.5 - A X_i^2 / 2) 21579 /// As a result, we precompute A/2 prior to the iteration loop. 21580 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, 21581 unsigned Iterations, 21582 SDNodeFlags Flags, bool Reciprocal) { 21583 EVT VT = Arg.getValueType(); 21584 SDLoc DL(Arg); 21585 SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); 21586 21587 // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that 21588 // this entire sequence requires only one FP constant. 21589 SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags); 21590 HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags); 21591 21592 // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) 21593 for (unsigned i = 0; i < Iterations; ++i) { 21594 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags); 21595 NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags); 21596 NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags); 21597 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); 21598 } 21599 21600 // If non-reciprocal square root is requested, multiply the result by Arg. 21601 if (!Reciprocal) 21602 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags); 21603 21604 return Est; 21605 } 21606 21607 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) 21608 /// For the reciprocal sqrt, we need to find the zero of the function: 21609 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] 21610 /// => 21611 /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) 21612 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, 21613 unsigned Iterations, 21614 SDNodeFlags Flags, bool Reciprocal) { 21615 EVT VT = Arg.getValueType(); 21616 SDLoc DL(Arg); 21617 SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); 21618 SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT); 21619 21620 // This routine must enter the loop below to work correctly 21621 // when (Reciprocal == false). 21622 assert(Iterations > 0); 21623 21624 // Newton iterations for reciprocal square root: 21625 // E = (E * -0.5) * ((A * E) * E + -3.0) 21626 for (unsigned i = 0; i < Iterations; ++i) { 21627 SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags); 21628 SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags); 21629 SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags); 21630 21631 // When calculating a square root at the last iteration build: 21632 // S = ((A * E) * -0.5) * ((A * E) * E + -3.0) 21633 // (notice a common subexpression) 21634 SDValue LHS; 21635 if (Reciprocal || (i + 1) < Iterations) { 21636 // RSQRT: LHS = (E * -0.5) 21637 LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags); 21638 } else { 21639 // SQRT: LHS = (A * E) * -0.5 21640 LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags); 21641 } 21642 21643 Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags); 21644 } 21645 21646 return Est; 21647 } 21648 21649 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case 21650 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if 21651 /// Op can be zero. 21652 SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, 21653 bool Reciprocal) { 21654 if (LegalDAG) 21655 return SDValue(); 21656 21657 // TODO: Handle half and/or extended types? 21658 EVT VT = Op.getValueType(); 21659 if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) 21660 return SDValue(); 21661 21662 // If estimates are explicitly disabled for this function, we're done. 21663 MachineFunction &MF = DAG.getMachineFunction(); 21664 int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF); 21665 if (Enabled == TLI.ReciprocalEstimate::Disabled) 21666 return SDValue(); 21667 21668 // Estimates may be explicitly enabled for this type with a custom number of 21669 // refinement steps. 21670 int Iterations = TLI.getSqrtRefinementSteps(VT, MF); 21671 21672 bool UseOneConstNR = false; 21673 if (SDValue Est = 21674 TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR, 21675 Reciprocal)) { 21676 AddToWorklist(Est.getNode()); 21677 21678 if (Iterations) { 21679 Est = UseOneConstNR 21680 ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) 21681 : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); 21682 21683 if (!Reciprocal) { 21684 // The estimate is now completely wrong if the input was exactly 0.0 or 21685 // possibly a denormal. Force the answer to 0.0 for those cases. 21686 SDLoc DL(Op); 21687 EVT CCVT = getSetCCResultType(VT); 21688 ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; 21689 DenormalMode DenormMode = DAG.getDenormalMode(VT); 21690 if (DenormMode.Input == DenormalMode::IEEE) { 21691 // This is specifically a check for the handling of denormal inputs, 21692 // not the result. 21693 21694 // fabs(X) < SmallestNormal ? 0.0 : Est 21695 const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); 21696 APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); 21697 SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); 21698 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 21699 SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); 21700 SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); 21701 Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); 21702 } else { 21703 // X == 0.0 ? 0.0 : Est 21704 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 21705 SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); 21706 Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); 21707 } 21708 } 21709 } 21710 return Est; 21711 } 21712 21713 return SDValue(); 21714 } 21715 21716 SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) { 21717 return buildSqrtEstimateImpl(Op, Flags, true); 21718 } 21719 21720 SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { 21721 return buildSqrtEstimateImpl(Op, Flags, false); 21722 } 21723 21724 /// Return true if there is any possibility that the two addresses overlap. 21725 bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const { 21726 21727 struct MemUseCharacteristics { 21728 bool IsVolatile; 21729 bool IsAtomic; 21730 SDValue BasePtr; 21731 int64_t Offset; 21732 Optional<int64_t> NumBytes; 21733 MachineMemOperand *MMO; 21734 }; 21735 21736 auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics { 21737 if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) { 21738 int64_t Offset = 0; 21739 if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset())) 21740 Offset = (LSN->getAddressingMode() == ISD::PRE_INC) 21741 ? C->getSExtValue() 21742 : (LSN->getAddressingMode() == ISD::PRE_DEC) 21743 ? -1 * C->getSExtValue() 21744 : 0; 21745 uint64_t Size = 21746 MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize()); 21747 return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(), 21748 Offset /*base offset*/, 21749 Optional<int64_t>(Size), 21750 LSN->getMemOperand()}; 21751 } 21752 if (const auto *LN = cast<LifetimeSDNode>(N)) 21753 return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1), 21754 (LN->hasOffset()) ? LN->getOffset() : 0, 21755 (LN->hasOffset()) ? Optional<int64_t>(LN->getSize()) 21756 : Optional<int64_t>(), 21757 (MachineMemOperand *)nullptr}; 21758 // Default. 21759 return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(), 21760 (int64_t)0 /*offset*/, 21761 Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr}; 21762 }; 21763 21764 MemUseCharacteristics MUC0 = getCharacteristics(Op0), 21765 MUC1 = getCharacteristics(Op1); 21766 21767 // If they are to the same address, then they must be aliases. 21768 if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr && 21769 MUC0.Offset == MUC1.Offset) 21770 return true; 21771 21772 // If they are both volatile then they cannot be reordered. 21773 if (MUC0.IsVolatile && MUC1.IsVolatile) 21774 return true; 21775 21776 // Be conservative about atomics for the moment 21777 // TODO: This is way overconservative for unordered atomics (see D66309) 21778 if (MUC0.IsAtomic && MUC1.IsAtomic) 21779 return true; 21780 21781 if (MUC0.MMO && MUC1.MMO) { 21782 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || 21783 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) 21784 return false; 21785 } 21786 21787 // Try to prove that there is aliasing, or that there is no aliasing. Either 21788 // way, we can return now. If nothing can be proved, proceed with more tests. 21789 bool IsAlias; 21790 if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes, 21791 DAG, IsAlias)) 21792 return IsAlias; 21793 21794 // The following all rely on MMO0 and MMO1 being valid. Fail conservatively if 21795 // either are not known. 21796 if (!MUC0.MMO || !MUC1.MMO) 21797 return true; 21798 21799 // If one operation reads from invariant memory, and the other may store, they 21800 // cannot alias. These should really be checking the equivalent of mayWrite, 21801 // but it only matters for memory nodes other than load /store. 21802 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || 21803 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) 21804 return false; 21805 21806 // If we know required SrcValue1 and SrcValue2 have relatively large 21807 // alignment compared to the size and offset of the access, we may be able 21808 // to prove they do not alias. This check is conservative for now to catch 21809 // cases created by splitting vector types, it only works when the offsets are 21810 // multiples of the size of the data. 21811 int64_t SrcValOffset0 = MUC0.MMO->getOffset(); 21812 int64_t SrcValOffset1 = MUC1.MMO->getOffset(); 21813 Align OrigAlignment0 = MUC0.MMO->getBaseAlign(); 21814 Align OrigAlignment1 = MUC1.MMO->getBaseAlign(); 21815 auto &Size0 = MUC0.NumBytes; 21816 auto &Size1 = MUC1.NumBytes; 21817 if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && 21818 Size0.hasValue() && Size1.hasValue() && *Size0 == *Size1 && 21819 OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 && 21820 SrcValOffset1 % *Size1 == 0) { 21821 int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value(); 21822 int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value(); 21823 21824 // There is no overlap between these relatively aligned accesses of 21825 // similar size. Return no alias. 21826 if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0) 21827 return false; 21828 } 21829 21830 bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 21831 ? CombinerGlobalAA 21832 : DAG.getSubtarget().useAA(); 21833 #ifndef NDEBUG 21834 if (CombinerAAOnlyFunc.getNumOccurrences() && 21835 CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) 21836 UseAA = false; 21837 #endif 21838 21839 if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && 21840 Size0.hasValue() && Size1.hasValue()) { 21841 // Use alias analysis information. 21842 int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); 21843 int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset; 21844 int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset; 21845 AliasResult AAResult = AA->alias( 21846 MemoryLocation(MUC0.MMO->getValue(), Overlap0, 21847 UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()), 21848 MemoryLocation(MUC1.MMO->getValue(), Overlap1, 21849 UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes())); 21850 if (AAResult == NoAlias) 21851 return false; 21852 } 21853 21854 // Otherwise we have to assume they alias. 21855 return true; 21856 } 21857 21858 /// Walk up chain skipping non-aliasing memory nodes, 21859 /// looking for aliasing nodes and adding them to the Aliases vector. 21860 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, 21861 SmallVectorImpl<SDValue> &Aliases) { 21862 SmallVector<SDValue, 8> Chains; // List of chains to visit. 21863 SmallPtrSet<SDNode *, 16> Visited; // Visited node set. 21864 21865 // Get alias information for node. 21866 // TODO: relax aliasing for unordered atomics (see D66309) 21867 const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple(); 21868 21869 // Starting off. 21870 Chains.push_back(OriginalChain); 21871 unsigned Depth = 0; 21872 21873 // Attempt to improve chain by a single step 21874 std::function<bool(SDValue &)> ImproveChain = [&](SDValue &C) -> bool { 21875 switch (C.getOpcode()) { 21876 case ISD::EntryToken: 21877 // No need to mark EntryToken. 21878 C = SDValue(); 21879 return true; 21880 case ISD::LOAD: 21881 case ISD::STORE: { 21882 // Get alias information for C. 21883 // TODO: Relax aliasing for unordered atomics (see D66309) 21884 bool IsOpLoad = isa<LoadSDNode>(C.getNode()) && 21885 cast<LSBaseSDNode>(C.getNode())->isSimple(); 21886 if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) { 21887 // Look further up the chain. 21888 C = C.getOperand(0); 21889 return true; 21890 } 21891 // Alias, so stop here. 21892 return false; 21893 } 21894 21895 case ISD::CopyFromReg: 21896 // Always forward past past CopyFromReg. 21897 C = C.getOperand(0); 21898 return true; 21899 21900 case ISD::LIFETIME_START: 21901 case ISD::LIFETIME_END: { 21902 // We can forward past any lifetime start/end that can be proven not to 21903 // alias the memory access. 21904 if (!isAlias(N, C.getNode())) { 21905 // Look further up the chain. 21906 C = C.getOperand(0); 21907 return true; 21908 } 21909 return false; 21910 } 21911 default: 21912 return false; 21913 } 21914 }; 21915 21916 // Look at each chain and determine if it is an alias. If so, add it to the 21917 // aliases list. If not, then continue up the chain looking for the next 21918 // candidate. 21919 while (!Chains.empty()) { 21920 SDValue Chain = Chains.pop_back_val(); 21921 21922 // Don't bother if we've seen Chain before. 21923 if (!Visited.insert(Chain.getNode()).second) 21924 continue; 21925 21926 // For TokenFactor nodes, look at each operand and only continue up the 21927 // chain until we reach the depth limit. 21928 // 21929 // FIXME: The depth check could be made to return the last non-aliasing 21930 // chain we found before we hit a tokenfactor rather than the original 21931 // chain. 21932 if (Depth > TLI.getGatherAllAliasesMaxDepth()) { 21933 Aliases.clear(); 21934 Aliases.push_back(OriginalChain); 21935 return; 21936 } 21937 21938 if (Chain.getOpcode() == ISD::TokenFactor) { 21939 // We have to check each of the operands of the token factor for "small" 21940 // token factors, so we queue them up. Adding the operands to the queue 21941 // (stack) in reverse order maintains the original order and increases the 21942 // likelihood that getNode will find a matching token factor (CSE.) 21943 if (Chain.getNumOperands() > 16) { 21944 Aliases.push_back(Chain); 21945 continue; 21946 } 21947 for (unsigned n = Chain.getNumOperands(); n;) 21948 Chains.push_back(Chain.getOperand(--n)); 21949 ++Depth; 21950 continue; 21951 } 21952 // Everything else 21953 if (ImproveChain(Chain)) { 21954 // Updated Chain Found, Consider new chain if one exists. 21955 if (Chain.getNode()) 21956 Chains.push_back(Chain); 21957 ++Depth; 21958 continue; 21959 } 21960 // No Improved Chain Possible, treat as Alias. 21961 Aliases.push_back(Chain); 21962 } 21963 } 21964 21965 /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain 21966 /// (aliasing node.) 21967 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { 21968 if (OptLevel == CodeGenOpt::None) 21969 return OldChain; 21970 21971 // Ops for replacing token factor. 21972 SmallVector<SDValue, 8> Aliases; 21973 21974 // Accumulate all the aliases to this node. 21975 GatherAllAliases(N, OldChain, Aliases); 21976 21977 // If no operands then chain to entry token. 21978 if (Aliases.size() == 0) 21979 return DAG.getEntryNode(); 21980 21981 // If a single operand then chain to it. We don't need to revisit it. 21982 if (Aliases.size() == 1) 21983 return Aliases[0]; 21984 21985 // Construct a custom tailored token factor. 21986 return DAG.getTokenFactor(SDLoc(N), Aliases); 21987 } 21988 21989 namespace { 21990 // TODO: Replace with with std::monostate when we move to C++17. 21991 struct UnitT { } Unit; 21992 bool operator==(const UnitT &, const UnitT &) { return true; } 21993 bool operator!=(const UnitT &, const UnitT &) { return false; } 21994 } // namespace 21995 21996 // This function tries to collect a bunch of potentially interesting 21997 // nodes to improve the chains of, all at once. This might seem 21998 // redundant, as this function gets called when visiting every store 21999 // node, so why not let the work be done on each store as it's visited? 22000 // 22001 // I believe this is mainly important because mergeConsecutiveStores 22002 // is unable to deal with merging stores of different sizes, so unless 22003 // we improve the chains of all the potential candidates up-front 22004 // before running mergeConsecutiveStores, it might only see some of 22005 // the nodes that will eventually be candidates, and then not be able 22006 // to go from a partially-merged state to the desired final 22007 // fully-merged state. 22008 22009 bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) { 22010 SmallVector<StoreSDNode *, 8> ChainedStores; 22011 StoreSDNode *STChain = St; 22012 // Intervals records which offsets from BaseIndex have been covered. In 22013 // the common case, every store writes to the immediately previous address 22014 // space and thus merged with the previous interval at insertion time. 22015 22016 using IMap = 22017 llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>; 22018 IMap::Allocator A; 22019 IMap Intervals(A); 22020 22021 // This holds the base pointer, index, and the offset in bytes from the base 22022 // pointer. 22023 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 22024 22025 // We must have a base and an offset. 22026 if (!BasePtr.getBase().getNode()) 22027 return false; 22028 22029 // Do not handle stores to undef base pointers. 22030 if (BasePtr.getBase().isUndef()) 22031 return false; 22032 22033 // BaseIndexOffset assumes that offsets are fixed-size, which 22034 // is not valid for scalable vectors where the offsets are 22035 // scaled by `vscale`, so bail out early. 22036 if (St->getMemoryVT().isScalableVector()) 22037 return false; 22038 22039 // Add ST's interval. 22040 Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit); 22041 22042 while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) { 22043 // If the chain has more than one use, then we can't reorder the mem ops. 22044 if (!SDValue(Chain, 0)->hasOneUse()) 22045 break; 22046 // TODO: Relax for unordered atomics (see D66309) 22047 if (!Chain->isSimple() || Chain->isIndexed()) 22048 break; 22049 22050 // Find the base pointer and offset for this memory node. 22051 const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG); 22052 // Check that the base pointer is the same as the original one. 22053 int64_t Offset; 22054 if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset)) 22055 break; 22056 int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8; 22057 // Make sure we don't overlap with other intervals by checking the ones to 22058 // the left or right before inserting. 22059 auto I = Intervals.find(Offset); 22060 // If there's a next interval, we should end before it. 22061 if (I != Intervals.end() && I.start() < (Offset + Length)) 22062 break; 22063 // If there's a previous interval, we should start after it. 22064 if (I != Intervals.begin() && (--I).stop() <= Offset) 22065 break; 22066 Intervals.insert(Offset, Offset + Length, Unit); 22067 22068 ChainedStores.push_back(Chain); 22069 STChain = Chain; 22070 } 22071 22072 // If we didn't find a chained store, exit. 22073 if (ChainedStores.size() == 0) 22074 return false; 22075 22076 // Improve all chained stores (St and ChainedStores members) starting from 22077 // where the store chain ended and return single TokenFactor. 22078 SDValue NewChain = STChain->getChain(); 22079 SmallVector<SDValue, 8> TFOps; 22080 for (unsigned I = ChainedStores.size(); I;) { 22081 StoreSDNode *S = ChainedStores[--I]; 22082 SDValue BetterChain = FindBetterChain(S, NewChain); 22083 S = cast<StoreSDNode>(DAG.UpdateNodeOperands( 22084 S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3))); 22085 TFOps.push_back(SDValue(S, 0)); 22086 ChainedStores[I] = S; 22087 } 22088 22089 // Improve St's chain. Use a new node to avoid creating a loop from CombineTo. 22090 SDValue BetterChain = FindBetterChain(St, NewChain); 22091 SDValue NewST; 22092 if (St->isTruncatingStore()) 22093 NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(), 22094 St->getBasePtr(), St->getMemoryVT(), 22095 St->getMemOperand()); 22096 else 22097 NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(), 22098 St->getBasePtr(), St->getMemOperand()); 22099 22100 TFOps.push_back(NewST); 22101 22102 // If we improved every element of TFOps, then we've lost the dependence on 22103 // NewChain to successors of St and we need to add it back to TFOps. Do so at 22104 // the beginning to keep relative order consistent with FindBetterChains. 22105 auto hasImprovedChain = [&](SDValue ST) -> bool { 22106 return ST->getOperand(0) != NewChain; 22107 }; 22108 bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain); 22109 if (AddNewChain) 22110 TFOps.insert(TFOps.begin(), NewChain); 22111 22112 SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps); 22113 CombineTo(St, TF); 22114 22115 // Add TF and its operands to the worklist. 22116 AddToWorklist(TF.getNode()); 22117 for (const SDValue &Op : TF->ops()) 22118 AddToWorklist(Op.getNode()); 22119 AddToWorklist(STChain); 22120 return true; 22121 } 22122 22123 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { 22124 if (OptLevel == CodeGenOpt::None) 22125 return false; 22126 22127 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); 22128 22129 // We must have a base and an offset. 22130 if (!BasePtr.getBase().getNode()) 22131 return false; 22132 22133 // Do not handle stores to undef base pointers. 22134 if (BasePtr.getBase().isUndef()) 22135 return false; 22136 22137 // Directly improve a chain of disjoint stores starting at St. 22138 if (parallelizeChainedStores(St)) 22139 return true; 22140 22141 // Improve St's Chain.. 22142 SDValue BetterChain = FindBetterChain(St, St->getChain()); 22143 if (St->getChain() != BetterChain) { 22144 replaceStoreChain(St, BetterChain); 22145 return true; 22146 } 22147 return false; 22148 } 22149 22150 /// This is the entry point for the file. 22151 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, 22152 CodeGenOpt::Level OptLevel) { 22153 /// This is the main entry point to this class. 22154 DAGCombiner(*this, AA, OptLevel).Run(Level); 22155 } 22156