1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "MCTargetDesc/ARMAddressingModes.h" 25 #include "MCTargetDesc/ARMBaseInfo.h" 26 #include "Utils/ARMBaseInfo.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/BitVector.h" 31 #include "llvm/ADT/DenseMap.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringExtras.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/ADT/StringSwitch.h" 39 #include "llvm/ADT/Triple.h" 40 #include "llvm/ADT/Twine.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/CodeGen/CallingConvLower.h" 43 #include "llvm/CodeGen/ISDOpcodes.h" 44 #include "llvm/CodeGen/IntrinsicLowering.h" 45 #include "llvm/CodeGen/MachineBasicBlock.h" 46 #include "llvm/CodeGen/MachineConstantPool.h" 47 #include "llvm/CodeGen/MachineFrameInfo.h" 48 #include "llvm/CodeGen/MachineFunction.h" 49 #include "llvm/CodeGen/MachineInstr.h" 50 #include "llvm/CodeGen/MachineInstrBuilder.h" 51 #include "llvm/CodeGen/MachineJumpTableInfo.h" 52 #include "llvm/CodeGen/MachineMemOperand.h" 53 #include "llvm/CodeGen/MachineOperand.h" 54 #include "llvm/CodeGen/MachineRegisterInfo.h" 55 #include "llvm/CodeGen/RuntimeLibcalls.h" 56 #include "llvm/CodeGen/SelectionDAG.h" 57 #include "llvm/CodeGen/SelectionDAGNodes.h" 58 #include "llvm/CodeGen/TargetInstrInfo.h" 59 #include "llvm/CodeGen/TargetLowering.h" 60 #include "llvm/CodeGen/TargetOpcodes.h" 61 #include "llvm/CodeGen/TargetRegisterInfo.h" 62 #include "llvm/CodeGen/TargetSubtargetInfo.h" 63 #include "llvm/CodeGen/ValueTypes.h" 64 #include "llvm/IR/Attributes.h" 65 #include "llvm/IR/CallingConv.h" 66 #include "llvm/IR/Constant.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DebugLoc.h" 70 #include "llvm/IR/DerivedTypes.h" 71 #include "llvm/IR/Function.h" 72 #include "llvm/IR/GlobalAlias.h" 73 #include "llvm/IR/GlobalValue.h" 74 #include "llvm/IR/GlobalVariable.h" 75 #include "llvm/IR/IRBuilder.h" 76 #include "llvm/IR/InlineAsm.h" 77 #include "llvm/IR/Instruction.h" 78 #include "llvm/IR/Instructions.h" 79 #include "llvm/IR/IntrinsicInst.h" 80 #include "llvm/IR/Intrinsics.h" 81 #include "llvm/IR/IntrinsicsARM.h" 82 #include "llvm/IR/Module.h" 83 #include "llvm/IR/PatternMatch.h" 84 #include "llvm/IR/Type.h" 85 #include "llvm/IR/User.h" 86 #include "llvm/IR/Value.h" 87 #include "llvm/MC/MCInstrDesc.h" 88 #include "llvm/MC/MCInstrItineraries.h" 89 #include "llvm/MC/MCRegisterInfo.h" 90 #include "llvm/MC/MCSchedule.h" 91 #include "llvm/Support/AtomicOrdering.h" 92 #include "llvm/Support/BranchProbability.h" 93 #include "llvm/Support/Casting.h" 94 #include "llvm/Support/CodeGen.h" 95 #include "llvm/Support/CommandLine.h" 96 #include "llvm/Support/Compiler.h" 97 #include "llvm/Support/Debug.h" 98 #include "llvm/Support/ErrorHandling.h" 99 #include "llvm/Support/KnownBits.h" 100 #include "llvm/Support/MachineValueType.h" 101 #include "llvm/Support/MathExtras.h" 102 #include "llvm/Support/raw_ostream.h" 103 #include "llvm/Target/TargetMachine.h" 104 #include "llvm/Target/TargetOptions.h" 105 #include <algorithm> 106 #include <cassert> 107 #include <cstdint> 108 #include <cstdlib> 109 #include <iterator> 110 #include <limits> 111 #include <string> 112 #include <tuple> 113 #include <utility> 114 #include <vector> 115 116 using namespace llvm; 117 using namespace llvm::PatternMatch; 118 119 #define DEBUG_TYPE "arm-isel" 120 121 STATISTIC(NumTailCalls, "Number of tail calls"); 122 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 123 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 124 STATISTIC(NumConstpoolPromoted, 125 "Number of constants with their storage promoted into constant pools"); 126 127 static cl::opt<bool> 128 ARMInterworking("arm-interworking", cl::Hidden, 129 cl::desc("Enable / disable ARM interworking (for debugging only)"), 130 cl::init(true)); 131 132 static cl::opt<bool> EnableConstpoolPromotion( 133 "arm-promote-constant", cl::Hidden, 134 cl::desc("Enable / disable promotion of unnamed_addr constants into " 135 "constant pools"), 136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 137 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 138 "arm-promote-constant-max-size", cl::Hidden, 139 cl::desc("Maximum size of constant to promote into a constant pool"), 140 cl::init(64)); 141 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 142 "arm-promote-constant-max-total", cl::Hidden, 143 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 144 cl::init(128)); 145 146 static cl::opt<unsigned> 147 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 148 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 149 cl::init(2)); 150 151 // The APCS parameter registers. 152 static const MCPhysReg GPRArgRegs[] = { 153 ARM::R0, ARM::R1, ARM::R2, ARM::R3 154 }; 155 156 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 157 MVT PromotedBitwiseVT) { 158 if (VT != PromotedLdStVT) { 159 setOperationAction(ISD::LOAD, VT, Promote); 160 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 161 162 setOperationAction(ISD::STORE, VT, Promote); 163 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 164 } 165 166 MVT ElemTy = VT.getVectorElementType(); 167 if (ElemTy != MVT::f64) 168 setOperationAction(ISD::SETCC, VT, Custom); 169 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 171 if (ElemTy == MVT::i32) { 172 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 173 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 175 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 176 } else { 177 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 178 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 180 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 181 } 182 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 183 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 184 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 185 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 186 setOperationAction(ISD::SELECT, VT, Expand); 187 setOperationAction(ISD::SELECT_CC, VT, Expand); 188 setOperationAction(ISD::VSELECT, VT, Expand); 189 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 190 if (VT.isInteger()) { 191 setOperationAction(ISD::SHL, VT, Custom); 192 setOperationAction(ISD::SRA, VT, Custom); 193 setOperationAction(ISD::SRL, VT, Custom); 194 } 195 196 // Promote all bit-wise operations. 197 if (VT.isInteger() && VT != PromotedBitwiseVT) { 198 setOperationAction(ISD::AND, VT, Promote); 199 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 200 setOperationAction(ISD::OR, VT, Promote); 201 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 202 setOperationAction(ISD::XOR, VT, Promote); 203 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 204 } 205 206 // Neon does not support vector divide/remainder operations. 207 setOperationAction(ISD::SDIV, VT, Expand); 208 setOperationAction(ISD::UDIV, VT, Expand); 209 setOperationAction(ISD::FDIV, VT, Expand); 210 setOperationAction(ISD::SREM, VT, Expand); 211 setOperationAction(ISD::UREM, VT, Expand); 212 setOperationAction(ISD::FREM, VT, Expand); 213 214 if (!VT.isFloatingPoint() && 215 VT != MVT::v2i64 && VT != MVT::v1i64) 216 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 217 setOperationAction(Opcode, VT, Legal); 218 if (!VT.isFloatingPoint()) 219 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 220 setOperationAction(Opcode, VT, Legal); 221 } 222 223 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 224 addRegisterClass(VT, &ARM::DPRRegClass); 225 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 226 } 227 228 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 229 addRegisterClass(VT, &ARM::DPairRegClass); 230 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 231 } 232 233 void ARMTargetLowering::setAllExpand(MVT VT) { 234 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 235 setOperationAction(Opc, VT, Expand); 236 237 // We support these really simple operations even on types where all 238 // the actual arithmetic has to be broken down into simpler 239 // operations or turned into library calls. 240 setOperationAction(ISD::BITCAST, VT, Legal); 241 setOperationAction(ISD::LOAD, VT, Legal); 242 setOperationAction(ISD::STORE, VT, Legal); 243 setOperationAction(ISD::UNDEF, VT, Legal); 244 } 245 246 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 247 LegalizeAction Action) { 248 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 249 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 250 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 251 } 252 253 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 254 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 255 256 for (auto VT : IntTypes) { 257 addRegisterClass(VT, &ARM::MQPRRegClass); 258 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 259 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 260 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 261 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 262 setOperationAction(ISD::SHL, VT, Custom); 263 setOperationAction(ISD::SRA, VT, Custom); 264 setOperationAction(ISD::SRL, VT, Custom); 265 setOperationAction(ISD::SMIN, VT, Legal); 266 setOperationAction(ISD::SMAX, VT, Legal); 267 setOperationAction(ISD::UMIN, VT, Legal); 268 setOperationAction(ISD::UMAX, VT, Legal); 269 setOperationAction(ISD::ABS, VT, Legal); 270 setOperationAction(ISD::SETCC, VT, Custom); 271 setOperationAction(ISD::MLOAD, VT, Custom); 272 setOperationAction(ISD::MSTORE, VT, Legal); 273 setOperationAction(ISD::CTLZ, VT, Legal); 274 setOperationAction(ISD::CTTZ, VT, Custom); 275 setOperationAction(ISD::BITREVERSE, VT, Legal); 276 setOperationAction(ISD::BSWAP, VT, Legal); 277 setOperationAction(ISD::SADDSAT, VT, Legal); 278 setOperationAction(ISD::UADDSAT, VT, Legal); 279 setOperationAction(ISD::SSUBSAT, VT, Legal); 280 setOperationAction(ISD::USUBSAT, VT, Legal); 281 282 // No native support for these. 283 setOperationAction(ISD::UDIV, VT, Expand); 284 setOperationAction(ISD::SDIV, VT, Expand); 285 setOperationAction(ISD::UREM, VT, Expand); 286 setOperationAction(ISD::SREM, VT, Expand); 287 setOperationAction(ISD::CTPOP, VT, Expand); 288 289 // Vector reductions 290 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 291 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 292 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 293 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 294 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 295 296 if (!HasMVEFP) { 297 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 298 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 299 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 300 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 301 } 302 303 // Pre and Post inc are supported on loads and stores 304 for (unsigned im = (unsigned)ISD::PRE_INC; 305 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 306 setIndexedLoadAction(im, VT, Legal); 307 setIndexedStoreAction(im, VT, Legal); 308 setIndexedMaskedLoadAction(im, VT, Legal); 309 setIndexedMaskedStoreAction(im, VT, Legal); 310 } 311 } 312 313 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 314 for (auto VT : FloatTypes) { 315 addRegisterClass(VT, &ARM::MQPRRegClass); 316 if (!HasMVEFP) 317 setAllExpand(VT); 318 319 // These are legal or custom whether we have MVE.fp or not 320 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 321 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 322 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 323 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 324 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 325 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 326 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 327 setOperationAction(ISD::SETCC, VT, Custom); 328 setOperationAction(ISD::MLOAD, VT, Custom); 329 setOperationAction(ISD::MSTORE, VT, Legal); 330 331 // Pre and Post inc are supported on loads and stores 332 for (unsigned im = (unsigned)ISD::PRE_INC; 333 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 334 setIndexedLoadAction(im, VT, Legal); 335 setIndexedStoreAction(im, VT, Legal); 336 setIndexedMaskedLoadAction(im, VT, Legal); 337 setIndexedMaskedStoreAction(im, VT, Legal); 338 } 339 340 if (HasMVEFP) { 341 setOperationAction(ISD::FMINNUM, VT, Legal); 342 setOperationAction(ISD::FMAXNUM, VT, Legal); 343 setOperationAction(ISD::FROUND, VT, Legal); 344 345 // No native support for these. 346 setOperationAction(ISD::FDIV, VT, Expand); 347 setOperationAction(ISD::FREM, VT, Expand); 348 setOperationAction(ISD::FSQRT, VT, Expand); 349 setOperationAction(ISD::FSIN, VT, Expand); 350 setOperationAction(ISD::FCOS, VT, Expand); 351 setOperationAction(ISD::FPOW, VT, Expand); 352 setOperationAction(ISD::FLOG, VT, Expand); 353 setOperationAction(ISD::FLOG2, VT, Expand); 354 setOperationAction(ISD::FLOG10, VT, Expand); 355 setOperationAction(ISD::FEXP, VT, Expand); 356 setOperationAction(ISD::FEXP2, VT, Expand); 357 setOperationAction(ISD::FNEARBYINT, VT, Expand); 358 } 359 } 360 361 // We 'support' these types up to bitcast/load/store level, regardless of 362 // MVE integer-only / float support. Only doing FP data processing on the FP 363 // vector types is inhibited at integer-only level. 364 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 365 for (auto VT : LongTypes) { 366 addRegisterClass(VT, &ARM::MQPRRegClass); 367 setAllExpand(VT); 368 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 369 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 370 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 371 } 372 // We can do bitwise operations on v2i64 vectors 373 setOperationAction(ISD::AND, MVT::v2i64, Legal); 374 setOperationAction(ISD::OR, MVT::v2i64, Legal); 375 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 376 377 // It is legal to extload from v4i8 to v4i16 or v4i32. 378 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 379 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 380 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 381 382 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 383 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 384 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 385 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 386 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 388 389 // Some truncating stores are legal too. 390 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 391 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 392 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 393 394 // Pre and Post inc on these are legal, given the correct extends 395 for (unsigned im = (unsigned)ISD::PRE_INC; 396 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 397 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 398 setIndexedLoadAction(im, VT, Legal); 399 setIndexedStoreAction(im, VT, Legal); 400 setIndexedMaskedLoadAction(im, VT, Legal); 401 setIndexedMaskedStoreAction(im, VT, Legal); 402 } 403 } 404 405 // Predicate types 406 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 407 for (auto VT : pTypes) { 408 addRegisterClass(VT, &ARM::VCCRRegClass); 409 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 410 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 411 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 412 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 413 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 414 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 415 setOperationAction(ISD::SETCC, VT, Custom); 416 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 417 setOperationAction(ISD::LOAD, VT, Custom); 418 setOperationAction(ISD::STORE, VT, Custom); 419 } 420 } 421 422 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 423 const ARMSubtarget &STI) 424 : TargetLowering(TM), Subtarget(&STI) { 425 RegInfo = Subtarget->getRegisterInfo(); 426 Itins = Subtarget->getInstrItineraryData(); 427 428 setBooleanContents(ZeroOrOneBooleanContent); 429 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 430 431 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 432 !Subtarget->isTargetWatchOS()) { 433 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 434 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 435 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 436 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 437 : CallingConv::ARM_AAPCS); 438 } 439 440 if (Subtarget->isTargetMachO()) { 441 // Uses VFP for Thumb libfuncs if available. 442 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 443 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 444 static const struct { 445 const RTLIB::Libcall Op; 446 const char * const Name; 447 const ISD::CondCode Cond; 448 } LibraryCalls[] = { 449 // Single-precision floating-point arithmetic. 450 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 451 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 452 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 453 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 454 455 // Double-precision floating-point arithmetic. 456 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 457 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 458 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 459 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 460 461 // Single-precision comparisons. 462 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 463 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 464 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 465 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 466 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 467 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 468 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 469 470 // Double-precision comparisons. 471 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 472 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 473 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 474 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 475 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 476 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 477 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 478 479 // Floating-point to integer conversions. 480 // i64 conversions are done via library routines even when generating VFP 481 // instructions, so use the same ones. 482 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 483 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 484 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 485 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 486 487 // Conversions between floating types. 488 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 489 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 490 491 // Integer to floating-point conversions. 492 // i64 conversions are done via library routines even when generating VFP 493 // instructions, so use the same ones. 494 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 495 // e.g., __floatunsidf vs. __floatunssidfvfp. 496 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 497 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 498 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 499 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 500 }; 501 502 for (const auto &LC : LibraryCalls) { 503 setLibcallName(LC.Op, LC.Name); 504 if (LC.Cond != ISD::SETCC_INVALID) 505 setCmpLibcallCC(LC.Op, LC.Cond); 506 } 507 } 508 } 509 510 // These libcalls are not available in 32-bit. 511 setLibcallName(RTLIB::SHL_I128, nullptr); 512 setLibcallName(RTLIB::SRL_I128, nullptr); 513 setLibcallName(RTLIB::SRA_I128, nullptr); 514 515 // RTLIB 516 if (Subtarget->isAAPCS_ABI() && 517 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 518 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 519 static const struct { 520 const RTLIB::Libcall Op; 521 const char * const Name; 522 const CallingConv::ID CC; 523 const ISD::CondCode Cond; 524 } LibraryCalls[] = { 525 // Double-precision floating-point arithmetic helper functions 526 // RTABI chapter 4.1.2, Table 2 527 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 528 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 529 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 530 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 531 532 // Double-precision floating-point comparison helper functions 533 // RTABI chapter 4.1.2, Table 3 534 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 535 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 536 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 537 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 538 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 539 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 540 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 541 542 // Single-precision floating-point arithmetic helper functions 543 // RTABI chapter 4.1.2, Table 4 544 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 545 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 546 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 547 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 548 549 // Single-precision floating-point comparison helper functions 550 // RTABI chapter 4.1.2, Table 5 551 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 552 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 553 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 554 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 555 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 556 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 557 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 558 559 // Floating-point to integer conversions. 560 // RTABI chapter 4.1.2, Table 6 561 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 562 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 565 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 566 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 567 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 568 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 569 570 // Conversions between floating types. 571 // RTABI chapter 4.1.2, Table 7 572 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 573 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 574 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 575 576 // Integer to floating-point conversions. 577 // RTABI chapter 4.1.2, Table 8 578 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 579 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 582 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 583 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 584 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 585 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 586 587 // Long long helper functions 588 // RTABI chapter 4.2, Table 9 589 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 590 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 591 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 592 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 593 594 // Integer division functions 595 // RTABI chapter 4.3.1 596 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 597 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 599 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 600 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 601 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 602 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 603 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 604 }; 605 606 for (const auto &LC : LibraryCalls) { 607 setLibcallName(LC.Op, LC.Name); 608 setLibcallCallingConv(LC.Op, LC.CC); 609 if (LC.Cond != ISD::SETCC_INVALID) 610 setCmpLibcallCC(LC.Op, LC.Cond); 611 } 612 613 // EABI dependent RTLIB 614 if (TM.Options.EABIVersion == EABI::EABI4 || 615 TM.Options.EABIVersion == EABI::EABI5) { 616 static const struct { 617 const RTLIB::Libcall Op; 618 const char *const Name; 619 const CallingConv::ID CC; 620 const ISD::CondCode Cond; 621 } MemOpsLibraryCalls[] = { 622 // Memory operations 623 // RTABI chapter 4.3.4 624 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 627 }; 628 629 for (const auto &LC : MemOpsLibraryCalls) { 630 setLibcallName(LC.Op, LC.Name); 631 setLibcallCallingConv(LC.Op, LC.CC); 632 if (LC.Cond != ISD::SETCC_INVALID) 633 setCmpLibcallCC(LC.Op, LC.Cond); 634 } 635 } 636 } 637 638 if (Subtarget->isTargetWindows()) { 639 static const struct { 640 const RTLIB::Libcall Op; 641 const char * const Name; 642 const CallingConv::ID CC; 643 } LibraryCalls[] = { 644 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 645 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 646 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 647 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 648 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 649 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 650 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 651 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 652 }; 653 654 for (const auto &LC : LibraryCalls) { 655 setLibcallName(LC.Op, LC.Name); 656 setLibcallCallingConv(LC.Op, LC.CC); 657 } 658 } 659 660 // Use divmod compiler-rt calls for iOS 5.0 and later. 661 if (Subtarget->isTargetMachO() && 662 !(Subtarget->isTargetIOS() && 663 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 664 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 665 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 666 } 667 668 // The half <-> float conversion functions are always soft-float on 669 // non-watchos platforms, but are needed for some targets which use a 670 // hard-float calling convention by default. 671 if (!Subtarget->isTargetWatchABI()) { 672 if (Subtarget->isAAPCS_ABI()) { 673 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 674 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 675 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 676 } else { 677 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 678 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 679 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 680 } 681 } 682 683 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 684 // a __gnu_ prefix (which is the default). 685 if (Subtarget->isTargetAEABI()) { 686 static const struct { 687 const RTLIB::Libcall Op; 688 const char * const Name; 689 const CallingConv::ID CC; 690 } LibraryCalls[] = { 691 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 692 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 693 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 694 }; 695 696 for (const auto &LC : LibraryCalls) { 697 setLibcallName(LC.Op, LC.Name); 698 setLibcallCallingConv(LC.Op, LC.CC); 699 } 700 } 701 702 if (Subtarget->isThumb1Only()) 703 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 704 else 705 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 706 707 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 708 Subtarget->hasFPRegs()) { 709 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 710 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 711 if (!Subtarget->hasVFP2Base()) 712 setAllExpand(MVT::f32); 713 if (!Subtarget->hasFP64()) 714 setAllExpand(MVT::f64); 715 } 716 717 if (Subtarget->hasFullFP16()) { 718 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 719 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 720 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 721 722 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 723 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 724 } 725 726 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 727 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 728 setTruncStoreAction(VT, InnerVT, Expand); 729 addAllExtLoads(VT, InnerVT, Expand); 730 } 731 732 setOperationAction(ISD::MULHS, VT, Expand); 733 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 734 setOperationAction(ISD::MULHU, VT, Expand); 735 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 736 737 setOperationAction(ISD::BSWAP, VT, Expand); 738 } 739 740 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 741 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 742 743 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 744 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 745 746 if (Subtarget->hasMVEIntegerOps()) 747 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 748 749 // Combine low-overhead loop intrinsics so that we can lower i1 types. 750 if (Subtarget->hasLOB()) { 751 setTargetDAGCombine(ISD::BRCOND); 752 setTargetDAGCombine(ISD::BR_CC); 753 } 754 755 if (Subtarget->hasNEON()) { 756 addDRTypeForNEON(MVT::v2f32); 757 addDRTypeForNEON(MVT::v8i8); 758 addDRTypeForNEON(MVT::v4i16); 759 addDRTypeForNEON(MVT::v2i32); 760 addDRTypeForNEON(MVT::v1i64); 761 762 addQRTypeForNEON(MVT::v4f32); 763 addQRTypeForNEON(MVT::v2f64); 764 addQRTypeForNEON(MVT::v16i8); 765 addQRTypeForNEON(MVT::v8i16); 766 addQRTypeForNEON(MVT::v4i32); 767 addQRTypeForNEON(MVT::v2i64); 768 769 if (Subtarget->hasFullFP16()) { 770 addQRTypeForNEON(MVT::v8f16); 771 addDRTypeForNEON(MVT::v4f16); 772 } 773 } 774 775 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 776 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 777 // none of Neon, MVE or VFP supports any arithmetic operations on it. 778 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 779 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 780 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 781 // FIXME: Code duplication: FDIV and FREM are expanded always, see 782 // ARMTargetLowering::addTypeForNEON method for details. 783 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 784 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 785 // FIXME: Create unittest. 786 // In another words, find a way when "copysign" appears in DAG with vector 787 // operands. 788 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 789 // FIXME: Code duplication: SETCC has custom operation action, see 790 // ARMTargetLowering::addTypeForNEON method for details. 791 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 792 // FIXME: Create unittest for FNEG and for FABS. 793 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 794 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 795 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 796 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 797 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 798 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 799 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 800 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 801 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 802 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 803 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 804 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 805 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 806 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 807 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 808 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 809 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 810 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 811 } 812 813 if (Subtarget->hasNEON()) { 814 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 815 // supported for v4f32. 816 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 817 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 818 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 819 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 820 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 821 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 822 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 823 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 824 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 825 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 826 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 827 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 828 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 829 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 830 831 // Mark v2f32 intrinsics. 832 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 833 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 834 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 835 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 836 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 837 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 838 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 839 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 840 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 841 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 842 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 843 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 844 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 845 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 846 847 // Neon does not support some operations on v1i64 and v2i64 types. 848 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 849 // Custom handling for some quad-vector types to detect VMULL. 850 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 851 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 852 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 853 // Custom handling for some vector types to avoid expensive expansions 854 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 855 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 856 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 857 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 858 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 859 // a destination type that is wider than the source, and nor does 860 // it have a FP_TO_[SU]INT instruction with a narrower destination than 861 // source. 862 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 863 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 864 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 865 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 866 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 867 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 868 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 869 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 870 871 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 872 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 873 874 // NEON does not have single instruction CTPOP for vectors with element 875 // types wider than 8-bits. However, custom lowering can leverage the 876 // v8i8/v16i8 vcnt instruction. 877 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 878 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 879 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 880 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 881 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 882 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 883 884 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 885 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 886 887 // NEON does not have single instruction CTTZ for vectors. 888 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 889 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 890 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 891 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 892 893 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 894 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 895 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 896 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 897 898 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 899 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 900 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 901 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 902 903 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 904 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 905 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 906 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 907 908 // NEON only has FMA instructions as of VFP4. 909 if (!Subtarget->hasVFP4Base()) { 910 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 911 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 912 } 913 914 setTargetDAGCombine(ISD::SHL); 915 setTargetDAGCombine(ISD::SRL); 916 setTargetDAGCombine(ISD::SRA); 917 setTargetDAGCombine(ISD::FP_TO_SINT); 918 setTargetDAGCombine(ISD::FP_TO_UINT); 919 setTargetDAGCombine(ISD::FDIV); 920 setTargetDAGCombine(ISD::LOAD); 921 922 // It is legal to extload from v4i8 to v4i16 or v4i32. 923 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 924 MVT::v2i32}) { 925 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 926 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 927 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 928 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 929 } 930 } 931 } 932 933 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 934 setTargetDAGCombine(ISD::BUILD_VECTOR); 935 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 936 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 937 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 938 setTargetDAGCombine(ISD::STORE); 939 setTargetDAGCombine(ISD::SIGN_EXTEND); 940 setTargetDAGCombine(ISD::ZERO_EXTEND); 941 setTargetDAGCombine(ISD::ANY_EXTEND); 942 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 943 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 944 setTargetDAGCombine(ISD::INTRINSIC_VOID); 945 setTargetDAGCombine(ISD::VECREDUCE_ADD); 946 setTargetDAGCombine(ISD::ADD); 947 setTargetDAGCombine(ISD::BITCAST); 948 } 949 if (Subtarget->hasMVEIntegerOps()) { 950 setTargetDAGCombine(ISD::SMIN); 951 setTargetDAGCombine(ISD::UMIN); 952 setTargetDAGCombine(ISD::SMAX); 953 setTargetDAGCombine(ISD::UMAX); 954 } 955 956 if (!Subtarget->hasFP64()) { 957 // When targeting a floating-point unit with only single-precision 958 // operations, f64 is legal for the few double-precision instructions which 959 // are present However, no double-precision operations other than moves, 960 // loads and stores are provided by the hardware. 961 setOperationAction(ISD::FADD, MVT::f64, Expand); 962 setOperationAction(ISD::FSUB, MVT::f64, Expand); 963 setOperationAction(ISD::FMUL, MVT::f64, Expand); 964 setOperationAction(ISD::FMA, MVT::f64, Expand); 965 setOperationAction(ISD::FDIV, MVT::f64, Expand); 966 setOperationAction(ISD::FREM, MVT::f64, Expand); 967 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 968 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 969 setOperationAction(ISD::FNEG, MVT::f64, Expand); 970 setOperationAction(ISD::FABS, MVT::f64, Expand); 971 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 972 setOperationAction(ISD::FSIN, MVT::f64, Expand); 973 setOperationAction(ISD::FCOS, MVT::f64, Expand); 974 setOperationAction(ISD::FPOW, MVT::f64, Expand); 975 setOperationAction(ISD::FLOG, MVT::f64, Expand); 976 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 977 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 978 setOperationAction(ISD::FEXP, MVT::f64, Expand); 979 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 980 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 981 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 982 setOperationAction(ISD::FRINT, MVT::f64, Expand); 983 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 984 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 985 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 986 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 987 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 988 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 989 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 990 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 991 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 992 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 993 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 994 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 995 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 996 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 997 } 998 999 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 1000 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 1001 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 1002 if (Subtarget->hasFullFP16()) { 1003 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 1004 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 1005 } 1006 } 1007 1008 if (!Subtarget->hasFP16()) { 1009 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1010 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1011 } 1012 1013 computeRegisterProperties(Subtarget->getRegisterInfo()); 1014 1015 // ARM does not have floating-point extending loads. 1016 for (MVT VT : MVT::fp_valuetypes()) { 1017 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1018 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1019 } 1020 1021 // ... or truncating stores 1022 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1023 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1024 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1025 1026 // ARM does not have i1 sign extending load. 1027 for (MVT VT : MVT::integer_valuetypes()) 1028 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1029 1030 // ARM supports all 4 flavors of integer indexed load / store. 1031 if (!Subtarget->isThumb1Only()) { 1032 for (unsigned im = (unsigned)ISD::PRE_INC; 1033 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1034 setIndexedLoadAction(im, MVT::i1, Legal); 1035 setIndexedLoadAction(im, MVT::i8, Legal); 1036 setIndexedLoadAction(im, MVT::i16, Legal); 1037 setIndexedLoadAction(im, MVT::i32, Legal); 1038 setIndexedStoreAction(im, MVT::i1, Legal); 1039 setIndexedStoreAction(im, MVT::i8, Legal); 1040 setIndexedStoreAction(im, MVT::i16, Legal); 1041 setIndexedStoreAction(im, MVT::i32, Legal); 1042 } 1043 } else { 1044 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1045 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1046 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1047 } 1048 1049 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1050 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1051 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1052 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1053 1054 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1055 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1056 if (Subtarget->hasDSP()) { 1057 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1058 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1059 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1060 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1061 } 1062 if (Subtarget->hasBaseDSP()) { 1063 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1064 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1065 } 1066 1067 // i64 operation support. 1068 setOperationAction(ISD::MUL, MVT::i64, Expand); 1069 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1070 if (Subtarget->isThumb1Only()) { 1071 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1072 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1073 } 1074 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1075 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1076 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1077 1078 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1079 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1080 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1081 setOperationAction(ISD::SRL, MVT::i64, Custom); 1082 setOperationAction(ISD::SRA, MVT::i64, Custom); 1083 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1084 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1085 setOperationAction(ISD::LOAD, MVT::i64, Custom); 1086 setOperationAction(ISD::STORE, MVT::i64, Custom); 1087 1088 // MVE lowers 64 bit shifts to lsll and lsrl 1089 // assuming that ISD::SRL and SRA of i64 are already marked custom 1090 if (Subtarget->hasMVEIntegerOps()) 1091 setOperationAction(ISD::SHL, MVT::i64, Custom); 1092 1093 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1094 if (Subtarget->isThumb1Only()) { 1095 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1096 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1097 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1098 } 1099 1100 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1101 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1102 1103 // ARM does not have ROTL. 1104 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1105 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1106 setOperationAction(ISD::ROTL, VT, Expand); 1107 setOperationAction(ISD::ROTR, VT, Expand); 1108 } 1109 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1110 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1111 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1112 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1113 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1114 } 1115 1116 // @llvm.readcyclecounter requires the Performance Monitors extension. 1117 // Default to the 0 expansion on unsupported platforms. 1118 // FIXME: Technically there are older ARM CPUs that have 1119 // implementation-specific ways of obtaining this information. 1120 if (Subtarget->hasPerfMon()) 1121 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1122 1123 // Only ARMv6 has BSWAP. 1124 if (!Subtarget->hasV6Ops()) 1125 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1126 1127 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1128 : Subtarget->hasDivideInARMMode(); 1129 if (!hasDivide) { 1130 // These are expanded into libcalls if the cpu doesn't have HW divider. 1131 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1132 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1133 } 1134 1135 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1136 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1137 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1138 1139 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1140 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1141 } 1142 1143 setOperationAction(ISD::SREM, MVT::i32, Expand); 1144 setOperationAction(ISD::UREM, MVT::i32, Expand); 1145 1146 // Register based DivRem for AEABI (RTABI 4.2) 1147 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1148 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1149 Subtarget->isTargetWindows()) { 1150 setOperationAction(ISD::SREM, MVT::i64, Custom); 1151 setOperationAction(ISD::UREM, MVT::i64, Custom); 1152 HasStandaloneRem = false; 1153 1154 if (Subtarget->isTargetWindows()) { 1155 const struct { 1156 const RTLIB::Libcall Op; 1157 const char * const Name; 1158 const CallingConv::ID CC; 1159 } LibraryCalls[] = { 1160 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1161 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1162 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1163 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1164 1165 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1166 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1167 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1168 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1169 }; 1170 1171 for (const auto &LC : LibraryCalls) { 1172 setLibcallName(LC.Op, LC.Name); 1173 setLibcallCallingConv(LC.Op, LC.CC); 1174 } 1175 } else { 1176 const struct { 1177 const RTLIB::Libcall Op; 1178 const char * const Name; 1179 const CallingConv::ID CC; 1180 } LibraryCalls[] = { 1181 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1182 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1183 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1184 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1185 1186 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1187 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1188 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1189 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1190 }; 1191 1192 for (const auto &LC : LibraryCalls) { 1193 setLibcallName(LC.Op, LC.Name); 1194 setLibcallCallingConv(LC.Op, LC.CC); 1195 } 1196 } 1197 1198 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1199 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1200 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1201 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1202 } else { 1203 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1204 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1205 } 1206 1207 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1208 // MSVCRT doesn't have powi; fall back to pow 1209 setLibcallName(RTLIB::POWI_F32, nullptr); 1210 setLibcallName(RTLIB::POWI_F64, nullptr); 1211 } 1212 1213 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1214 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1215 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1216 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1217 1218 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1219 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1220 1221 // Use the default implementation. 1222 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1223 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1224 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1225 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1226 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1227 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1228 1229 if (Subtarget->isTargetWindows()) 1230 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1231 else 1232 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1233 1234 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1235 // the default expansion. 1236 InsertFencesForAtomic = false; 1237 if (Subtarget->hasAnyDataBarrier() && 1238 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1239 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1240 // to ldrex/strex loops already. 1241 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1242 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1243 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1244 1245 // On v8, we have particularly efficient implementations of atomic fences 1246 // if they can be combined with nearby atomic loads and stores. 1247 if (!Subtarget->hasAcquireRelease() || 1248 getTargetMachine().getOptLevel() == 0) { 1249 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1250 InsertFencesForAtomic = true; 1251 } 1252 } else { 1253 // If there's anything we can use as a barrier, go through custom lowering 1254 // for ATOMIC_FENCE. 1255 // If target has DMB in thumb, Fences can be inserted. 1256 if (Subtarget->hasDataBarrier()) 1257 InsertFencesForAtomic = true; 1258 1259 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1260 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1261 1262 // Set them all for expansion, which will force libcalls. 1263 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1264 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1265 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1266 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1267 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1268 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1269 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1270 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1271 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1272 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1273 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1274 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1275 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1276 // Unordered/Monotonic case. 1277 if (!InsertFencesForAtomic) { 1278 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1279 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1280 } 1281 } 1282 1283 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1284 1285 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1286 if (!Subtarget->hasV6Ops()) { 1287 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1288 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1289 } 1290 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1291 1292 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1293 !Subtarget->isThumb1Only()) { 1294 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1295 // iff target supports vfp2. 1296 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1297 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1298 } 1299 1300 // We want to custom lower some of our intrinsics. 1301 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1302 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1303 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1304 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1305 if (Subtarget->useSjLjEH()) 1306 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1307 1308 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1309 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1310 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1311 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1312 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1313 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1314 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1315 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1316 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1317 if (Subtarget->hasFullFP16()) { 1318 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1319 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1320 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1321 } 1322 1323 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1324 1325 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1326 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1327 if (Subtarget->hasFullFP16()) 1328 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1329 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1330 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1331 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1332 1333 // We don't support sin/cos/fmod/copysign/pow 1334 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1335 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1336 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1337 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1338 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1339 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1340 setOperationAction(ISD::FREM, MVT::f64, Expand); 1341 setOperationAction(ISD::FREM, MVT::f32, Expand); 1342 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1343 !Subtarget->isThumb1Only()) { 1344 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1345 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1346 } 1347 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1348 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1349 1350 if (!Subtarget->hasVFP4Base()) { 1351 setOperationAction(ISD::FMA, MVT::f64, Expand); 1352 setOperationAction(ISD::FMA, MVT::f32, Expand); 1353 } 1354 1355 // Various VFP goodness 1356 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1357 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1358 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1359 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1360 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1361 } 1362 1363 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1364 if (!Subtarget->hasFP16()) { 1365 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1366 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1367 } 1368 1369 // Strict floating-point comparisons need custom lowering. 1370 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1371 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1372 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1373 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1374 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1375 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1376 } 1377 1378 // Use __sincos_stret if available. 1379 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1380 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1381 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1382 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1383 } 1384 1385 // FP-ARMv8 implements a lot of rounding-like FP operations. 1386 if (Subtarget->hasFPARMv8Base()) { 1387 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1388 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1389 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1390 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1391 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1392 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1393 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1394 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1395 if (Subtarget->hasNEON()) { 1396 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1397 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1398 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1399 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1400 } 1401 1402 if (Subtarget->hasFP64()) { 1403 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1404 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1405 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1406 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1407 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1408 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1409 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1410 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1411 } 1412 } 1413 1414 // FP16 often need to be promoted to call lib functions 1415 if (Subtarget->hasFullFP16()) { 1416 setOperationAction(ISD::FREM, MVT::f16, Promote); 1417 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1418 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1419 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1420 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1421 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1422 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1423 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1424 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1425 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1426 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1427 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1428 1429 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1430 } 1431 1432 if (Subtarget->hasNEON()) { 1433 // vmin and vmax aren't available in a scalar form, so we can use 1434 // a NEON instruction with an undef lane instead. This has a performance 1435 // penalty on some cores, so we don't do this unless we have been 1436 // asked to by the core tuning model. 1437 if (Subtarget->useNEONForSinglePrecisionFP()) { 1438 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1439 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1440 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1441 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1442 } 1443 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1444 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1445 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1446 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1447 1448 if (Subtarget->hasFullFP16()) { 1449 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1450 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1451 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1452 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1453 1454 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1455 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1456 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1457 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1458 } 1459 } 1460 1461 // We have target-specific dag combine patterns for the following nodes: 1462 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1463 setTargetDAGCombine(ISD::ADD); 1464 setTargetDAGCombine(ISD::SUB); 1465 setTargetDAGCombine(ISD::MUL); 1466 setTargetDAGCombine(ISD::AND); 1467 setTargetDAGCombine(ISD::OR); 1468 setTargetDAGCombine(ISD::XOR); 1469 1470 if (Subtarget->hasMVEIntegerOps()) 1471 setTargetDAGCombine(ISD::VSELECT); 1472 1473 if (Subtarget->hasV6Ops()) 1474 setTargetDAGCombine(ISD::SRL); 1475 if (Subtarget->isThumb1Only()) 1476 setTargetDAGCombine(ISD::SHL); 1477 1478 setStackPointerRegisterToSaveRestore(ARM::SP); 1479 1480 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1481 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1482 setSchedulingPreference(Sched::RegPressure); 1483 else 1484 setSchedulingPreference(Sched::Hybrid); 1485 1486 //// temporary - rewrite interface to use type 1487 MaxStoresPerMemset = 8; 1488 MaxStoresPerMemsetOptSize = 4; 1489 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1490 MaxStoresPerMemcpyOptSize = 2; 1491 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1492 MaxStoresPerMemmoveOptSize = 2; 1493 1494 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1495 // are at least 4 bytes aligned. 1496 setMinStackArgumentAlignment(Align(4)); 1497 1498 // Prefer likely predicted branches to selects on out-of-order cores. 1499 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1500 1501 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1502 1503 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1504 1505 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1506 setTargetDAGCombine(ISD::ABS); 1507 } 1508 1509 bool ARMTargetLowering::useSoftFloat() const { 1510 return Subtarget->useSoftFloat(); 1511 } 1512 1513 // FIXME: It might make sense to define the representative register class as the 1514 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1515 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1516 // SPR's representative would be DPR_VFP2. This should work well if register 1517 // pressure tracking were modified such that a register use would increment the 1518 // pressure of the register class's representative and all of it's super 1519 // classes' representatives transitively. We have not implemented this because 1520 // of the difficulty prior to coalescing of modeling operand register classes 1521 // due to the common occurrence of cross class copies and subregister insertions 1522 // and extractions. 1523 std::pair<const TargetRegisterClass *, uint8_t> 1524 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1525 MVT VT) const { 1526 const TargetRegisterClass *RRC = nullptr; 1527 uint8_t Cost = 1; 1528 switch (VT.SimpleTy) { 1529 default: 1530 return TargetLowering::findRepresentativeClass(TRI, VT); 1531 // Use DPR as representative register class for all floating point 1532 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1533 // the cost is 1 for both f32 and f64. 1534 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1535 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1536 RRC = &ARM::DPRRegClass; 1537 // When NEON is used for SP, only half of the register file is available 1538 // because operations that define both SP and DP results will be constrained 1539 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1540 // coalescing by double-counting the SP regs. See the FIXME above. 1541 if (Subtarget->useNEONForSinglePrecisionFP()) 1542 Cost = 2; 1543 break; 1544 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1545 case MVT::v4f32: case MVT::v2f64: 1546 RRC = &ARM::DPRRegClass; 1547 Cost = 2; 1548 break; 1549 case MVT::v4i64: 1550 RRC = &ARM::DPRRegClass; 1551 Cost = 4; 1552 break; 1553 case MVT::v8i64: 1554 RRC = &ARM::DPRRegClass; 1555 Cost = 8; 1556 break; 1557 } 1558 return std::make_pair(RRC, Cost); 1559 } 1560 1561 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1562 switch ((ARMISD::NodeType)Opcode) { 1563 case ARMISD::FIRST_NUMBER: break; 1564 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1565 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1566 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1567 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1568 case ARMISD::CALL: return "ARMISD::CALL"; 1569 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1570 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1571 case ARMISD::tSECALL: return "ARMISD::tSECALL"; 1572 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1573 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1574 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1575 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1576 case ARMISD::SERET_FLAG: return "ARMISD::SERET_FLAG"; 1577 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1578 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1579 case ARMISD::CMP: return "ARMISD::CMP"; 1580 case ARMISD::CMN: return "ARMISD::CMN"; 1581 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1582 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1583 case ARMISD::CMPFPE: return "ARMISD::CMPFPE"; 1584 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1585 case ARMISD::CMPFPEw0: return "ARMISD::CMPFPEw0"; 1586 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1587 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1588 1589 case ARMISD::CMOV: return "ARMISD::CMOV"; 1590 case ARMISD::SUBS: return "ARMISD::SUBS"; 1591 1592 case ARMISD::SSAT: return "ARMISD::SSAT"; 1593 case ARMISD::USAT: return "ARMISD::USAT"; 1594 1595 case ARMISD::ASRL: return "ARMISD::ASRL"; 1596 case ARMISD::LSRL: return "ARMISD::LSRL"; 1597 case ARMISD::LSLL: return "ARMISD::LSLL"; 1598 1599 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1600 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1601 case ARMISD::RRX: return "ARMISD::RRX"; 1602 1603 case ARMISD::ADDC: return "ARMISD::ADDC"; 1604 case ARMISD::ADDE: return "ARMISD::ADDE"; 1605 case ARMISD::SUBC: return "ARMISD::SUBC"; 1606 case ARMISD::SUBE: return "ARMISD::SUBE"; 1607 case ARMISD::LSLS: return "ARMISD::LSLS"; 1608 1609 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1610 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1611 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1612 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1613 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1614 1615 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1616 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1617 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1618 1619 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1620 1621 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1622 1623 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1624 1625 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1626 1627 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1628 1629 case ARMISD::LDRD: return "ARMISD::LDRD"; 1630 case ARMISD::STRD: return "ARMISD::STRD"; 1631 1632 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1633 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1634 1635 case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; 1636 case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST"; 1637 case ARMISD::VCMP: return "ARMISD::VCMP"; 1638 case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; 1639 case ARMISD::VTST: return "ARMISD::VTST"; 1640 1641 case ARMISD::VSHLs: return "ARMISD::VSHLs"; 1642 case ARMISD::VSHLu: return "ARMISD::VSHLu"; 1643 case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; 1644 case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; 1645 case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; 1646 case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; 1647 case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; 1648 case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; 1649 case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; 1650 case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; 1651 case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; 1652 case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; 1653 case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; 1654 case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; 1655 case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; 1656 case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; 1657 case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; 1658 case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; 1659 case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; 1660 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1661 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1662 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1663 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1664 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1665 case ARMISD::VDUP: return "ARMISD::VDUP"; 1666 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1667 case ARMISD::VEXT: return "ARMISD::VEXT"; 1668 case ARMISD::VREV64: return "ARMISD::VREV64"; 1669 case ARMISD::VREV32: return "ARMISD::VREV32"; 1670 case ARMISD::VREV16: return "ARMISD::VREV16"; 1671 case ARMISD::VZIP: return "ARMISD::VZIP"; 1672 case ARMISD::VUZP: return "ARMISD::VUZP"; 1673 case ARMISD::VTRN: return "ARMISD::VTRN"; 1674 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1675 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1676 case ARMISD::VMOVN: return "ARMISD::VMOVN"; 1677 case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs"; 1678 case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu"; 1679 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1680 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1681 case ARMISD::VADDVs: return "ARMISD::VADDVs"; 1682 case ARMISD::VADDVu: return "ARMISD::VADDVu"; 1683 case ARMISD::VADDLVs: return "ARMISD::VADDLVs"; 1684 case ARMISD::VADDLVu: return "ARMISD::VADDLVu"; 1685 case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs"; 1686 case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu"; 1687 case ARMISD::VADDLVps: return "ARMISD::VADDLVps"; 1688 case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu"; 1689 case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps"; 1690 case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu"; 1691 case ARMISD::VMLAVs: return "ARMISD::VMLAVs"; 1692 case ARMISD::VMLAVu: return "ARMISD::VMLAVu"; 1693 case ARMISD::VMLALVs: return "ARMISD::VMLALVs"; 1694 case ARMISD::VMLALVu: return "ARMISD::VMLALVu"; 1695 case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs"; 1696 case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu"; 1697 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1698 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1699 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1700 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1701 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1702 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1703 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1704 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1705 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1706 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1707 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1708 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1709 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1710 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1711 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1712 case ARMISD::QADD16b: return "ARMISD::QADD16b"; 1713 case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; 1714 case ARMISD::QADD8b: return "ARMISD::QADD8b"; 1715 case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; 1716 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1717 case ARMISD::BFI: return "ARMISD::BFI"; 1718 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1719 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1720 case ARMISD::VBSL: return "ARMISD::VBSL"; 1721 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1722 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1723 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1724 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1725 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1726 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1727 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1728 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1729 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1730 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1731 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1732 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1733 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1734 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1735 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1736 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1737 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1738 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1739 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1740 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1741 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1742 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1743 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1744 case ARMISD::WLS: return "ARMISD::WLS"; 1745 case ARMISD::LE: return "ARMISD::LE"; 1746 case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; 1747 case ARMISD::CSINV: return "ARMISD::CSINV"; 1748 case ARMISD::CSNEG: return "ARMISD::CSNEG"; 1749 case ARMISD::CSINC: return "ARMISD::CSINC"; 1750 } 1751 return nullptr; 1752 } 1753 1754 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1755 EVT VT) const { 1756 if (!VT.isVector()) 1757 return getPointerTy(DL); 1758 1759 // MVE has a predicate register. 1760 if (Subtarget->hasMVEIntegerOps() && 1761 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) 1762 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1763 return VT.changeVectorElementTypeToInteger(); 1764 } 1765 1766 /// getRegClassFor - Return the register class that should be used for the 1767 /// specified value type. 1768 const TargetRegisterClass * 1769 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1770 (void)isDivergent; 1771 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1772 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1773 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1774 // MVE Q registers. 1775 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1776 if (VT == MVT::v4i64) 1777 return &ARM::QQPRRegClass; 1778 if (VT == MVT::v8i64) 1779 return &ARM::QQQQPRRegClass; 1780 } 1781 return TargetLowering::getRegClassFor(VT); 1782 } 1783 1784 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1785 // source/dest is aligned and the copy size is large enough. We therefore want 1786 // to align such objects passed to memory intrinsics. 1787 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1788 unsigned &PrefAlign) const { 1789 if (!isa<MemIntrinsic>(CI)) 1790 return false; 1791 MinSize = 8; 1792 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1793 // cycle faster than 4-byte aligned LDM. 1794 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1795 return true; 1796 } 1797 1798 // Create a fast isel object. 1799 FastISel * 1800 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1801 const TargetLibraryInfo *libInfo) const { 1802 return ARM::createFastISel(funcInfo, libInfo); 1803 } 1804 1805 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1806 unsigned NumVals = N->getNumValues(); 1807 if (!NumVals) 1808 return Sched::RegPressure; 1809 1810 for (unsigned i = 0; i != NumVals; ++i) { 1811 EVT VT = N->getValueType(i); 1812 if (VT == MVT::Glue || VT == MVT::Other) 1813 continue; 1814 if (VT.isFloatingPoint() || VT.isVector()) 1815 return Sched::ILP; 1816 } 1817 1818 if (!N->isMachineOpcode()) 1819 return Sched::RegPressure; 1820 1821 // Load are scheduled for latency even if there instruction itinerary 1822 // is not available. 1823 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1824 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1825 1826 if (MCID.getNumDefs() == 0) 1827 return Sched::RegPressure; 1828 if (!Itins->isEmpty() && 1829 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1830 return Sched::ILP; 1831 1832 return Sched::RegPressure; 1833 } 1834 1835 //===----------------------------------------------------------------------===// 1836 // Lowering Code 1837 //===----------------------------------------------------------------------===// 1838 1839 static bool isSRL16(const SDValue &Op) { 1840 if (Op.getOpcode() != ISD::SRL) 1841 return false; 1842 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1843 return Const->getZExtValue() == 16; 1844 return false; 1845 } 1846 1847 static bool isSRA16(const SDValue &Op) { 1848 if (Op.getOpcode() != ISD::SRA) 1849 return false; 1850 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1851 return Const->getZExtValue() == 16; 1852 return false; 1853 } 1854 1855 static bool isSHL16(const SDValue &Op) { 1856 if (Op.getOpcode() != ISD::SHL) 1857 return false; 1858 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1859 return Const->getZExtValue() == 16; 1860 return false; 1861 } 1862 1863 // Check for a signed 16-bit value. We special case SRA because it makes it 1864 // more simple when also looking for SRAs that aren't sign extending a 1865 // smaller value. Without the check, we'd need to take extra care with 1866 // checking order for some operations. 1867 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1868 if (isSRA16(Op)) 1869 return isSHL16(Op.getOperand(0)); 1870 return DAG.ComputeNumSignBits(Op) == 17; 1871 } 1872 1873 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1874 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1875 switch (CC) { 1876 default: llvm_unreachable("Unknown condition code!"); 1877 case ISD::SETNE: return ARMCC::NE; 1878 case ISD::SETEQ: return ARMCC::EQ; 1879 case ISD::SETGT: return ARMCC::GT; 1880 case ISD::SETGE: return ARMCC::GE; 1881 case ISD::SETLT: return ARMCC::LT; 1882 case ISD::SETLE: return ARMCC::LE; 1883 case ISD::SETUGT: return ARMCC::HI; 1884 case ISD::SETUGE: return ARMCC::HS; 1885 case ISD::SETULT: return ARMCC::LO; 1886 case ISD::SETULE: return ARMCC::LS; 1887 } 1888 } 1889 1890 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1891 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1892 ARMCC::CondCodes &CondCode2) { 1893 CondCode2 = ARMCC::AL; 1894 switch (CC) { 1895 default: llvm_unreachable("Unknown FP condition!"); 1896 case ISD::SETEQ: 1897 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1898 case ISD::SETGT: 1899 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1900 case ISD::SETGE: 1901 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1902 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1903 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1904 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1905 case ISD::SETO: CondCode = ARMCC::VC; break; 1906 case ISD::SETUO: CondCode = ARMCC::VS; break; 1907 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1908 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1909 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1910 case ISD::SETLT: 1911 case ISD::SETULT: CondCode = ARMCC::LT; break; 1912 case ISD::SETLE: 1913 case ISD::SETULE: CondCode = ARMCC::LE; break; 1914 case ISD::SETNE: 1915 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1916 } 1917 } 1918 1919 //===----------------------------------------------------------------------===// 1920 // Calling Convention Implementation 1921 //===----------------------------------------------------------------------===// 1922 1923 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1924 /// account presence of floating point hardware and calling convention 1925 /// limitations, such as support for variadic functions. 1926 CallingConv::ID 1927 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1928 bool isVarArg) const { 1929 switch (CC) { 1930 default: 1931 report_fatal_error("Unsupported calling convention"); 1932 case CallingConv::ARM_AAPCS: 1933 case CallingConv::ARM_APCS: 1934 case CallingConv::GHC: 1935 case CallingConv::CFGuard_Check: 1936 return CC; 1937 case CallingConv::PreserveMost: 1938 return CallingConv::PreserveMost; 1939 case CallingConv::ARM_AAPCS_VFP: 1940 case CallingConv::Swift: 1941 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1942 case CallingConv::C: 1943 if (!Subtarget->isAAPCS_ABI()) 1944 return CallingConv::ARM_APCS; 1945 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 1946 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1947 !isVarArg) 1948 return CallingConv::ARM_AAPCS_VFP; 1949 else 1950 return CallingConv::ARM_AAPCS; 1951 case CallingConv::Fast: 1952 case CallingConv::CXX_FAST_TLS: 1953 if (!Subtarget->isAAPCS_ABI()) { 1954 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 1955 return CallingConv::Fast; 1956 return CallingConv::ARM_APCS; 1957 } else if (Subtarget->hasVFP2Base() && 1958 !Subtarget->isThumb1Only() && !isVarArg) 1959 return CallingConv::ARM_AAPCS_VFP; 1960 else 1961 return CallingConv::ARM_AAPCS; 1962 } 1963 } 1964 1965 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1966 bool isVarArg) const { 1967 return CCAssignFnForNode(CC, false, isVarArg); 1968 } 1969 1970 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1971 bool isVarArg) const { 1972 return CCAssignFnForNode(CC, true, isVarArg); 1973 } 1974 1975 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1976 /// CallingConvention. 1977 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1978 bool Return, 1979 bool isVarArg) const { 1980 switch (getEffectiveCallingConv(CC, isVarArg)) { 1981 default: 1982 report_fatal_error("Unsupported calling convention"); 1983 case CallingConv::ARM_APCS: 1984 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1985 case CallingConv::ARM_AAPCS: 1986 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1987 case CallingConv::ARM_AAPCS_VFP: 1988 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1989 case CallingConv::Fast: 1990 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1991 case CallingConv::GHC: 1992 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1993 case CallingConv::PreserveMost: 1994 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1995 case CallingConv::CFGuard_Check: 1996 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 1997 } 1998 } 1999 2000 /// LowerCallResult - Lower the result values of a call into the 2001 /// appropriate copies out of appropriate physical registers. 2002 SDValue ARMTargetLowering::LowerCallResult( 2003 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2004 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2005 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2006 SDValue ThisVal) const { 2007 // Assign locations to each value returned by this call. 2008 SmallVector<CCValAssign, 16> RVLocs; 2009 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2010 *DAG.getContext()); 2011 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 2012 2013 // Copy all of the result registers out of their specified physreg. 2014 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2015 CCValAssign VA = RVLocs[i]; 2016 2017 // Pass 'this' value directly from the argument to return value, to avoid 2018 // reg unit interference 2019 if (i == 0 && isThisReturn) { 2020 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 2021 "unexpected return calling convention register assignment"); 2022 InVals.push_back(ThisVal); 2023 continue; 2024 } 2025 2026 SDValue Val; 2027 if (VA.needsCustom()) { 2028 // Handle f64 or half of a v2f64. 2029 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2030 InFlag); 2031 Chain = Lo.getValue(1); 2032 InFlag = Lo.getValue(2); 2033 VA = RVLocs[++i]; // skip ahead to next loc 2034 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2035 InFlag); 2036 Chain = Hi.getValue(1); 2037 InFlag = Hi.getValue(2); 2038 if (!Subtarget->isLittle()) 2039 std::swap (Lo, Hi); 2040 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2041 2042 if (VA.getLocVT() == MVT::v2f64) { 2043 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2044 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2045 DAG.getConstant(0, dl, MVT::i32)); 2046 2047 VA = RVLocs[++i]; // skip ahead to next loc 2048 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2049 Chain = Lo.getValue(1); 2050 InFlag = Lo.getValue(2); 2051 VA = RVLocs[++i]; // skip ahead to next loc 2052 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2053 Chain = Hi.getValue(1); 2054 InFlag = Hi.getValue(2); 2055 if (!Subtarget->isLittle()) 2056 std::swap (Lo, Hi); 2057 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2058 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2059 DAG.getConstant(1, dl, MVT::i32)); 2060 } 2061 } else { 2062 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2063 InFlag); 2064 Chain = Val.getValue(1); 2065 InFlag = Val.getValue(2); 2066 } 2067 2068 switch (VA.getLocInfo()) { 2069 default: llvm_unreachable("Unknown loc info!"); 2070 case CCValAssign::Full: break; 2071 case CCValAssign::BCvt: 2072 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2073 break; 2074 } 2075 2076 InVals.push_back(Val); 2077 } 2078 2079 return Chain; 2080 } 2081 2082 /// LowerMemOpCallTo - Store the argument to the stack. 2083 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2084 SDValue Arg, const SDLoc &dl, 2085 SelectionDAG &DAG, 2086 const CCValAssign &VA, 2087 ISD::ArgFlagsTy Flags) const { 2088 unsigned LocMemOffset = VA.getLocMemOffset(); 2089 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2090 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2091 StackPtr, PtrOff); 2092 return DAG.getStore( 2093 Chain, dl, Arg, PtrOff, 2094 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 2095 } 2096 2097 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2098 SDValue Chain, SDValue &Arg, 2099 RegsToPassVector &RegsToPass, 2100 CCValAssign &VA, CCValAssign &NextVA, 2101 SDValue &StackPtr, 2102 SmallVectorImpl<SDValue> &MemOpChains, 2103 ISD::ArgFlagsTy Flags) const { 2104 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2105 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2106 unsigned id = Subtarget->isLittle() ? 0 : 1; 2107 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2108 2109 if (NextVA.isRegLoc()) 2110 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2111 else { 2112 assert(NextVA.isMemLoc()); 2113 if (!StackPtr.getNode()) 2114 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2115 getPointerTy(DAG.getDataLayout())); 2116 2117 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 2118 dl, DAG, NextVA, 2119 Flags)); 2120 } 2121 } 2122 2123 /// LowerCall - Lowering a call into a callseq_start <- 2124 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2125 /// nodes. 2126 SDValue 2127 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2128 SmallVectorImpl<SDValue> &InVals) const { 2129 SelectionDAG &DAG = CLI.DAG; 2130 SDLoc &dl = CLI.DL; 2131 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2132 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2133 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2134 SDValue Chain = CLI.Chain; 2135 SDValue Callee = CLI.Callee; 2136 bool &isTailCall = CLI.IsTailCall; 2137 CallingConv::ID CallConv = CLI.CallConv; 2138 bool doesNotRet = CLI.DoesNotReturn; 2139 bool isVarArg = CLI.IsVarArg; 2140 2141 MachineFunction &MF = DAG.getMachineFunction(); 2142 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2143 MachineFunction::CallSiteInfo CSInfo; 2144 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2145 bool isThisReturn = false; 2146 bool isCmseNSCall = false; 2147 bool PreferIndirect = false; 2148 2149 // Determine whether this is a non-secure function call. 2150 if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call")) 2151 isCmseNSCall = true; 2152 2153 // Disable tail calls if they're not supported. 2154 if (!Subtarget->supportsTailCall()) 2155 isTailCall = false; 2156 2157 // For both the non-secure calls and the returns from a CMSE entry function, 2158 // the function needs to do some extra work afte r the call, or before the 2159 // return, respectively, thus it cannot end with atail call 2160 if (isCmseNSCall || AFI->isCmseNSEntryFunction()) 2161 isTailCall = false; 2162 2163 if (isa<GlobalAddressSDNode>(Callee)) { 2164 // If we're optimizing for minimum size and the function is called three or 2165 // more times in this block, we can improve codesize by calling indirectly 2166 // as BLXr has a 16-bit encoding. 2167 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2168 if (CLI.CB) { 2169 auto *BB = CLI.CB->getParent(); 2170 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2171 count_if(GV->users(), [&BB](const User *U) { 2172 return isa<Instruction>(U) && 2173 cast<Instruction>(U)->getParent() == BB; 2174 }) > 2; 2175 } 2176 } 2177 if (isTailCall) { 2178 // Check if it's really possible to do a tail call. 2179 isTailCall = IsEligibleForTailCallOptimization( 2180 Callee, CallConv, isVarArg, isStructRet, 2181 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2182 PreferIndirect); 2183 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) 2184 report_fatal_error("failed to perform tail call elimination on a call " 2185 "site marked musttail"); 2186 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2187 // detected sibcalls. 2188 if (isTailCall) 2189 ++NumTailCalls; 2190 } 2191 2192 // Analyze operands of the call, assigning locations to each operand. 2193 SmallVector<CCValAssign, 16> ArgLocs; 2194 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2195 *DAG.getContext()); 2196 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2197 2198 // Get a count of how many bytes are to be pushed on the stack. 2199 unsigned NumBytes = CCInfo.getNextStackOffset(); 2200 2201 if (isTailCall) { 2202 // For tail calls, memory operands are available in our caller's stack. 2203 NumBytes = 0; 2204 } else { 2205 // Adjust the stack pointer for the new arguments... 2206 // These operations are automatically eliminated by the prolog/epilog pass 2207 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 2208 } 2209 2210 SDValue StackPtr = 2211 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2212 2213 RegsToPassVector RegsToPass; 2214 SmallVector<SDValue, 8> MemOpChains; 2215 2216 // Walk the register/memloc assignments, inserting copies/loads. In the case 2217 // of tail call optimization, arguments are handled later. 2218 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2219 i != e; 2220 ++i, ++realArgIdx) { 2221 CCValAssign &VA = ArgLocs[i]; 2222 SDValue Arg = OutVals[realArgIdx]; 2223 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2224 bool isByVal = Flags.isByVal(); 2225 2226 // Promote the value if needed. 2227 switch (VA.getLocInfo()) { 2228 default: llvm_unreachable("Unknown loc info!"); 2229 case CCValAssign::Full: break; 2230 case CCValAssign::SExt: 2231 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2232 break; 2233 case CCValAssign::ZExt: 2234 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2235 break; 2236 case CCValAssign::AExt: 2237 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2238 break; 2239 case CCValAssign::BCvt: 2240 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2241 break; 2242 } 2243 2244 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2245 if (VA.needsCustom()) { 2246 if (VA.getLocVT() == MVT::v2f64) { 2247 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2248 DAG.getConstant(0, dl, MVT::i32)); 2249 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2250 DAG.getConstant(1, dl, MVT::i32)); 2251 2252 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 2253 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2254 2255 VA = ArgLocs[++i]; // skip ahead to next loc 2256 if (VA.isRegLoc()) { 2257 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 2258 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2259 } else { 2260 assert(VA.isMemLoc()); 2261 2262 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 2263 dl, DAG, VA, Flags)); 2264 } 2265 } else { 2266 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2267 StackPtr, MemOpChains, Flags); 2268 } 2269 } else if (VA.isRegLoc()) { 2270 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2271 Outs[0].VT == MVT::i32) { 2272 assert(VA.getLocVT() == MVT::i32 && 2273 "unexpected calling convention register assignment"); 2274 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2275 "unexpected use of 'returned'"); 2276 isThisReturn = true; 2277 } 2278 const TargetOptions &Options = DAG.getTarget().Options; 2279 if (Options.EmitCallSiteInfo) 2280 CSInfo.emplace_back(VA.getLocReg(), i); 2281 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2282 } else if (isByVal) { 2283 assert(VA.isMemLoc()); 2284 unsigned offset = 0; 2285 2286 // True if this byval aggregate will be split between registers 2287 // and memory. 2288 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2289 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2290 2291 if (CurByValIdx < ByValArgsCount) { 2292 2293 unsigned RegBegin, RegEnd; 2294 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2295 2296 EVT PtrVT = 2297 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2298 unsigned int i, j; 2299 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2300 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2301 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2302 SDValue Load = 2303 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), 2304 DAG.InferPtrAlign(AddArg)); 2305 MemOpChains.push_back(Load.getValue(1)); 2306 RegsToPass.push_back(std::make_pair(j, Load)); 2307 } 2308 2309 // If parameter size outsides register area, "offset" value 2310 // helps us to calculate stack slot for remained part properly. 2311 offset = RegEnd - RegBegin; 2312 2313 CCInfo.nextInRegsParam(); 2314 } 2315 2316 if (Flags.getByValSize() > 4*offset) { 2317 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2318 unsigned LocMemOffset = VA.getLocMemOffset(); 2319 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2320 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 2321 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2322 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2323 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2324 MVT::i32); 2325 SDValue AlignNode = 2326 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); 2327 2328 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2329 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2330 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2331 Ops)); 2332 } 2333 } else if (!isTailCall) { 2334 assert(VA.isMemLoc()); 2335 2336 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2337 dl, DAG, VA, Flags)); 2338 } 2339 } 2340 2341 if (!MemOpChains.empty()) 2342 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2343 2344 // Build a sequence of copy-to-reg nodes chained together with token chain 2345 // and flag operands which copy the outgoing args into the appropriate regs. 2346 SDValue InFlag; 2347 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2348 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2349 RegsToPass[i].second, InFlag); 2350 InFlag = Chain.getValue(1); 2351 } 2352 2353 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2354 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2355 // node so that legalize doesn't hack it. 2356 bool isDirect = false; 2357 2358 const TargetMachine &TM = getTargetMachine(); 2359 const Module *Mod = MF.getFunction().getParent(); 2360 const GlobalValue *GV = nullptr; 2361 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2362 GV = G->getGlobal(); 2363 bool isStub = 2364 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2365 2366 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2367 bool isLocalARMFunc = false; 2368 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2369 2370 if (Subtarget->genLongCalls()) { 2371 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2372 "long-calls codegen is not position independent!"); 2373 // Handle a global address or an external symbol. If it's not one of 2374 // those, the target's already in a register, so we don't need to do 2375 // anything extra. 2376 if (isa<GlobalAddressSDNode>(Callee)) { 2377 // Create a constant pool entry for the callee address 2378 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2379 ARMConstantPoolValue *CPV = 2380 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2381 2382 // Get the address of the callee into a register 2383 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2384 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2385 Callee = DAG.getLoad( 2386 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2387 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2388 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2389 const char *Sym = S->getSymbol(); 2390 2391 // Create a constant pool entry for the callee address 2392 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2393 ARMConstantPoolValue *CPV = 2394 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2395 ARMPCLabelIndex, 0); 2396 // Get the address of the callee into a register 2397 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2398 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2399 Callee = DAG.getLoad( 2400 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2401 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2402 } 2403 } else if (isa<GlobalAddressSDNode>(Callee)) { 2404 if (!PreferIndirect) { 2405 isDirect = true; 2406 bool isDef = GV->isStrongDefinitionForLinker(); 2407 2408 // ARM call to a local ARM function is predicable. 2409 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2410 // tBX takes a register source operand. 2411 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2412 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2413 Callee = DAG.getNode( 2414 ARMISD::WrapperPIC, dl, PtrVt, 2415 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2416 Callee = DAG.getLoad( 2417 PtrVt, dl, DAG.getEntryNode(), Callee, 2418 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2419 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2420 MachineMemOperand::MOInvariant); 2421 } else if (Subtarget->isTargetCOFF()) { 2422 assert(Subtarget->isTargetWindows() && 2423 "Windows is the only supported COFF target"); 2424 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2425 if (GV->hasDLLImportStorageClass()) 2426 TargetFlags = ARMII::MO_DLLIMPORT; 2427 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2428 TargetFlags = ARMII::MO_COFFSTUB; 2429 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2430 TargetFlags); 2431 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2432 Callee = 2433 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2434 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2435 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2436 } else { 2437 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2438 } 2439 } 2440 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2441 isDirect = true; 2442 // tBX takes a register source operand. 2443 const char *Sym = S->getSymbol(); 2444 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2445 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2446 ARMConstantPoolValue *CPV = 2447 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2448 ARMPCLabelIndex, 4); 2449 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2450 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2451 Callee = DAG.getLoad( 2452 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2453 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2454 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2455 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2456 } else { 2457 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2458 } 2459 } 2460 2461 if (isCmseNSCall) { 2462 assert(!isARMFunc && !isDirect && 2463 "Cannot handle call to ARM function or direct call"); 2464 if (NumBytes > 0) { 2465 DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), 2466 "call to non-secure function would " 2467 "require passing arguments on stack", 2468 dl.getDebugLoc()); 2469 DAG.getContext()->diagnose(Diag); 2470 } 2471 if (isStructRet) { 2472 DiagnosticInfoUnsupported Diag( 2473 DAG.getMachineFunction().getFunction(), 2474 "call to non-secure function would return value through pointer", 2475 dl.getDebugLoc()); 2476 DAG.getContext()->diagnose(Diag); 2477 } 2478 } 2479 2480 // FIXME: handle tail calls differently. 2481 unsigned CallOpc; 2482 if (Subtarget->isThumb()) { 2483 if (isCmseNSCall) 2484 CallOpc = ARMISD::tSECALL; 2485 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2486 CallOpc = ARMISD::CALL_NOLINK; 2487 else 2488 CallOpc = ARMISD::CALL; 2489 } else { 2490 if (!isDirect && !Subtarget->hasV5TOps()) 2491 CallOpc = ARMISD::CALL_NOLINK; 2492 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2493 // Emit regular call when code size is the priority 2494 !Subtarget->hasMinSize()) 2495 // "mov lr, pc; b _foo" to avoid confusing the RSP 2496 CallOpc = ARMISD::CALL_NOLINK; 2497 else 2498 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2499 } 2500 2501 std::vector<SDValue> Ops; 2502 Ops.push_back(Chain); 2503 Ops.push_back(Callee); 2504 2505 // Add argument registers to the end of the list so that they are known live 2506 // into the call. 2507 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2508 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2509 RegsToPass[i].second.getValueType())); 2510 2511 // Add a register mask operand representing the call-preserved registers. 2512 if (!isTailCall) { 2513 const uint32_t *Mask; 2514 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2515 if (isThisReturn) { 2516 // For 'this' returns, use the R0-preserving mask if applicable 2517 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2518 if (!Mask) { 2519 // Set isThisReturn to false if the calling convention is not one that 2520 // allows 'returned' to be modeled in this way, so LowerCallResult does 2521 // not try to pass 'this' straight through 2522 isThisReturn = false; 2523 Mask = ARI->getCallPreservedMask(MF, CallConv); 2524 } 2525 } else 2526 Mask = ARI->getCallPreservedMask(MF, CallConv); 2527 2528 assert(Mask && "Missing call preserved mask for calling convention"); 2529 Ops.push_back(DAG.getRegisterMask(Mask)); 2530 } 2531 2532 if (InFlag.getNode()) 2533 Ops.push_back(InFlag); 2534 2535 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2536 if (isTailCall) { 2537 MF.getFrameInfo().setHasTailCall(); 2538 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2539 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2540 return Ret; 2541 } 2542 2543 // Returns a chain and a flag for retval copy to use. 2544 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2545 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2546 InFlag = Chain.getValue(1); 2547 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2548 2549 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2550 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2551 if (!Ins.empty()) 2552 InFlag = Chain.getValue(1); 2553 2554 // Handle result values, copying them out of physregs into vregs that we 2555 // return. 2556 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2557 InVals, isThisReturn, 2558 isThisReturn ? OutVals[0] : SDValue()); 2559 } 2560 2561 /// HandleByVal - Every parameter *after* a byval parameter is passed 2562 /// on the stack. Remember the next parameter register to allocate, 2563 /// and then confiscate the rest of the parameter registers to insure 2564 /// this. 2565 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2566 unsigned Align) const { 2567 // Byval (as with any stack) slots are always at least 4 byte aligned. 2568 Align = std::max(Align, 4U); 2569 2570 unsigned Reg = State->AllocateReg(GPRArgRegs); 2571 if (!Reg) 2572 return; 2573 2574 unsigned AlignInRegs = Align / 4; 2575 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2576 for (unsigned i = 0; i < Waste; ++i) 2577 Reg = State->AllocateReg(GPRArgRegs); 2578 2579 if (!Reg) 2580 return; 2581 2582 unsigned Excess = 4 * (ARM::R4 - Reg); 2583 2584 // Special case when NSAA != SP and parameter size greater than size of 2585 // all remained GPR regs. In that case we can't split parameter, we must 2586 // send it to stack. We also must set NCRN to R4, so waste all 2587 // remained registers. 2588 const unsigned NSAAOffset = State->getNextStackOffset(); 2589 if (NSAAOffset != 0 && Size > Excess) { 2590 while (State->AllocateReg(GPRArgRegs)) 2591 ; 2592 return; 2593 } 2594 2595 // First register for byval parameter is the first register that wasn't 2596 // allocated before this method call, so it would be "reg". 2597 // If parameter is small enough to be saved in range [reg, r4), then 2598 // the end (first after last) register would be reg + param-size-in-regs, 2599 // else parameter would be splitted between registers and stack, 2600 // end register would be r4 in this case. 2601 unsigned ByValRegBegin = Reg; 2602 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2603 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2604 // Note, first register is allocated in the beginning of function already, 2605 // allocate remained amount of registers we need. 2606 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2607 State->AllocateReg(GPRArgRegs); 2608 // A byval parameter that is split between registers and memory needs its 2609 // size truncated here. 2610 // In the case where the entire structure fits in registers, we set the 2611 // size in memory to zero. 2612 Size = std::max<int>(Size - Excess, 0); 2613 } 2614 2615 /// MatchingStackOffset - Return true if the given stack call argument is 2616 /// already available in the same position (relatively) of the caller's 2617 /// incoming argument stack. 2618 static 2619 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2620 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2621 const TargetInstrInfo *TII) { 2622 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2623 int FI = std::numeric_limits<int>::max(); 2624 if (Arg.getOpcode() == ISD::CopyFromReg) { 2625 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2626 if (!Register::isVirtualRegister(VR)) 2627 return false; 2628 MachineInstr *Def = MRI->getVRegDef(VR); 2629 if (!Def) 2630 return false; 2631 if (!Flags.isByVal()) { 2632 if (!TII->isLoadFromStackSlot(*Def, FI)) 2633 return false; 2634 } else { 2635 return false; 2636 } 2637 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2638 if (Flags.isByVal()) 2639 // ByVal argument is passed in as a pointer but it's now being 2640 // dereferenced. e.g. 2641 // define @foo(%struct.X* %A) { 2642 // tail call @bar(%struct.X* byval %A) 2643 // } 2644 return false; 2645 SDValue Ptr = Ld->getBasePtr(); 2646 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2647 if (!FINode) 2648 return false; 2649 FI = FINode->getIndex(); 2650 } else 2651 return false; 2652 2653 assert(FI != std::numeric_limits<int>::max()); 2654 if (!MFI.isFixedObjectIndex(FI)) 2655 return false; 2656 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2657 } 2658 2659 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2660 /// for tail call optimization. Targets which want to do tail call 2661 /// optimization should implement this function. 2662 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2663 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2664 bool isCalleeStructRet, bool isCallerStructRet, 2665 const SmallVectorImpl<ISD::OutputArg> &Outs, 2666 const SmallVectorImpl<SDValue> &OutVals, 2667 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2668 const bool isIndirect) const { 2669 MachineFunction &MF = DAG.getMachineFunction(); 2670 const Function &CallerF = MF.getFunction(); 2671 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2672 2673 assert(Subtarget->supportsTailCall()); 2674 2675 // Indirect tail calls cannot be optimized for Thumb1 if the args 2676 // to the call take up r0-r3. The reason is that there are no legal registers 2677 // left to hold the pointer to the function to be called. 2678 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2679 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2680 return false; 2681 2682 // Look for obvious safe cases to perform tail call optimization that do not 2683 // require ABI changes. This is what gcc calls sibcall. 2684 2685 // Exception-handling functions need a special set of instructions to indicate 2686 // a return to the hardware. Tail-calling another function would probably 2687 // break this. 2688 if (CallerF.hasFnAttribute("interrupt")) 2689 return false; 2690 2691 // Also avoid sibcall optimization if either caller or callee uses struct 2692 // return semantics. 2693 if (isCalleeStructRet || isCallerStructRet) 2694 return false; 2695 2696 // Externally-defined functions with weak linkage should not be 2697 // tail-called on ARM when the OS does not support dynamic 2698 // pre-emption of symbols, as the AAELF spec requires normal calls 2699 // to undefined weak functions to be replaced with a NOP or jump to the 2700 // next instruction. The behaviour of branch instructions in this 2701 // situation (as used for tail calls) is implementation-defined, so we 2702 // cannot rely on the linker replacing the tail call with a return. 2703 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2704 const GlobalValue *GV = G->getGlobal(); 2705 const Triple &TT = getTargetMachine().getTargetTriple(); 2706 if (GV->hasExternalWeakLinkage() && 2707 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2708 return false; 2709 } 2710 2711 // Check that the call results are passed in the same way. 2712 LLVMContext &C = *DAG.getContext(); 2713 if (!CCState::resultsCompatible( 2714 getEffectiveCallingConv(CalleeCC, isVarArg), 2715 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, 2716 CCAssignFnForReturn(CalleeCC, isVarArg), 2717 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) 2718 return false; 2719 // The callee has to preserve all registers the caller needs to preserve. 2720 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2721 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2722 if (CalleeCC != CallerCC) { 2723 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2724 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2725 return false; 2726 } 2727 2728 // If Caller's vararg or byval argument has been split between registers and 2729 // stack, do not perform tail call, since part of the argument is in caller's 2730 // local frame. 2731 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2732 if (AFI_Caller->getArgRegsSaveSize()) 2733 return false; 2734 2735 // If the callee takes no arguments then go on to check the results of the 2736 // call. 2737 if (!Outs.empty()) { 2738 // Check if stack adjustment is needed. For now, do not do this if any 2739 // argument is passed on the stack. 2740 SmallVector<CCValAssign, 16> ArgLocs; 2741 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2742 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2743 if (CCInfo.getNextStackOffset()) { 2744 // Check if the arguments are already laid out in the right way as 2745 // the caller's fixed stack objects. 2746 MachineFrameInfo &MFI = MF.getFrameInfo(); 2747 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2748 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2749 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2750 i != e; 2751 ++i, ++realArgIdx) { 2752 CCValAssign &VA = ArgLocs[i]; 2753 EVT RegVT = VA.getLocVT(); 2754 SDValue Arg = OutVals[realArgIdx]; 2755 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2756 if (VA.getLocInfo() == CCValAssign::Indirect) 2757 return false; 2758 if (VA.needsCustom()) { 2759 // f64 and vector types are split into multiple registers or 2760 // register/stack-slot combinations. The types will not match 2761 // the registers; give up on memory f64 refs until we figure 2762 // out what to do about this. 2763 if (!VA.isRegLoc()) 2764 return false; 2765 if (!ArgLocs[++i].isRegLoc()) 2766 return false; 2767 if (RegVT == MVT::v2f64) { 2768 if (!ArgLocs[++i].isRegLoc()) 2769 return false; 2770 if (!ArgLocs[++i].isRegLoc()) 2771 return false; 2772 } 2773 } else if (!VA.isRegLoc()) { 2774 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2775 MFI, MRI, TII)) 2776 return false; 2777 } 2778 } 2779 } 2780 2781 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2782 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2783 return false; 2784 } 2785 2786 return true; 2787 } 2788 2789 bool 2790 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2791 MachineFunction &MF, bool isVarArg, 2792 const SmallVectorImpl<ISD::OutputArg> &Outs, 2793 LLVMContext &Context) const { 2794 SmallVector<CCValAssign, 16> RVLocs; 2795 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2796 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2797 } 2798 2799 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2800 const SDLoc &DL, SelectionDAG &DAG) { 2801 const MachineFunction &MF = DAG.getMachineFunction(); 2802 const Function &F = MF.getFunction(); 2803 2804 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2805 2806 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2807 // version of the "preferred return address". These offsets affect the return 2808 // instruction if this is a return from PL1 without hypervisor extensions. 2809 // IRQ/FIQ: +4 "subs pc, lr, #4" 2810 // SWI: 0 "subs pc, lr, #0" 2811 // ABORT: +4 "subs pc, lr, #4" 2812 // UNDEF: +4/+2 "subs pc, lr, #0" 2813 // UNDEF varies depending on where the exception came from ARM or Thumb 2814 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2815 2816 int64_t LROffset; 2817 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2818 IntKind == "ABORT") 2819 LROffset = 4; 2820 else if (IntKind == "SWI" || IntKind == "UNDEF") 2821 LROffset = 0; 2822 else 2823 report_fatal_error("Unsupported interrupt attribute. If present, value " 2824 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2825 2826 RetOps.insert(RetOps.begin() + 1, 2827 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2828 2829 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2830 } 2831 2832 SDValue 2833 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2834 bool isVarArg, 2835 const SmallVectorImpl<ISD::OutputArg> &Outs, 2836 const SmallVectorImpl<SDValue> &OutVals, 2837 const SDLoc &dl, SelectionDAG &DAG) const { 2838 // CCValAssign - represent the assignment of the return value to a location. 2839 SmallVector<CCValAssign, 16> RVLocs; 2840 2841 // CCState - Info about the registers and stack slots. 2842 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2843 *DAG.getContext()); 2844 2845 // Analyze outgoing return values. 2846 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2847 2848 SDValue Flag; 2849 SmallVector<SDValue, 4> RetOps; 2850 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2851 bool isLittleEndian = Subtarget->isLittle(); 2852 2853 MachineFunction &MF = DAG.getMachineFunction(); 2854 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2855 AFI->setReturnRegsCount(RVLocs.size()); 2856 2857 // Report error if cmse entry function returns structure through first ptr arg. 2858 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { 2859 // Note: using an empty SDLoc(), as the first line of the function is a 2860 // better place to report than the last line. 2861 DiagnosticInfoUnsupported Diag( 2862 DAG.getMachineFunction().getFunction(), 2863 "secure entry function would return value through pointer", 2864 SDLoc().getDebugLoc()); 2865 DAG.getContext()->diagnose(Diag); 2866 } 2867 2868 // Copy the result values into the output registers. 2869 for (unsigned i = 0, realRVLocIdx = 0; 2870 i != RVLocs.size(); 2871 ++i, ++realRVLocIdx) { 2872 CCValAssign &VA = RVLocs[i]; 2873 assert(VA.isRegLoc() && "Can only return in registers!"); 2874 2875 SDValue Arg = OutVals[realRVLocIdx]; 2876 bool ReturnF16 = false; 2877 2878 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2879 // Half-precision return values can be returned like this: 2880 // 2881 // t11 f16 = fadd ... 2882 // t12: i16 = bitcast t11 2883 // t13: i32 = zero_extend t12 2884 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2885 // 2886 // to avoid code generation for bitcasts, we simply set Arg to the node 2887 // that produces the f16 value, t11 in this case. 2888 // 2889 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2890 SDValue ZE = Arg.getOperand(0); 2891 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2892 SDValue BC = ZE.getOperand(0); 2893 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2894 Arg = BC.getOperand(0); 2895 ReturnF16 = true; 2896 } 2897 } 2898 } 2899 } 2900 2901 switch (VA.getLocInfo()) { 2902 default: llvm_unreachable("Unknown loc info!"); 2903 case CCValAssign::Full: break; 2904 case CCValAssign::BCvt: 2905 if (!ReturnF16) 2906 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2907 break; 2908 } 2909 2910 if (VA.needsCustom()) { 2911 if (VA.getLocVT() == MVT::v2f64) { 2912 // Extract the first half and return it in two registers. 2913 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2914 DAG.getConstant(0, dl, MVT::i32)); 2915 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2916 DAG.getVTList(MVT::i32, MVT::i32), Half); 2917 2918 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2919 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2920 Flag); 2921 Flag = Chain.getValue(1); 2922 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2923 VA = RVLocs[++i]; // skip ahead to next loc 2924 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2925 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2926 Flag); 2927 Flag = Chain.getValue(1); 2928 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2929 VA = RVLocs[++i]; // skip ahead to next loc 2930 2931 // Extract the 2nd half and fall through to handle it as an f64 value. 2932 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2933 DAG.getConstant(1, dl, MVT::i32)); 2934 } 2935 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2936 // available. 2937 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2938 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2939 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2940 fmrrd.getValue(isLittleEndian ? 0 : 1), 2941 Flag); 2942 Flag = Chain.getValue(1); 2943 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2944 VA = RVLocs[++i]; // skip ahead to next loc 2945 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2946 fmrrd.getValue(isLittleEndian ? 1 : 0), 2947 Flag); 2948 } else 2949 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2950 2951 // Guarantee that all emitted copies are 2952 // stuck together, avoiding something bad. 2953 Flag = Chain.getValue(1); 2954 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2955 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2956 } 2957 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2958 const MCPhysReg *I = 2959 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2960 if (I) { 2961 for (; *I; ++I) { 2962 if (ARM::GPRRegClass.contains(*I)) 2963 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2964 else if (ARM::DPRRegClass.contains(*I)) 2965 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2966 else 2967 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2968 } 2969 } 2970 2971 // Update chain and glue. 2972 RetOps[0] = Chain; 2973 if (Flag.getNode()) 2974 RetOps.push_back(Flag); 2975 2976 // CPUs which aren't M-class use a special sequence to return from 2977 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2978 // though we use "subs pc, lr, #N"). 2979 // 2980 // M-class CPUs actually use a normal return sequence with a special 2981 // (hardware-provided) value in LR, so the normal code path works. 2982 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2983 !Subtarget->isMClass()) { 2984 if (Subtarget->isThumb1Only()) 2985 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2986 return LowerInterruptReturn(RetOps, dl, DAG); 2987 } 2988 2989 ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : 2990 ARMISD::RET_FLAG; 2991 return DAG.getNode(RetNode, dl, MVT::Other, RetOps); 2992 } 2993 2994 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2995 if (N->getNumValues() != 1) 2996 return false; 2997 if (!N->hasNUsesOfValue(1, 0)) 2998 return false; 2999 3000 SDValue TCChain = Chain; 3001 SDNode *Copy = *N->use_begin(); 3002 if (Copy->getOpcode() == ISD::CopyToReg) { 3003 // If the copy has a glue operand, we conservatively assume it isn't safe to 3004 // perform a tail call. 3005 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3006 return false; 3007 TCChain = Copy->getOperand(0); 3008 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 3009 SDNode *VMov = Copy; 3010 // f64 returned in a pair of GPRs. 3011 SmallPtrSet<SDNode*, 2> Copies; 3012 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 3013 UI != UE; ++UI) { 3014 if (UI->getOpcode() != ISD::CopyToReg) 3015 return false; 3016 Copies.insert(*UI); 3017 } 3018 if (Copies.size() > 2) 3019 return false; 3020 3021 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 3022 UI != UE; ++UI) { 3023 SDValue UseChain = UI->getOperand(0); 3024 if (Copies.count(UseChain.getNode())) 3025 // Second CopyToReg 3026 Copy = *UI; 3027 else { 3028 // We are at the top of this chain. 3029 // If the copy has a glue operand, we conservatively assume it 3030 // isn't safe to perform a tail call. 3031 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 3032 return false; 3033 // First CopyToReg 3034 TCChain = UseChain; 3035 } 3036 } 3037 } else if (Copy->getOpcode() == ISD::BITCAST) { 3038 // f32 returned in a single GPR. 3039 if (!Copy->hasOneUse()) 3040 return false; 3041 Copy = *Copy->use_begin(); 3042 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 3043 return false; 3044 // If the copy has a glue operand, we conservatively assume it isn't safe to 3045 // perform a tail call. 3046 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3047 return false; 3048 TCChain = Copy->getOperand(0); 3049 } else { 3050 return false; 3051 } 3052 3053 bool HasRet = false; 3054 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 3055 UI != UE; ++UI) { 3056 if (UI->getOpcode() != ARMISD::RET_FLAG && 3057 UI->getOpcode() != ARMISD::INTRET_FLAG) 3058 return false; 3059 HasRet = true; 3060 } 3061 3062 if (!HasRet) 3063 return false; 3064 3065 Chain = TCChain; 3066 return true; 3067 } 3068 3069 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3070 if (!Subtarget->supportsTailCall()) 3071 return false; 3072 3073 if (!CI->isTailCall()) 3074 return false; 3075 3076 return true; 3077 } 3078 3079 // Trying to write a 64 bit value so need to split into two 32 bit values first, 3080 // and pass the lower and high parts through. 3081 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 3082 SDLoc DL(Op); 3083 SDValue WriteValue = Op->getOperand(2); 3084 3085 // This function is only supposed to be called for i64 type argument. 3086 assert(WriteValue.getValueType() == MVT::i64 3087 && "LowerWRITE_REGISTER called for non-i64 type argument."); 3088 3089 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3090 DAG.getConstant(0, DL, MVT::i32)); 3091 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3092 DAG.getConstant(1, DL, MVT::i32)); 3093 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3094 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3095 } 3096 3097 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3098 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3099 // one of the above mentioned nodes. It has to be wrapped because otherwise 3100 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3101 // be used to form addressing mode. These wrapped nodes will be selected 3102 // into MOVi. 3103 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3104 SelectionDAG &DAG) const { 3105 EVT PtrVT = Op.getValueType(); 3106 // FIXME there is no actual debug info here 3107 SDLoc dl(Op); 3108 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3109 SDValue Res; 3110 3111 // When generating execute-only code Constant Pools must be promoted to the 3112 // global data section. It's a bit ugly that we can't share them across basic 3113 // blocks, but this way we guarantee that execute-only behaves correct with 3114 // position-independent addressing modes. 3115 if (Subtarget->genExecuteOnly()) { 3116 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3117 auto T = const_cast<Type*>(CP->getType()); 3118 auto C = const_cast<Constant*>(CP->getConstVal()); 3119 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3120 getFunction().getParent()); 3121 auto GV = new GlobalVariable( 3122 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3123 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3124 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3125 Twine(AFI->createPICLabelUId()) 3126 ); 3127 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3128 dl, PtrVT); 3129 return LowerGlobalAddress(GA, DAG); 3130 } 3131 3132 if (CP->isMachineConstantPoolEntry()) 3133 Res = 3134 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); 3135 else 3136 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); 3137 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3138 } 3139 3140 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3141 return MachineJumpTableInfo::EK_Inline; 3142 } 3143 3144 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3145 SelectionDAG &DAG) const { 3146 MachineFunction &MF = DAG.getMachineFunction(); 3147 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3148 unsigned ARMPCLabelIndex = 0; 3149 SDLoc DL(Op); 3150 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3151 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3152 SDValue CPAddr; 3153 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3154 if (!IsPositionIndependent) { 3155 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); 3156 } else { 3157 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3158 ARMPCLabelIndex = AFI->createPICLabelUId(); 3159 ARMConstantPoolValue *CPV = 3160 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3161 ARMCP::CPBlockAddress, PCAdj); 3162 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3163 } 3164 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3165 SDValue Result = DAG.getLoad( 3166 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3167 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3168 if (!IsPositionIndependent) 3169 return Result; 3170 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3171 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3172 } 3173 3174 /// Convert a TLS address reference into the correct sequence of loads 3175 /// and calls to compute the variable's address for Darwin, and return an 3176 /// SDValue containing the final node. 3177 3178 /// Darwin only has one TLS scheme which must be capable of dealing with the 3179 /// fully general situation, in the worst case. This means: 3180 /// + "extern __thread" declaration. 3181 /// + Defined in a possibly unknown dynamic library. 3182 /// 3183 /// The general system is that each __thread variable has a [3 x i32] descriptor 3184 /// which contains information used by the runtime to calculate the address. The 3185 /// only part of this the compiler needs to know about is the first word, which 3186 /// contains a function pointer that must be called with the address of the 3187 /// entire descriptor in "r0". 3188 /// 3189 /// Since this descriptor may be in a different unit, in general access must 3190 /// proceed along the usual ARM rules. A common sequence to produce is: 3191 /// 3192 /// movw rT1, :lower16:_var$non_lazy_ptr 3193 /// movt rT1, :upper16:_var$non_lazy_ptr 3194 /// ldr r0, [rT1] 3195 /// ldr rT2, [r0] 3196 /// blx rT2 3197 /// [...address now in r0...] 3198 SDValue 3199 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3200 SelectionDAG &DAG) const { 3201 assert(Subtarget->isTargetDarwin() && 3202 "This function expects a Darwin target"); 3203 SDLoc DL(Op); 3204 3205 // First step is to get the address of the actua global symbol. This is where 3206 // the TLS descriptor lives. 3207 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3208 3209 // The first entry in the descriptor is a function pointer that we must call 3210 // to obtain the address of the variable. 3211 SDValue Chain = DAG.getEntryNode(); 3212 SDValue FuncTLVGet = DAG.getLoad( 3213 MVT::i32, DL, Chain, DescAddr, 3214 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 3215 /* Alignment = */ 4, 3216 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3217 MachineMemOperand::MOInvariant); 3218 Chain = FuncTLVGet.getValue(1); 3219 3220 MachineFunction &F = DAG.getMachineFunction(); 3221 MachineFrameInfo &MFI = F.getFrameInfo(); 3222 MFI.setAdjustsStack(true); 3223 3224 // TLS calls preserve all registers except those that absolutely must be 3225 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3226 // silly). 3227 auto TRI = 3228 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3229 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3230 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3231 3232 // Finally, we can make the call. This is just a degenerate version of a 3233 // normal AArch64 call node: r0 takes the address of the descriptor, and 3234 // returns the address of the variable in this thread. 3235 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3236 Chain = 3237 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3238 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3239 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3240 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3241 } 3242 3243 SDValue 3244 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3245 SelectionDAG &DAG) const { 3246 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3247 3248 SDValue Chain = DAG.getEntryNode(); 3249 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3250 SDLoc DL(Op); 3251 3252 // Load the current TEB (thread environment block) 3253 SDValue Ops[] = {Chain, 3254 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3255 DAG.getTargetConstant(15, DL, MVT::i32), 3256 DAG.getTargetConstant(0, DL, MVT::i32), 3257 DAG.getTargetConstant(13, DL, MVT::i32), 3258 DAG.getTargetConstant(0, DL, MVT::i32), 3259 DAG.getTargetConstant(2, DL, MVT::i32)}; 3260 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3261 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3262 3263 SDValue TEB = CurrentTEB.getValue(0); 3264 Chain = CurrentTEB.getValue(1); 3265 3266 // Load the ThreadLocalStoragePointer from the TEB 3267 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3268 SDValue TLSArray = 3269 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3270 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3271 3272 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3273 // offset into the TLSArray. 3274 3275 // Load the TLS index from the C runtime 3276 SDValue TLSIndex = 3277 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3278 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3279 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3280 3281 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3282 DAG.getConstant(2, DL, MVT::i32)); 3283 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3284 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3285 MachinePointerInfo()); 3286 3287 // Get the offset of the start of the .tls section (section base) 3288 const auto *GA = cast<GlobalAddressSDNode>(Op); 3289 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3290 SDValue Offset = DAG.getLoad( 3291 PtrVT, DL, Chain, 3292 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3293 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), 3294 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3295 3296 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3297 } 3298 3299 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3300 SDValue 3301 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3302 SelectionDAG &DAG) const { 3303 SDLoc dl(GA); 3304 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3305 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3306 MachineFunction &MF = DAG.getMachineFunction(); 3307 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3308 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3309 ARMConstantPoolValue *CPV = 3310 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3311 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3312 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3313 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3314 Argument = DAG.getLoad( 3315 PtrVT, dl, DAG.getEntryNode(), Argument, 3316 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3317 SDValue Chain = Argument.getValue(1); 3318 3319 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3320 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3321 3322 // call __tls_get_addr. 3323 ArgListTy Args; 3324 ArgListEntry Entry; 3325 Entry.Node = Argument; 3326 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3327 Args.push_back(Entry); 3328 3329 // FIXME: is there useful debug info available here? 3330 TargetLowering::CallLoweringInfo CLI(DAG); 3331 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3332 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3333 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3334 3335 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3336 return CallResult.first; 3337 } 3338 3339 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3340 // "local exec" model. 3341 SDValue 3342 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3343 SelectionDAG &DAG, 3344 TLSModel::Model model) const { 3345 const GlobalValue *GV = GA->getGlobal(); 3346 SDLoc dl(GA); 3347 SDValue Offset; 3348 SDValue Chain = DAG.getEntryNode(); 3349 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3350 // Get the Thread Pointer 3351 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3352 3353 if (model == TLSModel::InitialExec) { 3354 MachineFunction &MF = DAG.getMachineFunction(); 3355 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3356 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3357 // Initial exec model. 3358 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3359 ARMConstantPoolValue *CPV = 3360 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3361 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3362 true); 3363 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3364 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3365 Offset = DAG.getLoad( 3366 PtrVT, dl, Chain, Offset, 3367 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3368 Chain = Offset.getValue(1); 3369 3370 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3371 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3372 3373 Offset = DAG.getLoad( 3374 PtrVT, dl, Chain, Offset, 3375 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3376 } else { 3377 // local exec model 3378 assert(model == TLSModel::LocalExec); 3379 ARMConstantPoolValue *CPV = 3380 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3381 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3382 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3383 Offset = DAG.getLoad( 3384 PtrVT, dl, Chain, Offset, 3385 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3386 } 3387 3388 // The address of the thread local variable is the add of the thread 3389 // pointer with the offset of the variable. 3390 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3391 } 3392 3393 SDValue 3394 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3395 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3396 if (DAG.getTarget().useEmulatedTLS()) 3397 return LowerToTLSEmulatedModel(GA, DAG); 3398 3399 if (Subtarget->isTargetDarwin()) 3400 return LowerGlobalTLSAddressDarwin(Op, DAG); 3401 3402 if (Subtarget->isTargetWindows()) 3403 return LowerGlobalTLSAddressWindows(Op, DAG); 3404 3405 // TODO: implement the "local dynamic" model 3406 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3407 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3408 3409 switch (model) { 3410 case TLSModel::GeneralDynamic: 3411 case TLSModel::LocalDynamic: 3412 return LowerToTLSGeneralDynamicModel(GA, DAG); 3413 case TLSModel::InitialExec: 3414 case TLSModel::LocalExec: 3415 return LowerToTLSExecModels(GA, DAG, model); 3416 } 3417 llvm_unreachable("bogus TLS model"); 3418 } 3419 3420 /// Return true if all users of V are within function F, looking through 3421 /// ConstantExprs. 3422 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3423 SmallVector<const User*,4> Worklist; 3424 for (auto *U : V->users()) 3425 Worklist.push_back(U); 3426 while (!Worklist.empty()) { 3427 auto *U = Worklist.pop_back_val(); 3428 if (isa<ConstantExpr>(U)) { 3429 for (auto *UU : U->users()) 3430 Worklist.push_back(UU); 3431 continue; 3432 } 3433 3434 auto *I = dyn_cast<Instruction>(U); 3435 if (!I || I->getParent()->getParent() != F) 3436 return false; 3437 } 3438 return true; 3439 } 3440 3441 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3442 const GlobalValue *GV, SelectionDAG &DAG, 3443 EVT PtrVT, const SDLoc &dl) { 3444 // If we're creating a pool entry for a constant global with unnamed address, 3445 // and the global is small enough, we can emit it inline into the constant pool 3446 // to save ourselves an indirection. 3447 // 3448 // This is a win if the constant is only used in one function (so it doesn't 3449 // need to be duplicated) or duplicating the constant wouldn't increase code 3450 // size (implying the constant is no larger than 4 bytes). 3451 const Function &F = DAG.getMachineFunction().getFunction(); 3452 3453 // We rely on this decision to inline being idemopotent and unrelated to the 3454 // use-site. We know that if we inline a variable at one use site, we'll 3455 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3456 // doesn't know about this optimization, so bail out if it's enabled else 3457 // we could decide to inline here (and thus never emit the GV) but require 3458 // the GV from fast-isel generated code. 3459 if (!EnableConstpoolPromotion || 3460 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3461 return SDValue(); 3462 3463 auto *GVar = dyn_cast<GlobalVariable>(GV); 3464 if (!GVar || !GVar->hasInitializer() || 3465 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3466 !GVar->hasLocalLinkage()) 3467 return SDValue(); 3468 3469 // If we inline a value that contains relocations, we move the relocations 3470 // from .data to .text. This is not allowed in position-independent code. 3471 auto *Init = GVar->getInitializer(); 3472 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3473 Init->needsRelocation()) 3474 return SDValue(); 3475 3476 // The constant islands pass can only really deal with alignment requests 3477 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3478 // any type wanting greater alignment requirements than 4 bytes. We also 3479 // can only promote constants that are multiples of 4 bytes in size or 3480 // are paddable to a multiple of 4. Currently we only try and pad constants 3481 // that are strings for simplicity. 3482 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3483 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3484 unsigned PrefAlign = DAG.getDataLayout().getPreferredAlignment(GVar); 3485 unsigned RequiredPadding = 4 - (Size % 4); 3486 bool PaddingPossible = 3487 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3488 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || 3489 Size == 0) 3490 return SDValue(); 3491 3492 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3493 MachineFunction &MF = DAG.getMachineFunction(); 3494 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3495 3496 // We can't bloat the constant pool too much, else the ConstantIslands pass 3497 // may fail to converge. If we haven't promoted this global yet (it may have 3498 // multiple uses), and promoting it would increase the constant pool size (Sz 3499 // > 4), ensure we have space to do so up to MaxTotal. 3500 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3501 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3502 ConstpoolPromotionMaxTotal) 3503 return SDValue(); 3504 3505 // This is only valid if all users are in a single function; we can't clone 3506 // the constant in general. The LLVM IR unnamed_addr allows merging 3507 // constants, but not cloning them. 3508 // 3509 // We could potentially allow cloning if we could prove all uses of the 3510 // constant in the current function don't care about the address, like 3511 // printf format strings. But that isn't implemented for now. 3512 if (!allUsersAreInFunction(GVar, &F)) 3513 return SDValue(); 3514 3515 // We're going to inline this global. Pad it out if needed. 3516 if (RequiredPadding != 4) { 3517 StringRef S = CDAInit->getAsString(); 3518 3519 SmallVector<uint8_t,16> V(S.size()); 3520 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3521 while (RequiredPadding--) 3522 V.push_back(0); 3523 Init = ConstantDataArray::get(*DAG.getContext(), V); 3524 } 3525 3526 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3527 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); 3528 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3529 AFI->markGlobalAsPromotedToConstantPool(GVar); 3530 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3531 PaddedSize - 4); 3532 } 3533 ++NumConstpoolPromoted; 3534 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3535 } 3536 3537 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3538 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3539 if (!(GV = GA->getBaseObject())) 3540 return false; 3541 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3542 return V->isConstant(); 3543 return isa<Function>(GV); 3544 } 3545 3546 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3547 SelectionDAG &DAG) const { 3548 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3549 default: llvm_unreachable("unknown object format"); 3550 case Triple::COFF: 3551 return LowerGlobalAddressWindows(Op, DAG); 3552 case Triple::ELF: 3553 return LowerGlobalAddressELF(Op, DAG); 3554 case Triple::MachO: 3555 return LowerGlobalAddressDarwin(Op, DAG); 3556 } 3557 } 3558 3559 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3560 SelectionDAG &DAG) const { 3561 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3562 SDLoc dl(Op); 3563 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3564 const TargetMachine &TM = getTargetMachine(); 3565 bool IsRO = isReadOnly(GV); 3566 3567 // promoteToConstantPool only if not generating XO text section 3568 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3569 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3570 return V; 3571 3572 if (isPositionIndependent()) { 3573 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3574 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3575 UseGOT_PREL ? ARMII::MO_GOT : 0); 3576 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3577 if (UseGOT_PREL) 3578 Result = 3579 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3580 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3581 return Result; 3582 } else if (Subtarget->isROPI() && IsRO) { 3583 // PC-relative. 3584 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3585 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3586 return Result; 3587 } else if (Subtarget->isRWPI() && !IsRO) { 3588 // SB-relative. 3589 SDValue RelAddr; 3590 if (Subtarget->useMovt()) { 3591 ++NumMovwMovt; 3592 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3593 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3594 } else { // use literal pool for address constant 3595 ARMConstantPoolValue *CPV = 3596 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3597 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3598 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3599 RelAddr = DAG.getLoad( 3600 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3601 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3602 } 3603 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3604 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3605 return Result; 3606 } 3607 3608 // If we have T2 ops, we can materialize the address directly via movt/movw 3609 // pair. This is always cheaper. 3610 if (Subtarget->useMovt()) { 3611 ++NumMovwMovt; 3612 // FIXME: Once remat is capable of dealing with instructions with register 3613 // operands, expand this into two nodes. 3614 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3615 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3616 } else { 3617 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); 3618 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3619 return DAG.getLoad( 3620 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3621 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3622 } 3623 } 3624 3625 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3626 SelectionDAG &DAG) const { 3627 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3628 "ROPI/RWPI not currently supported for Darwin"); 3629 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3630 SDLoc dl(Op); 3631 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3632 3633 if (Subtarget->useMovt()) 3634 ++NumMovwMovt; 3635 3636 // FIXME: Once remat is capable of dealing with instructions with register 3637 // operands, expand this into multiple nodes 3638 unsigned Wrapper = 3639 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3640 3641 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3642 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3643 3644 if (Subtarget->isGVIndirectSymbol(GV)) 3645 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3646 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3647 return Result; 3648 } 3649 3650 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3651 SelectionDAG &DAG) const { 3652 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3653 assert(Subtarget->useMovt() && 3654 "Windows on ARM expects to use movw/movt"); 3655 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3656 "ROPI/RWPI not currently supported for Windows"); 3657 3658 const TargetMachine &TM = getTargetMachine(); 3659 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3660 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3661 if (GV->hasDLLImportStorageClass()) 3662 TargetFlags = ARMII::MO_DLLIMPORT; 3663 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3664 TargetFlags = ARMII::MO_COFFSTUB; 3665 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3666 SDValue Result; 3667 SDLoc DL(Op); 3668 3669 ++NumMovwMovt; 3670 3671 // FIXME: Once remat is capable of dealing with instructions with register 3672 // operands, expand this into two nodes. 3673 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3674 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3675 TargetFlags)); 3676 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3677 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3678 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3679 return Result; 3680 } 3681 3682 SDValue 3683 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3684 SDLoc dl(Op); 3685 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3686 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3687 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3688 Op.getOperand(1), Val); 3689 } 3690 3691 SDValue 3692 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3693 SDLoc dl(Op); 3694 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3695 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3696 } 3697 3698 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3699 SelectionDAG &DAG) const { 3700 SDLoc dl(Op); 3701 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3702 Op.getOperand(0)); 3703 } 3704 3705 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3706 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3707 unsigned IntNo = 3708 cast<ConstantSDNode>( 3709 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3710 ->getZExtValue(); 3711 switch (IntNo) { 3712 default: 3713 return SDValue(); // Don't custom lower most intrinsics. 3714 case Intrinsic::arm_gnu_eabi_mcount: { 3715 MachineFunction &MF = DAG.getMachineFunction(); 3716 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3717 SDLoc dl(Op); 3718 SDValue Chain = Op.getOperand(0); 3719 // call "\01__gnu_mcount_nc" 3720 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3721 const uint32_t *Mask = 3722 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3723 assert(Mask && "Missing call preserved mask for calling convention"); 3724 // Mark LR an implicit live-in. 3725 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3726 SDValue ReturnAddress = 3727 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3728 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; 3729 SDValue Callee = 3730 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3731 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3732 if (Subtarget->isThumb()) 3733 return SDValue( 3734 DAG.getMachineNode( 3735 ARM::tBL_PUSHLR, dl, ResultTys, 3736 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3737 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3738 0); 3739 return SDValue( 3740 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3741 {ReturnAddress, Callee, RegisterMask, Chain}), 3742 0); 3743 } 3744 } 3745 } 3746 3747 SDValue 3748 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3749 const ARMSubtarget *Subtarget) const { 3750 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3751 SDLoc dl(Op); 3752 switch (IntNo) { 3753 default: return SDValue(); // Don't custom lower most intrinsics. 3754 case Intrinsic::thread_pointer: { 3755 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3756 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3757 } 3758 case Intrinsic::arm_cls: { 3759 const SDValue &Operand = Op.getOperand(1); 3760 const EVT VTy = Op.getValueType(); 3761 SDValue SRA = 3762 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 3763 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 3764 SDValue SHL = 3765 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 3766 SDValue OR = 3767 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 3768 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 3769 return Result; 3770 } 3771 case Intrinsic::arm_cls64: { 3772 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 3773 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 3774 const SDValue &Operand = Op.getOperand(1); 3775 const EVT VTy = Op.getValueType(); 3776 3777 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3778 DAG.getConstant(1, dl, VTy)); 3779 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3780 DAG.getConstant(0, dl, VTy)); 3781 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 3782 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 3783 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 3784 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 3785 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 3786 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 3787 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 3788 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 3789 SDValue CheckLo = 3790 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 3791 SDValue HiIsZero = 3792 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 3793 SDValue AdjustedLo = 3794 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 3795 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 3796 SDValue Result = 3797 DAG.getSelect(dl, VTy, CheckLo, 3798 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 3799 return Result; 3800 } 3801 case Intrinsic::eh_sjlj_lsda: { 3802 MachineFunction &MF = DAG.getMachineFunction(); 3803 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3804 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3805 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3806 SDValue CPAddr; 3807 bool IsPositionIndependent = isPositionIndependent(); 3808 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3809 ARMConstantPoolValue *CPV = 3810 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3811 ARMCP::CPLSDA, PCAdj); 3812 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3813 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3814 SDValue Result = DAG.getLoad( 3815 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3816 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3817 3818 if (IsPositionIndependent) { 3819 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3820 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3821 } 3822 return Result; 3823 } 3824 case Intrinsic::arm_neon_vabs: 3825 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3826 Op.getOperand(1)); 3827 case Intrinsic::arm_neon_vmulls: 3828 case Intrinsic::arm_neon_vmullu: { 3829 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3830 ? ARMISD::VMULLs : ARMISD::VMULLu; 3831 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3832 Op.getOperand(1), Op.getOperand(2)); 3833 } 3834 case Intrinsic::arm_neon_vminnm: 3835 case Intrinsic::arm_neon_vmaxnm: { 3836 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3837 ? ISD::FMINNUM : ISD::FMAXNUM; 3838 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3839 Op.getOperand(1), Op.getOperand(2)); 3840 } 3841 case Intrinsic::arm_neon_vminu: 3842 case Intrinsic::arm_neon_vmaxu: { 3843 if (Op.getValueType().isFloatingPoint()) 3844 return SDValue(); 3845 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3846 ? ISD::UMIN : ISD::UMAX; 3847 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3848 Op.getOperand(1), Op.getOperand(2)); 3849 } 3850 case Intrinsic::arm_neon_vmins: 3851 case Intrinsic::arm_neon_vmaxs: { 3852 // v{min,max}s is overloaded between signed integers and floats. 3853 if (!Op.getValueType().isFloatingPoint()) { 3854 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3855 ? ISD::SMIN : ISD::SMAX; 3856 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3857 Op.getOperand(1), Op.getOperand(2)); 3858 } 3859 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3860 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3861 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3862 Op.getOperand(1), Op.getOperand(2)); 3863 } 3864 case Intrinsic::arm_neon_vtbl1: 3865 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3866 Op.getOperand(1), Op.getOperand(2)); 3867 case Intrinsic::arm_neon_vtbl2: 3868 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3869 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3870 case Intrinsic::arm_mve_pred_i2v: 3871 case Intrinsic::arm_mve_pred_v2i: 3872 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 3873 Op.getOperand(1)); 3874 case Intrinsic::arm_mve_vreinterpretq: 3875 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), 3876 Op.getOperand(1)); 3877 case Intrinsic::arm_mve_lsll: 3878 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), 3879 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3880 case Intrinsic::arm_mve_asrl: 3881 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), 3882 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3883 } 3884 } 3885 3886 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3887 const ARMSubtarget *Subtarget) { 3888 SDLoc dl(Op); 3889 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3890 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3891 if (SSID == SyncScope::SingleThread) 3892 return Op; 3893 3894 if (!Subtarget->hasDataBarrier()) { 3895 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3896 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3897 // here. 3898 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3899 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3900 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3901 DAG.getConstant(0, dl, MVT::i32)); 3902 } 3903 3904 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3905 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3906 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3907 if (Subtarget->isMClass()) { 3908 // Only a full system barrier exists in the M-class architectures. 3909 Domain = ARM_MB::SY; 3910 } else if (Subtarget->preferISHSTBarriers() && 3911 Ord == AtomicOrdering::Release) { 3912 // Swift happens to implement ISHST barriers in a way that's compatible with 3913 // Release semantics but weaker than ISH so we'd be fools not to use 3914 // it. Beware: other processors probably don't! 3915 Domain = ARM_MB::ISHST; 3916 } 3917 3918 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3919 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3920 DAG.getConstant(Domain, dl, MVT::i32)); 3921 } 3922 3923 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3924 const ARMSubtarget *Subtarget) { 3925 // ARM pre v5TE and Thumb1 does not have preload instructions. 3926 if (!(Subtarget->isThumb2() || 3927 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3928 // Just preserve the chain. 3929 return Op.getOperand(0); 3930 3931 SDLoc dl(Op); 3932 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3933 if (!isRead && 3934 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3935 // ARMv7 with MP extension has PLDW. 3936 return Op.getOperand(0); 3937 3938 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3939 if (Subtarget->isThumb()) { 3940 // Invert the bits. 3941 isRead = ~isRead & 1; 3942 isData = ~isData & 1; 3943 } 3944 3945 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3946 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3947 DAG.getConstant(isData, dl, MVT::i32)); 3948 } 3949 3950 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3951 MachineFunction &MF = DAG.getMachineFunction(); 3952 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3953 3954 // vastart just stores the address of the VarArgsFrameIndex slot into the 3955 // memory location argument. 3956 SDLoc dl(Op); 3957 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3958 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3959 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3960 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3961 MachinePointerInfo(SV)); 3962 } 3963 3964 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3965 CCValAssign &NextVA, 3966 SDValue &Root, 3967 SelectionDAG &DAG, 3968 const SDLoc &dl) const { 3969 MachineFunction &MF = DAG.getMachineFunction(); 3970 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3971 3972 const TargetRegisterClass *RC; 3973 if (AFI->isThumb1OnlyFunction()) 3974 RC = &ARM::tGPRRegClass; 3975 else 3976 RC = &ARM::GPRRegClass; 3977 3978 // Transform the arguments stored in physical registers into virtual ones. 3979 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3980 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3981 3982 SDValue ArgValue2; 3983 if (NextVA.isMemLoc()) { 3984 MachineFrameInfo &MFI = MF.getFrameInfo(); 3985 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3986 3987 // Create load node to retrieve arguments from the stack. 3988 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3989 ArgValue2 = DAG.getLoad( 3990 MVT::i32, dl, Root, FIN, 3991 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3992 } else { 3993 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3994 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3995 } 3996 if (!Subtarget->isLittle()) 3997 std::swap (ArgValue, ArgValue2); 3998 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3999 } 4000 4001 // The remaining GPRs hold either the beginning of variable-argument 4002 // data, or the beginning of an aggregate passed by value (usually 4003 // byval). Either way, we allocate stack slots adjacent to the data 4004 // provided by our caller, and store the unallocated registers there. 4005 // If this is a variadic function, the va_list pointer will begin with 4006 // these values; otherwise, this reassembles a (byval) structure that 4007 // was split between registers and memory. 4008 // Return: The frame index registers were stored into. 4009 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 4010 const SDLoc &dl, SDValue &Chain, 4011 const Value *OrigArg, 4012 unsigned InRegsParamRecordIdx, 4013 int ArgOffset, unsigned ArgSize) const { 4014 // Currently, two use-cases possible: 4015 // Case #1. Non-var-args function, and we meet first byval parameter. 4016 // Setup first unallocated register as first byval register; 4017 // eat all remained registers 4018 // (these two actions are performed by HandleByVal method). 4019 // Then, here, we initialize stack frame with 4020 // "store-reg" instructions. 4021 // Case #2. Var-args function, that doesn't contain byval parameters. 4022 // The same: eat all remained unallocated registers, 4023 // initialize stack frame. 4024 4025 MachineFunction &MF = DAG.getMachineFunction(); 4026 MachineFrameInfo &MFI = MF.getFrameInfo(); 4027 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4028 unsigned RBegin, REnd; 4029 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 4030 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 4031 } else { 4032 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4033 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 4034 REnd = ARM::R4; 4035 } 4036 4037 if (REnd != RBegin) 4038 ArgOffset = -4 * (ARM::R4 - RBegin); 4039 4040 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4041 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 4042 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 4043 4044 SmallVector<SDValue, 4> MemOps; 4045 const TargetRegisterClass *RC = 4046 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 4047 4048 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 4049 unsigned VReg = MF.addLiveIn(Reg, RC); 4050 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4051 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4052 MachinePointerInfo(OrigArg, 4 * i)); 4053 MemOps.push_back(Store); 4054 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 4055 } 4056 4057 if (!MemOps.empty()) 4058 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4059 return FrameIndex; 4060 } 4061 4062 // Setup stack frame, the va_list pointer will start from. 4063 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 4064 const SDLoc &dl, SDValue &Chain, 4065 unsigned ArgOffset, 4066 unsigned TotalArgRegsSaveSize, 4067 bool ForceMutable) const { 4068 MachineFunction &MF = DAG.getMachineFunction(); 4069 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4070 4071 // Try to store any remaining integer argument regs 4072 // to their spots on the stack so that they may be loaded by dereferencing 4073 // the result of va_next. 4074 // If there is no regs to be stored, just point address after last 4075 // argument passed via stack. 4076 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 4077 CCInfo.getInRegsParamsCount(), 4078 CCInfo.getNextStackOffset(), 4079 std::max(4U, TotalArgRegsSaveSize)); 4080 AFI->setVarArgsFrameIndex(FrameIndex); 4081 } 4082 4083 SDValue ARMTargetLowering::LowerFormalArguments( 4084 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 4085 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4086 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4087 MachineFunction &MF = DAG.getMachineFunction(); 4088 MachineFrameInfo &MFI = MF.getFrameInfo(); 4089 4090 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4091 4092 // Assign locations to all of the incoming arguments. 4093 SmallVector<CCValAssign, 16> ArgLocs; 4094 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 4095 *DAG.getContext()); 4096 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 4097 4098 SmallVector<SDValue, 16> ArgValues; 4099 SDValue ArgValue; 4100 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4101 unsigned CurArgIdx = 0; 4102 4103 // Initially ArgRegsSaveSize is zero. 4104 // Then we increase this value each time we meet byval parameter. 4105 // We also increase this value in case of varargs function. 4106 AFI->setArgRegsSaveSize(0); 4107 4108 // Calculate the amount of stack space that we need to allocate to store 4109 // byval and variadic arguments that are passed in registers. 4110 // We need to know this before we allocate the first byval or variadic 4111 // argument, as they will be allocated a stack slot below the CFA (Canonical 4112 // Frame Address, the stack pointer at entry to the function). 4113 unsigned ArgRegBegin = ARM::R4; 4114 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4115 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4116 break; 4117 4118 CCValAssign &VA = ArgLocs[i]; 4119 unsigned Index = VA.getValNo(); 4120 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4121 if (!Flags.isByVal()) 4122 continue; 4123 4124 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4125 unsigned RBegin, REnd; 4126 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4127 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4128 4129 CCInfo.nextInRegsParam(); 4130 } 4131 CCInfo.rewindByValRegsInfo(); 4132 4133 int lastInsIndex = -1; 4134 if (isVarArg && MFI.hasVAStart()) { 4135 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4136 if (RegIdx != array_lengthof(GPRArgRegs)) 4137 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4138 } 4139 4140 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4141 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4142 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4143 4144 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4145 CCValAssign &VA = ArgLocs[i]; 4146 if (Ins[VA.getValNo()].isOrigArg()) { 4147 std::advance(CurOrigArg, 4148 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4149 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4150 } 4151 // Arguments stored in registers. 4152 if (VA.isRegLoc()) { 4153 EVT RegVT = VA.getLocVT(); 4154 4155 if (VA.needsCustom()) { 4156 // f64 and vector types are split up into multiple registers or 4157 // combinations of registers and stack slots. 4158 if (VA.getLocVT() == MVT::v2f64) { 4159 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 4160 Chain, DAG, dl); 4161 VA = ArgLocs[++i]; // skip ahead to next loc 4162 SDValue ArgValue2; 4163 if (VA.isMemLoc()) { 4164 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4165 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4166 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 4167 MachinePointerInfo::getFixedStack( 4168 DAG.getMachineFunction(), FI)); 4169 } else { 4170 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 4171 Chain, DAG, dl); 4172 } 4173 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4174 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4175 ArgValue, ArgValue1, 4176 DAG.getIntPtrConstant(0, dl)); 4177 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4178 ArgValue, ArgValue2, 4179 DAG.getIntPtrConstant(1, dl)); 4180 } else 4181 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4182 } else { 4183 const TargetRegisterClass *RC; 4184 4185 4186 if (RegVT == MVT::f16) 4187 RC = &ARM::HPRRegClass; 4188 else if (RegVT == MVT::f32) 4189 RC = &ARM::SPRRegClass; 4190 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 4191 RC = &ARM::DPRRegClass; 4192 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 4193 RC = &ARM::QPRRegClass; 4194 else if (RegVT == MVT::i32) 4195 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4196 : &ARM::GPRRegClass; 4197 else 4198 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4199 4200 // Transform the arguments in physical registers into virtual ones. 4201 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4202 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4203 4204 // If this value is passed in r0 and has the returned attribute (e.g. 4205 // C++ 'structors), record this fact for later use. 4206 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4207 AFI->setPreservesR0(); 4208 } 4209 } 4210 4211 // If this is an 8 or 16-bit value, it is really passed promoted 4212 // to 32 bits. Insert an assert[sz]ext to capture this, then 4213 // truncate to the right size. 4214 switch (VA.getLocInfo()) { 4215 default: llvm_unreachable("Unknown loc info!"); 4216 case CCValAssign::Full: break; 4217 case CCValAssign::BCvt: 4218 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4219 break; 4220 case CCValAssign::SExt: 4221 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4222 DAG.getValueType(VA.getValVT())); 4223 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4224 break; 4225 case CCValAssign::ZExt: 4226 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4227 DAG.getValueType(VA.getValVT())); 4228 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4229 break; 4230 } 4231 4232 InVals.push_back(ArgValue); 4233 } else { // VA.isRegLoc() 4234 // sanity check 4235 assert(VA.isMemLoc()); 4236 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4237 4238 int index = VA.getValNo(); 4239 4240 // Some Ins[] entries become multiple ArgLoc[] entries. 4241 // Process them only once. 4242 if (index != lastInsIndex) 4243 { 4244 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4245 // FIXME: For now, all byval parameter objects are marked mutable. 4246 // This can be changed with more analysis. 4247 // In case of tail call optimization mark all arguments mutable. 4248 // Since they could be overwritten by lowering of arguments in case of 4249 // a tail call. 4250 if (Flags.isByVal()) { 4251 assert(Ins[index].isOrigArg() && 4252 "Byval arguments cannot be implicit"); 4253 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4254 4255 int FrameIndex = StoreByValRegs( 4256 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4257 VA.getLocMemOffset(), Flags.getByValSize()); 4258 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4259 CCInfo.nextInRegsParam(); 4260 } else { 4261 unsigned FIOffset = VA.getLocMemOffset(); 4262 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4263 FIOffset, true); 4264 4265 // Create load nodes to retrieve arguments from the stack. 4266 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4267 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4268 MachinePointerInfo::getFixedStack( 4269 DAG.getMachineFunction(), FI))); 4270 } 4271 lastInsIndex = index; 4272 } 4273 } 4274 } 4275 4276 // varargs 4277 if (isVarArg && MFI.hasVAStart()) 4278 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 4279 CCInfo.getNextStackOffset(), 4280 TotalArgRegsSaveSize); 4281 4282 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 4283 4284 return Chain; 4285 } 4286 4287 /// isFloatingPointZero - Return true if this is +0.0. 4288 static bool isFloatingPointZero(SDValue Op) { 4289 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4290 return CFP->getValueAPF().isPosZero(); 4291 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4292 // Maybe this has already been legalized into the constant pool? 4293 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4294 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4295 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4296 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4297 return CFP->getValueAPF().isPosZero(); 4298 } 4299 } else if (Op->getOpcode() == ISD::BITCAST && 4300 Op->getValueType(0) == MVT::f64) { 4301 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4302 // created by LowerConstantFP(). 4303 SDValue BitcastOp = Op->getOperand(0); 4304 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4305 isNullConstant(BitcastOp->getOperand(0))) 4306 return true; 4307 } 4308 return false; 4309 } 4310 4311 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4312 /// the given operands. 4313 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4314 SDValue &ARMcc, SelectionDAG &DAG, 4315 const SDLoc &dl) const { 4316 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4317 unsigned C = RHSC->getZExtValue(); 4318 if (!isLegalICmpImmediate((int32_t)C)) { 4319 // Constant does not fit, try adjusting it by one. 4320 switch (CC) { 4321 default: break; 4322 case ISD::SETLT: 4323 case ISD::SETGE: 4324 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4325 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4326 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4327 } 4328 break; 4329 case ISD::SETULT: 4330 case ISD::SETUGE: 4331 if (C != 0 && isLegalICmpImmediate(C-1)) { 4332 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4333 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4334 } 4335 break; 4336 case ISD::SETLE: 4337 case ISD::SETGT: 4338 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4339 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4340 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4341 } 4342 break; 4343 case ISD::SETULE: 4344 case ISD::SETUGT: 4345 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4346 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4347 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4348 } 4349 break; 4350 } 4351 } 4352 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4353 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4354 // In ARM and Thumb-2, the compare instructions can shift their second 4355 // operand. 4356 CC = ISD::getSetCCSwappedOperands(CC); 4357 std::swap(LHS, RHS); 4358 } 4359 4360 // Thumb1 has very limited immediate modes, so turning an "and" into a 4361 // shift can save multiple instructions. 4362 // 4363 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4364 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4365 // own. If it's the operand to an unsigned comparison with an immediate, 4366 // we can eliminate one of the shifts: we transform 4367 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4368 // 4369 // We avoid transforming cases which aren't profitable due to encoding 4370 // details: 4371 // 4372 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4373 // would not; in that case, we're essentially trading one immediate load for 4374 // another. 4375 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4376 // 3. C2 is zero; we have other code for this special case. 4377 // 4378 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4379 // instruction, since the AND is always one instruction anyway, but we could 4380 // use narrow instructions in some cases. 4381 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4382 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4383 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4384 !isSignedIntSetCC(CC)) { 4385 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4386 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4387 uint64_t RHSV = RHSC->getZExtValue(); 4388 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4389 unsigned ShiftBits = countLeadingZeros(Mask); 4390 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4391 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4392 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4393 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4394 } 4395 } 4396 } 4397 4398 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4399 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4400 // way a cmp would. 4401 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4402 // some tweaks to the heuristics for the previous and->shift transform. 4403 // FIXME: Optimize cases where the LHS isn't a shift. 4404 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4405 isa<ConstantSDNode>(RHS) && 4406 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4407 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4408 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4409 unsigned ShiftAmt = 4410 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4411 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4412 DAG.getVTList(MVT::i32, MVT::i32), 4413 LHS.getOperand(0), 4414 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4415 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4416 Shift.getValue(1), SDValue()); 4417 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4418 return Chain.getValue(1); 4419 } 4420 4421 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4422 4423 // If the RHS is a constant zero then the V (overflow) flag will never be 4424 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4425 // simpler for other passes (like the peephole optimiser) to deal with. 4426 if (isNullConstant(RHS)) { 4427 switch (CondCode) { 4428 default: break; 4429 case ARMCC::GE: 4430 CondCode = ARMCC::PL; 4431 break; 4432 case ARMCC::LT: 4433 CondCode = ARMCC::MI; 4434 break; 4435 } 4436 } 4437 4438 ARMISD::NodeType CompareType; 4439 switch (CondCode) { 4440 default: 4441 CompareType = ARMISD::CMP; 4442 break; 4443 case ARMCC::EQ: 4444 case ARMCC::NE: 4445 // Uses only Z Flag 4446 CompareType = ARMISD::CMPZ; 4447 break; 4448 } 4449 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4450 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4451 } 4452 4453 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4454 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4455 SelectionDAG &DAG, const SDLoc &dl, 4456 bool Signaling) const { 4457 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4458 SDValue Cmp; 4459 if (!isFloatingPointZero(RHS)) 4460 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4461 dl, MVT::Glue, LHS, RHS); 4462 else 4463 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4464 dl, MVT::Glue, LHS); 4465 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4466 } 4467 4468 /// duplicateCmp - Glue values can have only one use, so this function 4469 /// duplicates a comparison node. 4470 SDValue 4471 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4472 unsigned Opc = Cmp.getOpcode(); 4473 SDLoc DL(Cmp); 4474 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4475 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4476 4477 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4478 Cmp = Cmp.getOperand(0); 4479 Opc = Cmp.getOpcode(); 4480 if (Opc == ARMISD::CMPFP) 4481 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4482 else { 4483 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4484 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4485 } 4486 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4487 } 4488 4489 // This function returns three things: the arithmetic computation itself 4490 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4491 // comparison and the condition code define the case in which the arithmetic 4492 // computation *does not* overflow. 4493 std::pair<SDValue, SDValue> 4494 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4495 SDValue &ARMcc) const { 4496 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4497 4498 SDValue Value, OverflowCmp; 4499 SDValue LHS = Op.getOperand(0); 4500 SDValue RHS = Op.getOperand(1); 4501 SDLoc dl(Op); 4502 4503 // FIXME: We are currently always generating CMPs because we don't support 4504 // generating CMN through the backend. This is not as good as the natural 4505 // CMP case because it causes a register dependency and cannot be folded 4506 // later. 4507 4508 switch (Op.getOpcode()) { 4509 default: 4510 llvm_unreachable("Unknown overflow instruction!"); 4511 case ISD::SADDO: 4512 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4513 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4514 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4515 break; 4516 case ISD::UADDO: 4517 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4518 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4519 // We do not use it in the USUBO case as Value may not be used. 4520 Value = DAG.getNode(ARMISD::ADDC, dl, 4521 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4522 .getValue(0); 4523 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4524 break; 4525 case ISD::SSUBO: 4526 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4527 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4528 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4529 break; 4530 case ISD::USUBO: 4531 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4532 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4533 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4534 break; 4535 case ISD::UMULO: 4536 // We generate a UMUL_LOHI and then check if the high word is 0. 4537 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4538 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4539 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4540 LHS, RHS); 4541 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4542 DAG.getConstant(0, dl, MVT::i32)); 4543 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4544 break; 4545 case ISD::SMULO: 4546 // We generate a SMUL_LOHI and then check if all the bits of the high word 4547 // are the same as the sign bit of the low word. 4548 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4549 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4550 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4551 LHS, RHS); 4552 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4553 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4554 Value.getValue(0), 4555 DAG.getConstant(31, dl, MVT::i32))); 4556 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4557 break; 4558 } // switch (...) 4559 4560 return std::make_pair(Value, OverflowCmp); 4561 } 4562 4563 SDValue 4564 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4565 // Let legalize expand this if it isn't a legal type yet. 4566 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4567 return SDValue(); 4568 4569 SDValue Value, OverflowCmp; 4570 SDValue ARMcc; 4571 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4572 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4573 SDLoc dl(Op); 4574 // We use 0 and 1 as false and true values. 4575 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4576 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4577 EVT VT = Op.getValueType(); 4578 4579 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4580 ARMcc, CCR, OverflowCmp); 4581 4582 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4583 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4584 } 4585 4586 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4587 SelectionDAG &DAG) { 4588 SDLoc DL(BoolCarry); 4589 EVT CarryVT = BoolCarry.getValueType(); 4590 4591 // This converts the boolean value carry into the carry flag by doing 4592 // ARMISD::SUBC Carry, 1 4593 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4594 DAG.getVTList(CarryVT, MVT::i32), 4595 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4596 return Carry.getValue(1); 4597 } 4598 4599 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4600 SelectionDAG &DAG) { 4601 SDLoc DL(Flags); 4602 4603 // Now convert the carry flag into a boolean carry. We do this 4604 // using ARMISD:ADDE 0, 0, Carry 4605 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4606 DAG.getConstant(0, DL, MVT::i32), 4607 DAG.getConstant(0, DL, MVT::i32), Flags); 4608 } 4609 4610 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4611 SelectionDAG &DAG) const { 4612 // Let legalize expand this if it isn't a legal type yet. 4613 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4614 return SDValue(); 4615 4616 SDValue LHS = Op.getOperand(0); 4617 SDValue RHS = Op.getOperand(1); 4618 SDLoc dl(Op); 4619 4620 EVT VT = Op.getValueType(); 4621 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4622 SDValue Value; 4623 SDValue Overflow; 4624 switch (Op.getOpcode()) { 4625 default: 4626 llvm_unreachable("Unknown overflow instruction!"); 4627 case ISD::UADDO: 4628 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4629 // Convert the carry flag into a boolean value. 4630 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4631 break; 4632 case ISD::USUBO: { 4633 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4634 // Convert the carry flag into a boolean value. 4635 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4636 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4637 // value. So compute 1 - C. 4638 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4639 DAG.getConstant(1, dl, MVT::i32), Overflow); 4640 break; 4641 } 4642 } 4643 4644 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4645 } 4646 4647 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4648 const ARMSubtarget *Subtarget) { 4649 EVT VT = Op.getValueType(); 4650 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 4651 return SDValue(); 4652 if (!VT.isSimple()) 4653 return SDValue(); 4654 4655 unsigned NewOpcode; 4656 bool IsAdd = Op->getOpcode() == ISD::SADDSAT; 4657 switch (VT.getSimpleVT().SimpleTy) { 4658 default: 4659 return SDValue(); 4660 case MVT::i8: 4661 NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; 4662 break; 4663 case MVT::i16: 4664 NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; 4665 break; 4666 } 4667 4668 SDLoc dl(Op); 4669 SDValue Add = 4670 DAG.getNode(NewOpcode, dl, MVT::i32, 4671 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 4672 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 4673 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 4674 } 4675 4676 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4677 SDValue Cond = Op.getOperand(0); 4678 SDValue SelectTrue = Op.getOperand(1); 4679 SDValue SelectFalse = Op.getOperand(2); 4680 SDLoc dl(Op); 4681 unsigned Opc = Cond.getOpcode(); 4682 4683 if (Cond.getResNo() == 1 && 4684 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4685 Opc == ISD::USUBO)) { 4686 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4687 return SDValue(); 4688 4689 SDValue Value, OverflowCmp; 4690 SDValue ARMcc; 4691 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4692 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4693 EVT VT = Op.getValueType(); 4694 4695 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4696 OverflowCmp, DAG); 4697 } 4698 4699 // Convert: 4700 // 4701 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4702 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4703 // 4704 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4705 const ConstantSDNode *CMOVTrue = 4706 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4707 const ConstantSDNode *CMOVFalse = 4708 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4709 4710 if (CMOVTrue && CMOVFalse) { 4711 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4712 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4713 4714 SDValue True; 4715 SDValue False; 4716 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4717 True = SelectTrue; 4718 False = SelectFalse; 4719 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4720 True = SelectFalse; 4721 False = SelectTrue; 4722 } 4723 4724 if (True.getNode() && False.getNode()) { 4725 EVT VT = Op.getValueType(); 4726 SDValue ARMcc = Cond.getOperand(2); 4727 SDValue CCR = Cond.getOperand(3); 4728 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4729 assert(True.getValueType() == VT); 4730 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4731 } 4732 } 4733 } 4734 4735 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4736 // undefined bits before doing a full-word comparison with zero. 4737 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4738 DAG.getConstant(1, dl, Cond.getValueType())); 4739 4740 return DAG.getSelectCC(dl, Cond, 4741 DAG.getConstant(0, dl, Cond.getValueType()), 4742 SelectTrue, SelectFalse, ISD::SETNE); 4743 } 4744 4745 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4746 bool &swpCmpOps, bool &swpVselOps) { 4747 // Start by selecting the GE condition code for opcodes that return true for 4748 // 'equality' 4749 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4750 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4751 CondCode = ARMCC::GE; 4752 4753 // and GT for opcodes that return false for 'equality'. 4754 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4755 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4756 CondCode = ARMCC::GT; 4757 4758 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4759 // to swap the compare operands. 4760 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4761 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4762 swpCmpOps = true; 4763 4764 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4765 // If we have an unordered opcode, we need to swap the operands to the VSEL 4766 // instruction (effectively negating the condition). 4767 // 4768 // This also has the effect of swapping which one of 'less' or 'greater' 4769 // returns true, so we also swap the compare operands. It also switches 4770 // whether we return true for 'equality', so we compensate by picking the 4771 // opposite condition code to our original choice. 4772 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4773 CC == ISD::SETUGT) { 4774 swpCmpOps = !swpCmpOps; 4775 swpVselOps = !swpVselOps; 4776 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4777 } 4778 4779 // 'ordered' is 'anything but unordered', so use the VS condition code and 4780 // swap the VSEL operands. 4781 if (CC == ISD::SETO) { 4782 CondCode = ARMCC::VS; 4783 swpVselOps = true; 4784 } 4785 4786 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4787 // code and swap the VSEL operands. Also do this if we don't care about the 4788 // unordered case. 4789 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4790 CondCode = ARMCC::EQ; 4791 swpVselOps = true; 4792 } 4793 } 4794 4795 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4796 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4797 SDValue Cmp, SelectionDAG &DAG) const { 4798 if (!Subtarget->hasFP64() && VT == MVT::f64) { 4799 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4800 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4801 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4802 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4803 4804 SDValue TrueLow = TrueVal.getValue(0); 4805 SDValue TrueHigh = TrueVal.getValue(1); 4806 SDValue FalseLow = FalseVal.getValue(0); 4807 SDValue FalseHigh = FalseVal.getValue(1); 4808 4809 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4810 ARMcc, CCR, Cmp); 4811 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4812 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4813 4814 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4815 } else { 4816 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4817 Cmp); 4818 } 4819 } 4820 4821 static bool isGTorGE(ISD::CondCode CC) { 4822 return CC == ISD::SETGT || CC == ISD::SETGE; 4823 } 4824 4825 static bool isLTorLE(ISD::CondCode CC) { 4826 return CC == ISD::SETLT || CC == ISD::SETLE; 4827 } 4828 4829 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4830 // All of these conditions (and their <= and >= counterparts) will do: 4831 // x < k ? k : x 4832 // x > k ? x : k 4833 // k < x ? x : k 4834 // k > x ? k : x 4835 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4836 const SDValue TrueVal, const SDValue FalseVal, 4837 const ISD::CondCode CC, const SDValue K) { 4838 return (isGTorGE(CC) && 4839 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4840 (isLTorLE(CC) && 4841 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4842 } 4843 4844 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4845 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4846 const SDValue TrueVal, const SDValue FalseVal, 4847 const ISD::CondCode CC, const SDValue K) { 4848 return (isGTorGE(CC) && 4849 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4850 (isLTorLE(CC) && 4851 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4852 } 4853 4854 // Check if two chained conditionals could be converted into SSAT or USAT. 4855 // 4856 // SSAT can replace a set of two conditional selectors that bound a number to an 4857 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4858 // 4859 // x < -k ? -k : (x > k ? k : x) 4860 // x < -k ? -k : (x < k ? x : k) 4861 // x > -k ? (x > k ? k : x) : -k 4862 // x < k ? (x < -k ? -k : x) : k 4863 // etc. 4864 // 4865 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4866 // a power of 2. 4867 // 4868 // It returns true if the conversion can be done, false otherwise. 4869 // Additionally, the variable is returned in parameter V, the constant in K and 4870 // usat is set to true if the conditional represents an unsigned saturation 4871 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4872 uint64_t &K, bool &usat) { 4873 SDValue LHS1 = Op.getOperand(0); 4874 SDValue RHS1 = Op.getOperand(1); 4875 SDValue TrueVal1 = Op.getOperand(2); 4876 SDValue FalseVal1 = Op.getOperand(3); 4877 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4878 4879 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4880 if (Op2.getOpcode() != ISD::SELECT_CC) 4881 return false; 4882 4883 SDValue LHS2 = Op2.getOperand(0); 4884 SDValue RHS2 = Op2.getOperand(1); 4885 SDValue TrueVal2 = Op2.getOperand(2); 4886 SDValue FalseVal2 = Op2.getOperand(3); 4887 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4888 4889 // Find out which are the constants and which are the variables 4890 // in each conditional 4891 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4892 ? &RHS1 4893 : nullptr; 4894 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4895 ? &RHS2 4896 : nullptr; 4897 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4898 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4899 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4900 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4901 4902 // We must detect cases where the original operations worked with 16- or 4903 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4904 // must work with sign-extended values but the select operations return 4905 // the original non-extended value. 4906 SDValue V2TmpReg = V2Tmp; 4907 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4908 V2TmpReg = V2Tmp->getOperand(0); 4909 4910 // Check that the registers and the constants have the correct values 4911 // in both conditionals 4912 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4913 V2TmpReg != V2) 4914 return false; 4915 4916 // Figure out which conditional is saturating the lower/upper bound. 4917 const SDValue *LowerCheckOp = 4918 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4919 ? &Op 4920 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4921 ? &Op2 4922 : nullptr; 4923 const SDValue *UpperCheckOp = 4924 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4925 ? &Op 4926 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4927 ? &Op2 4928 : nullptr; 4929 4930 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4931 return false; 4932 4933 // Check that the constant in the lower-bound check is 4934 // the opposite of the constant in the upper-bound check 4935 // in 1's complement. 4936 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4937 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4938 int64_t PosVal = std::max(Val1, Val2); 4939 int64_t NegVal = std::min(Val1, Val2); 4940 4941 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4942 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4943 isPowerOf2_64(PosVal + 1)) { 4944 4945 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4946 if (Val1 == ~Val2) 4947 usat = false; 4948 else if (NegVal == 0) 4949 usat = true; 4950 else 4951 return false; 4952 4953 V = V2; 4954 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4955 4956 return true; 4957 } 4958 4959 return false; 4960 } 4961 4962 // Check if a condition of the type x < k ? k : x can be converted into a 4963 // bit operation instead of conditional moves. 4964 // Currently this is allowed given: 4965 // - The conditions and values match up 4966 // - k is 0 or -1 (all ones) 4967 // This function will not check the last condition, thats up to the caller 4968 // It returns true if the transformation can be made, and in such case 4969 // returns x in V, and k in SatK. 4970 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4971 SDValue &SatK) 4972 { 4973 SDValue LHS = Op.getOperand(0); 4974 SDValue RHS = Op.getOperand(1); 4975 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4976 SDValue TrueVal = Op.getOperand(2); 4977 SDValue FalseVal = Op.getOperand(3); 4978 4979 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4980 ? &RHS 4981 : nullptr; 4982 4983 // No constant operation in comparison, early out 4984 if (!K) 4985 return false; 4986 4987 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4988 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4989 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4990 4991 // If the constant on left and right side, or variable on left and right, 4992 // does not match, early out 4993 if (*K != KTmp || V != VTmp) 4994 return false; 4995 4996 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4997 SatK = *K; 4998 return true; 4999 } 5000 5001 return false; 5002 } 5003 5004 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 5005 if (VT == MVT::f32) 5006 return !Subtarget->hasVFP2Base(); 5007 if (VT == MVT::f64) 5008 return !Subtarget->hasFP64(); 5009 if (VT == MVT::f16) 5010 return !Subtarget->hasFullFP16(); 5011 return false; 5012 } 5013 5014 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 5015 EVT VT = Op.getValueType(); 5016 SDLoc dl(Op); 5017 5018 // Try to convert two saturating conditional selects into a single SSAT 5019 SDValue SatValue; 5020 uint64_t SatConstant; 5021 bool SatUSat; 5022 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 5023 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 5024 if (SatUSat) 5025 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 5026 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 5027 else 5028 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 5029 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 5030 } 5031 5032 // Try to convert expressions of the form x < k ? k : x (and similar forms) 5033 // into more efficient bit operations, which is possible when k is 0 or -1 5034 // On ARM and Thumb-2 which have flexible operand 2 this will result in 5035 // single instructions. On Thumb the shift and the bit operation will be two 5036 // instructions. 5037 // Only allow this transformation on full-width (32-bit) operations 5038 SDValue LowerSatConstant; 5039 if (VT == MVT::i32 && 5040 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 5041 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 5042 DAG.getConstant(31, dl, VT)); 5043 if (isNullConstant(LowerSatConstant)) { 5044 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 5045 DAG.getAllOnesConstant(dl, VT)); 5046 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 5047 } else if (isAllOnesConstant(LowerSatConstant)) 5048 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 5049 } 5050 5051 SDValue LHS = Op.getOperand(0); 5052 SDValue RHS = Op.getOperand(1); 5053 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5054 SDValue TrueVal = Op.getOperand(2); 5055 SDValue FalseVal = Op.getOperand(3); 5056 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 5057 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 5058 5059 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 5060 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 5061 unsigned TVal = CTVal->getZExtValue(); 5062 unsigned FVal = CFVal->getZExtValue(); 5063 unsigned Opcode = 0; 5064 5065 if (TVal == ~FVal) { 5066 Opcode = ARMISD::CSINV; 5067 } else if (TVal == ~FVal + 1) { 5068 Opcode = ARMISD::CSNEG; 5069 } else if (TVal + 1 == FVal) { 5070 Opcode = ARMISD::CSINC; 5071 } else if (TVal == FVal + 1) { 5072 Opcode = ARMISD::CSINC; 5073 std::swap(TrueVal, FalseVal); 5074 std::swap(TVal, FVal); 5075 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5076 } 5077 5078 if (Opcode) { 5079 // If one of the constants is cheaper than another, materialise the 5080 // cheaper one and let the csel generate the other. 5081 if (Opcode != ARMISD::CSINC && 5082 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 5083 std::swap(TrueVal, FalseVal); 5084 std::swap(TVal, FVal); 5085 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5086 } 5087 5088 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 5089 // to get there. CSINC not is invertable like the other two (~(~a) == a, 5090 // -(-a) == a, but (a+1)+1 != a). 5091 if (FVal == 0 && Opcode != ARMISD::CSINC) { 5092 std::swap(TrueVal, FalseVal); 5093 std::swap(TVal, FVal); 5094 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5095 } 5096 if (TVal == 0) 5097 TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); 5098 5099 // Drops F's value because we can get it by inverting/negating TVal. 5100 FalseVal = TrueVal; 5101 5102 SDValue ARMcc; 5103 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5104 EVT VT = TrueVal.getValueType(); 5105 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5106 } 5107 } 5108 5109 if (isUnsupportedFloatingType(LHS.getValueType())) { 5110 DAG.getTargetLoweringInfo().softenSetCCOperands( 5111 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5112 5113 // If softenSetCCOperands only returned one value, we should compare it to 5114 // zero. 5115 if (!RHS.getNode()) { 5116 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5117 CC = ISD::SETNE; 5118 } 5119 } 5120 5121 if (LHS.getValueType() == MVT::i32) { 5122 // Try to generate VSEL on ARMv8. 5123 // The VSEL instruction can't use all the usual ARM condition 5124 // codes: it only has two bits to select the condition code, so it's 5125 // constrained to use only GE, GT, VS and EQ. 5126 // 5127 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5128 // swap the operands of the previous compare instruction (effectively 5129 // inverting the compare condition, swapping 'less' and 'greater') and 5130 // sometimes need to swap the operands to the VSEL (which inverts the 5131 // condition in the sense of firing whenever the previous condition didn't) 5132 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5133 TrueVal.getValueType() == MVT::f32 || 5134 TrueVal.getValueType() == MVT::f64)) { 5135 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5136 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5137 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5138 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5139 std::swap(TrueVal, FalseVal); 5140 } 5141 } 5142 5143 SDValue ARMcc; 5144 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5145 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5146 // Choose GE over PL, which vsel does now support 5147 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5148 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5149 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5150 } 5151 5152 ARMCC::CondCodes CondCode, CondCode2; 5153 FPCCToARMCC(CC, CondCode, CondCode2); 5154 5155 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5156 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5157 // must use VSEL (limited condition codes), due to not having conditional f16 5158 // moves. 5159 if (Subtarget->hasFPARMv8Base() && 5160 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5161 (TrueVal.getValueType() == MVT::f16 || 5162 TrueVal.getValueType() == MVT::f32 || 5163 TrueVal.getValueType() == MVT::f64)) { 5164 bool swpCmpOps = false; 5165 bool swpVselOps = false; 5166 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5167 5168 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5169 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5170 if (swpCmpOps) 5171 std::swap(LHS, RHS); 5172 if (swpVselOps) 5173 std::swap(TrueVal, FalseVal); 5174 } 5175 } 5176 5177 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5178 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5179 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5180 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5181 if (CondCode2 != ARMCC::AL) { 5182 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5183 // FIXME: Needs another CMP because flag can have but one use. 5184 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5185 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5186 } 5187 return Result; 5188 } 5189 5190 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5191 /// to morph to an integer compare sequence. 5192 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5193 const ARMSubtarget *Subtarget) { 5194 SDNode *N = Op.getNode(); 5195 if (!N->hasOneUse()) 5196 // Otherwise it requires moving the value from fp to integer registers. 5197 return false; 5198 if (!N->getNumValues()) 5199 return false; 5200 EVT VT = Op.getValueType(); 5201 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5202 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5203 // vmrs are very slow, e.g. cortex-a8. 5204 return false; 5205 5206 if (isFloatingPointZero(Op)) { 5207 SeenZero = true; 5208 return true; 5209 } 5210 return ISD::isNormalLoad(N); 5211 } 5212 5213 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5214 if (isFloatingPointZero(Op)) 5215 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5216 5217 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5218 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5219 Ld->getPointerInfo(), Ld->getAlignment(), 5220 Ld->getMemOperand()->getFlags()); 5221 5222 llvm_unreachable("Unknown VFP cmp argument!"); 5223 } 5224 5225 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5226 SDValue &RetVal1, SDValue &RetVal2) { 5227 SDLoc dl(Op); 5228 5229 if (isFloatingPointZero(Op)) { 5230 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5231 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5232 return; 5233 } 5234 5235 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5236 SDValue Ptr = Ld->getBasePtr(); 5237 RetVal1 = 5238 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5239 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5240 5241 EVT PtrType = Ptr.getValueType(); 5242 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5243 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5244 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5245 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5246 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5247 Ld->getMemOperand()->getFlags()); 5248 return; 5249 } 5250 5251 llvm_unreachable("Unknown VFP cmp argument!"); 5252 } 5253 5254 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5255 /// f32 and even f64 comparisons to integer ones. 5256 SDValue 5257 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5258 SDValue Chain = Op.getOperand(0); 5259 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5260 SDValue LHS = Op.getOperand(2); 5261 SDValue RHS = Op.getOperand(3); 5262 SDValue Dest = Op.getOperand(4); 5263 SDLoc dl(Op); 5264 5265 bool LHSSeenZero = false; 5266 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5267 bool RHSSeenZero = false; 5268 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5269 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5270 // If unsafe fp math optimization is enabled and there are no other uses of 5271 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5272 // to an integer comparison. 5273 if (CC == ISD::SETOEQ) 5274 CC = ISD::SETEQ; 5275 else if (CC == ISD::SETUNE) 5276 CC = ISD::SETNE; 5277 5278 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5279 SDValue ARMcc; 5280 if (LHS.getValueType() == MVT::f32) { 5281 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5282 bitcastf32Toi32(LHS, DAG), Mask); 5283 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5284 bitcastf32Toi32(RHS, DAG), Mask); 5285 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5286 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5287 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5288 Chain, Dest, ARMcc, CCR, Cmp); 5289 } 5290 5291 SDValue LHS1, LHS2; 5292 SDValue RHS1, RHS2; 5293 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5294 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5295 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5296 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5297 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5298 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5299 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5300 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5301 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5302 } 5303 5304 return SDValue(); 5305 } 5306 5307 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5308 SDValue Chain = Op.getOperand(0); 5309 SDValue Cond = Op.getOperand(1); 5310 SDValue Dest = Op.getOperand(2); 5311 SDLoc dl(Op); 5312 5313 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5314 // instruction. 5315 unsigned Opc = Cond.getOpcode(); 5316 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5317 !Subtarget->isThumb1Only(); 5318 if (Cond.getResNo() == 1 && 5319 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5320 Opc == ISD::USUBO || OptimizeMul)) { 5321 // Only lower legal XALUO ops. 5322 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5323 return SDValue(); 5324 5325 // The actual operation with overflow check. 5326 SDValue Value, OverflowCmp; 5327 SDValue ARMcc; 5328 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5329 5330 // Reverse the condition code. 5331 ARMCC::CondCodes CondCode = 5332 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5333 CondCode = ARMCC::getOppositeCondition(CondCode); 5334 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5335 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5336 5337 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5338 OverflowCmp); 5339 } 5340 5341 return SDValue(); 5342 } 5343 5344 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5345 SDValue Chain = Op.getOperand(0); 5346 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5347 SDValue LHS = Op.getOperand(2); 5348 SDValue RHS = Op.getOperand(3); 5349 SDValue Dest = Op.getOperand(4); 5350 SDLoc dl(Op); 5351 5352 if (isUnsupportedFloatingType(LHS.getValueType())) { 5353 DAG.getTargetLoweringInfo().softenSetCCOperands( 5354 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5355 5356 // If softenSetCCOperands only returned one value, we should compare it to 5357 // zero. 5358 if (!RHS.getNode()) { 5359 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5360 CC = ISD::SETNE; 5361 } 5362 } 5363 5364 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5365 // instruction. 5366 unsigned Opc = LHS.getOpcode(); 5367 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5368 !Subtarget->isThumb1Only(); 5369 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5370 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5371 Opc == ISD::USUBO || OptimizeMul) && 5372 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5373 // Only lower legal XALUO ops. 5374 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5375 return SDValue(); 5376 5377 // The actual operation with overflow check. 5378 SDValue Value, OverflowCmp; 5379 SDValue ARMcc; 5380 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5381 5382 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5383 // Reverse the condition code. 5384 ARMCC::CondCodes CondCode = 5385 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5386 CondCode = ARMCC::getOppositeCondition(CondCode); 5387 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5388 } 5389 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5390 5391 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5392 OverflowCmp); 5393 } 5394 5395 if (LHS.getValueType() == MVT::i32) { 5396 SDValue ARMcc; 5397 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5398 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5399 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5400 Chain, Dest, ARMcc, CCR, Cmp); 5401 } 5402 5403 if (getTargetMachine().Options.UnsafeFPMath && 5404 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5405 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5406 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5407 return Result; 5408 } 5409 5410 ARMCC::CondCodes CondCode, CondCode2; 5411 FPCCToARMCC(CC, CondCode, CondCode2); 5412 5413 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5414 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5415 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5416 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5417 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5418 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5419 if (CondCode2 != ARMCC::AL) { 5420 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5421 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5422 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5423 } 5424 return Res; 5425 } 5426 5427 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5428 SDValue Chain = Op.getOperand(0); 5429 SDValue Table = Op.getOperand(1); 5430 SDValue Index = Op.getOperand(2); 5431 SDLoc dl(Op); 5432 5433 EVT PTy = getPointerTy(DAG.getDataLayout()); 5434 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5435 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5436 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5437 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5438 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5439 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5440 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5441 // which does another jump to the destination. This also makes it easier 5442 // to translate it to TBB / TBH later (Thumb2 only). 5443 // FIXME: This might not work if the function is extremely large. 5444 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5445 Addr, Op.getOperand(2), JTI); 5446 } 5447 if (isPositionIndependent() || Subtarget->isROPI()) { 5448 Addr = 5449 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5450 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5451 Chain = Addr.getValue(1); 5452 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5453 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5454 } else { 5455 Addr = 5456 DAG.getLoad(PTy, dl, Chain, Addr, 5457 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5458 Chain = Addr.getValue(1); 5459 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5460 } 5461 } 5462 5463 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5464 EVT VT = Op.getValueType(); 5465 SDLoc dl(Op); 5466 5467 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5468 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5469 return Op; 5470 return DAG.UnrollVectorOp(Op.getNode()); 5471 } 5472 5473 const bool HasFullFP16 = 5474 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5475 5476 EVT NewTy; 5477 const EVT OpTy = Op.getOperand(0).getValueType(); 5478 if (OpTy == MVT::v4f32) 5479 NewTy = MVT::v4i32; 5480 else if (OpTy == MVT::v4f16 && HasFullFP16) 5481 NewTy = MVT::v4i16; 5482 else if (OpTy == MVT::v8f16 && HasFullFP16) 5483 NewTy = MVT::v8i16; 5484 else 5485 llvm_unreachable("Invalid type for custom lowering!"); 5486 5487 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5488 return DAG.UnrollVectorOp(Op.getNode()); 5489 5490 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5491 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5492 } 5493 5494 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5495 EVT VT = Op.getValueType(); 5496 if (VT.isVector()) 5497 return LowerVectorFP_TO_INT(Op, DAG); 5498 5499 bool IsStrict = Op->isStrictFPOpcode(); 5500 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5501 5502 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5503 RTLIB::Libcall LC; 5504 if (Op.getOpcode() == ISD::FP_TO_SINT || 5505 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5506 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5507 Op.getValueType()); 5508 else 5509 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5510 Op.getValueType()); 5511 SDLoc Loc(Op); 5512 MakeLibCallOptions CallOptions; 5513 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5514 SDValue Result; 5515 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5516 CallOptions, Loc, Chain); 5517 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5518 } 5519 5520 // FIXME: Remove this when we have strict fp instruction selection patterns 5521 if (IsStrict) { 5522 SDLoc Loc(Op); 5523 SDValue Result = 5524 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5525 : ISD::FP_TO_UINT, 5526 Loc, Op.getValueType(), SrcVal); 5527 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5528 } 5529 5530 return Op; 5531 } 5532 5533 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5534 EVT VT = Op.getValueType(); 5535 SDLoc dl(Op); 5536 5537 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5538 if (VT.getVectorElementType() == MVT::f32) 5539 return Op; 5540 return DAG.UnrollVectorOp(Op.getNode()); 5541 } 5542 5543 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5544 Op.getOperand(0).getValueType() == MVT::v8i16) && 5545 "Invalid type for custom lowering!"); 5546 5547 const bool HasFullFP16 = 5548 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5549 5550 EVT DestVecType; 5551 if (VT == MVT::v4f32) 5552 DestVecType = MVT::v4i32; 5553 else if (VT == MVT::v4f16 && HasFullFP16) 5554 DestVecType = MVT::v4i16; 5555 else if (VT == MVT::v8f16 && HasFullFP16) 5556 DestVecType = MVT::v8i16; 5557 else 5558 return DAG.UnrollVectorOp(Op.getNode()); 5559 5560 unsigned CastOpc; 5561 unsigned Opc; 5562 switch (Op.getOpcode()) { 5563 default: llvm_unreachable("Invalid opcode!"); 5564 case ISD::SINT_TO_FP: 5565 CastOpc = ISD::SIGN_EXTEND; 5566 Opc = ISD::SINT_TO_FP; 5567 break; 5568 case ISD::UINT_TO_FP: 5569 CastOpc = ISD::ZERO_EXTEND; 5570 Opc = ISD::UINT_TO_FP; 5571 break; 5572 } 5573 5574 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5575 return DAG.getNode(Opc, dl, VT, Op); 5576 } 5577 5578 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5579 EVT VT = Op.getValueType(); 5580 if (VT.isVector()) 5581 return LowerVectorINT_TO_FP(Op, DAG); 5582 if (isUnsupportedFloatingType(VT)) { 5583 RTLIB::Libcall LC; 5584 if (Op.getOpcode() == ISD::SINT_TO_FP) 5585 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5586 Op.getValueType()); 5587 else 5588 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5589 Op.getValueType()); 5590 MakeLibCallOptions CallOptions; 5591 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5592 CallOptions, SDLoc(Op)).first; 5593 } 5594 5595 return Op; 5596 } 5597 5598 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5599 // Implement fcopysign with a fabs and a conditional fneg. 5600 SDValue Tmp0 = Op.getOperand(0); 5601 SDValue Tmp1 = Op.getOperand(1); 5602 SDLoc dl(Op); 5603 EVT VT = Op.getValueType(); 5604 EVT SrcVT = Tmp1.getValueType(); 5605 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5606 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5607 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5608 5609 if (UseNEON) { 5610 // Use VBSL to copy the sign bit. 5611 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5612 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5613 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5614 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5615 if (VT == MVT::f64) 5616 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5617 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5618 DAG.getConstant(32, dl, MVT::i32)); 5619 else /*if (VT == MVT::f32)*/ 5620 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5621 if (SrcVT == MVT::f32) { 5622 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5623 if (VT == MVT::f64) 5624 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5625 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5626 DAG.getConstant(32, dl, MVT::i32)); 5627 } else if (VT == MVT::f32) 5628 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5629 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5630 DAG.getConstant(32, dl, MVT::i32)); 5631 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5632 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5633 5634 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5635 dl, MVT::i32); 5636 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5637 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5638 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5639 5640 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5641 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5642 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5643 if (VT == MVT::f32) { 5644 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5645 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5646 DAG.getConstant(0, dl, MVT::i32)); 5647 } else { 5648 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5649 } 5650 5651 return Res; 5652 } 5653 5654 // Bitcast operand 1 to i32. 5655 if (SrcVT == MVT::f64) 5656 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5657 Tmp1).getValue(1); 5658 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5659 5660 // Or in the signbit with integer operations. 5661 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5662 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5663 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5664 if (VT == MVT::f32) { 5665 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5666 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5667 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5668 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5669 } 5670 5671 // f64: Or the high part with signbit and then combine two parts. 5672 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5673 Tmp0); 5674 SDValue Lo = Tmp0.getValue(0); 5675 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5676 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5677 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5678 } 5679 5680 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5681 MachineFunction &MF = DAG.getMachineFunction(); 5682 MachineFrameInfo &MFI = MF.getFrameInfo(); 5683 MFI.setReturnAddressIsTaken(true); 5684 5685 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5686 return SDValue(); 5687 5688 EVT VT = Op.getValueType(); 5689 SDLoc dl(Op); 5690 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5691 if (Depth) { 5692 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5693 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5694 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5695 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5696 MachinePointerInfo()); 5697 } 5698 5699 // Return LR, which contains the return address. Mark it an implicit live-in. 5700 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5701 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5702 } 5703 5704 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5705 const ARMBaseRegisterInfo &ARI = 5706 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5707 MachineFunction &MF = DAG.getMachineFunction(); 5708 MachineFrameInfo &MFI = MF.getFrameInfo(); 5709 MFI.setFrameAddressIsTaken(true); 5710 5711 EVT VT = Op.getValueType(); 5712 SDLoc dl(Op); // FIXME probably not meaningful 5713 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5714 Register FrameReg = ARI.getFrameRegister(MF); 5715 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5716 while (Depth--) 5717 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5718 MachinePointerInfo()); 5719 return FrameAddr; 5720 } 5721 5722 // FIXME? Maybe this could be a TableGen attribute on some registers and 5723 // this table could be generated automatically from RegInfo. 5724 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 5725 const MachineFunction &MF) const { 5726 Register Reg = StringSwitch<unsigned>(RegName) 5727 .Case("sp", ARM::SP) 5728 .Default(0); 5729 if (Reg) 5730 return Reg; 5731 report_fatal_error(Twine("Invalid register name \"" 5732 + StringRef(RegName) + "\".")); 5733 } 5734 5735 // Result is 64 bit value so split into two 32 bit values and return as a 5736 // pair of values. 5737 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5738 SelectionDAG &DAG) { 5739 SDLoc DL(N); 5740 5741 // This function is only supposed to be called for i64 type destination. 5742 assert(N->getValueType(0) == MVT::i64 5743 && "ExpandREAD_REGISTER called for non-i64 type result."); 5744 5745 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5746 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5747 N->getOperand(0), 5748 N->getOperand(1)); 5749 5750 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5751 Read.getValue(1))); 5752 Results.push_back(Read.getOperand(0)); 5753 } 5754 5755 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5756 /// When \p DstVT, the destination type of \p BC, is on the vector 5757 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5758 /// it might be possible to combine them, such that everything stays on the 5759 /// vector register bank. 5760 /// \p return The node that would replace \p BT, if the combine 5761 /// is possible. 5762 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5763 SelectionDAG &DAG) { 5764 SDValue Op = BC->getOperand(0); 5765 EVT DstVT = BC->getValueType(0); 5766 5767 // The only vector instruction that can produce a scalar (remember, 5768 // since the bitcast was about to be turned into VMOVDRR, the source 5769 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5770 // Moreover, we can do this combine only if there is one use. 5771 // Finally, if the destination type is not a vector, there is not 5772 // much point on forcing everything on the vector bank. 5773 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5774 !Op.hasOneUse()) 5775 return SDValue(); 5776 5777 // If the index is not constant, we will introduce an additional 5778 // multiply that will stick. 5779 // Give up in that case. 5780 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5781 if (!Index) 5782 return SDValue(); 5783 unsigned DstNumElt = DstVT.getVectorNumElements(); 5784 5785 // Compute the new index. 5786 const APInt &APIntIndex = Index->getAPIntValue(); 5787 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5788 NewIndex *= APIntIndex; 5789 // Check if the new constant index fits into i32. 5790 if (NewIndex.getBitWidth() > 32) 5791 return SDValue(); 5792 5793 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5794 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5795 SDLoc dl(Op); 5796 SDValue ExtractSrc = Op.getOperand(0); 5797 EVT VecVT = EVT::getVectorVT( 5798 *DAG.getContext(), DstVT.getScalarType(), 5799 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5800 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5801 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5802 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5803 } 5804 5805 /// ExpandBITCAST - If the target supports VFP, this function is called to 5806 /// expand a bit convert where either the source or destination type is i64 to 5807 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5808 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5809 /// vectors), since the legalizer won't know what to do with that. 5810 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5811 const ARMSubtarget *Subtarget) { 5812 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5813 SDLoc dl(N); 5814 SDValue Op = N->getOperand(0); 5815 5816 // This function is only supposed to be called for i16 and i64 types, either 5817 // as the source or destination of the bit convert. 5818 EVT SrcVT = Op.getValueType(); 5819 EVT DstVT = N->getValueType(0); 5820 5821 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5822 if (!Subtarget->hasFullFP16()) 5823 return SDValue(); 5824 // f16 bitcast i16 -> VMOVhr 5825 return DAG.getNode(ARMISD::VMOVhr, SDLoc(N), MVT::f16, 5826 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); 5827 } 5828 5829 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5830 if (!Subtarget->hasFullFP16()) 5831 return SDValue(); 5832 // i16 bitcast f16 -> VMOVrh 5833 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, 5834 DAG.getNode(ARMISD::VMOVrh, SDLoc(N), MVT::i32, Op)); 5835 } 5836 5837 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5838 return SDValue(); 5839 5840 // Turn i64->f64 into VMOVDRR. 5841 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5842 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5843 // if we can combine the bitcast with its source. 5844 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5845 return Val; 5846 5847 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5848 DAG.getConstant(0, dl, MVT::i32)); 5849 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5850 DAG.getConstant(1, dl, MVT::i32)); 5851 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5852 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5853 } 5854 5855 // Turn f64->i64 into VMOVRRD. 5856 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5857 SDValue Cvt; 5858 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5859 SrcVT.getVectorNumElements() > 1) 5860 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5861 DAG.getVTList(MVT::i32, MVT::i32), 5862 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5863 else 5864 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5865 DAG.getVTList(MVT::i32, MVT::i32), Op); 5866 // Merge the pieces into a single i64 value. 5867 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5868 } 5869 5870 return SDValue(); 5871 } 5872 5873 /// getZeroVector - Returns a vector of specified type with all zero elements. 5874 /// Zero vectors are used to represent vector negation and in those cases 5875 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5876 /// not support i64 elements, so sometimes the zero vectors will need to be 5877 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5878 /// zero vector. 5879 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5880 assert(VT.isVector() && "Expected a vector type"); 5881 // The canonical modified immediate encoding of a zero vector is....0! 5882 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5883 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5884 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5885 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5886 } 5887 5888 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5889 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5890 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5891 SelectionDAG &DAG) const { 5892 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5893 EVT VT = Op.getValueType(); 5894 unsigned VTBits = VT.getSizeInBits(); 5895 SDLoc dl(Op); 5896 SDValue ShOpLo = Op.getOperand(0); 5897 SDValue ShOpHi = Op.getOperand(1); 5898 SDValue ShAmt = Op.getOperand(2); 5899 SDValue ARMcc; 5900 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5901 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5902 5903 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5904 5905 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5906 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5907 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5908 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5909 DAG.getConstant(VTBits, dl, MVT::i32)); 5910 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5911 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5912 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5913 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5914 ISD::SETGE, ARMcc, DAG, dl); 5915 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5916 ARMcc, CCR, CmpLo); 5917 5918 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5919 SDValue HiBigShift = Opc == ISD::SRA 5920 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5921 DAG.getConstant(VTBits - 1, dl, VT)) 5922 : DAG.getConstant(0, dl, VT); 5923 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5924 ISD::SETGE, ARMcc, DAG, dl); 5925 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5926 ARMcc, CCR, CmpHi); 5927 5928 SDValue Ops[2] = { Lo, Hi }; 5929 return DAG.getMergeValues(Ops, dl); 5930 } 5931 5932 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5933 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5934 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5935 SelectionDAG &DAG) const { 5936 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5937 EVT VT = Op.getValueType(); 5938 unsigned VTBits = VT.getSizeInBits(); 5939 SDLoc dl(Op); 5940 SDValue ShOpLo = Op.getOperand(0); 5941 SDValue ShOpHi = Op.getOperand(1); 5942 SDValue ShAmt = Op.getOperand(2); 5943 SDValue ARMcc; 5944 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5945 5946 assert(Op.getOpcode() == ISD::SHL_PARTS); 5947 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5948 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5949 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5950 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5951 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5952 5953 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5954 DAG.getConstant(VTBits, dl, MVT::i32)); 5955 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5956 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5957 ISD::SETGE, ARMcc, DAG, dl); 5958 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5959 ARMcc, CCR, CmpHi); 5960 5961 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5962 ISD::SETGE, ARMcc, DAG, dl); 5963 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5964 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5965 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5966 5967 SDValue Ops[2] = { Lo, Hi }; 5968 return DAG.getMergeValues(Ops, dl); 5969 } 5970 5971 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5972 SelectionDAG &DAG) const { 5973 // The rounding mode is in bits 23:22 of the FPSCR. 5974 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5975 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5976 // so that the shift + and get folded into a bitfield extract. 5977 SDLoc dl(Op); 5978 SDValue Chain = Op.getOperand(0); 5979 SDValue Ops[] = {Chain, 5980 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; 5981 5982 SDValue FPSCR = 5983 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); 5984 Chain = FPSCR.getValue(1); 5985 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5986 DAG.getConstant(1U << 22, dl, MVT::i32)); 5987 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5988 DAG.getConstant(22, dl, MVT::i32)); 5989 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5990 DAG.getConstant(3, dl, MVT::i32)); 5991 return DAG.getMergeValues({And, Chain}, dl); 5992 } 5993 5994 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5995 const ARMSubtarget *ST) { 5996 SDLoc dl(N); 5997 EVT VT = N->getValueType(0); 5998 if (VT.isVector() && ST->hasNEON()) { 5999 6000 // Compute the least significant set bit: LSB = X & -X 6001 SDValue X = N->getOperand(0); 6002 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 6003 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 6004 6005 EVT ElemTy = VT.getVectorElementType(); 6006 6007 if (ElemTy == MVT::i8) { 6008 // Compute with: cttz(x) = ctpop(lsb - 1) 6009 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6010 DAG.getTargetConstant(1, dl, ElemTy)); 6011 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6012 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6013 } 6014 6015 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 6016 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 6017 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 6018 unsigned NumBits = ElemTy.getSizeInBits(); 6019 SDValue WidthMinus1 = 6020 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6021 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 6022 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 6023 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 6024 } 6025 6026 // Compute with: cttz(x) = ctpop(lsb - 1) 6027 6028 // Compute LSB - 1. 6029 SDValue Bits; 6030 if (ElemTy == MVT::i64) { 6031 // Load constant 0xffff'ffff'ffff'ffff to register. 6032 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6033 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 6034 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 6035 } else { 6036 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6037 DAG.getTargetConstant(1, dl, ElemTy)); 6038 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6039 } 6040 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6041 } 6042 6043 if (!ST->hasV6T2Ops()) 6044 return SDValue(); 6045 6046 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 6047 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 6048 } 6049 6050 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6051 const ARMSubtarget *ST) { 6052 EVT VT = N->getValueType(0); 6053 SDLoc DL(N); 6054 6055 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6056 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6057 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6058 "Unexpected type for custom ctpop lowering"); 6059 6060 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6061 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6062 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6063 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6064 6065 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6066 unsigned EltSize = 8; 6067 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6068 while (EltSize != VT.getScalarSizeInBits()) { 6069 SmallVector<SDValue, 8> Ops; 6070 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6071 TLI.getPointerTy(DAG.getDataLayout()))); 6072 Ops.push_back(Res); 6073 6074 EltSize *= 2; 6075 NumElts /= 2; 6076 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6077 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6078 } 6079 6080 return Res; 6081 } 6082 6083 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6084 /// operand of a vector shift operation, where all the elements of the 6085 /// build_vector must have the same constant integer value. 6086 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6087 // Ignore bit_converts. 6088 while (Op.getOpcode() == ISD::BITCAST) 6089 Op = Op.getOperand(0); 6090 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6091 APInt SplatBits, SplatUndef; 6092 unsigned SplatBitSize; 6093 bool HasAnyUndefs; 6094 if (!BVN || 6095 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6096 ElementBits) || 6097 SplatBitSize > ElementBits) 6098 return false; 6099 Cnt = SplatBits.getSExtValue(); 6100 return true; 6101 } 6102 6103 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6104 /// operand of a vector shift left operation. That value must be in the range: 6105 /// 0 <= Value < ElementBits for a left shift; or 6106 /// 0 <= Value <= ElementBits for a long left shift. 6107 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6108 assert(VT.isVector() && "vector shift count is not a vector type"); 6109 int64_t ElementBits = VT.getScalarSizeInBits(); 6110 if (!getVShiftImm(Op, ElementBits, Cnt)) 6111 return false; 6112 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6113 } 6114 6115 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6116 /// operand of a vector shift right operation. For a shift opcode, the value 6117 /// is positive, but for an intrinsic the value count must be negative. The 6118 /// absolute value must be in the range: 6119 /// 1 <= |Value| <= ElementBits for a right shift; or 6120 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6121 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6122 int64_t &Cnt) { 6123 assert(VT.isVector() && "vector shift count is not a vector type"); 6124 int64_t ElementBits = VT.getScalarSizeInBits(); 6125 if (!getVShiftImm(Op, ElementBits, Cnt)) 6126 return false; 6127 if (!isIntrinsic) 6128 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6129 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6130 Cnt = -Cnt; 6131 return true; 6132 } 6133 return false; 6134 } 6135 6136 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6137 const ARMSubtarget *ST) { 6138 EVT VT = N->getValueType(0); 6139 SDLoc dl(N); 6140 int64_t Cnt; 6141 6142 if (!VT.isVector()) 6143 return SDValue(); 6144 6145 // We essentially have two forms here. Shift by an immediate and shift by a 6146 // vector register (there are also shift by a gpr, but that is just handled 6147 // with a tablegen pattern). We cannot easily match shift by an immediate in 6148 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6149 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6150 // signed or unsigned, and a negative shift indicates a shift right). 6151 if (N->getOpcode() == ISD::SHL) { 6152 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6153 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6154 DAG.getConstant(Cnt, dl, MVT::i32)); 6155 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6156 N->getOperand(1)); 6157 } 6158 6159 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6160 "unexpected vector shift opcode"); 6161 6162 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6163 unsigned VShiftOpc = 6164 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6165 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6166 DAG.getConstant(Cnt, dl, MVT::i32)); 6167 } 6168 6169 // Other right shifts we don't have operations for (we use a shift left by a 6170 // negative number). 6171 EVT ShiftVT = N->getOperand(1).getValueType(); 6172 SDValue NegatedCount = DAG.getNode( 6173 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6174 unsigned VShiftOpc = 6175 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6176 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6177 } 6178 6179 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6180 const ARMSubtarget *ST) { 6181 EVT VT = N->getValueType(0); 6182 SDLoc dl(N); 6183 6184 // We can get here for a node like i32 = ISD::SHL i32, i64 6185 if (VT != MVT::i64) 6186 return SDValue(); 6187 6188 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6189 N->getOpcode() == ISD::SHL) && 6190 "Unknown shift to lower!"); 6191 6192 unsigned ShOpc = N->getOpcode(); 6193 if (ST->hasMVEIntegerOps()) { 6194 SDValue ShAmt = N->getOperand(1); 6195 unsigned ShPartsOpc = ARMISD::LSLL; 6196 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6197 6198 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6199 // then do the default optimisation 6200 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6201 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6202 return SDValue(); 6203 6204 // Extract the lower 32 bits of the shift amount if it's not an i32 6205 if (ShAmt->getValueType(0) != MVT::i32) 6206 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6207 6208 if (ShOpc == ISD::SRL) { 6209 if (!Con) 6210 // There is no t2LSRLr instruction so negate and perform an lsll if the 6211 // shift amount is in a register, emulating a right shift. 6212 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6213 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6214 else 6215 // Else generate an lsrl on the immediate shift amount 6216 ShPartsOpc = ARMISD::LSRL; 6217 } else if (ShOpc == ISD::SRA) 6218 ShPartsOpc = ARMISD::ASRL; 6219 6220 // Lower 32 bits of the destination/source 6221 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6222 DAG.getConstant(0, dl, MVT::i32)); 6223 // Upper 32 bits of the destination/source 6224 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6225 DAG.getConstant(1, dl, MVT::i32)); 6226 6227 // Generate the shift operation as computed above 6228 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6229 ShAmt); 6230 // The upper 32 bits come from the second return value of lsll 6231 Hi = SDValue(Lo.getNode(), 1); 6232 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6233 } 6234 6235 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6236 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6237 return SDValue(); 6238 6239 // If we are in thumb mode, we don't have RRX. 6240 if (ST->isThumb1Only()) 6241 return SDValue(); 6242 6243 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6244 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6245 DAG.getConstant(0, dl, MVT::i32)); 6246 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6247 DAG.getConstant(1, dl, MVT::i32)); 6248 6249 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6250 // captures the result into a carry flag. 6251 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6252 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6253 6254 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6255 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6256 6257 // Merge the pieces into a single i64 value. 6258 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6259 } 6260 6261 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6262 const ARMSubtarget *ST) { 6263 bool Invert = false; 6264 bool Swap = false; 6265 unsigned Opc = ARMCC::AL; 6266 6267 SDValue Op0 = Op.getOperand(0); 6268 SDValue Op1 = Op.getOperand(1); 6269 SDValue CC = Op.getOperand(2); 6270 EVT VT = Op.getValueType(); 6271 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6272 SDLoc dl(Op); 6273 6274 EVT CmpVT; 6275 if (ST->hasNEON()) 6276 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6277 else { 6278 assert(ST->hasMVEIntegerOps() && 6279 "No hardware support for integer vector comparison!"); 6280 6281 if (Op.getValueType().getVectorElementType() != MVT::i1) 6282 return SDValue(); 6283 6284 // Make sure we expand floating point setcc to scalar if we do not have 6285 // mve.fp, so that we can handle them from there. 6286 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6287 return SDValue(); 6288 6289 CmpVT = VT; 6290 } 6291 6292 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6293 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6294 // Special-case integer 64-bit equality comparisons. They aren't legal, 6295 // but they can be lowered with a few vector instructions. 6296 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6297 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6298 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6299 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6300 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6301 DAG.getCondCode(ISD::SETEQ)); 6302 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6303 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6304 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6305 if (SetCCOpcode == ISD::SETNE) 6306 Merged = DAG.getNOT(dl, Merged, CmpVT); 6307 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6308 return Merged; 6309 } 6310 6311 if (CmpVT.getVectorElementType() == MVT::i64) 6312 // 64-bit comparisons are not legal in general. 6313 return SDValue(); 6314 6315 if (Op1.getValueType().isFloatingPoint()) { 6316 switch (SetCCOpcode) { 6317 default: llvm_unreachable("Illegal FP comparison"); 6318 case ISD::SETUNE: 6319 case ISD::SETNE: 6320 if (ST->hasMVEFloatOps()) { 6321 Opc = ARMCC::NE; break; 6322 } else { 6323 Invert = true; LLVM_FALLTHROUGH; 6324 } 6325 case ISD::SETOEQ: 6326 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6327 case ISD::SETOLT: 6328 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6329 case ISD::SETOGT: 6330 case ISD::SETGT: Opc = ARMCC::GT; break; 6331 case ISD::SETOLE: 6332 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6333 case ISD::SETOGE: 6334 case ISD::SETGE: Opc = ARMCC::GE; break; 6335 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6336 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6337 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6338 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6339 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6340 case ISD::SETONE: { 6341 // Expand this to (OLT | OGT). 6342 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6343 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6344 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6345 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6346 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6347 if (Invert) 6348 Result = DAG.getNOT(dl, Result, VT); 6349 return Result; 6350 } 6351 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6352 case ISD::SETO: { 6353 // Expand this to (OLT | OGE). 6354 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6355 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6356 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6357 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6358 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6359 if (Invert) 6360 Result = DAG.getNOT(dl, Result, VT); 6361 return Result; 6362 } 6363 } 6364 } else { 6365 // Integer comparisons. 6366 switch (SetCCOpcode) { 6367 default: llvm_unreachable("Illegal integer comparison"); 6368 case ISD::SETNE: 6369 if (ST->hasMVEIntegerOps()) { 6370 Opc = ARMCC::NE; break; 6371 } else { 6372 Invert = true; LLVM_FALLTHROUGH; 6373 } 6374 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6375 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6376 case ISD::SETGT: Opc = ARMCC::GT; break; 6377 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6378 case ISD::SETGE: Opc = ARMCC::GE; break; 6379 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6380 case ISD::SETUGT: Opc = ARMCC::HI; break; 6381 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6382 case ISD::SETUGE: Opc = ARMCC::HS; break; 6383 } 6384 6385 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6386 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6387 SDValue AndOp; 6388 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6389 AndOp = Op0; 6390 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6391 AndOp = Op1; 6392 6393 // Ignore bitconvert. 6394 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6395 AndOp = AndOp.getOperand(0); 6396 6397 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6398 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6399 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6400 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6401 if (!Invert) 6402 Result = DAG.getNOT(dl, Result, VT); 6403 return Result; 6404 } 6405 } 6406 } 6407 6408 if (Swap) 6409 std::swap(Op0, Op1); 6410 6411 // If one of the operands is a constant vector zero, attempt to fold the 6412 // comparison to a specialized compare-against-zero form. 6413 SDValue SingleOp; 6414 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6415 SingleOp = Op0; 6416 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6417 if (Opc == ARMCC::GE) 6418 Opc = ARMCC::LE; 6419 else if (Opc == ARMCC::GT) 6420 Opc = ARMCC::LT; 6421 SingleOp = Op1; 6422 } 6423 6424 SDValue Result; 6425 if (SingleOp.getNode()) { 6426 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6427 DAG.getConstant(Opc, dl, MVT::i32)); 6428 } else { 6429 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6430 DAG.getConstant(Opc, dl, MVT::i32)); 6431 } 6432 6433 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6434 6435 if (Invert) 6436 Result = DAG.getNOT(dl, Result, VT); 6437 6438 return Result; 6439 } 6440 6441 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6442 SDValue LHS = Op.getOperand(0); 6443 SDValue RHS = Op.getOperand(1); 6444 SDValue Carry = Op.getOperand(2); 6445 SDValue Cond = Op.getOperand(3); 6446 SDLoc DL(Op); 6447 6448 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6449 6450 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6451 // have to invert the carry first. 6452 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6453 DAG.getConstant(1, DL, MVT::i32), Carry); 6454 // This converts the boolean value carry into the carry flag. 6455 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6456 6457 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6458 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6459 6460 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6461 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6462 SDValue ARMcc = DAG.getConstant( 6463 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6464 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6465 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6466 Cmp.getValue(1), SDValue()); 6467 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6468 CCR, Chain.getValue(1)); 6469 } 6470 6471 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6472 /// valid vector constant for a NEON or MVE instruction with a "modified 6473 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6474 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6475 unsigned SplatBitSize, SelectionDAG &DAG, 6476 const SDLoc &dl, EVT &VT, EVT VectorVT, 6477 VMOVModImmType type) { 6478 unsigned OpCmode, Imm; 6479 bool is128Bits = VectorVT.is128BitVector(); 6480 6481 // SplatBitSize is set to the smallest size that splats the vector, so a 6482 // zero vector will always have SplatBitSize == 8. However, NEON modified 6483 // immediate instructions others than VMOV do not support the 8-bit encoding 6484 // of a zero vector, and the default encoding of zero is supposed to be the 6485 // 32-bit version. 6486 if (SplatBits == 0) 6487 SplatBitSize = 32; 6488 6489 switch (SplatBitSize) { 6490 case 8: 6491 if (type != VMOVModImm) 6492 return SDValue(); 6493 // Any 1-byte value is OK. Op=0, Cmode=1110. 6494 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6495 OpCmode = 0xe; 6496 Imm = SplatBits; 6497 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6498 break; 6499 6500 case 16: 6501 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6502 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6503 if ((SplatBits & ~0xff) == 0) { 6504 // Value = 0x00nn: Op=x, Cmode=100x. 6505 OpCmode = 0x8; 6506 Imm = SplatBits; 6507 break; 6508 } 6509 if ((SplatBits & ~0xff00) == 0) { 6510 // Value = 0xnn00: Op=x, Cmode=101x. 6511 OpCmode = 0xa; 6512 Imm = SplatBits >> 8; 6513 break; 6514 } 6515 return SDValue(); 6516 6517 case 32: 6518 // NEON's 32-bit VMOV supports splat values where: 6519 // * only one byte is nonzero, or 6520 // * the least significant byte is 0xff and the second byte is nonzero, or 6521 // * the least significant 2 bytes are 0xff and the third is nonzero. 6522 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6523 if ((SplatBits & ~0xff) == 0) { 6524 // Value = 0x000000nn: Op=x, Cmode=000x. 6525 OpCmode = 0; 6526 Imm = SplatBits; 6527 break; 6528 } 6529 if ((SplatBits & ~0xff00) == 0) { 6530 // Value = 0x0000nn00: Op=x, Cmode=001x. 6531 OpCmode = 0x2; 6532 Imm = SplatBits >> 8; 6533 break; 6534 } 6535 if ((SplatBits & ~0xff0000) == 0) { 6536 // Value = 0x00nn0000: Op=x, Cmode=010x. 6537 OpCmode = 0x4; 6538 Imm = SplatBits >> 16; 6539 break; 6540 } 6541 if ((SplatBits & ~0xff000000) == 0) { 6542 // Value = 0xnn000000: Op=x, Cmode=011x. 6543 OpCmode = 0x6; 6544 Imm = SplatBits >> 24; 6545 break; 6546 } 6547 6548 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6549 if (type == OtherModImm) return SDValue(); 6550 6551 if ((SplatBits & ~0xffff) == 0 && 6552 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6553 // Value = 0x0000nnff: Op=x, Cmode=1100. 6554 OpCmode = 0xc; 6555 Imm = SplatBits >> 8; 6556 break; 6557 } 6558 6559 // cmode == 0b1101 is not supported for MVE VMVN 6560 if (type == MVEVMVNModImm) 6561 return SDValue(); 6562 6563 if ((SplatBits & ~0xffffff) == 0 && 6564 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6565 // Value = 0x00nnffff: Op=x, Cmode=1101. 6566 OpCmode = 0xd; 6567 Imm = SplatBits >> 16; 6568 break; 6569 } 6570 6571 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6572 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6573 // VMOV.I32. A (very) minor optimization would be to replicate the value 6574 // and fall through here to test for a valid 64-bit splat. But, then the 6575 // caller would also need to check and handle the change in size. 6576 return SDValue(); 6577 6578 case 64: { 6579 if (type != VMOVModImm) 6580 return SDValue(); 6581 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6582 uint64_t BitMask = 0xff; 6583 uint64_t Val = 0; 6584 unsigned ImmMask = 1; 6585 Imm = 0; 6586 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6587 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6588 Val |= BitMask; 6589 Imm |= ImmMask; 6590 } else if ((SplatBits & BitMask) != 0) { 6591 return SDValue(); 6592 } 6593 BitMask <<= 8; 6594 ImmMask <<= 1; 6595 } 6596 6597 if (DAG.getDataLayout().isBigEndian()) { 6598 // Reverse the order of elements within the vector. 6599 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; 6600 unsigned Mask = (1 << BytesPerElem) - 1; 6601 unsigned NumElems = 8 / BytesPerElem; 6602 unsigned NewImm = 0; 6603 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { 6604 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); 6605 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; 6606 } 6607 Imm = NewImm; 6608 } 6609 6610 // Op=1, Cmode=1110. 6611 OpCmode = 0x1e; 6612 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6613 break; 6614 } 6615 6616 default: 6617 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6618 } 6619 6620 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6621 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6622 } 6623 6624 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6625 const ARMSubtarget *ST) const { 6626 EVT VT = Op.getValueType(); 6627 bool IsDouble = (VT == MVT::f64); 6628 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6629 const APFloat &FPVal = CFP->getValueAPF(); 6630 6631 // Prevent floating-point constants from using literal loads 6632 // when execute-only is enabled. 6633 if (ST->genExecuteOnly()) { 6634 // If we can represent the constant as an immediate, don't lower it 6635 if (isFPImmLegal(FPVal, VT)) 6636 return Op; 6637 // Otherwise, construct as integer, and move to float register 6638 APInt INTVal = FPVal.bitcastToAPInt(); 6639 SDLoc DL(CFP); 6640 switch (VT.getSimpleVT().SimpleTy) { 6641 default: 6642 llvm_unreachable("Unknown floating point type!"); 6643 break; 6644 case MVT::f64: { 6645 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6646 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6647 if (!ST->isLittle()) 6648 std::swap(Lo, Hi); 6649 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6650 } 6651 case MVT::f32: 6652 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6653 DAG.getConstant(INTVal, DL, MVT::i32)); 6654 } 6655 } 6656 6657 if (!ST->hasVFP3Base()) 6658 return SDValue(); 6659 6660 // Use the default (constant pool) lowering for double constants when we have 6661 // an SP-only FPU 6662 if (IsDouble && !Subtarget->hasFP64()) 6663 return SDValue(); 6664 6665 // Try splatting with a VMOV.f32... 6666 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6667 6668 if (ImmVal != -1) { 6669 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6670 // We have code in place to select a valid ConstantFP already, no need to 6671 // do any mangling. 6672 return Op; 6673 } 6674 6675 // It's a float and we are trying to use NEON operations where 6676 // possible. Lower it to a splat followed by an extract. 6677 SDLoc DL(Op); 6678 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6679 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6680 NewVal); 6681 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6682 DAG.getConstant(0, DL, MVT::i32)); 6683 } 6684 6685 // The rest of our options are NEON only, make sure that's allowed before 6686 // proceeding.. 6687 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6688 return SDValue(); 6689 6690 EVT VMovVT; 6691 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6692 6693 // It wouldn't really be worth bothering for doubles except for one very 6694 // important value, which does happen to match: 0.0. So make sure we don't do 6695 // anything stupid. 6696 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6697 return SDValue(); 6698 6699 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6700 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6701 VMovVT, VT, VMOVModImm); 6702 if (NewVal != SDValue()) { 6703 SDLoc DL(Op); 6704 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6705 NewVal); 6706 if (IsDouble) 6707 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6708 6709 // It's a float: cast and extract a vector element. 6710 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6711 VecConstant); 6712 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6713 DAG.getConstant(0, DL, MVT::i32)); 6714 } 6715 6716 // Finally, try a VMVN.i32 6717 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6718 VT, VMVNModImm); 6719 if (NewVal != SDValue()) { 6720 SDLoc DL(Op); 6721 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6722 6723 if (IsDouble) 6724 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6725 6726 // It's a float: cast and extract a vector element. 6727 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6728 VecConstant); 6729 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6730 DAG.getConstant(0, DL, MVT::i32)); 6731 } 6732 6733 return SDValue(); 6734 } 6735 6736 // check if an VEXT instruction can handle the shuffle mask when the 6737 // vector sources of the shuffle are the same. 6738 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6739 unsigned NumElts = VT.getVectorNumElements(); 6740 6741 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6742 if (M[0] < 0) 6743 return false; 6744 6745 Imm = M[0]; 6746 6747 // If this is a VEXT shuffle, the immediate value is the index of the first 6748 // element. The other shuffle indices must be the successive elements after 6749 // the first one. 6750 unsigned ExpectedElt = Imm; 6751 for (unsigned i = 1; i < NumElts; ++i) { 6752 // Increment the expected index. If it wraps around, just follow it 6753 // back to index zero and keep going. 6754 ++ExpectedElt; 6755 if (ExpectedElt == NumElts) 6756 ExpectedElt = 0; 6757 6758 if (M[i] < 0) continue; // ignore UNDEF indices 6759 if (ExpectedElt != static_cast<unsigned>(M[i])) 6760 return false; 6761 } 6762 6763 return true; 6764 } 6765 6766 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6767 bool &ReverseVEXT, unsigned &Imm) { 6768 unsigned NumElts = VT.getVectorNumElements(); 6769 ReverseVEXT = false; 6770 6771 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6772 if (M[0] < 0) 6773 return false; 6774 6775 Imm = M[0]; 6776 6777 // If this is a VEXT shuffle, the immediate value is the index of the first 6778 // element. The other shuffle indices must be the successive elements after 6779 // the first one. 6780 unsigned ExpectedElt = Imm; 6781 for (unsigned i = 1; i < NumElts; ++i) { 6782 // Increment the expected index. If it wraps around, it may still be 6783 // a VEXT but the source vectors must be swapped. 6784 ExpectedElt += 1; 6785 if (ExpectedElt == NumElts * 2) { 6786 ExpectedElt = 0; 6787 ReverseVEXT = true; 6788 } 6789 6790 if (M[i] < 0) continue; // ignore UNDEF indices 6791 if (ExpectedElt != static_cast<unsigned>(M[i])) 6792 return false; 6793 } 6794 6795 // Adjust the index value if the source operands will be swapped. 6796 if (ReverseVEXT) 6797 Imm -= NumElts; 6798 6799 return true; 6800 } 6801 6802 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6803 /// instruction with the specified blocksize. (The order of the elements 6804 /// within each block of the vector is reversed.) 6805 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6806 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6807 "Only possible block sizes for VREV are: 16, 32, 64"); 6808 6809 unsigned EltSz = VT.getScalarSizeInBits(); 6810 if (EltSz == 64) 6811 return false; 6812 6813 unsigned NumElts = VT.getVectorNumElements(); 6814 unsigned BlockElts = M[0] + 1; 6815 // If the first shuffle index is UNDEF, be optimistic. 6816 if (M[0] < 0) 6817 BlockElts = BlockSize / EltSz; 6818 6819 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6820 return false; 6821 6822 for (unsigned i = 0; i < NumElts; ++i) { 6823 if (M[i] < 0) continue; // ignore UNDEF indices 6824 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6825 return false; 6826 } 6827 6828 return true; 6829 } 6830 6831 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6832 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6833 // range, then 0 is placed into the resulting vector. So pretty much any mask 6834 // of 8 elements can work here. 6835 return VT == MVT::v8i8 && M.size() == 8; 6836 } 6837 6838 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6839 unsigned Index) { 6840 if (Mask.size() == Elements * 2) 6841 return Index / Elements; 6842 return Mask[Index] == 0 ? 0 : 1; 6843 } 6844 6845 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6846 // checking that pairs of elements in the shuffle mask represent the same index 6847 // in each vector, incrementing the expected index by 2 at each step. 6848 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6849 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6850 // v2={e,f,g,h} 6851 // WhichResult gives the offset for each element in the mask based on which 6852 // of the two results it belongs to. 6853 // 6854 // The transpose can be represented either as: 6855 // result1 = shufflevector v1, v2, result1_shuffle_mask 6856 // result2 = shufflevector v1, v2, result2_shuffle_mask 6857 // where v1/v2 and the shuffle masks have the same number of elements 6858 // (here WhichResult (see below) indicates which result is being checked) 6859 // 6860 // or as: 6861 // results = shufflevector v1, v2, shuffle_mask 6862 // where both results are returned in one vector and the shuffle mask has twice 6863 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6864 // want to check the low half and high half of the shuffle mask as if it were 6865 // the other case 6866 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6867 unsigned EltSz = VT.getScalarSizeInBits(); 6868 if (EltSz == 64) 6869 return false; 6870 6871 unsigned NumElts = VT.getVectorNumElements(); 6872 if (M.size() != NumElts && M.size() != NumElts*2) 6873 return false; 6874 6875 // If the mask is twice as long as the input vector then we need to check the 6876 // upper and lower parts of the mask with a matching value for WhichResult 6877 // FIXME: A mask with only even values will be rejected in case the first 6878 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6879 // M[0] is used to determine WhichResult 6880 for (unsigned i = 0; i < M.size(); i += NumElts) { 6881 WhichResult = SelectPairHalf(NumElts, M, i); 6882 for (unsigned j = 0; j < NumElts; j += 2) { 6883 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6884 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6885 return false; 6886 } 6887 } 6888 6889 if (M.size() == NumElts*2) 6890 WhichResult = 0; 6891 6892 return true; 6893 } 6894 6895 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6896 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6897 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6898 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6899 unsigned EltSz = VT.getScalarSizeInBits(); 6900 if (EltSz == 64) 6901 return false; 6902 6903 unsigned NumElts = VT.getVectorNumElements(); 6904 if (M.size() != NumElts && M.size() != NumElts*2) 6905 return false; 6906 6907 for (unsigned i = 0; i < M.size(); i += NumElts) { 6908 WhichResult = SelectPairHalf(NumElts, M, i); 6909 for (unsigned j = 0; j < NumElts; j += 2) { 6910 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6911 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6912 return false; 6913 } 6914 } 6915 6916 if (M.size() == NumElts*2) 6917 WhichResult = 0; 6918 6919 return true; 6920 } 6921 6922 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6923 // that the mask elements are either all even and in steps of size 2 or all odd 6924 // and in steps of size 2. 6925 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6926 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6927 // v2={e,f,g,h} 6928 // Requires similar checks to that of isVTRNMask with 6929 // respect the how results are returned. 6930 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6931 unsigned EltSz = VT.getScalarSizeInBits(); 6932 if (EltSz == 64) 6933 return false; 6934 6935 unsigned NumElts = VT.getVectorNumElements(); 6936 if (M.size() != NumElts && M.size() != NumElts*2) 6937 return false; 6938 6939 for (unsigned i = 0; i < M.size(); i += NumElts) { 6940 WhichResult = SelectPairHalf(NumElts, M, i); 6941 for (unsigned j = 0; j < NumElts; ++j) { 6942 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6943 return false; 6944 } 6945 } 6946 6947 if (M.size() == NumElts*2) 6948 WhichResult = 0; 6949 6950 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6951 if (VT.is64BitVector() && EltSz == 32) 6952 return false; 6953 6954 return true; 6955 } 6956 6957 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6958 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6959 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6960 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6961 unsigned EltSz = VT.getScalarSizeInBits(); 6962 if (EltSz == 64) 6963 return false; 6964 6965 unsigned NumElts = VT.getVectorNumElements(); 6966 if (M.size() != NumElts && M.size() != NumElts*2) 6967 return false; 6968 6969 unsigned Half = NumElts / 2; 6970 for (unsigned i = 0; i < M.size(); i += NumElts) { 6971 WhichResult = SelectPairHalf(NumElts, M, i); 6972 for (unsigned j = 0; j < NumElts; j += Half) { 6973 unsigned Idx = WhichResult; 6974 for (unsigned k = 0; k < Half; ++k) { 6975 int MIdx = M[i + j + k]; 6976 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6977 return false; 6978 Idx += 2; 6979 } 6980 } 6981 } 6982 6983 if (M.size() == NumElts*2) 6984 WhichResult = 0; 6985 6986 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6987 if (VT.is64BitVector() && EltSz == 32) 6988 return false; 6989 6990 return true; 6991 } 6992 6993 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6994 // that pairs of elements of the shufflemask represent the same index in each 6995 // vector incrementing sequentially through the vectors. 6996 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6997 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6998 // v2={e,f,g,h} 6999 // Requires similar checks to that of isVTRNMask with respect the how results 7000 // are returned. 7001 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7002 unsigned EltSz = VT.getScalarSizeInBits(); 7003 if (EltSz == 64) 7004 return false; 7005 7006 unsigned NumElts = VT.getVectorNumElements(); 7007 if (M.size() != NumElts && M.size() != NumElts*2) 7008 return false; 7009 7010 for (unsigned i = 0; i < M.size(); i += NumElts) { 7011 WhichResult = SelectPairHalf(NumElts, M, i); 7012 unsigned Idx = WhichResult * NumElts / 2; 7013 for (unsigned j = 0; j < NumElts; j += 2) { 7014 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7015 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 7016 return false; 7017 Idx += 1; 7018 } 7019 } 7020 7021 if (M.size() == NumElts*2) 7022 WhichResult = 0; 7023 7024 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7025 if (VT.is64BitVector() && EltSz == 32) 7026 return false; 7027 7028 return true; 7029 } 7030 7031 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 7032 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7033 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 7034 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7035 unsigned EltSz = VT.getScalarSizeInBits(); 7036 if (EltSz == 64) 7037 return false; 7038 7039 unsigned NumElts = VT.getVectorNumElements(); 7040 if (M.size() != NumElts && M.size() != NumElts*2) 7041 return false; 7042 7043 for (unsigned i = 0; i < M.size(); i += NumElts) { 7044 WhichResult = SelectPairHalf(NumElts, M, i); 7045 unsigned Idx = WhichResult * NumElts / 2; 7046 for (unsigned j = 0; j < NumElts; j += 2) { 7047 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7048 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 7049 return false; 7050 Idx += 1; 7051 } 7052 } 7053 7054 if (M.size() == NumElts*2) 7055 WhichResult = 0; 7056 7057 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7058 if (VT.is64BitVector() && EltSz == 32) 7059 return false; 7060 7061 return true; 7062 } 7063 7064 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7065 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7066 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7067 unsigned &WhichResult, 7068 bool &isV_UNDEF) { 7069 isV_UNDEF = false; 7070 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7071 return ARMISD::VTRN; 7072 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7073 return ARMISD::VUZP; 7074 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7075 return ARMISD::VZIP; 7076 7077 isV_UNDEF = true; 7078 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7079 return ARMISD::VTRN; 7080 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7081 return ARMISD::VUZP; 7082 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7083 return ARMISD::VZIP; 7084 7085 return 0; 7086 } 7087 7088 /// \return true if this is a reverse operation on an vector. 7089 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7090 unsigned NumElts = VT.getVectorNumElements(); 7091 // Make sure the mask has the right size. 7092 if (NumElts != M.size()) 7093 return false; 7094 7095 // Look for <15, ..., 3, -1, 1, 0>. 7096 for (unsigned i = 0; i != NumElts; ++i) 7097 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7098 return false; 7099 7100 return true; 7101 } 7102 7103 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { 7104 unsigned NumElts = VT.getVectorNumElements(); 7105 // Make sure the mask has the right size. 7106 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7107 return false; 7108 7109 // If Top 7110 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7111 // This inserts Input2 into Input1 7112 // else if not Top 7113 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7114 // This inserts Input1 into Input2 7115 unsigned Offset = Top ? 0 : 1; 7116 for (unsigned i = 0; i < NumElts; i+=2) { 7117 if (M[i] >= 0 && M[i] != (int)i) 7118 return false; 7119 if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) 7120 return false; 7121 } 7122 7123 return true; 7124 } 7125 7126 // If N is an integer constant that can be moved into a register in one 7127 // instruction, return an SDValue of such a constant (will become a MOV 7128 // instruction). Otherwise return null. 7129 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7130 const ARMSubtarget *ST, const SDLoc &dl) { 7131 uint64_t Val; 7132 if (!isa<ConstantSDNode>(N)) 7133 return SDValue(); 7134 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7135 7136 if (ST->isThumb1Only()) { 7137 if (Val <= 255 || ~Val <= 255) 7138 return DAG.getConstant(Val, dl, MVT::i32); 7139 } else { 7140 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7141 return DAG.getConstant(Val, dl, MVT::i32); 7142 } 7143 return SDValue(); 7144 } 7145 7146 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7147 const ARMSubtarget *ST) { 7148 SDLoc dl(Op); 7149 EVT VT = Op.getValueType(); 7150 7151 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7152 7153 unsigned NumElts = VT.getVectorNumElements(); 7154 unsigned BoolMask; 7155 unsigned BitsPerBool; 7156 if (NumElts == 4) { 7157 BitsPerBool = 4; 7158 BoolMask = 0xf; 7159 } else if (NumElts == 8) { 7160 BitsPerBool = 2; 7161 BoolMask = 0x3; 7162 } else if (NumElts == 16) { 7163 BitsPerBool = 1; 7164 BoolMask = 0x1; 7165 } else 7166 return SDValue(); 7167 7168 // If this is a single value copied into all lanes (a splat), we can just sign 7169 // extend that single value 7170 SDValue FirstOp = Op.getOperand(0); 7171 if (!isa<ConstantSDNode>(FirstOp) && 7172 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7173 [&FirstOp](SDUse &U) { 7174 return U.get().isUndef() || U.get() == FirstOp; 7175 })) { 7176 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7177 DAG.getValueType(MVT::i1)); 7178 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7179 } 7180 7181 // First create base with bits set where known 7182 unsigned Bits32 = 0; 7183 for (unsigned i = 0; i < NumElts; ++i) { 7184 SDValue V = Op.getOperand(i); 7185 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7186 continue; 7187 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7188 if (BitSet) 7189 Bits32 |= BoolMask << (i * BitsPerBool); 7190 } 7191 7192 // Add in unknown nodes 7193 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7194 DAG.getConstant(Bits32, dl, MVT::i32)); 7195 for (unsigned i = 0; i < NumElts; ++i) { 7196 SDValue V = Op.getOperand(i); 7197 if (isa<ConstantSDNode>(V) || V.isUndef()) 7198 continue; 7199 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7200 DAG.getConstant(i, dl, MVT::i32)); 7201 } 7202 7203 return Base; 7204 } 7205 7206 // If this is a case we can't handle, return null and let the default 7207 // expansion code take care of it. 7208 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7209 const ARMSubtarget *ST) const { 7210 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7211 SDLoc dl(Op); 7212 EVT VT = Op.getValueType(); 7213 7214 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7215 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7216 7217 APInt SplatBits, SplatUndef; 7218 unsigned SplatBitSize; 7219 bool HasAnyUndefs; 7220 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7221 if (SplatUndef.isAllOnesValue()) 7222 return DAG.getUNDEF(VT); 7223 7224 if ((ST->hasNEON() && SplatBitSize <= 64) || 7225 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { 7226 // Check if an immediate VMOV works. 7227 EVT VmovVT; 7228 SDValue Val = 7229 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 7230 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); 7231 7232 if (Val.getNode()) { 7233 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7234 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7235 } 7236 7237 // Try an immediate VMVN. 7238 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7239 Val = isVMOVModifiedImm( 7240 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, 7241 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7242 if (Val.getNode()) { 7243 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7244 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7245 } 7246 7247 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7248 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7249 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7250 if (ImmVal != -1) { 7251 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7252 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7253 } 7254 } 7255 } 7256 } 7257 7258 // Scan through the operands to see if only one value is used. 7259 // 7260 // As an optimisation, even if more than one value is used it may be more 7261 // profitable to splat with one value then change some lanes. 7262 // 7263 // Heuristically we decide to do this if the vector has a "dominant" value, 7264 // defined as splatted to more than half of the lanes. 7265 unsigned NumElts = VT.getVectorNumElements(); 7266 bool isOnlyLowElement = true; 7267 bool usesOnlyOneValue = true; 7268 bool hasDominantValue = false; 7269 bool isConstant = true; 7270 7271 // Map of the number of times a particular SDValue appears in the 7272 // element list. 7273 DenseMap<SDValue, unsigned> ValueCounts; 7274 SDValue Value; 7275 for (unsigned i = 0; i < NumElts; ++i) { 7276 SDValue V = Op.getOperand(i); 7277 if (V.isUndef()) 7278 continue; 7279 if (i > 0) 7280 isOnlyLowElement = false; 7281 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7282 isConstant = false; 7283 7284 ValueCounts.insert(std::make_pair(V, 0)); 7285 unsigned &Count = ValueCounts[V]; 7286 7287 // Is this value dominant? (takes up more than half of the lanes) 7288 if (++Count > (NumElts / 2)) { 7289 hasDominantValue = true; 7290 Value = V; 7291 } 7292 } 7293 if (ValueCounts.size() != 1) 7294 usesOnlyOneValue = false; 7295 if (!Value.getNode() && !ValueCounts.empty()) 7296 Value = ValueCounts.begin()->first; 7297 7298 if (ValueCounts.empty()) 7299 return DAG.getUNDEF(VT); 7300 7301 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7302 // Keep going if we are hitting this case. 7303 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7304 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7305 7306 unsigned EltSize = VT.getScalarSizeInBits(); 7307 7308 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7309 // i32 and try again. 7310 if (hasDominantValue && EltSize <= 32) { 7311 if (!isConstant) { 7312 SDValue N; 7313 7314 // If we are VDUPing a value that comes directly from a vector, that will 7315 // cause an unnecessary move to and from a GPR, where instead we could 7316 // just use VDUPLANE. We can only do this if the lane being extracted 7317 // is at a constant index, as the VDUP from lane instructions only have 7318 // constant-index forms. 7319 ConstantSDNode *constIndex; 7320 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7321 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7322 // We need to create a new undef vector to use for the VDUPLANE if the 7323 // size of the vector from which we get the value is different than the 7324 // size of the vector that we need to create. We will insert the element 7325 // such that the register coalescer will remove unnecessary copies. 7326 if (VT != Value->getOperand(0).getValueType()) { 7327 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7328 VT.getVectorNumElements(); 7329 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7330 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7331 Value, DAG.getConstant(index, dl, MVT::i32)), 7332 DAG.getConstant(index, dl, MVT::i32)); 7333 } else 7334 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7335 Value->getOperand(0), Value->getOperand(1)); 7336 } else 7337 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7338 7339 if (!usesOnlyOneValue) { 7340 // The dominant value was splatted as 'N', but we now have to insert 7341 // all differing elements. 7342 for (unsigned I = 0; I < NumElts; ++I) { 7343 if (Op.getOperand(I) == Value) 7344 continue; 7345 SmallVector<SDValue, 3> Ops; 7346 Ops.push_back(N); 7347 Ops.push_back(Op.getOperand(I)); 7348 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7349 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7350 } 7351 } 7352 return N; 7353 } 7354 if (VT.getVectorElementType().isFloatingPoint()) { 7355 SmallVector<SDValue, 8> Ops; 7356 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7357 assert(FVT == MVT::f32 || FVT == MVT::f16); 7358 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7359 for (unsigned i = 0; i < NumElts; ++i) 7360 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7361 Op.getOperand(i))); 7362 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7363 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7364 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7365 if (Val.getNode()) 7366 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7367 } 7368 if (usesOnlyOneValue) { 7369 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7370 if (isConstant && Val.getNode()) 7371 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7372 } 7373 } 7374 7375 // If all elements are constants and the case above didn't get hit, fall back 7376 // to the default expansion, which will generate a load from the constant 7377 // pool. 7378 if (isConstant) 7379 return SDValue(); 7380 7381 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 7382 if (NumElts >= 4) { 7383 SDValue shuffle = ReconstructShuffle(Op, DAG); 7384 if (shuffle != SDValue()) 7385 return shuffle; 7386 } 7387 7388 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7389 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7390 // into two 64-bit vectors; we might discover a better way to lower it. 7391 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7392 EVT ExtVT = VT.getVectorElementType(); 7393 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7394 SDValue Lower = 7395 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7396 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7397 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7398 SDValue Upper = DAG.getBuildVector( 7399 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7400 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7401 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7402 if (Lower && Upper) 7403 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7404 } 7405 7406 // Vectors with 32- or 64-bit elements can be built by directly assigning 7407 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7408 // will be legalized. 7409 if (EltSize >= 32) { 7410 // Do the expansion with floating-point types, since that is what the VFP 7411 // registers are defined to use, and since i64 is not legal. 7412 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7413 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7414 SmallVector<SDValue, 8> Ops; 7415 for (unsigned i = 0; i < NumElts; ++i) 7416 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7417 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7418 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7419 } 7420 7421 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7422 // know the default expansion would otherwise fall back on something even 7423 // worse. For a vector with one or two non-undef values, that's 7424 // scalar_to_vector for the elements followed by a shuffle (provided the 7425 // shuffle is valid for the target) and materialization element by element 7426 // on the stack followed by a load for everything else. 7427 if (!isConstant && !usesOnlyOneValue) { 7428 SDValue Vec = DAG.getUNDEF(VT); 7429 for (unsigned i = 0 ; i < NumElts; ++i) { 7430 SDValue V = Op.getOperand(i); 7431 if (V.isUndef()) 7432 continue; 7433 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7434 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7435 } 7436 return Vec; 7437 } 7438 7439 return SDValue(); 7440 } 7441 7442 // Gather data to see if the operation can be modelled as a 7443 // shuffle in combination with VEXTs. 7444 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7445 SelectionDAG &DAG) const { 7446 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7447 SDLoc dl(Op); 7448 EVT VT = Op.getValueType(); 7449 unsigned NumElts = VT.getVectorNumElements(); 7450 7451 struct ShuffleSourceInfo { 7452 SDValue Vec; 7453 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7454 unsigned MaxElt = 0; 7455 7456 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7457 // be compatible with the shuffle we intend to construct. As a result 7458 // ShuffleVec will be some sliding window into the original Vec. 7459 SDValue ShuffleVec; 7460 7461 // Code should guarantee that element i in Vec starts at element "WindowBase 7462 // + i * WindowScale in ShuffleVec". 7463 int WindowBase = 0; 7464 int WindowScale = 1; 7465 7466 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7467 7468 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7469 }; 7470 7471 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7472 // node. 7473 SmallVector<ShuffleSourceInfo, 2> Sources; 7474 for (unsigned i = 0; i < NumElts; ++i) { 7475 SDValue V = Op.getOperand(i); 7476 if (V.isUndef()) 7477 continue; 7478 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7479 // A shuffle can only come from building a vector from various 7480 // elements of other vectors. 7481 return SDValue(); 7482 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7483 // Furthermore, shuffles require a constant mask, whereas extractelts 7484 // accept variable indices. 7485 return SDValue(); 7486 } 7487 7488 // Add this element source to the list if it's not already there. 7489 SDValue SourceVec = V.getOperand(0); 7490 auto Source = llvm::find(Sources, SourceVec); 7491 if (Source == Sources.end()) 7492 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7493 7494 // Update the minimum and maximum lane number seen. 7495 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7496 Source->MinElt = std::min(Source->MinElt, EltNo); 7497 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7498 } 7499 7500 // Currently only do something sane when at most two source vectors 7501 // are involved. 7502 if (Sources.size() > 2) 7503 return SDValue(); 7504 7505 // Find out the smallest element size among result and two sources, and use 7506 // it as element size to build the shuffle_vector. 7507 EVT SmallestEltTy = VT.getVectorElementType(); 7508 for (auto &Source : Sources) { 7509 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7510 if (SrcEltTy.bitsLT(SmallestEltTy)) 7511 SmallestEltTy = SrcEltTy; 7512 } 7513 unsigned ResMultiplier = 7514 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7515 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7516 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7517 7518 // If the source vector is too wide or too narrow, we may nevertheless be able 7519 // to construct a compatible shuffle either by concatenating it with UNDEF or 7520 // extracting a suitable range of elements. 7521 for (auto &Src : Sources) { 7522 EVT SrcVT = Src.ShuffleVec.getValueType(); 7523 7524 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 7525 continue; 7526 7527 // This stage of the search produces a source with the same element type as 7528 // the original, but with a total width matching the BUILD_VECTOR output. 7529 EVT EltVT = SrcVT.getVectorElementType(); 7530 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 7531 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7532 7533 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 7534 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 7535 return SDValue(); 7536 // We can pad out the smaller vector for free, so if it's part of a 7537 // shuffle... 7538 Src.ShuffleVec = 7539 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7540 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7541 continue; 7542 } 7543 7544 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 7545 return SDValue(); 7546 7547 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7548 // Span too large for a VEXT to cope 7549 return SDValue(); 7550 } 7551 7552 if (Src.MinElt >= NumSrcElts) { 7553 // The extraction can just take the second half 7554 Src.ShuffleVec = 7555 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7556 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7557 Src.WindowBase = -NumSrcElts; 7558 } else if (Src.MaxElt < NumSrcElts) { 7559 // The extraction can just take the first half 7560 Src.ShuffleVec = 7561 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7562 DAG.getConstant(0, dl, MVT::i32)); 7563 } else { 7564 // An actual VEXT is needed 7565 SDValue VEXTSrc1 = 7566 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7567 DAG.getConstant(0, dl, MVT::i32)); 7568 SDValue VEXTSrc2 = 7569 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7570 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7571 7572 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 7573 VEXTSrc2, 7574 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 7575 Src.WindowBase = -Src.MinElt; 7576 } 7577 } 7578 7579 // Another possible incompatibility occurs from the vector element types. We 7580 // can fix this by bitcasting the source vectors to the same type we intend 7581 // for the shuffle. 7582 for (auto &Src : Sources) { 7583 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7584 if (SrcEltTy == SmallestEltTy) 7585 continue; 7586 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7587 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); 7588 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7589 Src.WindowBase *= Src.WindowScale; 7590 } 7591 7592 // Final sanity check before we try to actually produce a shuffle. 7593 LLVM_DEBUG(for (auto Src 7594 : Sources) 7595 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7596 7597 // The stars all align, our next step is to produce the mask for the shuffle. 7598 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7599 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7600 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7601 SDValue Entry = Op.getOperand(i); 7602 if (Entry.isUndef()) 7603 continue; 7604 7605 auto Src = llvm::find(Sources, Entry.getOperand(0)); 7606 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7607 7608 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7609 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7610 // segment. 7611 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7612 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 7613 VT.getScalarSizeInBits()); 7614 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7615 7616 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7617 // starting at the appropriate offset. 7618 int *LaneMask = &Mask[i * ResMultiplier]; 7619 7620 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7621 ExtractBase += NumElts * (Src - Sources.begin()); 7622 for (int j = 0; j < LanesDefined; ++j) 7623 LaneMask[j] = ExtractBase + j; 7624 } 7625 7626 7627 // We can't handle more than two sources. This should have already 7628 // been checked before this point. 7629 assert(Sources.size() <= 2 && "Too many sources!"); 7630 7631 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7632 for (unsigned i = 0; i < Sources.size(); ++i) 7633 ShuffleOps[i] = Sources[i].ShuffleVec; 7634 7635 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7636 ShuffleOps[1], Mask, DAG); 7637 if (!Shuffle) 7638 return SDValue(); 7639 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); 7640 } 7641 7642 enum ShuffleOpCodes { 7643 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7644 OP_VREV, 7645 OP_VDUP0, 7646 OP_VDUP1, 7647 OP_VDUP2, 7648 OP_VDUP3, 7649 OP_VEXT1, 7650 OP_VEXT2, 7651 OP_VEXT3, 7652 OP_VUZPL, // VUZP, left result 7653 OP_VUZPR, // VUZP, right result 7654 OP_VZIPL, // VZIP, left result 7655 OP_VZIPR, // VZIP, right result 7656 OP_VTRNL, // VTRN, left result 7657 OP_VTRNR // VTRN, right result 7658 }; 7659 7660 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 7661 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7662 switch (OpNum) { 7663 case OP_COPY: 7664 case OP_VREV: 7665 case OP_VDUP0: 7666 case OP_VDUP1: 7667 case OP_VDUP2: 7668 case OP_VDUP3: 7669 return true; 7670 } 7671 return false; 7672 } 7673 7674 /// isShuffleMaskLegal - Targets can use this to indicate that they only 7675 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7676 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7677 /// are assumed to be legal. 7678 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7679 if (VT.getVectorNumElements() == 4 && 7680 (VT.is128BitVector() || VT.is64BitVector())) { 7681 unsigned PFIndexes[4]; 7682 for (unsigned i = 0; i != 4; ++i) { 7683 if (M[i] < 0) 7684 PFIndexes[i] = 8; 7685 else 7686 PFIndexes[i] = M[i]; 7687 } 7688 7689 // Compute the index in the perfect shuffle table. 7690 unsigned PFTableIndex = 7691 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7692 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7693 unsigned Cost = (PFEntry >> 30); 7694 7695 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 7696 return true; 7697 } 7698 7699 bool ReverseVEXT, isV_UNDEF; 7700 unsigned Imm, WhichResult; 7701 7702 unsigned EltSize = VT.getScalarSizeInBits(); 7703 if (EltSize >= 32 || 7704 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7705 ShuffleVectorInst::isIdentityMask(M) || 7706 isVREVMask(M, VT, 64) || 7707 isVREVMask(M, VT, 32) || 7708 isVREVMask(M, VT, 16)) 7709 return true; 7710 else if (Subtarget->hasNEON() && 7711 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 7712 isVTBLMask(M, VT) || 7713 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 7714 return true; 7715 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 7716 isReverseMask(M, VT)) 7717 return true; 7718 else if (Subtarget->hasMVEIntegerOps() && 7719 (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) 7720 return true; 7721 else 7722 return false; 7723 } 7724 7725 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7726 /// the specified operations to build the shuffle. 7727 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7728 SDValue RHS, SelectionDAG &DAG, 7729 const SDLoc &dl) { 7730 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7731 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7732 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7733 7734 if (OpNum == OP_COPY) { 7735 if (LHSID == (1*9+2)*9+3) return LHS; 7736 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7737 return RHS; 7738 } 7739 7740 SDValue OpLHS, OpRHS; 7741 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7742 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7743 EVT VT = OpLHS.getValueType(); 7744 7745 switch (OpNum) { 7746 default: llvm_unreachable("Unknown shuffle opcode!"); 7747 case OP_VREV: 7748 // VREV divides the vector in half and swaps within the half. 7749 if (VT.getVectorElementType() == MVT::i32 || 7750 VT.getVectorElementType() == MVT::f32) 7751 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 7752 // vrev <4 x i16> -> VREV32 7753 if (VT.getVectorElementType() == MVT::i16) 7754 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 7755 // vrev <4 x i8> -> VREV16 7756 assert(VT.getVectorElementType() == MVT::i8); 7757 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 7758 case OP_VDUP0: 7759 case OP_VDUP1: 7760 case OP_VDUP2: 7761 case OP_VDUP3: 7762 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7763 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 7764 case OP_VEXT1: 7765 case OP_VEXT2: 7766 case OP_VEXT3: 7767 return DAG.getNode(ARMISD::VEXT, dl, VT, 7768 OpLHS, OpRHS, 7769 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 7770 case OP_VUZPL: 7771 case OP_VUZPR: 7772 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 7773 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 7774 case OP_VZIPL: 7775 case OP_VZIPR: 7776 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 7777 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 7778 case OP_VTRNL: 7779 case OP_VTRNR: 7780 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 7781 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 7782 } 7783 } 7784 7785 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 7786 ArrayRef<int> ShuffleMask, 7787 SelectionDAG &DAG) { 7788 // Check to see if we can use the VTBL instruction. 7789 SDValue V1 = Op.getOperand(0); 7790 SDValue V2 = Op.getOperand(1); 7791 SDLoc DL(Op); 7792 7793 SmallVector<SDValue, 8> VTBLMask; 7794 for (ArrayRef<int>::iterator 7795 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 7796 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 7797 7798 if (V2.getNode()->isUndef()) 7799 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 7800 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7801 7802 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 7803 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7804 } 7805 7806 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 7807 SelectionDAG &DAG) { 7808 SDLoc DL(Op); 7809 SDValue OpLHS = Op.getOperand(0); 7810 EVT VT = OpLHS.getValueType(); 7811 7812 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 7813 "Expect an v8i16/v16i8 type"); 7814 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 7815 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 7816 // extract the first 8 bytes into the top double word and the last 8 bytes 7817 // into the bottom double word. The v8i16 case is similar. 7818 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 7819 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 7820 DAG.getConstant(ExtractNum, DL, MVT::i32)); 7821 } 7822 7823 static EVT getVectorTyFromPredicateVector(EVT VT) { 7824 switch (VT.getSimpleVT().SimpleTy) { 7825 case MVT::v4i1: 7826 return MVT::v4i32; 7827 case MVT::v8i1: 7828 return MVT::v8i16; 7829 case MVT::v16i1: 7830 return MVT::v16i8; 7831 default: 7832 llvm_unreachable("Unexpected vector predicate type"); 7833 } 7834 } 7835 7836 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 7837 SelectionDAG &DAG) { 7838 // Converting from boolean predicates to integers involves creating a vector 7839 // of all ones or all zeroes and selecting the lanes based upon the real 7840 // predicate. 7841 SDValue AllOnes = 7842 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 7843 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 7844 7845 SDValue AllZeroes = 7846 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 7847 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 7848 7849 // Get full vector type from predicate type 7850 EVT NewVT = getVectorTyFromPredicateVector(VT); 7851 7852 SDValue RecastV1; 7853 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 7854 // this to a v16i1. This cannot be done with an ordinary bitcast because the 7855 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 7856 // since we know in hardware the sizes are really the same. 7857 if (VT != MVT::v16i1) 7858 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 7859 else 7860 RecastV1 = Pred; 7861 7862 // Select either all ones or zeroes depending upon the real predicate bits. 7863 SDValue PredAsVector = 7864 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 7865 7866 // Recast our new predicate-as-integer v16i8 vector into something 7867 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 7868 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 7869 } 7870 7871 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 7872 const ARMSubtarget *ST) { 7873 EVT VT = Op.getValueType(); 7874 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7875 ArrayRef<int> ShuffleMask = SVN->getMask(); 7876 7877 assert(ST->hasMVEIntegerOps() && 7878 "No support for vector shuffle of boolean predicates"); 7879 7880 SDValue V1 = Op.getOperand(0); 7881 SDLoc dl(Op); 7882 if (isReverseMask(ShuffleMask, VT)) { 7883 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 7884 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 7885 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 7886 DAG.getConstant(16, dl, MVT::i32)); 7887 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 7888 } 7889 7890 // Until we can come up with optimised cases for every single vector 7891 // shuffle in existence we have chosen the least painful strategy. This is 7892 // to essentially promote the boolean predicate to a 8-bit integer, where 7893 // each predicate represents a byte. Then we fall back on a normal integer 7894 // vector shuffle and convert the result back into a predicate vector. In 7895 // many cases the generated code might be even better than scalar code 7896 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 7897 // fields in a register into 8 other arbitrary 2-bit fields! 7898 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 7899 EVT NewVT = PredAsVector.getValueType(); 7900 7901 // Do the shuffle! 7902 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 7903 DAG.getUNDEF(NewVT), ShuffleMask); 7904 7905 // Now return the result of comparing the shuffled vector with zero, 7906 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 7907 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 7908 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 7909 } 7910 7911 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 7912 ArrayRef<int> ShuffleMask, 7913 SelectionDAG &DAG) { 7914 // Attempt to lower the vector shuffle using as many whole register movs as 7915 // possible. This is useful for types smaller than 32bits, which would 7916 // often otherwise become a series for grp movs. 7917 SDLoc dl(Op); 7918 EVT VT = Op.getValueType(); 7919 if (VT.getScalarSizeInBits() >= 32) 7920 return SDValue(); 7921 7922 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 7923 "Unexpected vector type"); 7924 int NumElts = VT.getVectorNumElements(); 7925 int QuarterSize = NumElts / 4; 7926 // The four final parts of the vector, as i32's 7927 SDValue Parts[4]; 7928 7929 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 7930 // <u,u,u,u>), returning the vmov lane index 7931 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 7932 // Detect which mov lane this would be from the first non-undef element. 7933 int MovIdx = -1; 7934 for (int i = 0; i < Length; i++) { 7935 if (ShuffleMask[Start + i] >= 0) { 7936 if (ShuffleMask[Start + i] % Length != i) 7937 return -1; 7938 MovIdx = ShuffleMask[Start + i] / Length; 7939 break; 7940 } 7941 } 7942 // If all items are undef, leave this for other combines 7943 if (MovIdx == -1) 7944 return -1; 7945 // Check the remaining values are the correct part of the same mov 7946 for (int i = 1; i < Length; i++) { 7947 if (ShuffleMask[Start + i] >= 0 && 7948 (ShuffleMask[Start + i] / Length != MovIdx || 7949 ShuffleMask[Start + i] % Length != i)) 7950 return -1; 7951 } 7952 return MovIdx; 7953 }; 7954 7955 for (int Part = 0; Part < 4; ++Part) { 7956 // Does this part look like a mov 7957 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 7958 if (Elt != -1) { 7959 SDValue Input = Op->getOperand(0); 7960 if (Elt >= 4) { 7961 Input = Op->getOperand(1); 7962 Elt -= 4; 7963 } 7964 SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); 7965 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, 7966 DAG.getConstant(Elt, dl, MVT::i32)); 7967 } 7968 } 7969 7970 // Nothing interesting found, just return 7971 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 7972 return SDValue(); 7973 7974 // The other parts need to be built with the old shuffle vector, cast to a 7975 // v4i32 and extract_vector_elts 7976 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 7977 SmallVector<int, 16> NewShuffleMask; 7978 for (int Part = 0; Part < 4; ++Part) 7979 for (int i = 0; i < QuarterSize; i++) 7980 NewShuffleMask.push_back( 7981 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 7982 SDValue NewShuffle = DAG.getVectorShuffle( 7983 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 7984 SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); 7985 7986 for (int Part = 0; Part < 4; ++Part) 7987 if (!Parts[Part]) 7988 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7989 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 7990 } 7991 // Build a vector out of the various parts and bitcast it back to the original 7992 // type. 7993 SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); 7994 return DAG.getBitcast(VT, NewVec); 7995 } 7996 7997 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 7998 const ARMSubtarget *ST) { 7999 SDValue V1 = Op.getOperand(0); 8000 SDValue V2 = Op.getOperand(1); 8001 SDLoc dl(Op); 8002 EVT VT = Op.getValueType(); 8003 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8004 unsigned EltSize = VT.getScalarSizeInBits(); 8005 8006 if (ST->hasMVEIntegerOps() && EltSize == 1) 8007 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 8008 8009 // Convert shuffles that are directly supported on NEON to target-specific 8010 // DAG nodes, instead of keeping them as shuffles and matching them again 8011 // during code selection. This is more efficient and avoids the possibility 8012 // of inconsistencies between legalization and selection. 8013 // FIXME: floating-point vectors should be canonicalized to integer vectors 8014 // of the same time so that they get CSEd properly. 8015 ArrayRef<int> ShuffleMask = SVN->getMask(); 8016 8017 if (EltSize <= 32) { 8018 if (SVN->isSplat()) { 8019 int Lane = SVN->getSplatIndex(); 8020 // If this is undef splat, generate it via "just" vdup, if possible. 8021 if (Lane == -1) Lane = 0; 8022 8023 // Test if V1 is a SCALAR_TO_VECTOR. 8024 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 8025 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8026 } 8027 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 8028 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 8029 // reaches it). 8030 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 8031 !isa<ConstantSDNode>(V1.getOperand(0))) { 8032 bool IsScalarToVector = true; 8033 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 8034 if (!V1.getOperand(i).isUndef()) { 8035 IsScalarToVector = false; 8036 break; 8037 } 8038 if (IsScalarToVector) 8039 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8040 } 8041 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 8042 DAG.getConstant(Lane, dl, MVT::i32)); 8043 } 8044 8045 bool ReverseVEXT = false; 8046 unsigned Imm = 0; 8047 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 8048 if (ReverseVEXT) 8049 std::swap(V1, V2); 8050 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 8051 DAG.getConstant(Imm, dl, MVT::i32)); 8052 } 8053 8054 if (isVREVMask(ShuffleMask, VT, 64)) 8055 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 8056 if (isVREVMask(ShuffleMask, VT, 32)) 8057 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8058 if (isVREVMask(ShuffleMask, VT, 16)) 8059 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8060 8061 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8062 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8063 DAG.getConstant(Imm, dl, MVT::i32)); 8064 } 8065 8066 // Check for Neon shuffles that modify both input vectors in place. 8067 // If both results are used, i.e., if there are two shuffles with the same 8068 // source operands and with masks corresponding to both results of one of 8069 // these operations, DAG memoization will ensure that a single node is 8070 // used for both shuffles. 8071 unsigned WhichResult = 0; 8072 bool isV_UNDEF = false; 8073 if (ST->hasNEON()) { 8074 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8075 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8076 if (isV_UNDEF) 8077 V2 = V1; 8078 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8079 .getValue(WhichResult); 8080 } 8081 } 8082 if (ST->hasMVEIntegerOps()) { 8083 if (isVMOVNMask(ShuffleMask, VT, 0)) 8084 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8085 DAG.getConstant(0, dl, MVT::i32)); 8086 if (isVMOVNMask(ShuffleMask, VT, 1)) 8087 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8088 DAG.getConstant(1, dl, MVT::i32)); 8089 } 8090 8091 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8092 // shuffles that produce a result larger than their operands with: 8093 // shuffle(concat(v1, undef), concat(v2, undef)) 8094 // -> 8095 // shuffle(concat(v1, v2), undef) 8096 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8097 // 8098 // This is useful in the general case, but there are special cases where 8099 // native shuffles produce larger results: the two-result ops. 8100 // 8101 // Look through the concat when lowering them: 8102 // shuffle(concat(v1, v2), undef) 8103 // -> 8104 // concat(VZIP(v1, v2):0, :1) 8105 // 8106 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8107 SDValue SubV1 = V1->getOperand(0); 8108 SDValue SubV2 = V1->getOperand(1); 8109 EVT SubVT = SubV1.getValueType(); 8110 8111 // We expect these to have been canonicalized to -1. 8112 assert(llvm::all_of(ShuffleMask, [&](int i) { 8113 return i < (int)VT.getVectorNumElements(); 8114 }) && "Unexpected shuffle index into UNDEF operand!"); 8115 8116 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8117 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8118 if (isV_UNDEF) 8119 SubV2 = SubV1; 8120 assert((WhichResult == 0) && 8121 "In-place shuffle of concat can only have one result!"); 8122 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8123 SubV1, SubV2); 8124 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8125 Res.getValue(1)); 8126 } 8127 } 8128 } 8129 8130 // If the shuffle is not directly supported and it has 4 elements, use 8131 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8132 unsigned NumElts = VT.getVectorNumElements(); 8133 if (NumElts == 4) { 8134 unsigned PFIndexes[4]; 8135 for (unsigned i = 0; i != 4; ++i) { 8136 if (ShuffleMask[i] < 0) 8137 PFIndexes[i] = 8; 8138 else 8139 PFIndexes[i] = ShuffleMask[i]; 8140 } 8141 8142 // Compute the index in the perfect shuffle table. 8143 unsigned PFTableIndex = 8144 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8145 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8146 unsigned Cost = (PFEntry >> 30); 8147 8148 if (Cost <= 4) { 8149 if (ST->hasNEON()) 8150 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8151 else if (isLegalMVEShuffleOp(PFEntry)) { 8152 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8153 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8154 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8155 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8156 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8157 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8158 } 8159 } 8160 } 8161 8162 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8163 if (EltSize >= 32) { 8164 // Do the expansion with floating-point types, since that is what the VFP 8165 // registers are defined to use, and since i64 is not legal. 8166 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8167 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8168 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8169 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8170 SmallVector<SDValue, 8> Ops; 8171 for (unsigned i = 0; i < NumElts; ++i) { 8172 if (ShuffleMask[i] < 0) 8173 Ops.push_back(DAG.getUNDEF(EltVT)); 8174 else 8175 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8176 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8177 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8178 dl, MVT::i32))); 8179 } 8180 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8181 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8182 } 8183 8184 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 8185 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 8186 8187 if (ST->hasNEON() && VT == MVT::v8i8) 8188 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8189 return NewOp; 8190 8191 if (ST->hasMVEIntegerOps()) 8192 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8193 return NewOp; 8194 8195 return SDValue(); 8196 } 8197 8198 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8199 const ARMSubtarget *ST) { 8200 EVT VecVT = Op.getOperand(0).getValueType(); 8201 SDLoc dl(Op); 8202 8203 assert(ST->hasMVEIntegerOps() && 8204 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8205 8206 SDValue Conv = 8207 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8208 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8209 unsigned LaneWidth = 8210 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8211 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8212 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8213 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8214 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8215 DAG.getConstant(~Mask, dl, MVT::i32)); 8216 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8217 } 8218 8219 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8220 SelectionDAG &DAG) const { 8221 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8222 SDValue Lane = Op.getOperand(2); 8223 if (!isa<ConstantSDNode>(Lane)) 8224 return SDValue(); 8225 8226 SDValue Elt = Op.getOperand(1); 8227 EVT EltVT = Elt.getValueType(); 8228 8229 if (Subtarget->hasMVEIntegerOps() && 8230 Op.getValueType().getScalarSizeInBits() == 1) 8231 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8232 8233 if (getTypeAction(*DAG.getContext(), EltVT) == 8234 TargetLowering::TypePromoteFloat) { 8235 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8236 // but the type system will try to do that if we don't intervene. 8237 // Reinterpret any such vector-element insertion as one with the 8238 // corresponding integer types. 8239 8240 SDLoc dl(Op); 8241 8242 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8243 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8244 TargetLowering::TypePromoteFloat); 8245 8246 SDValue VecIn = Op.getOperand(0); 8247 EVT VecVT = VecIn.getValueType(); 8248 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8249 VecVT.getVectorNumElements()); 8250 8251 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8252 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8253 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8254 IVecIn, IElt, Lane); 8255 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8256 } 8257 8258 return Op; 8259 } 8260 8261 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8262 const ARMSubtarget *ST) { 8263 EVT VecVT = Op.getOperand(0).getValueType(); 8264 SDLoc dl(Op); 8265 8266 assert(ST->hasMVEIntegerOps() && 8267 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8268 8269 SDValue Conv = 8270 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8271 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8272 unsigned LaneWidth = 8273 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8274 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8275 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8276 return Shift; 8277 } 8278 8279 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8280 const ARMSubtarget *ST) { 8281 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8282 SDValue Lane = Op.getOperand(1); 8283 if (!isa<ConstantSDNode>(Lane)) 8284 return SDValue(); 8285 8286 SDValue Vec = Op.getOperand(0); 8287 EVT VT = Vec.getValueType(); 8288 8289 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8290 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8291 8292 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8293 SDLoc dl(Op); 8294 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8295 } 8296 8297 return Op; 8298 } 8299 8300 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8301 const ARMSubtarget *ST) { 8302 SDValue V1 = Op.getOperand(0); 8303 SDValue V2 = Op.getOperand(1); 8304 SDLoc dl(Op); 8305 EVT VT = Op.getValueType(); 8306 EVT Op1VT = V1.getValueType(); 8307 EVT Op2VT = V2.getValueType(); 8308 unsigned NumElts = VT.getVectorNumElements(); 8309 8310 assert(Op1VT == Op2VT && "Operand types don't match!"); 8311 assert(VT.getScalarSizeInBits() == 1 && 8312 "Unexpected custom CONCAT_VECTORS lowering"); 8313 assert(ST->hasMVEIntegerOps() && 8314 "CONCAT_VECTORS lowering only supported for MVE"); 8315 8316 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8317 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8318 8319 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8320 // promoted to v8i16, etc. 8321 8322 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8323 8324 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8325 // to be the right size for the destination. For example, if Op1 is v4i1 then 8326 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8327 // which when promoted is v8i16. That means each i32 element from Op1 needs 8328 // truncating to i16 and inserting in the result. 8329 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8330 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8331 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8332 EVT NewVT = NewV.getValueType(); 8333 EVT ConcatVT = ConVec.getValueType(); 8334 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8335 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8336 DAG.getIntPtrConstant(i, dl)); 8337 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8338 DAG.getConstant(j, dl, MVT::i32)); 8339 } 8340 return ConVec; 8341 }; 8342 unsigned j = 0; 8343 ConVec = ExractInto(NewV1, ConVec, j); 8344 ConVec = ExractInto(NewV2, ConVec, j); 8345 8346 // Now return the result of comparing the subvector with zero, 8347 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8348 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8349 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8350 } 8351 8352 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8353 const ARMSubtarget *ST) { 8354 EVT VT = Op->getValueType(0); 8355 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8356 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8357 8358 // The only time a CONCAT_VECTORS operation can have legal types is when 8359 // two 64-bit vectors are concatenated to a 128-bit vector. 8360 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8361 "unexpected CONCAT_VECTORS"); 8362 SDLoc dl(Op); 8363 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8364 SDValue Op0 = Op.getOperand(0); 8365 SDValue Op1 = Op.getOperand(1); 8366 if (!Op0.isUndef()) 8367 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8368 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8369 DAG.getIntPtrConstant(0, dl)); 8370 if (!Op1.isUndef()) 8371 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8372 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8373 DAG.getIntPtrConstant(1, dl)); 8374 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8375 } 8376 8377 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8378 const ARMSubtarget *ST) { 8379 SDValue V1 = Op.getOperand(0); 8380 SDValue V2 = Op.getOperand(1); 8381 SDLoc dl(Op); 8382 EVT VT = Op.getValueType(); 8383 EVT Op1VT = V1.getValueType(); 8384 unsigned NumElts = VT.getVectorNumElements(); 8385 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8386 8387 assert(VT.getScalarSizeInBits() == 1 && 8388 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8389 assert(ST->hasMVEIntegerOps() && 8390 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8391 8392 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8393 8394 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8395 // promoted to v8i16, etc. 8396 8397 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8398 8399 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8400 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8401 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8402 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8403 DAG.getIntPtrConstant(i, dl)); 8404 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8405 DAG.getConstant(j, dl, MVT::i32)); 8406 } 8407 8408 // Now return the result of comparing the subvector with zero, 8409 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8410 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8411 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8412 } 8413 8414 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 8415 /// element has been zero/sign-extended, depending on the isSigned parameter, 8416 /// from an integer type half its size. 8417 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 8418 bool isSigned) { 8419 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 8420 EVT VT = N->getValueType(0); 8421 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 8422 SDNode *BVN = N->getOperand(0).getNode(); 8423 if (BVN->getValueType(0) != MVT::v4i32 || 8424 BVN->getOpcode() != ISD::BUILD_VECTOR) 8425 return false; 8426 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8427 unsigned HiElt = 1 - LoElt; 8428 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 8429 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 8430 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 8431 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 8432 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 8433 return false; 8434 if (isSigned) { 8435 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 8436 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 8437 return true; 8438 } else { 8439 if (Hi0->isNullValue() && Hi1->isNullValue()) 8440 return true; 8441 } 8442 return false; 8443 } 8444 8445 if (N->getOpcode() != ISD::BUILD_VECTOR) 8446 return false; 8447 8448 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 8449 SDNode *Elt = N->getOperand(i).getNode(); 8450 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 8451 unsigned EltSize = VT.getScalarSizeInBits(); 8452 unsigned HalfSize = EltSize / 2; 8453 if (isSigned) { 8454 if (!isIntN(HalfSize, C->getSExtValue())) 8455 return false; 8456 } else { 8457 if (!isUIntN(HalfSize, C->getZExtValue())) 8458 return false; 8459 } 8460 continue; 8461 } 8462 return false; 8463 } 8464 8465 return true; 8466 } 8467 8468 /// isSignExtended - Check if a node is a vector value that is sign-extended 8469 /// or a constant BUILD_VECTOR with sign-extended elements. 8470 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 8471 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 8472 return true; 8473 if (isExtendedBUILD_VECTOR(N, DAG, true)) 8474 return true; 8475 return false; 8476 } 8477 8478 /// isZeroExtended - Check if a node is a vector value that is zero-extended 8479 /// or a constant BUILD_VECTOR with zero-extended elements. 8480 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 8481 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 8482 return true; 8483 if (isExtendedBUILD_VECTOR(N, DAG, false)) 8484 return true; 8485 return false; 8486 } 8487 8488 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 8489 if (OrigVT.getSizeInBits() >= 64) 8490 return OrigVT; 8491 8492 assert(OrigVT.isSimple() && "Expecting a simple value type"); 8493 8494 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 8495 switch (OrigSimpleTy) { 8496 default: llvm_unreachable("Unexpected Vector Type"); 8497 case MVT::v2i8: 8498 case MVT::v2i16: 8499 return MVT::v2i32; 8500 case MVT::v4i8: 8501 return MVT::v4i16; 8502 } 8503 } 8504 8505 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 8506 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 8507 /// We insert the required extension here to get the vector to fill a D register. 8508 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 8509 const EVT &OrigTy, 8510 const EVT &ExtTy, 8511 unsigned ExtOpcode) { 8512 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 8513 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 8514 // 64-bits we need to insert a new extension so that it will be 64-bits. 8515 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 8516 if (OrigTy.getSizeInBits() >= 64) 8517 return N; 8518 8519 // Must extend size to at least 64 bits to be used as an operand for VMULL. 8520 EVT NewVT = getExtensionTo64Bits(OrigTy); 8521 8522 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 8523 } 8524 8525 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 8526 /// does not do any sign/zero extension. If the original vector is less 8527 /// than 64 bits, an appropriate extension will be added after the load to 8528 /// reach a total size of 64 bits. We have to add the extension separately 8529 /// because ARM does not have a sign/zero extending load for vectors. 8530 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 8531 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 8532 8533 // The load already has the right type. 8534 if (ExtendedTy == LD->getMemoryVT()) 8535 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 8536 LD->getBasePtr(), LD->getPointerInfo(), 8537 LD->getAlignment(), LD->getMemOperand()->getFlags()); 8538 8539 // We need to create a zextload/sextload. We cannot just create a load 8540 // followed by a zext/zext node because LowerMUL is also run during normal 8541 // operation legalization where we can't create illegal types. 8542 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 8543 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 8544 LD->getMemoryVT(), LD->getAlignment(), 8545 LD->getMemOperand()->getFlags()); 8546 } 8547 8548 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 8549 /// extending load, or BUILD_VECTOR with extended elements, return the 8550 /// unextended value. The unextended vector should be 64 bits so that it can 8551 /// be used as an operand to a VMULL instruction. If the original vector size 8552 /// before extension is less than 64 bits we add a an extension to resize 8553 /// the vector to 64 bits. 8554 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 8555 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 8556 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 8557 N->getOperand(0)->getValueType(0), 8558 N->getValueType(0), 8559 N->getOpcode()); 8560 8561 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8562 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 8563 "Expected extending load"); 8564 8565 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 8566 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 8567 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8568 SDValue extLoad = 8569 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 8570 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 8571 8572 return newLoad; 8573 } 8574 8575 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 8576 // have been legalized as a BITCAST from v4i32. 8577 if (N->getOpcode() == ISD::BITCAST) { 8578 SDNode *BVN = N->getOperand(0).getNode(); 8579 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 8580 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 8581 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8582 return DAG.getBuildVector( 8583 MVT::v2i32, SDLoc(N), 8584 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 8585 } 8586 // Construct a new BUILD_VECTOR with elements truncated to half the size. 8587 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 8588 EVT VT = N->getValueType(0); 8589 unsigned EltSize = VT.getScalarSizeInBits() / 2; 8590 unsigned NumElts = VT.getVectorNumElements(); 8591 MVT TruncVT = MVT::getIntegerVT(EltSize); 8592 SmallVector<SDValue, 8> Ops; 8593 SDLoc dl(N); 8594 for (unsigned i = 0; i != NumElts; ++i) { 8595 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 8596 const APInt &CInt = C->getAPIntValue(); 8597 // Element types smaller than 32 bits are not legal, so use i32 elements. 8598 // The values are implicitly truncated so sext vs. zext doesn't matter. 8599 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 8600 } 8601 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 8602 } 8603 8604 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 8605 unsigned Opcode = N->getOpcode(); 8606 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8607 SDNode *N0 = N->getOperand(0).getNode(); 8608 SDNode *N1 = N->getOperand(1).getNode(); 8609 return N0->hasOneUse() && N1->hasOneUse() && 8610 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 8611 } 8612 return false; 8613 } 8614 8615 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 8616 unsigned Opcode = N->getOpcode(); 8617 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8618 SDNode *N0 = N->getOperand(0).getNode(); 8619 SDNode *N1 = N->getOperand(1).getNode(); 8620 return N0->hasOneUse() && N1->hasOneUse() && 8621 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 8622 } 8623 return false; 8624 } 8625 8626 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 8627 // Multiplications are only custom-lowered for 128-bit vectors so that 8628 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 8629 EVT VT = Op.getValueType(); 8630 assert(VT.is128BitVector() && VT.isInteger() && 8631 "unexpected type for custom-lowering ISD::MUL"); 8632 SDNode *N0 = Op.getOperand(0).getNode(); 8633 SDNode *N1 = Op.getOperand(1).getNode(); 8634 unsigned NewOpc = 0; 8635 bool isMLA = false; 8636 bool isN0SExt = isSignExtended(N0, DAG); 8637 bool isN1SExt = isSignExtended(N1, DAG); 8638 if (isN0SExt && isN1SExt) 8639 NewOpc = ARMISD::VMULLs; 8640 else { 8641 bool isN0ZExt = isZeroExtended(N0, DAG); 8642 bool isN1ZExt = isZeroExtended(N1, DAG); 8643 if (isN0ZExt && isN1ZExt) 8644 NewOpc = ARMISD::VMULLu; 8645 else if (isN1SExt || isN1ZExt) { 8646 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 8647 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 8648 if (isN1SExt && isAddSubSExt(N0, DAG)) { 8649 NewOpc = ARMISD::VMULLs; 8650 isMLA = true; 8651 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 8652 NewOpc = ARMISD::VMULLu; 8653 isMLA = true; 8654 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 8655 std::swap(N0, N1); 8656 NewOpc = ARMISD::VMULLu; 8657 isMLA = true; 8658 } 8659 } 8660 8661 if (!NewOpc) { 8662 if (VT == MVT::v2i64) 8663 // Fall through to expand this. It is not legal. 8664 return SDValue(); 8665 else 8666 // Other vector multiplications are legal. 8667 return Op; 8668 } 8669 } 8670 8671 // Legalize to a VMULL instruction. 8672 SDLoc DL(Op); 8673 SDValue Op0; 8674 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 8675 if (!isMLA) { 8676 Op0 = SkipExtensionForVMULL(N0, DAG); 8677 assert(Op0.getValueType().is64BitVector() && 8678 Op1.getValueType().is64BitVector() && 8679 "unexpected types for extended operands to VMULL"); 8680 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 8681 } 8682 8683 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 8684 // isel lowering to take advantage of no-stall back to back vmul + vmla. 8685 // vmull q0, d4, d6 8686 // vmlal q0, d5, d6 8687 // is faster than 8688 // vaddl q0, d4, d5 8689 // vmovl q1, d6 8690 // vmul q0, q0, q1 8691 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 8692 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 8693 EVT Op1VT = Op1.getValueType(); 8694 return DAG.getNode(N0->getOpcode(), DL, VT, 8695 DAG.getNode(NewOpc, DL, VT, 8696 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 8697 DAG.getNode(NewOpc, DL, VT, 8698 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 8699 } 8700 8701 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 8702 SelectionDAG &DAG) { 8703 // TODO: Should this propagate fast-math-flags? 8704 8705 // Convert to float 8706 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 8707 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 8708 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 8709 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 8710 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 8711 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 8712 // Get reciprocal estimate. 8713 // float4 recip = vrecpeq_f32(yf); 8714 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8715 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8716 Y); 8717 // Because char has a smaller range than uchar, we can actually get away 8718 // without any newton steps. This requires that we use a weird bias 8719 // of 0xb000, however (again, this has been exhaustively tested). 8720 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 8721 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 8722 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 8723 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 8724 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 8725 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 8726 // Convert back to short. 8727 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 8728 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 8729 return X; 8730 } 8731 8732 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 8733 SelectionDAG &DAG) { 8734 // TODO: Should this propagate fast-math-flags? 8735 8736 SDValue N2; 8737 // Convert to float. 8738 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 8739 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 8740 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 8741 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 8742 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8743 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8744 8745 // Use reciprocal estimate and one refinement step. 8746 // float4 recip = vrecpeq_f32(yf); 8747 // recip *= vrecpsq_f32(yf, recip); 8748 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8749 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8750 N1); 8751 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8752 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8753 N1, N2); 8754 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8755 // Because short has a smaller range than ushort, we can actually get away 8756 // with only a single newton step. This requires that we use a weird bias 8757 // of 89, however (again, this has been exhaustively tested). 8758 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 8759 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8760 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8761 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 8762 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8763 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8764 // Convert back to integer and return. 8765 // return vmovn_s32(vcvt_s32_f32(result)); 8766 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8767 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8768 return N0; 8769 } 8770 8771 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 8772 const ARMSubtarget *ST) { 8773 EVT VT = Op.getValueType(); 8774 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8775 "unexpected type for custom-lowering ISD::SDIV"); 8776 8777 SDLoc dl(Op); 8778 SDValue N0 = Op.getOperand(0); 8779 SDValue N1 = Op.getOperand(1); 8780 SDValue N2, N3; 8781 8782 if (VT == MVT::v8i8) { 8783 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 8784 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 8785 8786 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8787 DAG.getIntPtrConstant(4, dl)); 8788 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8789 DAG.getIntPtrConstant(4, dl)); 8790 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8791 DAG.getIntPtrConstant(0, dl)); 8792 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8793 DAG.getIntPtrConstant(0, dl)); 8794 8795 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 8796 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 8797 8798 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8799 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8800 8801 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 8802 return N0; 8803 } 8804 return LowerSDIV_v4i16(N0, N1, dl, DAG); 8805 } 8806 8807 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 8808 const ARMSubtarget *ST) { 8809 // TODO: Should this propagate fast-math-flags? 8810 EVT VT = Op.getValueType(); 8811 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8812 "unexpected type for custom-lowering ISD::UDIV"); 8813 8814 SDLoc dl(Op); 8815 SDValue N0 = Op.getOperand(0); 8816 SDValue N1 = Op.getOperand(1); 8817 SDValue N2, N3; 8818 8819 if (VT == MVT::v8i8) { 8820 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 8821 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 8822 8823 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8824 DAG.getIntPtrConstant(4, dl)); 8825 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8826 DAG.getIntPtrConstant(4, dl)); 8827 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8828 DAG.getIntPtrConstant(0, dl)); 8829 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8830 DAG.getIntPtrConstant(0, dl)); 8831 8832 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 8833 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 8834 8835 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8836 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8837 8838 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 8839 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 8840 MVT::i32), 8841 N0); 8842 return N0; 8843 } 8844 8845 // v4i16 sdiv ... Convert to float. 8846 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 8847 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 8848 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 8849 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 8850 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8851 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8852 8853 // Use reciprocal estimate and two refinement steps. 8854 // float4 recip = vrecpeq_f32(yf); 8855 // recip *= vrecpsq_f32(yf, recip); 8856 // recip *= vrecpsq_f32(yf, recip); 8857 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8858 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8859 BN1); 8860 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8861 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8862 BN1, N2); 8863 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8864 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8865 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8866 BN1, N2); 8867 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8868 // Simply multiplying by the reciprocal estimate can leave us a few ulps 8869 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 8870 // and that it will never cause us to return an answer too large). 8871 // float4 result = as_float4(as_int4(xf*recip) + 2); 8872 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8873 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8874 N1 = DAG.getConstant(2, dl, MVT::v4i32); 8875 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8876 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8877 // Convert back to integer and return. 8878 // return vmovn_u32(vcvt_s32_f32(result)); 8879 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8880 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8881 return N0; 8882 } 8883 8884 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 8885 SDNode *N = Op.getNode(); 8886 EVT VT = N->getValueType(0); 8887 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8888 8889 SDValue Carry = Op.getOperand(2); 8890 8891 SDLoc DL(Op); 8892 8893 SDValue Result; 8894 if (Op.getOpcode() == ISD::ADDCARRY) { 8895 // This converts the boolean value carry into the carry flag. 8896 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8897 8898 // Do the addition proper using the carry flag we wanted. 8899 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 8900 Op.getOperand(1), Carry); 8901 8902 // Now convert the carry flag into a boolean value. 8903 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8904 } else { 8905 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 8906 // have to invert the carry first. 8907 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8908 DAG.getConstant(1, DL, MVT::i32), Carry); 8909 // This converts the boolean value carry into the carry flag. 8910 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8911 8912 // Do the subtraction proper using the carry flag we wanted. 8913 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 8914 Op.getOperand(1), Carry); 8915 8916 // Now convert the carry flag into a boolean value. 8917 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8918 // But the carry returned by ARMISD::SUBE is not a borrow as expected 8919 // by ISD::SUBCARRY, so compute 1 - C. 8920 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8921 DAG.getConstant(1, DL, MVT::i32), Carry); 8922 } 8923 8924 // Return both values. 8925 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 8926 } 8927 8928 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 8929 assert(Subtarget->isTargetDarwin()); 8930 8931 // For iOS, we want to call an alternative entry point: __sincos_stret, 8932 // return values are passed via sret. 8933 SDLoc dl(Op); 8934 SDValue Arg = Op.getOperand(0); 8935 EVT ArgVT = Arg.getValueType(); 8936 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8937 auto PtrVT = getPointerTy(DAG.getDataLayout()); 8938 8939 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8940 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8941 8942 // Pair of floats / doubles used to pass the result. 8943 Type *RetTy = StructType::get(ArgTy, ArgTy); 8944 auto &DL = DAG.getDataLayout(); 8945 8946 ArgListTy Args; 8947 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 8948 SDValue SRet; 8949 if (ShouldUseSRet) { 8950 // Create stack object for sret. 8951 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 8952 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 8953 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 8954 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 8955 8956 ArgListEntry Entry; 8957 Entry.Node = SRet; 8958 Entry.Ty = RetTy->getPointerTo(); 8959 Entry.IsSExt = false; 8960 Entry.IsZExt = false; 8961 Entry.IsSRet = true; 8962 Args.push_back(Entry); 8963 RetTy = Type::getVoidTy(*DAG.getContext()); 8964 } 8965 8966 ArgListEntry Entry; 8967 Entry.Node = Arg; 8968 Entry.Ty = ArgTy; 8969 Entry.IsSExt = false; 8970 Entry.IsZExt = false; 8971 Args.push_back(Entry); 8972 8973 RTLIB::Libcall LC = 8974 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 8975 const char *LibcallName = getLibcallName(LC); 8976 CallingConv::ID CC = getLibcallCallingConv(LC); 8977 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 8978 8979 TargetLowering::CallLoweringInfo CLI(DAG); 8980 CLI.setDebugLoc(dl) 8981 .setChain(DAG.getEntryNode()) 8982 .setCallee(CC, RetTy, Callee, std::move(Args)) 8983 .setDiscardResult(ShouldUseSRet); 8984 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 8985 8986 if (!ShouldUseSRet) 8987 return CallResult.first; 8988 8989 SDValue LoadSin = 8990 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 8991 8992 // Address of cos field. 8993 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 8994 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 8995 SDValue LoadCos = 8996 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 8997 8998 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 8999 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 9000 LoadSin.getValue(0), LoadCos.getValue(0)); 9001 } 9002 9003 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 9004 bool Signed, 9005 SDValue &Chain) const { 9006 EVT VT = Op.getValueType(); 9007 assert((VT == MVT::i32 || VT == MVT::i64) && 9008 "unexpected type for custom lowering DIV"); 9009 SDLoc dl(Op); 9010 9011 const auto &DL = DAG.getDataLayout(); 9012 const auto &TLI = DAG.getTargetLoweringInfo(); 9013 9014 const char *Name = nullptr; 9015 if (Signed) 9016 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 9017 else 9018 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 9019 9020 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 9021 9022 ARMTargetLowering::ArgListTy Args; 9023 9024 for (auto AI : {1, 0}) { 9025 ArgListEntry Arg; 9026 Arg.Node = Op.getOperand(AI); 9027 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 9028 Args.push_back(Arg); 9029 } 9030 9031 CallLoweringInfo CLI(DAG); 9032 CLI.setDebugLoc(dl) 9033 .setChain(Chain) 9034 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 9035 ES, std::move(Args)); 9036 9037 return LowerCallTo(CLI).first; 9038 } 9039 9040 // This is a code size optimisation: return the original SDIV node to 9041 // DAGCombiner when we don't want to expand SDIV into a sequence of 9042 // instructions, and an empty node otherwise which will cause the 9043 // SDIV to be expanded in DAGCombine. 9044 SDValue 9045 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 9046 SelectionDAG &DAG, 9047 SmallVectorImpl<SDNode *> &Created) const { 9048 // TODO: Support SREM 9049 if (N->getOpcode() != ISD::SDIV) 9050 return SDValue(); 9051 9052 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 9053 const bool MinSize = ST.hasMinSize(); 9054 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 9055 : ST.hasDivideInARMMode(); 9056 9057 // Don't touch vector types; rewriting this may lead to scalarizing 9058 // the int divs. 9059 if (N->getOperand(0).getValueType().isVector()) 9060 return SDValue(); 9061 9062 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9063 // hwdiv support for this to be really profitable. 9064 if (!(MinSize && HasDivide)) 9065 return SDValue(); 9066 9067 // ARM mode is a bit simpler than Thumb: we can handle large power 9068 // of 2 immediates with 1 mov instruction; no further checks required, 9069 // just return the sdiv node. 9070 if (!ST.isThumb()) 9071 return SDValue(N, 0); 9072 9073 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9074 // and thus lose the code size benefits of a MOVS that requires only 2. 9075 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9076 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9077 if (Divisor.sgt(128)) 9078 return SDValue(); 9079 9080 return SDValue(N, 0); 9081 } 9082 9083 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9084 bool Signed) const { 9085 assert(Op.getValueType() == MVT::i32 && 9086 "unexpected type for custom lowering DIV"); 9087 SDLoc dl(Op); 9088 9089 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9090 DAG.getEntryNode(), Op.getOperand(1)); 9091 9092 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9093 } 9094 9095 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9096 SDLoc DL(N); 9097 SDValue Op = N->getOperand(1); 9098 if (N->getValueType(0) == MVT::i32) 9099 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9100 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9101 DAG.getConstant(0, DL, MVT::i32)); 9102 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9103 DAG.getConstant(1, DL, MVT::i32)); 9104 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9105 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9106 } 9107 9108 void ARMTargetLowering::ExpandDIV_Windows( 9109 SDValue Op, SelectionDAG &DAG, bool Signed, 9110 SmallVectorImpl<SDValue> &Results) const { 9111 const auto &DL = DAG.getDataLayout(); 9112 const auto &TLI = DAG.getTargetLoweringInfo(); 9113 9114 assert(Op.getValueType() == MVT::i64 && 9115 "unexpected type for custom lowering DIV"); 9116 SDLoc dl(Op); 9117 9118 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9119 9120 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9121 9122 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9123 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9124 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9125 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9126 9127 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); 9128 } 9129 9130 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9131 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9132 EVT MemVT = LD->getMemoryVT(); 9133 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9134 "Expected a predicate type!"); 9135 assert(MemVT == Op.getValueType()); 9136 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9137 "Expected a non-extending load"); 9138 assert(LD->isUnindexed() && "Expected a unindexed load"); 9139 9140 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 9141 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9142 // need to make sure that 8/4 bits are actually loaded into the correct 9143 // place, which means loading the value and then shuffling the values into 9144 // the bottom bits of the predicate. 9145 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9146 // for BE). 9147 9148 SDLoc dl(Op); 9149 SDValue Load = DAG.getExtLoad( 9150 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9151 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9152 LD->getMemOperand()); 9153 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); 9154 if (MemVT != MVT::v16i1) 9155 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9156 DAG.getConstant(0, dl, MVT::i32)); 9157 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9158 } 9159 9160 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, 9161 SelectionDAG &DAG) const { 9162 LoadSDNode *LD = cast<LoadSDNode>(N); 9163 EVT MemVT = LD->getMemoryVT(); 9164 assert(LD->isUnindexed() && "Loads should be unindexed at this point."); 9165 9166 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9167 !Subtarget->isThumb1Only() && LD->isVolatile()) { 9168 SDLoc dl(N); 9169 SDValue Result = DAG.getMemIntrinsicNode( 9170 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), 9171 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); 9172 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); 9173 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); 9174 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 9175 Results.append({Pair, Result.getValue(2)}); 9176 } 9177 } 9178 9179 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 9180 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9181 EVT MemVT = ST->getMemoryVT(); 9182 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9183 "Expected a predicate type!"); 9184 assert(MemVT == ST->getValue().getValueType()); 9185 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 9186 assert(ST->isUnindexed() && "Expected a unindexed store"); 9187 9188 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 9189 // unset and a scalar store. 9190 SDLoc dl(Op); 9191 SDValue Build = ST->getValue(); 9192 if (MemVT != MVT::v16i1) { 9193 SmallVector<SDValue, 16> Ops; 9194 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) 9195 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 9196 DAG.getConstant(I, dl, MVT::i32))); 9197 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 9198 Ops.push_back(DAG.getUNDEF(MVT::i32)); 9199 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 9200 } 9201 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 9202 return DAG.getTruncStore( 9203 ST->getChain(), dl, GRP, ST->getBasePtr(), 9204 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9205 ST->getMemOperand()); 9206 } 9207 9208 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, 9209 const ARMSubtarget *Subtarget) { 9210 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9211 EVT MemVT = ST->getMemoryVT(); 9212 assert(ST->isUnindexed() && "Stores should be unindexed at this point."); 9213 9214 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9215 !Subtarget->isThumb1Only() && ST->isVolatile()) { 9216 SDNode *N = Op.getNode(); 9217 SDLoc dl(N); 9218 9219 SDValue Lo = DAG.getNode( 9220 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9221 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, 9222 MVT::i32)); 9223 SDValue Hi = DAG.getNode( 9224 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9225 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, 9226 MVT::i32)); 9227 9228 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), 9229 {ST->getChain(), Lo, Hi, ST->getBasePtr()}, 9230 MemVT, ST->getMemOperand()); 9231 } else if (Subtarget->hasMVEIntegerOps() && 9232 ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 9233 MemVT == MVT::v16i1))) { 9234 return LowerPredicateStore(Op, DAG); 9235 } 9236 9237 return SDValue(); 9238 } 9239 9240 static bool isZeroVector(SDValue N) { 9241 return (ISD::isBuildVectorAllZeros(N.getNode()) || 9242 (N->getOpcode() == ARMISD::VMOVIMM && 9243 isNullConstant(N->getOperand(0)))); 9244 } 9245 9246 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 9247 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 9248 MVT VT = Op.getSimpleValueType(); 9249 SDValue Mask = N->getMask(); 9250 SDValue PassThru = N->getPassThru(); 9251 SDLoc dl(Op); 9252 9253 if (isZeroVector(PassThru)) 9254 return Op; 9255 9256 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9257 // zero too, and other values are lowered to a select. 9258 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9259 DAG.getTargetConstant(0, dl, MVT::i32)); 9260 SDValue NewLoad = DAG.getMaskedLoad( 9261 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 9262 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 9263 N->getExtensionType(), N->isExpandingLoad()); 9264 SDValue Combo = NewLoad; 9265 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || 9266 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && 9267 isZeroVector(PassThru->getOperand(0)); 9268 if (!PassThru.isUndef() && !PassThruIsCastZero) 9269 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9270 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9271 } 9272 9273 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9274 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 9275 // Acquire/Release load/store is not legal for targets without a dmb or 9276 // equivalent available. 9277 return SDValue(); 9278 9279 // Monotonic load/store is legal for all targets. 9280 return Op; 9281 } 9282 9283 static void ReplaceREADCYCLECOUNTER(SDNode *N, 9284 SmallVectorImpl<SDValue> &Results, 9285 SelectionDAG &DAG, 9286 const ARMSubtarget *Subtarget) { 9287 SDLoc DL(N); 9288 // Under Power Management extensions, the cycle-count is: 9289 // mrc p15, #0, <Rt>, c9, c13, #0 9290 SDValue Ops[] = { N->getOperand(0), // Chain 9291 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 9292 DAG.getTargetConstant(15, DL, MVT::i32), 9293 DAG.getTargetConstant(0, DL, MVT::i32), 9294 DAG.getTargetConstant(9, DL, MVT::i32), 9295 DAG.getTargetConstant(13, DL, MVT::i32), 9296 DAG.getTargetConstant(0, DL, MVT::i32) 9297 }; 9298 9299 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 9300 DAG.getVTList(MVT::i32, MVT::Other), Ops); 9301 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 9302 DAG.getConstant(0, DL, MVT::i32))); 9303 Results.push_back(Cycles32.getValue(1)); 9304 } 9305 9306 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 9307 SDLoc dl(V.getNode()); 9308 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 9309 SDValue VHi = DAG.getAnyExtOrTrunc( 9310 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 9311 dl, MVT::i32); 9312 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9313 if (isBigEndian) 9314 std::swap (VLo, VHi); 9315 SDValue RegClass = 9316 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 9317 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 9318 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 9319 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 9320 return SDValue( 9321 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 9322 } 9323 9324 static void ReplaceCMP_SWAP_64Results(SDNode *N, 9325 SmallVectorImpl<SDValue> & Results, 9326 SelectionDAG &DAG) { 9327 assert(N->getValueType(0) == MVT::i64 && 9328 "AtomicCmpSwap on types less than 64 should be legal"); 9329 SDValue Ops[] = {N->getOperand(1), 9330 createGPRPairNode(DAG, N->getOperand(2)), 9331 createGPRPairNode(DAG, N->getOperand(3)), 9332 N->getOperand(0)}; 9333 SDNode *CmpSwap = DAG.getMachineNode( 9334 ARM::CMP_SWAP_64, SDLoc(N), 9335 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 9336 9337 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 9338 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 9339 9340 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9341 9342 SDValue Lo = 9343 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 9344 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 9345 SDValue Hi = 9346 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 9347 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 9348 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); 9349 Results.push_back(SDValue(CmpSwap, 2)); 9350 } 9351 9352 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 9353 SDLoc dl(Op); 9354 EVT VT = Op.getValueType(); 9355 SDValue Chain = Op.getOperand(0); 9356 SDValue LHS = Op.getOperand(1); 9357 SDValue RHS = Op.getOperand(2); 9358 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 9359 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 9360 9361 // If we don't have instructions of this float type then soften to a libcall 9362 // and use SETCC instead. 9363 if (isUnsupportedFloatingType(LHS.getValueType())) { 9364 DAG.getTargetLoweringInfo().softenSetCCOperands( 9365 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 9366 if (!RHS.getNode()) { 9367 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 9368 CC = ISD::SETNE; 9369 } 9370 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 9371 DAG.getCondCode(CC)); 9372 return DAG.getMergeValues({Result, Chain}, dl); 9373 } 9374 9375 ARMCC::CondCodes CondCode, CondCode2; 9376 FPCCToARMCC(CC, CondCode, CondCode2); 9377 9378 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 9379 // in CMPFP and CMPFPE, but instead it should be made explicit by these 9380 // instructions using a chain instead of glue. This would also fix the problem 9381 // here (and also in LowerSELECT_CC) where we generate two comparisons when 9382 // CondCode2 != AL. 9383 SDValue True = DAG.getConstant(1, dl, VT); 9384 SDValue False = DAG.getConstant(0, dl, VT); 9385 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 9386 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 9387 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9388 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 9389 if (CondCode2 != ARMCC::AL) { 9390 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 9391 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9392 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 9393 } 9394 return DAG.getMergeValues({Result, Chain}, dl); 9395 } 9396 9397 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9398 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 9399 switch (Op.getOpcode()) { 9400 default: llvm_unreachable("Don't know how to custom lower this!"); 9401 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 9402 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9403 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9404 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9405 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9406 case ISD::SELECT: return LowerSELECT(Op, DAG); 9407 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9408 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9409 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 9410 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 9411 case ISD::VASTART: return LowerVASTART(Op, DAG); 9412 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 9413 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 9414 case ISD::SINT_TO_FP: 9415 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9416 case ISD::STRICT_FP_TO_SINT: 9417 case ISD::STRICT_FP_TO_UINT: 9418 case ISD::FP_TO_SINT: 9419 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 9420 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9421 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9422 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9423 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 9424 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 9425 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 9426 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 9427 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 9428 Subtarget); 9429 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 9430 case ISD::SHL: 9431 case ISD::SRL: 9432 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 9433 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 9434 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 9435 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 9436 case ISD::SRL_PARTS: 9437 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 9438 case ISD::CTTZ: 9439 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 9440 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 9441 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 9442 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 9443 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 9444 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 9445 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 9446 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 9447 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9448 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 9449 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 9450 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9451 case ISD::MUL: return LowerMUL(Op, DAG); 9452 case ISD::SDIV: 9453 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9454 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 9455 return LowerSDIV(Op, DAG, Subtarget); 9456 case ISD::UDIV: 9457 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9458 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 9459 return LowerUDIV(Op, DAG, Subtarget); 9460 case ISD::ADDCARRY: 9461 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 9462 case ISD::SADDO: 9463 case ISD::SSUBO: 9464 return LowerSignedALUO(Op, DAG); 9465 case ISD::UADDO: 9466 case ISD::USUBO: 9467 return LowerUnsignedALUO(Op, DAG); 9468 case ISD::SADDSAT: 9469 case ISD::SSUBSAT: 9470 return LowerSADDSUBSAT(Op, DAG, Subtarget); 9471 case ISD::LOAD: 9472 return LowerPredicateLoad(Op, DAG); 9473 case ISD::STORE: 9474 return LowerSTORE(Op, DAG, Subtarget); 9475 case ISD::MLOAD: 9476 return LowerMLOAD(Op, DAG); 9477 case ISD::ATOMIC_LOAD: 9478 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 9479 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 9480 case ISD::SDIVREM: 9481 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 9482 case ISD::DYNAMIC_STACKALLOC: 9483 if (Subtarget->isTargetWindows()) 9484 return LowerDYNAMIC_STACKALLOC(Op, DAG); 9485 llvm_unreachable("Don't know how to custom lower this!"); 9486 case ISD::STRICT_FP_ROUND: 9487 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 9488 case ISD::STRICT_FP_EXTEND: 9489 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 9490 case ISD::STRICT_FSETCC: 9491 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 9492 case ARMISD::WIN__DBZCHK: return SDValue(); 9493 } 9494 } 9495 9496 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 9497 SelectionDAG &DAG) { 9498 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9499 unsigned Opc = 0; 9500 if (IntNo == Intrinsic::arm_smlald) 9501 Opc = ARMISD::SMLALD; 9502 else if (IntNo == Intrinsic::arm_smlaldx) 9503 Opc = ARMISD::SMLALDX; 9504 else if (IntNo == Intrinsic::arm_smlsld) 9505 Opc = ARMISD::SMLSLD; 9506 else if (IntNo == Intrinsic::arm_smlsldx) 9507 Opc = ARMISD::SMLSLDX; 9508 else 9509 return; 9510 9511 SDLoc dl(N); 9512 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9513 N->getOperand(3), 9514 DAG.getConstant(0, dl, MVT::i32)); 9515 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9516 N->getOperand(3), 9517 DAG.getConstant(1, dl, MVT::i32)); 9518 9519 SDValue LongMul = DAG.getNode(Opc, dl, 9520 DAG.getVTList(MVT::i32, MVT::i32), 9521 N->getOperand(1), N->getOperand(2), 9522 Lo, Hi); 9523 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 9524 LongMul.getValue(0), LongMul.getValue(1))); 9525 } 9526 9527 /// ReplaceNodeResults - Replace the results of node with an illegal result 9528 /// type with new values built out of custom code. 9529 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 9530 SmallVectorImpl<SDValue> &Results, 9531 SelectionDAG &DAG) const { 9532 SDValue Res; 9533 switch (N->getOpcode()) { 9534 default: 9535 llvm_unreachable("Don't know how to custom expand this!"); 9536 case ISD::READ_REGISTER: 9537 ExpandREAD_REGISTER(N, Results, DAG); 9538 break; 9539 case ISD::BITCAST: 9540 Res = ExpandBITCAST(N, DAG, Subtarget); 9541 break; 9542 case ISD::SRL: 9543 case ISD::SRA: 9544 case ISD::SHL: 9545 Res = Expand64BitShift(N, DAG, Subtarget); 9546 break; 9547 case ISD::SREM: 9548 case ISD::UREM: 9549 Res = LowerREM(N, DAG); 9550 break; 9551 case ISD::SDIVREM: 9552 case ISD::UDIVREM: 9553 Res = LowerDivRem(SDValue(N, 0), DAG); 9554 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 9555 Results.push_back(Res.getValue(0)); 9556 Results.push_back(Res.getValue(1)); 9557 return; 9558 case ISD::SADDSAT: 9559 case ISD::SSUBSAT: 9560 Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 9561 break; 9562 case ISD::READCYCLECOUNTER: 9563 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 9564 return; 9565 case ISD::UDIV: 9566 case ISD::SDIV: 9567 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 9568 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 9569 Results); 9570 case ISD::ATOMIC_CMP_SWAP: 9571 ReplaceCMP_SWAP_64Results(N, Results, DAG); 9572 return; 9573 case ISD::INTRINSIC_WO_CHAIN: 9574 return ReplaceLongIntrinsic(N, Results, DAG); 9575 case ISD::ABS: 9576 lowerABS(N, Results, DAG); 9577 return ; 9578 case ISD::LOAD: 9579 LowerLOAD(N, Results, DAG); 9580 break; 9581 } 9582 if (Res.getNode()) 9583 Results.push_back(Res); 9584 } 9585 9586 //===----------------------------------------------------------------------===// 9587 // ARM Scheduler Hooks 9588 //===----------------------------------------------------------------------===// 9589 9590 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 9591 /// registers the function context. 9592 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 9593 MachineBasicBlock *MBB, 9594 MachineBasicBlock *DispatchBB, 9595 int FI) const { 9596 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 9597 "ROPI/RWPI not currently supported with SjLj"); 9598 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9599 DebugLoc dl = MI.getDebugLoc(); 9600 MachineFunction *MF = MBB->getParent(); 9601 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9602 MachineConstantPool *MCP = MF->getConstantPool(); 9603 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 9604 const Function &F = MF->getFunction(); 9605 9606 bool isThumb = Subtarget->isThumb(); 9607 bool isThumb2 = Subtarget->isThumb2(); 9608 9609 unsigned PCLabelId = AFI->createPICLabelUId(); 9610 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 9611 ARMConstantPoolValue *CPV = 9612 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 9613 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); 9614 9615 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 9616 : &ARM::GPRRegClass; 9617 9618 // Grab constant pool and fixed stack memory operands. 9619 MachineMemOperand *CPMMO = 9620 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 9621 MachineMemOperand::MOLoad, 4, Align(4)); 9622 9623 MachineMemOperand *FIMMOSt = 9624 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 9625 MachineMemOperand::MOStore, 4, Align(4)); 9626 9627 // Load the address of the dispatch MBB into the jump buffer. 9628 if (isThumb2) { 9629 // Incoming value: jbuf 9630 // ldr.n r5, LCPI1_1 9631 // orr r5, r5, #1 9632 // add r5, pc 9633 // str r5, [$jbuf, #+4] ; &jbuf[1] 9634 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9635 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 9636 .addConstantPoolIndex(CPI) 9637 .addMemOperand(CPMMO) 9638 .add(predOps(ARMCC::AL)); 9639 // Set the low bit because of thumb mode. 9640 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9641 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 9642 .addReg(NewVReg1, RegState::Kill) 9643 .addImm(0x01) 9644 .add(predOps(ARMCC::AL)) 9645 .add(condCodeOp()); 9646 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9647 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 9648 .addReg(NewVReg2, RegState::Kill) 9649 .addImm(PCLabelId); 9650 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 9651 .addReg(NewVReg3, RegState::Kill) 9652 .addFrameIndex(FI) 9653 .addImm(36) // &jbuf[1] :: pc 9654 .addMemOperand(FIMMOSt) 9655 .add(predOps(ARMCC::AL)); 9656 } else if (isThumb) { 9657 // Incoming value: jbuf 9658 // ldr.n r1, LCPI1_4 9659 // add r1, pc 9660 // mov r2, #1 9661 // orrs r1, r2 9662 // add r2, $jbuf, #+4 ; &jbuf[1] 9663 // str r1, [r2] 9664 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9665 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 9666 .addConstantPoolIndex(CPI) 9667 .addMemOperand(CPMMO) 9668 .add(predOps(ARMCC::AL)); 9669 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9670 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 9671 .addReg(NewVReg1, RegState::Kill) 9672 .addImm(PCLabelId); 9673 // Set the low bit because of thumb mode. 9674 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9675 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 9676 .addReg(ARM::CPSR, RegState::Define) 9677 .addImm(1) 9678 .add(predOps(ARMCC::AL)); 9679 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9680 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 9681 .addReg(ARM::CPSR, RegState::Define) 9682 .addReg(NewVReg2, RegState::Kill) 9683 .addReg(NewVReg3, RegState::Kill) 9684 .add(predOps(ARMCC::AL)); 9685 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9686 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 9687 .addFrameIndex(FI) 9688 .addImm(36); // &jbuf[1] :: pc 9689 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 9690 .addReg(NewVReg4, RegState::Kill) 9691 .addReg(NewVReg5, RegState::Kill) 9692 .addImm(0) 9693 .addMemOperand(FIMMOSt) 9694 .add(predOps(ARMCC::AL)); 9695 } else { 9696 // Incoming value: jbuf 9697 // ldr r1, LCPI1_1 9698 // add r1, pc, r1 9699 // str r1, [$jbuf, #+4] ; &jbuf[1] 9700 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9701 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 9702 .addConstantPoolIndex(CPI) 9703 .addImm(0) 9704 .addMemOperand(CPMMO) 9705 .add(predOps(ARMCC::AL)); 9706 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9707 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 9708 .addReg(NewVReg1, RegState::Kill) 9709 .addImm(PCLabelId) 9710 .add(predOps(ARMCC::AL)); 9711 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 9712 .addReg(NewVReg2, RegState::Kill) 9713 .addFrameIndex(FI) 9714 .addImm(36) // &jbuf[1] :: pc 9715 .addMemOperand(FIMMOSt) 9716 .add(predOps(ARMCC::AL)); 9717 } 9718 } 9719 9720 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 9721 MachineBasicBlock *MBB) const { 9722 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9723 DebugLoc dl = MI.getDebugLoc(); 9724 MachineFunction *MF = MBB->getParent(); 9725 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9726 MachineFrameInfo &MFI = MF->getFrameInfo(); 9727 int FI = MFI.getFunctionContextIndex(); 9728 9729 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 9730 : &ARM::GPRnopcRegClass; 9731 9732 // Get a mapping of the call site numbers to all of the landing pads they're 9733 // associated with. 9734 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 9735 unsigned MaxCSNum = 0; 9736 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 9737 ++BB) { 9738 if (!BB->isEHPad()) continue; 9739 9740 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 9741 // pad. 9742 for (MachineBasicBlock::iterator 9743 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 9744 if (!II->isEHLabel()) continue; 9745 9746 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 9747 if (!MF->hasCallSiteLandingPad(Sym)) continue; 9748 9749 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 9750 for (SmallVectorImpl<unsigned>::iterator 9751 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 9752 CSI != CSE; ++CSI) { 9753 CallSiteNumToLPad[*CSI].push_back(&*BB); 9754 MaxCSNum = std::max(MaxCSNum, *CSI); 9755 } 9756 break; 9757 } 9758 } 9759 9760 // Get an ordered list of the machine basic blocks for the jump table. 9761 std::vector<MachineBasicBlock*> LPadList; 9762 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 9763 LPadList.reserve(CallSiteNumToLPad.size()); 9764 for (unsigned I = 1; I <= MaxCSNum; ++I) { 9765 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 9766 for (SmallVectorImpl<MachineBasicBlock*>::iterator 9767 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 9768 LPadList.push_back(*II); 9769 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 9770 } 9771 } 9772 9773 assert(!LPadList.empty() && 9774 "No landing pad destinations for the dispatch jump table!"); 9775 9776 // Create the jump table and associated information. 9777 MachineJumpTableInfo *JTI = 9778 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 9779 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 9780 9781 // Create the MBBs for the dispatch code. 9782 9783 // Shove the dispatch's address into the return slot in the function context. 9784 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 9785 DispatchBB->setIsEHPad(); 9786 9787 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9788 unsigned trap_opcode; 9789 if (Subtarget->isThumb()) 9790 trap_opcode = ARM::tTRAP; 9791 else 9792 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 9793 9794 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 9795 DispatchBB->addSuccessor(TrapBB); 9796 9797 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 9798 DispatchBB->addSuccessor(DispContBB); 9799 9800 // Insert and MBBs. 9801 MF->insert(MF->end(), DispatchBB); 9802 MF->insert(MF->end(), DispContBB); 9803 MF->insert(MF->end(), TrapBB); 9804 9805 // Insert code into the entry block that creates and registers the function 9806 // context. 9807 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 9808 9809 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 9810 MachinePointerInfo::getFixedStack(*MF, FI), 9811 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); 9812 9813 MachineInstrBuilder MIB; 9814 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 9815 9816 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 9817 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 9818 9819 // Add a register mask with no preserved registers. This results in all 9820 // registers being marked as clobbered. This can't work if the dispatch block 9821 // is in a Thumb1 function and is linked with ARM code which uses the FP 9822 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 9823 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 9824 9825 bool IsPositionIndependent = isPositionIndependent(); 9826 unsigned NumLPads = LPadList.size(); 9827 if (Subtarget->isThumb2()) { 9828 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9829 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 9830 .addFrameIndex(FI) 9831 .addImm(4) 9832 .addMemOperand(FIMMOLd) 9833 .add(predOps(ARMCC::AL)); 9834 9835 if (NumLPads < 256) { 9836 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 9837 .addReg(NewVReg1) 9838 .addImm(LPadList.size()) 9839 .add(predOps(ARMCC::AL)); 9840 } else { 9841 Register VReg1 = MRI->createVirtualRegister(TRC); 9842 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 9843 .addImm(NumLPads & 0xFFFF) 9844 .add(predOps(ARMCC::AL)); 9845 9846 unsigned VReg2 = VReg1; 9847 if ((NumLPads & 0xFFFF0000) != 0) { 9848 VReg2 = MRI->createVirtualRegister(TRC); 9849 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 9850 .addReg(VReg1) 9851 .addImm(NumLPads >> 16) 9852 .add(predOps(ARMCC::AL)); 9853 } 9854 9855 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 9856 .addReg(NewVReg1) 9857 .addReg(VReg2) 9858 .add(predOps(ARMCC::AL)); 9859 } 9860 9861 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 9862 .addMBB(TrapBB) 9863 .addImm(ARMCC::HI) 9864 .addReg(ARM::CPSR); 9865 9866 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9867 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 9868 .addJumpTableIndex(MJTI) 9869 .add(predOps(ARMCC::AL)); 9870 9871 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9872 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 9873 .addReg(NewVReg3, RegState::Kill) 9874 .addReg(NewVReg1) 9875 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9876 .add(predOps(ARMCC::AL)) 9877 .add(condCodeOp()); 9878 9879 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 9880 .addReg(NewVReg4, RegState::Kill) 9881 .addReg(NewVReg1) 9882 .addJumpTableIndex(MJTI); 9883 } else if (Subtarget->isThumb()) { 9884 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9885 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 9886 .addFrameIndex(FI) 9887 .addImm(1) 9888 .addMemOperand(FIMMOLd) 9889 .add(predOps(ARMCC::AL)); 9890 9891 if (NumLPads < 256) { 9892 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 9893 .addReg(NewVReg1) 9894 .addImm(NumLPads) 9895 .add(predOps(ARMCC::AL)); 9896 } else { 9897 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9898 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9899 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9900 9901 // MachineConstantPool wants an explicit alignment. 9902 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 9903 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 9904 9905 Register VReg1 = MRI->createVirtualRegister(TRC); 9906 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 9907 .addReg(VReg1, RegState::Define) 9908 .addConstantPoolIndex(Idx) 9909 .add(predOps(ARMCC::AL)); 9910 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 9911 .addReg(NewVReg1) 9912 .addReg(VReg1) 9913 .add(predOps(ARMCC::AL)); 9914 } 9915 9916 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 9917 .addMBB(TrapBB) 9918 .addImm(ARMCC::HI) 9919 .addReg(ARM::CPSR); 9920 9921 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9922 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 9923 .addReg(ARM::CPSR, RegState::Define) 9924 .addReg(NewVReg1) 9925 .addImm(2) 9926 .add(predOps(ARMCC::AL)); 9927 9928 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9929 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 9930 .addJumpTableIndex(MJTI) 9931 .add(predOps(ARMCC::AL)); 9932 9933 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9934 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 9935 .addReg(ARM::CPSR, RegState::Define) 9936 .addReg(NewVReg2, RegState::Kill) 9937 .addReg(NewVReg3) 9938 .add(predOps(ARMCC::AL)); 9939 9940 MachineMemOperand *JTMMOLd = 9941 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 9942 MachineMemOperand::MOLoad, 4, Align(4)); 9943 9944 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9945 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 9946 .addReg(NewVReg4, RegState::Kill) 9947 .addImm(0) 9948 .addMemOperand(JTMMOLd) 9949 .add(predOps(ARMCC::AL)); 9950 9951 unsigned NewVReg6 = NewVReg5; 9952 if (IsPositionIndependent) { 9953 NewVReg6 = MRI->createVirtualRegister(TRC); 9954 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 9955 .addReg(ARM::CPSR, RegState::Define) 9956 .addReg(NewVReg5, RegState::Kill) 9957 .addReg(NewVReg3) 9958 .add(predOps(ARMCC::AL)); 9959 } 9960 9961 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 9962 .addReg(NewVReg6, RegState::Kill) 9963 .addJumpTableIndex(MJTI); 9964 } else { 9965 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9966 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 9967 .addFrameIndex(FI) 9968 .addImm(4) 9969 .addMemOperand(FIMMOLd) 9970 .add(predOps(ARMCC::AL)); 9971 9972 if (NumLPads < 256) { 9973 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 9974 .addReg(NewVReg1) 9975 .addImm(NumLPads) 9976 .add(predOps(ARMCC::AL)); 9977 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 9978 Register VReg1 = MRI->createVirtualRegister(TRC); 9979 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 9980 .addImm(NumLPads & 0xFFFF) 9981 .add(predOps(ARMCC::AL)); 9982 9983 unsigned VReg2 = VReg1; 9984 if ((NumLPads & 0xFFFF0000) != 0) { 9985 VReg2 = MRI->createVirtualRegister(TRC); 9986 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 9987 .addReg(VReg1) 9988 .addImm(NumLPads >> 16) 9989 .add(predOps(ARMCC::AL)); 9990 } 9991 9992 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9993 .addReg(NewVReg1) 9994 .addReg(VReg2) 9995 .add(predOps(ARMCC::AL)); 9996 } else { 9997 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9998 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9999 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10000 10001 // MachineConstantPool wants an explicit alignment. 10002 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10003 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10004 10005 Register VReg1 = MRI->createVirtualRegister(TRC); 10006 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 10007 .addReg(VReg1, RegState::Define) 10008 .addConstantPoolIndex(Idx) 10009 .addImm(0) 10010 .add(predOps(ARMCC::AL)); 10011 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10012 .addReg(NewVReg1) 10013 .addReg(VReg1, RegState::Kill) 10014 .add(predOps(ARMCC::AL)); 10015 } 10016 10017 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 10018 .addMBB(TrapBB) 10019 .addImm(ARMCC::HI) 10020 .addReg(ARM::CPSR); 10021 10022 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10023 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 10024 .addReg(NewVReg1) 10025 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10026 .add(predOps(ARMCC::AL)) 10027 .add(condCodeOp()); 10028 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10029 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 10030 .addJumpTableIndex(MJTI) 10031 .add(predOps(ARMCC::AL)); 10032 10033 MachineMemOperand *JTMMOLd = 10034 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10035 MachineMemOperand::MOLoad, 4, Align(4)); 10036 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10037 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 10038 .addReg(NewVReg3, RegState::Kill) 10039 .addReg(NewVReg4) 10040 .addImm(0) 10041 .addMemOperand(JTMMOLd) 10042 .add(predOps(ARMCC::AL)); 10043 10044 if (IsPositionIndependent) { 10045 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 10046 .addReg(NewVReg5, RegState::Kill) 10047 .addReg(NewVReg4) 10048 .addJumpTableIndex(MJTI); 10049 } else { 10050 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 10051 .addReg(NewVReg5, RegState::Kill) 10052 .addJumpTableIndex(MJTI); 10053 } 10054 } 10055 10056 // Add the jump table entries as successors to the MBB. 10057 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 10058 for (std::vector<MachineBasicBlock*>::iterator 10059 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 10060 MachineBasicBlock *CurMBB = *I; 10061 if (SeenMBBs.insert(CurMBB).second) 10062 DispContBB->addSuccessor(CurMBB); 10063 } 10064 10065 // N.B. the order the invoke BBs are processed in doesn't matter here. 10066 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 10067 SmallVector<MachineBasicBlock*, 64> MBBLPads; 10068 for (MachineBasicBlock *BB : InvokeBBs) { 10069 10070 // Remove the landing pad successor from the invoke block and replace it 10071 // with the new dispatch block. 10072 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 10073 BB->succ_end()); 10074 while (!Successors.empty()) { 10075 MachineBasicBlock *SMBB = Successors.pop_back_val(); 10076 if (SMBB->isEHPad()) { 10077 BB->removeSuccessor(SMBB); 10078 MBBLPads.push_back(SMBB); 10079 } 10080 } 10081 10082 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 10083 BB->normalizeSuccProbs(); 10084 10085 // Find the invoke call and mark all of the callee-saved registers as 10086 // 'implicit defined' so that they're spilled. This prevents code from 10087 // moving instructions to before the EH block, where they will never be 10088 // executed. 10089 for (MachineBasicBlock::reverse_iterator 10090 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 10091 if (!II->isCall()) continue; 10092 10093 DenseMap<unsigned, bool> DefRegs; 10094 for (MachineInstr::mop_iterator 10095 OI = II->operands_begin(), OE = II->operands_end(); 10096 OI != OE; ++OI) { 10097 if (!OI->isReg()) continue; 10098 DefRegs[OI->getReg()] = true; 10099 } 10100 10101 MachineInstrBuilder MIB(*MF, &*II); 10102 10103 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 10104 unsigned Reg = SavedRegs[i]; 10105 if (Subtarget->isThumb2() && 10106 !ARM::tGPRRegClass.contains(Reg) && 10107 !ARM::hGPRRegClass.contains(Reg)) 10108 continue; 10109 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 10110 continue; 10111 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 10112 continue; 10113 if (!DefRegs[Reg]) 10114 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 10115 } 10116 10117 break; 10118 } 10119 } 10120 10121 // Mark all former landing pads as non-landing pads. The dispatch is the only 10122 // landing pad now. 10123 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10124 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 10125 (*I)->setIsEHPad(false); 10126 10127 // The instruction is gone now. 10128 MI.eraseFromParent(); 10129 } 10130 10131 static 10132 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 10133 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 10134 E = MBB->succ_end(); I != E; ++I) 10135 if (*I != Succ) 10136 return *I; 10137 llvm_unreachable("Expecting a BB with two successors!"); 10138 } 10139 10140 /// Return the load opcode for a given load size. If load size >= 8, 10141 /// neon opcode will be returned. 10142 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 10143 if (LdSize >= 8) 10144 return LdSize == 16 ? ARM::VLD1q32wb_fixed 10145 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 10146 if (IsThumb1) 10147 return LdSize == 4 ? ARM::tLDRi 10148 : LdSize == 2 ? ARM::tLDRHi 10149 : LdSize == 1 ? ARM::tLDRBi : 0; 10150 if (IsThumb2) 10151 return LdSize == 4 ? ARM::t2LDR_POST 10152 : LdSize == 2 ? ARM::t2LDRH_POST 10153 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 10154 return LdSize == 4 ? ARM::LDR_POST_IMM 10155 : LdSize == 2 ? ARM::LDRH_POST 10156 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 10157 } 10158 10159 /// Return the store opcode for a given store size. If store size >= 8, 10160 /// neon opcode will be returned. 10161 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 10162 if (StSize >= 8) 10163 return StSize == 16 ? ARM::VST1q32wb_fixed 10164 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 10165 if (IsThumb1) 10166 return StSize == 4 ? ARM::tSTRi 10167 : StSize == 2 ? ARM::tSTRHi 10168 : StSize == 1 ? ARM::tSTRBi : 0; 10169 if (IsThumb2) 10170 return StSize == 4 ? ARM::t2STR_POST 10171 : StSize == 2 ? ARM::t2STRH_POST 10172 : StSize == 1 ? ARM::t2STRB_POST : 0; 10173 return StSize == 4 ? ARM::STR_POST_IMM 10174 : StSize == 2 ? ARM::STRH_POST 10175 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 10176 } 10177 10178 /// Emit a post-increment load operation with given size. The instructions 10179 /// will be added to BB at Pos. 10180 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10181 const TargetInstrInfo *TII, const DebugLoc &dl, 10182 unsigned LdSize, unsigned Data, unsigned AddrIn, 10183 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10184 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 10185 assert(LdOpc != 0 && "Should have a load opcode"); 10186 if (LdSize >= 8) { 10187 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10188 .addReg(AddrOut, RegState::Define) 10189 .addReg(AddrIn) 10190 .addImm(0) 10191 .add(predOps(ARMCC::AL)); 10192 } else if (IsThumb1) { 10193 // load + update AddrIn 10194 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10195 .addReg(AddrIn) 10196 .addImm(0) 10197 .add(predOps(ARMCC::AL)); 10198 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10199 .add(t1CondCodeOp()) 10200 .addReg(AddrIn) 10201 .addImm(LdSize) 10202 .add(predOps(ARMCC::AL)); 10203 } else if (IsThumb2) { 10204 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10205 .addReg(AddrOut, RegState::Define) 10206 .addReg(AddrIn) 10207 .addImm(LdSize) 10208 .add(predOps(ARMCC::AL)); 10209 } else { // arm 10210 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10211 .addReg(AddrOut, RegState::Define) 10212 .addReg(AddrIn) 10213 .addReg(0) 10214 .addImm(LdSize) 10215 .add(predOps(ARMCC::AL)); 10216 } 10217 } 10218 10219 /// Emit a post-increment store operation with given size. The instructions 10220 /// will be added to BB at Pos. 10221 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10222 const TargetInstrInfo *TII, const DebugLoc &dl, 10223 unsigned StSize, unsigned Data, unsigned AddrIn, 10224 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10225 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 10226 assert(StOpc != 0 && "Should have a store opcode"); 10227 if (StSize >= 8) { 10228 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10229 .addReg(AddrIn) 10230 .addImm(0) 10231 .addReg(Data) 10232 .add(predOps(ARMCC::AL)); 10233 } else if (IsThumb1) { 10234 // store + update AddrIn 10235 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 10236 .addReg(Data) 10237 .addReg(AddrIn) 10238 .addImm(0) 10239 .add(predOps(ARMCC::AL)); 10240 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10241 .add(t1CondCodeOp()) 10242 .addReg(AddrIn) 10243 .addImm(StSize) 10244 .add(predOps(ARMCC::AL)); 10245 } else if (IsThumb2) { 10246 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10247 .addReg(Data) 10248 .addReg(AddrIn) 10249 .addImm(StSize) 10250 .add(predOps(ARMCC::AL)); 10251 } else { // arm 10252 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10253 .addReg(Data) 10254 .addReg(AddrIn) 10255 .addReg(0) 10256 .addImm(StSize) 10257 .add(predOps(ARMCC::AL)); 10258 } 10259 } 10260 10261 MachineBasicBlock * 10262 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 10263 MachineBasicBlock *BB) const { 10264 // This pseudo instruction has 3 operands: dst, src, size 10265 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 10266 // Otherwise, we will generate unrolled scalar copies. 10267 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10268 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10269 MachineFunction::iterator It = ++BB->getIterator(); 10270 10271 Register dest = MI.getOperand(0).getReg(); 10272 Register src = MI.getOperand(1).getReg(); 10273 unsigned SizeVal = MI.getOperand(2).getImm(); 10274 unsigned Alignment = MI.getOperand(3).getImm(); 10275 DebugLoc dl = MI.getDebugLoc(); 10276 10277 MachineFunction *MF = BB->getParent(); 10278 MachineRegisterInfo &MRI = MF->getRegInfo(); 10279 unsigned UnitSize = 0; 10280 const TargetRegisterClass *TRC = nullptr; 10281 const TargetRegisterClass *VecTRC = nullptr; 10282 10283 bool IsThumb1 = Subtarget->isThumb1Only(); 10284 bool IsThumb2 = Subtarget->isThumb2(); 10285 bool IsThumb = Subtarget->isThumb(); 10286 10287 if (Alignment & 1) { 10288 UnitSize = 1; 10289 } else if (Alignment & 2) { 10290 UnitSize = 2; 10291 } else { 10292 // Check whether we can use NEON instructions. 10293 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 10294 Subtarget->hasNEON()) { 10295 if ((Alignment % 16 == 0) && SizeVal >= 16) 10296 UnitSize = 16; 10297 else if ((Alignment % 8 == 0) && SizeVal >= 8) 10298 UnitSize = 8; 10299 } 10300 // Can't use NEON instructions. 10301 if (UnitSize == 0) 10302 UnitSize = 4; 10303 } 10304 10305 // Select the correct opcode and register class for unit size load/store 10306 bool IsNeon = UnitSize >= 8; 10307 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 10308 if (IsNeon) 10309 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 10310 : UnitSize == 8 ? &ARM::DPRRegClass 10311 : nullptr; 10312 10313 unsigned BytesLeft = SizeVal % UnitSize; 10314 unsigned LoopSize = SizeVal - BytesLeft; 10315 10316 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 10317 // Use LDR and STR to copy. 10318 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 10319 // [destOut] = STR_POST(scratch, destIn, UnitSize) 10320 unsigned srcIn = src; 10321 unsigned destIn = dest; 10322 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 10323 Register srcOut = MRI.createVirtualRegister(TRC); 10324 Register destOut = MRI.createVirtualRegister(TRC); 10325 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10326 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 10327 IsThumb1, IsThumb2); 10328 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 10329 IsThumb1, IsThumb2); 10330 srcIn = srcOut; 10331 destIn = destOut; 10332 } 10333 10334 // Handle the leftover bytes with LDRB and STRB. 10335 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 10336 // [destOut] = STRB_POST(scratch, destIn, 1) 10337 for (unsigned i = 0; i < BytesLeft; i++) { 10338 Register srcOut = MRI.createVirtualRegister(TRC); 10339 Register destOut = MRI.createVirtualRegister(TRC); 10340 Register scratch = MRI.createVirtualRegister(TRC); 10341 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 10342 IsThumb1, IsThumb2); 10343 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 10344 IsThumb1, IsThumb2); 10345 srcIn = srcOut; 10346 destIn = destOut; 10347 } 10348 MI.eraseFromParent(); // The instruction is gone now. 10349 return BB; 10350 } 10351 10352 // Expand the pseudo op to a loop. 10353 // thisMBB: 10354 // ... 10355 // movw varEnd, # --> with thumb2 10356 // movt varEnd, # 10357 // ldrcp varEnd, idx --> without thumb2 10358 // fallthrough --> loopMBB 10359 // loopMBB: 10360 // PHI varPhi, varEnd, varLoop 10361 // PHI srcPhi, src, srcLoop 10362 // PHI destPhi, dst, destLoop 10363 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10364 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 10365 // subs varLoop, varPhi, #UnitSize 10366 // bne loopMBB 10367 // fallthrough --> exitMBB 10368 // exitMBB: 10369 // epilogue to handle left-over bytes 10370 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10371 // [destOut] = STRB_POST(scratch, destLoop, 1) 10372 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10373 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10374 MF->insert(It, loopMBB); 10375 MF->insert(It, exitMBB); 10376 10377 // Transfer the remainder of BB and its successor edges to exitMBB. 10378 exitMBB->splice(exitMBB->begin(), BB, 10379 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10380 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10381 10382 // Load an immediate to varEnd. 10383 Register varEnd = MRI.createVirtualRegister(TRC); 10384 if (Subtarget->useMovt()) { 10385 unsigned Vtmp = varEnd; 10386 if ((LoopSize & 0xFFFF0000) != 0) 10387 Vtmp = MRI.createVirtualRegister(TRC); 10388 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 10389 .addImm(LoopSize & 0xFFFF) 10390 .add(predOps(ARMCC::AL)); 10391 10392 if ((LoopSize & 0xFFFF0000) != 0) 10393 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 10394 .addReg(Vtmp) 10395 .addImm(LoopSize >> 16) 10396 .add(predOps(ARMCC::AL)); 10397 } else { 10398 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10399 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10400 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 10401 10402 // MachineConstantPool wants an explicit alignment. 10403 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10404 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10405 MachineMemOperand *CPMMO = 10406 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10407 MachineMemOperand::MOLoad, 4, Align(4)); 10408 10409 if (IsThumb) 10410 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 10411 .addReg(varEnd, RegState::Define) 10412 .addConstantPoolIndex(Idx) 10413 .add(predOps(ARMCC::AL)) 10414 .addMemOperand(CPMMO); 10415 else 10416 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 10417 .addReg(varEnd, RegState::Define) 10418 .addConstantPoolIndex(Idx) 10419 .addImm(0) 10420 .add(predOps(ARMCC::AL)) 10421 .addMemOperand(CPMMO); 10422 } 10423 BB->addSuccessor(loopMBB); 10424 10425 // Generate the loop body: 10426 // varPhi = PHI(varLoop, varEnd) 10427 // srcPhi = PHI(srcLoop, src) 10428 // destPhi = PHI(destLoop, dst) 10429 MachineBasicBlock *entryBB = BB; 10430 BB = loopMBB; 10431 Register varLoop = MRI.createVirtualRegister(TRC); 10432 Register varPhi = MRI.createVirtualRegister(TRC); 10433 Register srcLoop = MRI.createVirtualRegister(TRC); 10434 Register srcPhi = MRI.createVirtualRegister(TRC); 10435 Register destLoop = MRI.createVirtualRegister(TRC); 10436 Register destPhi = MRI.createVirtualRegister(TRC); 10437 10438 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 10439 .addReg(varLoop).addMBB(loopMBB) 10440 .addReg(varEnd).addMBB(entryBB); 10441 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 10442 .addReg(srcLoop).addMBB(loopMBB) 10443 .addReg(src).addMBB(entryBB); 10444 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 10445 .addReg(destLoop).addMBB(loopMBB) 10446 .addReg(dest).addMBB(entryBB); 10447 10448 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10449 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 10450 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10451 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 10452 IsThumb1, IsThumb2); 10453 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 10454 IsThumb1, IsThumb2); 10455 10456 // Decrement loop variable by UnitSize. 10457 if (IsThumb1) { 10458 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 10459 .add(t1CondCodeOp()) 10460 .addReg(varPhi) 10461 .addImm(UnitSize) 10462 .add(predOps(ARMCC::AL)); 10463 } else { 10464 MachineInstrBuilder MIB = 10465 BuildMI(*BB, BB->end(), dl, 10466 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 10467 MIB.addReg(varPhi) 10468 .addImm(UnitSize) 10469 .add(predOps(ARMCC::AL)) 10470 .add(condCodeOp()); 10471 MIB->getOperand(5).setReg(ARM::CPSR); 10472 MIB->getOperand(5).setIsDef(true); 10473 } 10474 BuildMI(*BB, BB->end(), dl, 10475 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10476 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 10477 10478 // loopMBB can loop back to loopMBB or fall through to exitMBB. 10479 BB->addSuccessor(loopMBB); 10480 BB->addSuccessor(exitMBB); 10481 10482 // Add epilogue to handle BytesLeft. 10483 BB = exitMBB; 10484 auto StartOfExit = exitMBB->begin(); 10485 10486 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10487 // [destOut] = STRB_POST(scratch, destLoop, 1) 10488 unsigned srcIn = srcLoop; 10489 unsigned destIn = destLoop; 10490 for (unsigned i = 0; i < BytesLeft; i++) { 10491 Register srcOut = MRI.createVirtualRegister(TRC); 10492 Register destOut = MRI.createVirtualRegister(TRC); 10493 Register scratch = MRI.createVirtualRegister(TRC); 10494 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 10495 IsThumb1, IsThumb2); 10496 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 10497 IsThumb1, IsThumb2); 10498 srcIn = srcOut; 10499 destIn = destOut; 10500 } 10501 10502 MI.eraseFromParent(); // The instruction is gone now. 10503 return BB; 10504 } 10505 10506 MachineBasicBlock * 10507 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 10508 MachineBasicBlock *MBB) const { 10509 const TargetMachine &TM = getTargetMachine(); 10510 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 10511 DebugLoc DL = MI.getDebugLoc(); 10512 10513 assert(Subtarget->isTargetWindows() && 10514 "__chkstk is only supported on Windows"); 10515 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 10516 10517 // __chkstk takes the number of words to allocate on the stack in R4, and 10518 // returns the stack adjustment in number of bytes in R4. This will not 10519 // clober any other registers (other than the obvious lr). 10520 // 10521 // Although, technically, IP should be considered a register which may be 10522 // clobbered, the call itself will not touch it. Windows on ARM is a pure 10523 // thumb-2 environment, so there is no interworking required. As a result, we 10524 // do not expect a veneer to be emitted by the linker, clobbering IP. 10525 // 10526 // Each module receives its own copy of __chkstk, so no import thunk is 10527 // required, again, ensuring that IP is not clobbered. 10528 // 10529 // Finally, although some linkers may theoretically provide a trampoline for 10530 // out of range calls (which is quite common due to a 32M range limitation of 10531 // branches for Thumb), we can generate the long-call version via 10532 // -mcmodel=large, alleviating the need for the trampoline which may clobber 10533 // IP. 10534 10535 switch (TM.getCodeModel()) { 10536 case CodeModel::Tiny: 10537 llvm_unreachable("Tiny code model not available on ARM."); 10538 case CodeModel::Small: 10539 case CodeModel::Medium: 10540 case CodeModel::Kernel: 10541 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 10542 .add(predOps(ARMCC::AL)) 10543 .addExternalSymbol("__chkstk") 10544 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10545 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10546 .addReg(ARM::R12, 10547 RegState::Implicit | RegState::Define | RegState::Dead) 10548 .addReg(ARM::CPSR, 10549 RegState::Implicit | RegState::Define | RegState::Dead); 10550 break; 10551 case CodeModel::Large: { 10552 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10553 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 10554 10555 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 10556 .addExternalSymbol("__chkstk"); 10557 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 10558 .add(predOps(ARMCC::AL)) 10559 .addReg(Reg, RegState::Kill) 10560 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10561 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10562 .addReg(ARM::R12, 10563 RegState::Implicit | RegState::Define | RegState::Dead) 10564 .addReg(ARM::CPSR, 10565 RegState::Implicit | RegState::Define | RegState::Dead); 10566 break; 10567 } 10568 } 10569 10570 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 10571 .addReg(ARM::SP, RegState::Kill) 10572 .addReg(ARM::R4, RegState::Kill) 10573 .setMIFlags(MachineInstr::FrameSetup) 10574 .add(predOps(ARMCC::AL)) 10575 .add(condCodeOp()); 10576 10577 MI.eraseFromParent(); 10578 return MBB; 10579 } 10580 10581 MachineBasicBlock * 10582 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 10583 MachineBasicBlock *MBB) const { 10584 DebugLoc DL = MI.getDebugLoc(); 10585 MachineFunction *MF = MBB->getParent(); 10586 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10587 10588 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 10589 MF->insert(++MBB->getIterator(), ContBB); 10590 ContBB->splice(ContBB->begin(), MBB, 10591 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10592 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 10593 MBB->addSuccessor(ContBB); 10594 10595 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10596 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 10597 MF->push_back(TrapBB); 10598 MBB->addSuccessor(TrapBB); 10599 10600 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 10601 .addReg(MI.getOperand(0).getReg()) 10602 .addImm(0) 10603 .add(predOps(ARMCC::AL)); 10604 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 10605 .addMBB(TrapBB) 10606 .addImm(ARMCC::EQ) 10607 .addReg(ARM::CPSR); 10608 10609 MI.eraseFromParent(); 10610 return ContBB; 10611 } 10612 10613 // The CPSR operand of SelectItr might be missing a kill marker 10614 // because there were multiple uses of CPSR, and ISel didn't know 10615 // which to mark. Figure out whether SelectItr should have had a 10616 // kill marker, and set it if it should. Returns the correct kill 10617 // marker value. 10618 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 10619 MachineBasicBlock* BB, 10620 const TargetRegisterInfo* TRI) { 10621 // Scan forward through BB for a use/def of CPSR. 10622 MachineBasicBlock::iterator miI(std::next(SelectItr)); 10623 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 10624 const MachineInstr& mi = *miI; 10625 if (mi.readsRegister(ARM::CPSR)) 10626 return false; 10627 if (mi.definesRegister(ARM::CPSR)) 10628 break; // Should have kill-flag - update below. 10629 } 10630 10631 // If we hit the end of the block, check whether CPSR is live into a 10632 // successor. 10633 if (miI == BB->end()) { 10634 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 10635 sEnd = BB->succ_end(); 10636 sItr != sEnd; ++sItr) { 10637 MachineBasicBlock* succ = *sItr; 10638 if (succ->isLiveIn(ARM::CPSR)) 10639 return false; 10640 } 10641 } 10642 10643 // We found a def, or hit the end of the basic block and CPSR wasn't live 10644 // out. SelectMI should have a kill flag on CPSR. 10645 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 10646 return true; 10647 } 10648 10649 MachineBasicBlock * 10650 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10651 MachineBasicBlock *BB) const { 10652 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10653 DebugLoc dl = MI.getDebugLoc(); 10654 bool isThumb2 = Subtarget->isThumb2(); 10655 switch (MI.getOpcode()) { 10656 default: { 10657 MI.print(errs()); 10658 llvm_unreachable("Unexpected instr type to insert"); 10659 } 10660 10661 // Thumb1 post-indexed loads are really just single-register LDMs. 10662 case ARM::tLDR_postidx: { 10663 MachineOperand Def(MI.getOperand(1)); 10664 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 10665 .add(Def) // Rn_wb 10666 .add(MI.getOperand(2)) // Rn 10667 .add(MI.getOperand(3)) // PredImm 10668 .add(MI.getOperand(4)) // PredReg 10669 .add(MI.getOperand(0)) // Rt 10670 .cloneMemRefs(MI); 10671 MI.eraseFromParent(); 10672 return BB; 10673 } 10674 10675 // The Thumb2 pre-indexed stores have the same MI operands, they just 10676 // define them differently in the .td files from the isel patterns, so 10677 // they need pseudos. 10678 case ARM::t2STR_preidx: 10679 MI.setDesc(TII->get(ARM::t2STR_PRE)); 10680 return BB; 10681 case ARM::t2STRB_preidx: 10682 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 10683 return BB; 10684 case ARM::t2STRH_preidx: 10685 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 10686 return BB; 10687 10688 case ARM::STRi_preidx: 10689 case ARM::STRBi_preidx: { 10690 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 10691 : ARM::STRB_PRE_IMM; 10692 // Decode the offset. 10693 unsigned Offset = MI.getOperand(4).getImm(); 10694 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 10695 Offset = ARM_AM::getAM2Offset(Offset); 10696 if (isSub) 10697 Offset = -Offset; 10698 10699 MachineMemOperand *MMO = *MI.memoperands_begin(); 10700 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 10701 .add(MI.getOperand(0)) // Rn_wb 10702 .add(MI.getOperand(1)) // Rt 10703 .add(MI.getOperand(2)) // Rn 10704 .addImm(Offset) // offset (skip GPR==zero_reg) 10705 .add(MI.getOperand(5)) // pred 10706 .add(MI.getOperand(6)) 10707 .addMemOperand(MMO); 10708 MI.eraseFromParent(); 10709 return BB; 10710 } 10711 case ARM::STRr_preidx: 10712 case ARM::STRBr_preidx: 10713 case ARM::STRH_preidx: { 10714 unsigned NewOpc; 10715 switch (MI.getOpcode()) { 10716 default: llvm_unreachable("unexpected opcode!"); 10717 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 10718 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 10719 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 10720 } 10721 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 10722 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 10723 MIB.add(MI.getOperand(i)); 10724 MI.eraseFromParent(); 10725 return BB; 10726 } 10727 10728 case ARM::tMOVCCr_pseudo: { 10729 // To "insert" a SELECT_CC instruction, we actually have to insert the 10730 // diamond control-flow pattern. The incoming instruction knows the 10731 // destination vreg to set, the condition code register to branch on, the 10732 // true/false values to select between, and a branch opcode to use. 10733 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10734 MachineFunction::iterator It = ++BB->getIterator(); 10735 10736 // thisMBB: 10737 // ... 10738 // TrueVal = ... 10739 // cmpTY ccX, r1, r2 10740 // bCC copy1MBB 10741 // fallthrough --> copy0MBB 10742 MachineBasicBlock *thisMBB = BB; 10743 MachineFunction *F = BB->getParent(); 10744 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10745 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10746 F->insert(It, copy0MBB); 10747 F->insert(It, sinkMBB); 10748 10749 // Check whether CPSR is live past the tMOVCCr_pseudo. 10750 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 10751 if (!MI.killsRegister(ARM::CPSR) && 10752 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 10753 copy0MBB->addLiveIn(ARM::CPSR); 10754 sinkMBB->addLiveIn(ARM::CPSR); 10755 } 10756 10757 // Transfer the remainder of BB and its successor edges to sinkMBB. 10758 sinkMBB->splice(sinkMBB->begin(), BB, 10759 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10760 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10761 10762 BB->addSuccessor(copy0MBB); 10763 BB->addSuccessor(sinkMBB); 10764 10765 BuildMI(BB, dl, TII->get(ARM::tBcc)) 10766 .addMBB(sinkMBB) 10767 .addImm(MI.getOperand(3).getImm()) 10768 .addReg(MI.getOperand(4).getReg()); 10769 10770 // copy0MBB: 10771 // %FalseValue = ... 10772 // # fallthrough to sinkMBB 10773 BB = copy0MBB; 10774 10775 // Update machine-CFG edges 10776 BB->addSuccessor(sinkMBB); 10777 10778 // sinkMBB: 10779 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10780 // ... 10781 BB = sinkMBB; 10782 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 10783 .addReg(MI.getOperand(1).getReg()) 10784 .addMBB(copy0MBB) 10785 .addReg(MI.getOperand(2).getReg()) 10786 .addMBB(thisMBB); 10787 10788 MI.eraseFromParent(); // The pseudo instruction is gone now. 10789 return BB; 10790 } 10791 10792 case ARM::BCCi64: 10793 case ARM::BCCZi64: { 10794 // If there is an unconditional branch to the other successor, remove it. 10795 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10796 10797 // Compare both parts that make up the double comparison separately for 10798 // equality. 10799 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 10800 10801 Register LHS1 = MI.getOperand(1).getReg(); 10802 Register LHS2 = MI.getOperand(2).getReg(); 10803 if (RHSisZero) { 10804 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10805 .addReg(LHS1) 10806 .addImm(0) 10807 .add(predOps(ARMCC::AL)); 10808 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10809 .addReg(LHS2).addImm(0) 10810 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10811 } else { 10812 Register RHS1 = MI.getOperand(3).getReg(); 10813 Register RHS2 = MI.getOperand(4).getReg(); 10814 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10815 .addReg(LHS1) 10816 .addReg(RHS1) 10817 .add(predOps(ARMCC::AL)); 10818 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10819 .addReg(LHS2).addReg(RHS2) 10820 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10821 } 10822 10823 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 10824 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 10825 if (MI.getOperand(0).getImm() == ARMCC::NE) 10826 std::swap(destMBB, exitMBB); 10827 10828 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10829 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 10830 if (isThumb2) 10831 BuildMI(BB, dl, TII->get(ARM::t2B)) 10832 .addMBB(exitMBB) 10833 .add(predOps(ARMCC::AL)); 10834 else 10835 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 10836 10837 MI.eraseFromParent(); // The pseudo instruction is gone now. 10838 return BB; 10839 } 10840 10841 case ARM::Int_eh_sjlj_setjmp: 10842 case ARM::Int_eh_sjlj_setjmp_nofp: 10843 case ARM::tInt_eh_sjlj_setjmp: 10844 case ARM::t2Int_eh_sjlj_setjmp: 10845 case ARM::t2Int_eh_sjlj_setjmp_nofp: 10846 return BB; 10847 10848 case ARM::Int_eh_sjlj_setup_dispatch: 10849 EmitSjLjDispatchBlock(MI, BB); 10850 return BB; 10851 10852 case ARM::ABS: 10853 case ARM::t2ABS: { 10854 // To insert an ABS instruction, we have to insert the 10855 // diamond control-flow pattern. The incoming instruction knows the 10856 // source vreg to test against 0, the destination vreg to set, 10857 // the condition code register to branch on, the 10858 // true/false values to select between, and a branch opcode to use. 10859 // It transforms 10860 // V1 = ABS V0 10861 // into 10862 // V2 = MOVS V0 10863 // BCC (branch to SinkBB if V0 >= 0) 10864 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 10865 // SinkBB: V1 = PHI(V2, V3) 10866 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10867 MachineFunction::iterator BBI = ++BB->getIterator(); 10868 MachineFunction *Fn = BB->getParent(); 10869 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10870 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10871 Fn->insert(BBI, RSBBB); 10872 Fn->insert(BBI, SinkBB); 10873 10874 Register ABSSrcReg = MI.getOperand(1).getReg(); 10875 Register ABSDstReg = MI.getOperand(0).getReg(); 10876 bool ABSSrcKIll = MI.getOperand(1).isKill(); 10877 bool isThumb2 = Subtarget->isThumb2(); 10878 MachineRegisterInfo &MRI = Fn->getRegInfo(); 10879 // In Thumb mode S must not be specified if source register is the SP or 10880 // PC and if destination register is the SP, so restrict register class 10881 Register NewRsbDstReg = MRI.createVirtualRegister( 10882 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 10883 10884 // Transfer the remainder of BB and its successor edges to sinkMBB. 10885 SinkBB->splice(SinkBB->begin(), BB, 10886 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10887 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 10888 10889 BB->addSuccessor(RSBBB); 10890 BB->addSuccessor(SinkBB); 10891 10892 // fall through to SinkMBB 10893 RSBBB->addSuccessor(SinkBB); 10894 10895 // insert a cmp at the end of BB 10896 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10897 .addReg(ABSSrcReg) 10898 .addImm(0) 10899 .add(predOps(ARMCC::AL)); 10900 10901 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 10902 BuildMI(BB, dl, 10903 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 10904 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 10905 10906 // insert rsbri in RSBBB 10907 // Note: BCC and rsbri will be converted into predicated rsbmi 10908 // by if-conversion pass 10909 BuildMI(*RSBBB, RSBBB->begin(), dl, 10910 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 10911 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 10912 .addImm(0) 10913 .add(predOps(ARMCC::AL)) 10914 .add(condCodeOp()); 10915 10916 // insert PHI in SinkBB, 10917 // reuse ABSDstReg to not change uses of ABS instruction 10918 BuildMI(*SinkBB, SinkBB->begin(), dl, 10919 TII->get(ARM::PHI), ABSDstReg) 10920 .addReg(NewRsbDstReg).addMBB(RSBBB) 10921 .addReg(ABSSrcReg).addMBB(BB); 10922 10923 // remove ABS instruction 10924 MI.eraseFromParent(); 10925 10926 // return last added BB 10927 return SinkBB; 10928 } 10929 case ARM::COPY_STRUCT_BYVAL_I32: 10930 ++NumLoopByVals; 10931 return EmitStructByval(MI, BB); 10932 case ARM::WIN__CHKSTK: 10933 return EmitLowered__chkstk(MI, BB); 10934 case ARM::WIN__DBZCHK: 10935 return EmitLowered__dbzchk(MI, BB); 10936 } 10937 } 10938 10939 /// Attaches vregs to MEMCPY that it will use as scratch registers 10940 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 10941 /// instead of as a custom inserter because we need the use list from the SDNode. 10942 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 10943 MachineInstr &MI, const SDNode *Node) { 10944 bool isThumb1 = Subtarget->isThumb1Only(); 10945 10946 DebugLoc DL = MI.getDebugLoc(); 10947 MachineFunction *MF = MI.getParent()->getParent(); 10948 MachineRegisterInfo &MRI = MF->getRegInfo(); 10949 MachineInstrBuilder MIB(*MF, MI); 10950 10951 // If the new dst/src is unused mark it as dead. 10952 if (!Node->hasAnyUseOfValue(0)) { 10953 MI.getOperand(0).setIsDead(true); 10954 } 10955 if (!Node->hasAnyUseOfValue(1)) { 10956 MI.getOperand(1).setIsDead(true); 10957 } 10958 10959 // The MEMCPY both defines and kills the scratch registers. 10960 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 10961 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 10962 : &ARM::GPRRegClass); 10963 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 10964 } 10965 } 10966 10967 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 10968 SDNode *Node) const { 10969 if (MI.getOpcode() == ARM::MEMCPY) { 10970 attachMEMCPYScratchRegs(Subtarget, MI, Node); 10971 return; 10972 } 10973 10974 const MCInstrDesc *MCID = &MI.getDesc(); 10975 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 10976 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 10977 // operand is still set to noreg. If needed, set the optional operand's 10978 // register to CPSR, and remove the redundant implicit def. 10979 // 10980 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 10981 10982 // Rename pseudo opcodes. 10983 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 10984 unsigned ccOutIdx; 10985 if (NewOpc) { 10986 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 10987 MCID = &TII->get(NewOpc); 10988 10989 assert(MCID->getNumOperands() == 10990 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 10991 && "converted opcode should be the same except for cc_out" 10992 " (and, on Thumb1, pred)"); 10993 10994 MI.setDesc(*MCID); 10995 10996 // Add the optional cc_out operand 10997 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 10998 10999 // On Thumb1, move all input operands to the end, then add the predicate 11000 if (Subtarget->isThumb1Only()) { 11001 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 11002 MI.addOperand(MI.getOperand(1)); 11003 MI.RemoveOperand(1); 11004 } 11005 11006 // Restore the ties 11007 for (unsigned i = MI.getNumOperands(); i--;) { 11008 const MachineOperand& op = MI.getOperand(i); 11009 if (op.isReg() && op.isUse()) { 11010 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 11011 if (DefIdx != -1) 11012 MI.tieOperands(DefIdx, i); 11013 } 11014 } 11015 11016 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 11017 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 11018 ccOutIdx = 1; 11019 } else 11020 ccOutIdx = MCID->getNumOperands() - 1; 11021 } else 11022 ccOutIdx = MCID->getNumOperands() - 1; 11023 11024 // Any ARM instruction that sets the 's' bit should specify an optional 11025 // "cc_out" operand in the last operand position. 11026 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 11027 assert(!NewOpc && "Optional cc_out operand required"); 11028 return; 11029 } 11030 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 11031 // since we already have an optional CPSR def. 11032 bool definesCPSR = false; 11033 bool deadCPSR = false; 11034 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 11035 ++i) { 11036 const MachineOperand &MO = MI.getOperand(i); 11037 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 11038 definesCPSR = true; 11039 if (MO.isDead()) 11040 deadCPSR = true; 11041 MI.RemoveOperand(i); 11042 break; 11043 } 11044 } 11045 if (!definesCPSR) { 11046 assert(!NewOpc && "Optional cc_out operand required"); 11047 return; 11048 } 11049 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 11050 if (deadCPSR) { 11051 assert(!MI.getOperand(ccOutIdx).getReg() && 11052 "expect uninitialized optional cc_out operand"); 11053 // Thumb1 instructions must have the S bit even if the CPSR is dead. 11054 if (!Subtarget->isThumb1Only()) 11055 return; 11056 } 11057 11058 // If this instruction was defined with an optional CPSR def and its dag node 11059 // had a live implicit CPSR def, then activate the optional CPSR def. 11060 MachineOperand &MO = MI.getOperand(ccOutIdx); 11061 MO.setReg(ARM::CPSR); 11062 MO.setIsDef(true); 11063 } 11064 11065 //===----------------------------------------------------------------------===// 11066 // ARM Optimization Hooks 11067 //===----------------------------------------------------------------------===// 11068 11069 // Helper function that checks if N is a null or all ones constant. 11070 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 11071 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 11072 } 11073 11074 // Return true if N is conditionally 0 or all ones. 11075 // Detects these expressions where cc is an i1 value: 11076 // 11077 // (select cc 0, y) [AllOnes=0] 11078 // (select cc y, 0) [AllOnes=0] 11079 // (zext cc) [AllOnes=0] 11080 // (sext cc) [AllOnes=0/1] 11081 // (select cc -1, y) [AllOnes=1] 11082 // (select cc y, -1) [AllOnes=1] 11083 // 11084 // Invert is set when N is the null/all ones constant when CC is false. 11085 // OtherOp is set to the alternative value of N. 11086 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 11087 SDValue &CC, bool &Invert, 11088 SDValue &OtherOp, 11089 SelectionDAG &DAG) { 11090 switch (N->getOpcode()) { 11091 default: return false; 11092 case ISD::SELECT: { 11093 CC = N->getOperand(0); 11094 SDValue N1 = N->getOperand(1); 11095 SDValue N2 = N->getOperand(2); 11096 if (isZeroOrAllOnes(N1, AllOnes)) { 11097 Invert = false; 11098 OtherOp = N2; 11099 return true; 11100 } 11101 if (isZeroOrAllOnes(N2, AllOnes)) { 11102 Invert = true; 11103 OtherOp = N1; 11104 return true; 11105 } 11106 return false; 11107 } 11108 case ISD::ZERO_EXTEND: 11109 // (zext cc) can never be the all ones value. 11110 if (AllOnes) 11111 return false; 11112 LLVM_FALLTHROUGH; 11113 case ISD::SIGN_EXTEND: { 11114 SDLoc dl(N); 11115 EVT VT = N->getValueType(0); 11116 CC = N->getOperand(0); 11117 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 11118 return false; 11119 Invert = !AllOnes; 11120 if (AllOnes) 11121 // When looking for an AllOnes constant, N is an sext, and the 'other' 11122 // value is 0. 11123 OtherOp = DAG.getConstant(0, dl, VT); 11124 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11125 // When looking for a 0 constant, N can be zext or sext. 11126 OtherOp = DAG.getConstant(1, dl, VT); 11127 else 11128 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 11129 VT); 11130 return true; 11131 } 11132 } 11133 } 11134 11135 // Combine a constant select operand into its use: 11136 // 11137 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11138 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11139 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 11140 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 11141 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11142 // 11143 // The transform is rejected if the select doesn't have a constant operand that 11144 // is null, or all ones when AllOnes is set. 11145 // 11146 // Also recognize sext/zext from i1: 11147 // 11148 // (add (zext cc), x) -> (select cc (add x, 1), x) 11149 // (add (sext cc), x) -> (select cc (add x, -1), x) 11150 // 11151 // These transformations eventually create predicated instructions. 11152 // 11153 // @param N The node to transform. 11154 // @param Slct The N operand that is a select. 11155 // @param OtherOp The other N operand (x above). 11156 // @param DCI Context. 11157 // @param AllOnes Require the select constant to be all ones instead of null. 11158 // @returns The new node, or SDValue() on failure. 11159 static 11160 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 11161 TargetLowering::DAGCombinerInfo &DCI, 11162 bool AllOnes = false) { 11163 SelectionDAG &DAG = DCI.DAG; 11164 EVT VT = N->getValueType(0); 11165 SDValue NonConstantVal; 11166 SDValue CCOp; 11167 bool SwapSelectOps; 11168 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 11169 NonConstantVal, DAG)) 11170 return SDValue(); 11171 11172 // Slct is now know to be the desired identity constant when CC is true. 11173 SDValue TrueVal = OtherOp; 11174 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 11175 OtherOp, NonConstantVal); 11176 // Unless SwapSelectOps says CC should be false. 11177 if (SwapSelectOps) 11178 std::swap(TrueVal, FalseVal); 11179 11180 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 11181 CCOp, TrueVal, FalseVal); 11182 } 11183 11184 // Attempt combineSelectAndUse on each operand of a commutative operator N. 11185 static 11186 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 11187 TargetLowering::DAGCombinerInfo &DCI) { 11188 SDValue N0 = N->getOperand(0); 11189 SDValue N1 = N->getOperand(1); 11190 if (N0.getNode()->hasOneUse()) 11191 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 11192 return Result; 11193 if (N1.getNode()->hasOneUse()) 11194 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 11195 return Result; 11196 return SDValue(); 11197 } 11198 11199 static bool IsVUZPShuffleNode(SDNode *N) { 11200 // VUZP shuffle node. 11201 if (N->getOpcode() == ARMISD::VUZP) 11202 return true; 11203 11204 // "VUZP" on i32 is an alias for VTRN. 11205 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 11206 return true; 11207 11208 return false; 11209 } 11210 11211 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 11212 TargetLowering::DAGCombinerInfo &DCI, 11213 const ARMSubtarget *Subtarget) { 11214 // Look for ADD(VUZP.0, VUZP.1). 11215 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 11216 N0 == N1) 11217 return SDValue(); 11218 11219 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 11220 if (!N->getValueType(0).is64BitVector()) 11221 return SDValue(); 11222 11223 // Generate vpadd. 11224 SelectionDAG &DAG = DCI.DAG; 11225 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11226 SDLoc dl(N); 11227 SDNode *Unzip = N0.getNode(); 11228 EVT VT = N->getValueType(0); 11229 11230 SmallVector<SDValue, 8> Ops; 11231 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 11232 TLI.getPointerTy(DAG.getDataLayout()))); 11233 Ops.push_back(Unzip->getOperand(0)); 11234 Ops.push_back(Unzip->getOperand(1)); 11235 11236 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11237 } 11238 11239 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11240 TargetLowering::DAGCombinerInfo &DCI, 11241 const ARMSubtarget *Subtarget) { 11242 // Check for two extended operands. 11243 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 11244 N1.getOpcode() == ISD::SIGN_EXTEND) && 11245 !(N0.getOpcode() == ISD::ZERO_EXTEND && 11246 N1.getOpcode() == ISD::ZERO_EXTEND)) 11247 return SDValue(); 11248 11249 SDValue N00 = N0.getOperand(0); 11250 SDValue N10 = N1.getOperand(0); 11251 11252 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 11253 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 11254 N00 == N10) 11255 return SDValue(); 11256 11257 // We only recognize Q register paddl here; this can't be reached until 11258 // after type legalization. 11259 if (!N00.getValueType().is64BitVector() || 11260 !N0.getValueType().is128BitVector()) 11261 return SDValue(); 11262 11263 // Generate vpaddl. 11264 SelectionDAG &DAG = DCI.DAG; 11265 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11266 SDLoc dl(N); 11267 EVT VT = N->getValueType(0); 11268 11269 SmallVector<SDValue, 8> Ops; 11270 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 11271 unsigned Opcode; 11272 if (N0.getOpcode() == ISD::SIGN_EXTEND) 11273 Opcode = Intrinsic::arm_neon_vpaddls; 11274 else 11275 Opcode = Intrinsic::arm_neon_vpaddlu; 11276 Ops.push_back(DAG.getConstant(Opcode, dl, 11277 TLI.getPointerTy(DAG.getDataLayout()))); 11278 EVT ElemTy = N00.getValueType().getVectorElementType(); 11279 unsigned NumElts = VT.getVectorNumElements(); 11280 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 11281 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 11282 N00.getOperand(0), N00.getOperand(1)); 11283 Ops.push_back(Concat); 11284 11285 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11286 } 11287 11288 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 11289 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 11290 // much easier to match. 11291 static SDValue 11292 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11293 TargetLowering::DAGCombinerInfo &DCI, 11294 const ARMSubtarget *Subtarget) { 11295 // Only perform optimization if after legalize, and if NEON is available. We 11296 // also expected both operands to be BUILD_VECTORs. 11297 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 11298 || N0.getOpcode() != ISD::BUILD_VECTOR 11299 || N1.getOpcode() != ISD::BUILD_VECTOR) 11300 return SDValue(); 11301 11302 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 11303 EVT VT = N->getValueType(0); 11304 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 11305 return SDValue(); 11306 11307 // Check that the vector operands are of the right form. 11308 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 11309 // operands, where N is the size of the formed vector. 11310 // Each EXTRACT_VECTOR should have the same input vector and odd or even 11311 // index such that we have a pair wise add pattern. 11312 11313 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 11314 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11315 return SDValue(); 11316 SDValue Vec = N0->getOperand(0)->getOperand(0); 11317 SDNode *V = Vec.getNode(); 11318 unsigned nextIndex = 0; 11319 11320 // For each operands to the ADD which are BUILD_VECTORs, 11321 // check to see if each of their operands are an EXTRACT_VECTOR with 11322 // the same vector and appropriate index. 11323 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 11324 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 11325 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11326 11327 SDValue ExtVec0 = N0->getOperand(i); 11328 SDValue ExtVec1 = N1->getOperand(i); 11329 11330 // First operand is the vector, verify its the same. 11331 if (V != ExtVec0->getOperand(0).getNode() || 11332 V != ExtVec1->getOperand(0).getNode()) 11333 return SDValue(); 11334 11335 // Second is the constant, verify its correct. 11336 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 11337 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 11338 11339 // For the constant, we want to see all the even or all the odd. 11340 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 11341 || C1->getZExtValue() != nextIndex+1) 11342 return SDValue(); 11343 11344 // Increment index. 11345 nextIndex+=2; 11346 } else 11347 return SDValue(); 11348 } 11349 11350 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 11351 // we're using the entire input vector, otherwise there's a size/legality 11352 // mismatch somewhere. 11353 if (nextIndex != Vec.getValueType().getVectorNumElements() || 11354 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 11355 return SDValue(); 11356 11357 // Create VPADDL node. 11358 SelectionDAG &DAG = DCI.DAG; 11359 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11360 11361 SDLoc dl(N); 11362 11363 // Build operand list. 11364 SmallVector<SDValue, 8> Ops; 11365 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 11366 TLI.getPointerTy(DAG.getDataLayout()))); 11367 11368 // Input is the vector. 11369 Ops.push_back(Vec); 11370 11371 // Get widened type and narrowed type. 11372 MVT widenType; 11373 unsigned numElem = VT.getVectorNumElements(); 11374 11375 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 11376 switch (inputLaneType.getSimpleVT().SimpleTy) { 11377 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 11378 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 11379 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 11380 default: 11381 llvm_unreachable("Invalid vector element type for padd optimization."); 11382 } 11383 11384 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 11385 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 11386 return DAG.getNode(ExtOp, dl, VT, tmp); 11387 } 11388 11389 static SDValue findMUL_LOHI(SDValue V) { 11390 if (V->getOpcode() == ISD::UMUL_LOHI || 11391 V->getOpcode() == ISD::SMUL_LOHI) 11392 return V; 11393 return SDValue(); 11394 } 11395 11396 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 11397 TargetLowering::DAGCombinerInfo &DCI, 11398 const ARMSubtarget *Subtarget) { 11399 if (!Subtarget->hasBaseDSP()) 11400 return SDValue(); 11401 11402 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 11403 // accumulates the product into a 64-bit value. The 16-bit values will 11404 // be sign extended somehow or SRA'd into 32-bit values 11405 // (addc (adde (mul 16bit, 16bit), lo), hi) 11406 SDValue Mul = AddcNode->getOperand(0); 11407 SDValue Lo = AddcNode->getOperand(1); 11408 if (Mul.getOpcode() != ISD::MUL) { 11409 Lo = AddcNode->getOperand(0); 11410 Mul = AddcNode->getOperand(1); 11411 if (Mul.getOpcode() != ISD::MUL) 11412 return SDValue(); 11413 } 11414 11415 SDValue SRA = AddeNode->getOperand(0); 11416 SDValue Hi = AddeNode->getOperand(1); 11417 if (SRA.getOpcode() != ISD::SRA) { 11418 SRA = AddeNode->getOperand(1); 11419 Hi = AddeNode->getOperand(0); 11420 if (SRA.getOpcode() != ISD::SRA) 11421 return SDValue(); 11422 } 11423 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 11424 if (Const->getZExtValue() != 31) 11425 return SDValue(); 11426 } else 11427 return SDValue(); 11428 11429 if (SRA.getOperand(0) != Mul) 11430 return SDValue(); 11431 11432 SelectionDAG &DAG = DCI.DAG; 11433 SDLoc dl(AddcNode); 11434 unsigned Opcode = 0; 11435 SDValue Op0; 11436 SDValue Op1; 11437 11438 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 11439 Opcode = ARMISD::SMLALBB; 11440 Op0 = Mul.getOperand(0); 11441 Op1 = Mul.getOperand(1); 11442 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 11443 Opcode = ARMISD::SMLALBT; 11444 Op0 = Mul.getOperand(0); 11445 Op1 = Mul.getOperand(1).getOperand(0); 11446 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 11447 Opcode = ARMISD::SMLALTB; 11448 Op0 = Mul.getOperand(0).getOperand(0); 11449 Op1 = Mul.getOperand(1); 11450 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 11451 Opcode = ARMISD::SMLALTT; 11452 Op0 = Mul->getOperand(0).getOperand(0); 11453 Op1 = Mul->getOperand(1).getOperand(0); 11454 } 11455 11456 if (!Op0 || !Op1) 11457 return SDValue(); 11458 11459 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 11460 Op0, Op1, Lo, Hi); 11461 // Replace the ADDs' nodes uses by the MLA node's values. 11462 SDValue HiMLALResult(SMLAL.getNode(), 1); 11463 SDValue LoMLALResult(SMLAL.getNode(), 0); 11464 11465 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 11466 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 11467 11468 // Return original node to notify the driver to stop replacing. 11469 SDValue resNode(AddcNode, 0); 11470 return resNode; 11471 } 11472 11473 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 11474 TargetLowering::DAGCombinerInfo &DCI, 11475 const ARMSubtarget *Subtarget) { 11476 // Look for multiply add opportunities. 11477 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 11478 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 11479 // a glue link from the first add to the second add. 11480 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 11481 // a S/UMLAL instruction. 11482 // UMUL_LOHI 11483 // / :lo \ :hi 11484 // V \ [no multiline comment] 11485 // loAdd -> ADDC | 11486 // \ :carry / 11487 // V V 11488 // ADDE <- hiAdd 11489 // 11490 // In the special case where only the higher part of a signed result is used 11491 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 11492 // a constant with the exact value of 0x80000000, we recognize we are dealing 11493 // with a "rounded multiply and add" (or subtract) and transform it into 11494 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 11495 11496 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 11497 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 11498 "Expect an ADDE or SUBE"); 11499 11500 assert(AddeSubeNode->getNumOperands() == 3 && 11501 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 11502 "ADDE node has the wrong inputs"); 11503 11504 // Check that we are chained to the right ADDC or SUBC node. 11505 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 11506 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 11507 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 11508 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 11509 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 11510 return SDValue(); 11511 11512 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 11513 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 11514 11515 // Check if the two operands are from the same mul_lohi node. 11516 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 11517 return SDValue(); 11518 11519 assert(AddcSubcNode->getNumValues() == 2 && 11520 AddcSubcNode->getValueType(0) == MVT::i32 && 11521 "Expect ADDC with two result values. First: i32"); 11522 11523 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 11524 // maybe a SMLAL which multiplies two 16-bit values. 11525 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 11526 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 11527 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 11528 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 11529 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 11530 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 11531 11532 // Check for the triangle shape. 11533 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 11534 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 11535 11536 // Make sure that the ADDE/SUBE operands are not coming from the same node. 11537 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 11538 return SDValue(); 11539 11540 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 11541 bool IsLeftOperandMUL = false; 11542 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 11543 if (MULOp == SDValue()) 11544 MULOp = findMUL_LOHI(AddeSubeOp1); 11545 else 11546 IsLeftOperandMUL = true; 11547 if (MULOp == SDValue()) 11548 return SDValue(); 11549 11550 // Figure out the right opcode. 11551 unsigned Opc = MULOp->getOpcode(); 11552 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 11553 11554 // Figure out the high and low input values to the MLAL node. 11555 SDValue *HiAddSub = nullptr; 11556 SDValue *LoMul = nullptr; 11557 SDValue *LowAddSub = nullptr; 11558 11559 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 11560 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 11561 return SDValue(); 11562 11563 if (IsLeftOperandMUL) 11564 HiAddSub = &AddeSubeOp1; 11565 else 11566 HiAddSub = &AddeSubeOp0; 11567 11568 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 11569 // whose low result is fed to the ADDC/SUBC we are checking. 11570 11571 if (AddcSubcOp0 == MULOp.getValue(0)) { 11572 LoMul = &AddcSubcOp0; 11573 LowAddSub = &AddcSubcOp1; 11574 } 11575 if (AddcSubcOp1 == MULOp.getValue(0)) { 11576 LoMul = &AddcSubcOp1; 11577 LowAddSub = &AddcSubcOp0; 11578 } 11579 11580 if (!LoMul) 11581 return SDValue(); 11582 11583 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 11584 // the replacement below will create a cycle. 11585 if (AddcSubcNode == HiAddSub->getNode() || 11586 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 11587 return SDValue(); 11588 11589 // Create the merged node. 11590 SelectionDAG &DAG = DCI.DAG; 11591 11592 // Start building operand list. 11593 SmallVector<SDValue, 8> Ops; 11594 Ops.push_back(LoMul->getOperand(0)); 11595 Ops.push_back(LoMul->getOperand(1)); 11596 11597 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 11598 // the case, we must be doing signed multiplication and only use the higher 11599 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 11600 // addition or subtraction with the value of 0x800000. 11601 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 11602 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 11603 LowAddSub->getNode()->getOpcode() == ISD::Constant && 11604 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 11605 0x80000000) { 11606 Ops.push_back(*HiAddSub); 11607 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 11608 FinalOpc = ARMISD::SMMLSR; 11609 } else { 11610 FinalOpc = ARMISD::SMMLAR; 11611 } 11612 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 11613 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 11614 11615 return SDValue(AddeSubeNode, 0); 11616 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 11617 // SMMLS is generated during instruction selection and the rest of this 11618 // function can not handle the case where AddcSubcNode is a SUBC. 11619 return SDValue(); 11620 11621 // Finish building the operand list for {U/S}MLAL 11622 Ops.push_back(*LowAddSub); 11623 Ops.push_back(*HiAddSub); 11624 11625 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 11626 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11627 11628 // Replace the ADDs' nodes uses by the MLA node's values. 11629 SDValue HiMLALResult(MLALNode.getNode(), 1); 11630 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 11631 11632 SDValue LoMLALResult(MLALNode.getNode(), 0); 11633 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 11634 11635 // Return original node to notify the driver to stop replacing. 11636 return SDValue(AddeSubeNode, 0); 11637 } 11638 11639 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 11640 TargetLowering::DAGCombinerInfo &DCI, 11641 const ARMSubtarget *Subtarget) { 11642 // UMAAL is similar to UMLAL except that it adds two unsigned values. 11643 // While trying to combine for the other MLAL nodes, first search for the 11644 // chance to use UMAAL. Check if Addc uses a node which has already 11645 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 11646 // as the addend, and it's handled in PerformUMLALCombine. 11647 11648 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11649 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11650 11651 // Check that we have a glued ADDC node. 11652 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 11653 if (AddcNode->getOpcode() != ARMISD::ADDC) 11654 return SDValue(); 11655 11656 // Find the converted UMAAL or quit if it doesn't exist. 11657 SDNode *UmlalNode = nullptr; 11658 SDValue AddHi; 11659 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 11660 UmlalNode = AddcNode->getOperand(0).getNode(); 11661 AddHi = AddcNode->getOperand(1); 11662 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 11663 UmlalNode = AddcNode->getOperand(1).getNode(); 11664 AddHi = AddcNode->getOperand(0); 11665 } else { 11666 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11667 } 11668 11669 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 11670 // the ADDC as well as Zero. 11671 if (!isNullConstant(UmlalNode->getOperand(3))) 11672 return SDValue(); 11673 11674 if ((isNullConstant(AddeNode->getOperand(0)) && 11675 AddeNode->getOperand(1).getNode() == UmlalNode) || 11676 (AddeNode->getOperand(0).getNode() == UmlalNode && 11677 isNullConstant(AddeNode->getOperand(1)))) { 11678 SelectionDAG &DAG = DCI.DAG; 11679 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 11680 UmlalNode->getOperand(2), AddHi }; 11681 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 11682 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11683 11684 // Replace the ADDs' nodes uses by the UMAAL node's values. 11685 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 11686 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 11687 11688 // Return original node to notify the driver to stop replacing. 11689 return SDValue(AddeNode, 0); 11690 } 11691 return SDValue(); 11692 } 11693 11694 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 11695 const ARMSubtarget *Subtarget) { 11696 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11697 return SDValue(); 11698 11699 // Check that we have a pair of ADDC and ADDE as operands. 11700 // Both addends of the ADDE must be zero. 11701 SDNode* AddcNode = N->getOperand(2).getNode(); 11702 SDNode* AddeNode = N->getOperand(3).getNode(); 11703 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 11704 (AddeNode->getOpcode() == ARMISD::ADDE) && 11705 isNullConstant(AddeNode->getOperand(0)) && 11706 isNullConstant(AddeNode->getOperand(1)) && 11707 (AddeNode->getOperand(2).getNode() == AddcNode)) 11708 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 11709 DAG.getVTList(MVT::i32, MVT::i32), 11710 {N->getOperand(0), N->getOperand(1), 11711 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 11712 else 11713 return SDValue(); 11714 } 11715 11716 static SDValue PerformAddcSubcCombine(SDNode *N, 11717 TargetLowering::DAGCombinerInfo &DCI, 11718 const ARMSubtarget *Subtarget) { 11719 SelectionDAG &DAG(DCI.DAG); 11720 11721 if (N->getOpcode() == ARMISD::SUBC) { 11722 // (SUBC (ADDE 0, 0, C), 1) -> C 11723 SDValue LHS = N->getOperand(0); 11724 SDValue RHS = N->getOperand(1); 11725 if (LHS->getOpcode() == ARMISD::ADDE && 11726 isNullConstant(LHS->getOperand(0)) && 11727 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 11728 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 11729 } 11730 } 11731 11732 if (Subtarget->isThumb1Only()) { 11733 SDValue RHS = N->getOperand(1); 11734 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11735 int32_t imm = C->getSExtValue(); 11736 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 11737 SDLoc DL(N); 11738 RHS = DAG.getConstant(-imm, DL, MVT::i32); 11739 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 11740 : ARMISD::ADDC; 11741 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 11742 } 11743 } 11744 } 11745 11746 return SDValue(); 11747 } 11748 11749 static SDValue PerformAddeSubeCombine(SDNode *N, 11750 TargetLowering::DAGCombinerInfo &DCI, 11751 const ARMSubtarget *Subtarget) { 11752 if (Subtarget->isThumb1Only()) { 11753 SelectionDAG &DAG = DCI.DAG; 11754 SDValue RHS = N->getOperand(1); 11755 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11756 int64_t imm = C->getSExtValue(); 11757 if (imm < 0) { 11758 SDLoc DL(N); 11759 11760 // The with-carry-in form matches bitwise not instead of the negation. 11761 // Effectively, the inverse interpretation of the carry flag already 11762 // accounts for part of the negation. 11763 RHS = DAG.getConstant(~imm, DL, MVT::i32); 11764 11765 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 11766 : ARMISD::ADDE; 11767 return DAG.getNode(Opcode, DL, N->getVTList(), 11768 N->getOperand(0), RHS, N->getOperand(2)); 11769 } 11770 } 11771 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 11772 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 11773 } 11774 return SDValue(); 11775 } 11776 11777 static SDValue PerformVSELECTCombine(SDNode *N, 11778 TargetLowering::DAGCombinerInfo &DCI, 11779 const ARMSubtarget *Subtarget) { 11780 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). 11781 // 11782 // We need to re-implement this optimization here as the implementation in the 11783 // Target-Independent DAGCombiner does not handle the kind of constant we make 11784 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for 11785 // good reason, allowing truncation there would break other targets). 11786 // 11787 // Currently, this is only done for MVE, as it's the only target that benefits 11788 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). 11789 if (!Subtarget->hasMVEIntegerOps()) 11790 return SDValue(); 11791 11792 if (N->getOperand(0).getOpcode() != ISD::XOR) 11793 return SDValue(); 11794 SDValue XOR = N->getOperand(0); 11795 11796 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. 11797 // It is important to check with truncation allowed as the BUILD_VECTORs we 11798 // generate in those situations will truncate their operands. 11799 ConstantSDNode *Const = 11800 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, 11801 /*AllowTruncation*/ true); 11802 if (!Const || !Const->isOne()) 11803 return SDValue(); 11804 11805 // Rewrite into vselect(cond, rhs, lhs). 11806 SDValue Cond = XOR->getOperand(0); 11807 SDValue LHS = N->getOperand(1); 11808 SDValue RHS = N->getOperand(2); 11809 EVT Type = N->getValueType(0); 11810 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); 11811 } 11812 11813 static SDValue PerformABSCombine(SDNode *N, 11814 TargetLowering::DAGCombinerInfo &DCI, 11815 const ARMSubtarget *Subtarget) { 11816 SDValue res; 11817 SelectionDAG &DAG = DCI.DAG; 11818 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11819 11820 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 11821 return SDValue(); 11822 11823 if (!TLI.expandABS(N, res, DAG)) 11824 return SDValue(); 11825 11826 return res; 11827 } 11828 11829 /// PerformADDECombine - Target-specific dag combine transform from 11830 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 11831 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 11832 static SDValue PerformADDECombine(SDNode *N, 11833 TargetLowering::DAGCombinerInfo &DCI, 11834 const ARMSubtarget *Subtarget) { 11835 // Only ARM and Thumb2 support UMLAL/SMLAL. 11836 if (Subtarget->isThumb1Only()) 11837 return PerformAddeSubeCombine(N, DCI, Subtarget); 11838 11839 // Only perform the checks after legalize when the pattern is available. 11840 if (DCI.isBeforeLegalize()) return SDValue(); 11841 11842 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 11843 } 11844 11845 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 11846 /// operands N0 and N1. This is a helper for PerformADDCombine that is 11847 /// called with the default operands, and if that fails, with commuted 11848 /// operands. 11849 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 11850 TargetLowering::DAGCombinerInfo &DCI, 11851 const ARMSubtarget *Subtarget){ 11852 // Attempt to create vpadd for this add. 11853 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 11854 return Result; 11855 11856 // Attempt to create vpaddl for this add. 11857 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 11858 return Result; 11859 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 11860 Subtarget)) 11861 return Result; 11862 11863 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11864 if (N0.getNode()->hasOneUse()) 11865 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 11866 return Result; 11867 return SDValue(); 11868 } 11869 11870 static SDValue PerformADDVecReduce(SDNode *N, 11871 TargetLowering::DAGCombinerInfo &DCI, 11872 const ARMSubtarget *Subtarget) { 11873 if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64) 11874 return SDValue(); 11875 11876 SDValue N0 = N->getOperand(0); 11877 SDValue N1 = N->getOperand(1); 11878 11879 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this 11880 // will look like: 11881 // t1: i32,i32 = ARMISD::VADDLVs x 11882 // t2: i64 = build_pair t1, t1:1 11883 // t3: i64 = add t2, y 11884 // We also need to check for sext / zext and commutitive adds. 11885 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, 11886 SDValue NB) { 11887 if (NB->getOpcode() != ISD::BUILD_PAIR) 11888 return SDValue(); 11889 SDValue VecRed = NB->getOperand(0); 11890 if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 || 11891 NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) 11892 return SDValue(); 11893 11894 SDLoc dl(N); 11895 SmallVector<SDValue, 4> Ops; 11896 Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 11897 DCI.DAG.getConstant(0, dl, MVT::i32))); 11898 Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 11899 DCI.DAG.getConstant(1, dl, MVT::i32))); 11900 for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++) 11901 Ops.push_back(VecRed->getOperand(i)); 11902 SDValue Red = DCI.DAG.getNode(OpcodeA, dl, 11903 DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops); 11904 return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, 11905 SDValue(Red.getNode(), 1)); 11906 }; 11907 11908 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) 11909 return M; 11910 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) 11911 return M; 11912 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) 11913 return M; 11914 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) 11915 return M; 11916 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) 11917 return M; 11918 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) 11919 return M; 11920 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) 11921 return M; 11922 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) 11923 return M; 11924 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) 11925 return M; 11926 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) 11927 return M; 11928 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) 11929 return M; 11930 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) 11931 return M; 11932 return SDValue(); 11933 } 11934 11935 bool 11936 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 11937 CombineLevel Level) const { 11938 if (Level == BeforeLegalizeTypes) 11939 return true; 11940 11941 if (N->getOpcode() != ISD::SHL) 11942 return true; 11943 11944 if (Subtarget->isThumb1Only()) { 11945 // Avoid making expensive immediates by commuting shifts. (This logic 11946 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 11947 // for free.) 11948 if (N->getOpcode() != ISD::SHL) 11949 return true; 11950 SDValue N1 = N->getOperand(0); 11951 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 11952 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 11953 return true; 11954 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 11955 if (Const->getAPIntValue().ult(256)) 11956 return false; 11957 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 11958 Const->getAPIntValue().sgt(-256)) 11959 return false; 11960 } 11961 return true; 11962 } 11963 11964 // Turn off commute-with-shift transform after legalization, so it doesn't 11965 // conflict with PerformSHLSimplify. (We could try to detect when 11966 // PerformSHLSimplify would trigger more precisely, but it isn't 11967 // really necessary.) 11968 return false; 11969 } 11970 11971 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 11972 const SDNode *N, CombineLevel Level) const { 11973 if (!Subtarget->isThumb1Only()) 11974 return true; 11975 11976 if (Level == BeforeLegalizeTypes) 11977 return true; 11978 11979 return false; 11980 } 11981 11982 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 11983 if (!Subtarget->hasNEON()) { 11984 if (Subtarget->isThumb1Only()) 11985 return VT.getScalarSizeInBits() <= 32; 11986 return true; 11987 } 11988 return VT.isScalarInteger(); 11989 } 11990 11991 static SDValue PerformSHLSimplify(SDNode *N, 11992 TargetLowering::DAGCombinerInfo &DCI, 11993 const ARMSubtarget *ST) { 11994 // Allow the generic combiner to identify potential bswaps. 11995 if (DCI.isBeforeLegalize()) 11996 return SDValue(); 11997 11998 // DAG combiner will fold: 11999 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 12000 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 12001 // Other code patterns that can be also be modified have the following form: 12002 // b + ((a << 1) | 510) 12003 // b + ((a << 1) & 510) 12004 // b + ((a << 1) ^ 510) 12005 // b + ((a << 1) + 510) 12006 12007 // Many instructions can perform the shift for free, but it requires both 12008 // the operands to be registers. If c1 << c2 is too large, a mov immediate 12009 // instruction will needed. So, unfold back to the original pattern if: 12010 // - if c1 and c2 are small enough that they don't require mov imms. 12011 // - the user(s) of the node can perform an shl 12012 12013 // No shifted operands for 16-bit instructions. 12014 if (ST->isThumb() && ST->isThumb1Only()) 12015 return SDValue(); 12016 12017 // Check that all the users could perform the shl themselves. 12018 for (auto U : N->uses()) { 12019 switch(U->getOpcode()) { 12020 default: 12021 return SDValue(); 12022 case ISD::SUB: 12023 case ISD::ADD: 12024 case ISD::AND: 12025 case ISD::OR: 12026 case ISD::XOR: 12027 case ISD::SETCC: 12028 case ARMISD::CMP: 12029 // Check that the user isn't already using a constant because there 12030 // aren't any instructions that support an immediate operand and a 12031 // shifted operand. 12032 if (isa<ConstantSDNode>(U->getOperand(0)) || 12033 isa<ConstantSDNode>(U->getOperand(1))) 12034 return SDValue(); 12035 12036 // Check that it's not already using a shift. 12037 if (U->getOperand(0).getOpcode() == ISD::SHL || 12038 U->getOperand(1).getOpcode() == ISD::SHL) 12039 return SDValue(); 12040 break; 12041 } 12042 } 12043 12044 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 12045 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 12046 return SDValue(); 12047 12048 if (N->getOperand(0).getOpcode() != ISD::SHL) 12049 return SDValue(); 12050 12051 SDValue SHL = N->getOperand(0); 12052 12053 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12054 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 12055 if (!C1ShlC2 || !C2) 12056 return SDValue(); 12057 12058 APInt C2Int = C2->getAPIntValue(); 12059 APInt C1Int = C1ShlC2->getAPIntValue(); 12060 12061 // Check that performing a lshr will not lose any information. 12062 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 12063 C2Int.getBitWidth() - C2->getZExtValue()); 12064 if ((C1Int & Mask) != C1Int) 12065 return SDValue(); 12066 12067 // Shift the first constant. 12068 C1Int.lshrInPlace(C2Int); 12069 12070 // The immediates are encoded as an 8-bit value that can be rotated. 12071 auto LargeImm = [](const APInt &Imm) { 12072 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 12073 return Imm.getBitWidth() - Zeros > 8; 12074 }; 12075 12076 if (LargeImm(C1Int) || LargeImm(C2Int)) 12077 return SDValue(); 12078 12079 SelectionDAG &DAG = DCI.DAG; 12080 SDLoc dl(N); 12081 SDValue X = SHL.getOperand(0); 12082 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 12083 DAG.getConstant(C1Int, dl, MVT::i32)); 12084 // Shift left to compensate for the lshr of C1Int. 12085 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 12086 12087 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 12088 SHL.dump(); N->dump()); 12089 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 12090 return Res; 12091 } 12092 12093 12094 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 12095 /// 12096 static SDValue PerformADDCombine(SDNode *N, 12097 TargetLowering::DAGCombinerInfo &DCI, 12098 const ARMSubtarget *Subtarget) { 12099 SDValue N0 = N->getOperand(0); 12100 SDValue N1 = N->getOperand(1); 12101 12102 // Only works one way, because it needs an immediate operand. 12103 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12104 return Result; 12105 12106 if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget)) 12107 return Result; 12108 12109 // First try with the default operand order. 12110 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 12111 return Result; 12112 12113 // If that didn't work, try again with the operands commuted. 12114 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 12115 } 12116 12117 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 12118 /// 12119 static SDValue PerformSUBCombine(SDNode *N, 12120 TargetLowering::DAGCombinerInfo &DCI, 12121 const ARMSubtarget *Subtarget) { 12122 SDValue N0 = N->getOperand(0); 12123 SDValue N1 = N->getOperand(1); 12124 12125 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 12126 if (N1.getNode()->hasOneUse()) 12127 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 12128 return Result; 12129 12130 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 12131 return SDValue(); 12132 12133 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 12134 // so that we can readily pattern match more mve instructions which can use 12135 // a scalar operand. 12136 SDValue VDup = N->getOperand(1); 12137 if (VDup->getOpcode() != ARMISD::VDUP) 12138 return SDValue(); 12139 12140 SDValue VMov = N->getOperand(0); 12141 if (VMov->getOpcode() == ISD::BITCAST) 12142 VMov = VMov->getOperand(0); 12143 12144 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 12145 return SDValue(); 12146 12147 SDLoc dl(N); 12148 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 12149 DCI.DAG.getConstant(0, dl, MVT::i32), 12150 VDup->getOperand(0)); 12151 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 12152 } 12153 12154 /// PerformVMULCombine 12155 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 12156 /// special multiplier accumulator forwarding. 12157 /// vmul d3, d0, d2 12158 /// vmla d3, d1, d2 12159 /// is faster than 12160 /// vadd d3, d0, d1 12161 /// vmul d3, d3, d2 12162 // However, for (A + B) * (A + B), 12163 // vadd d2, d0, d1 12164 // vmul d3, d0, d2 12165 // vmla d3, d1, d2 12166 // is slower than 12167 // vadd d2, d0, d1 12168 // vmul d3, d2, d2 12169 static SDValue PerformVMULCombine(SDNode *N, 12170 TargetLowering::DAGCombinerInfo &DCI, 12171 const ARMSubtarget *Subtarget) { 12172 if (!Subtarget->hasVMLxForwarding()) 12173 return SDValue(); 12174 12175 SelectionDAG &DAG = DCI.DAG; 12176 SDValue N0 = N->getOperand(0); 12177 SDValue N1 = N->getOperand(1); 12178 unsigned Opcode = N0.getOpcode(); 12179 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 12180 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 12181 Opcode = N1.getOpcode(); 12182 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 12183 Opcode != ISD::FADD && Opcode != ISD::FSUB) 12184 return SDValue(); 12185 std::swap(N0, N1); 12186 } 12187 12188 if (N0 == N1) 12189 return SDValue(); 12190 12191 EVT VT = N->getValueType(0); 12192 SDLoc DL(N); 12193 SDValue N00 = N0->getOperand(0); 12194 SDValue N01 = N0->getOperand(1); 12195 return DAG.getNode(Opcode, DL, VT, 12196 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 12197 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 12198 } 12199 12200 static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, 12201 const ARMSubtarget *Subtarget) { 12202 EVT VT = N->getValueType(0); 12203 if (VT != MVT::v2i64) 12204 return SDValue(); 12205 12206 SDValue N0 = N->getOperand(0); 12207 SDValue N1 = N->getOperand(1); 12208 12209 auto IsSignExt = [&](SDValue Op) { 12210 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) 12211 return SDValue(); 12212 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); 12213 if (VT.getScalarSizeInBits() == 32) 12214 return Op->getOperand(0); 12215 return SDValue(); 12216 }; 12217 auto IsZeroExt = [&](SDValue Op) { 12218 // Zero extends are a little more awkward. At the point we are matching 12219 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. 12220 // That might be before of after a bitcast depending on how the and is 12221 // placed. Because this has to look through bitcasts, it is currently only 12222 // supported on LE. 12223 if (!Subtarget->isLittle()) 12224 return SDValue(); 12225 12226 SDValue And = Op; 12227 if (And->getOpcode() == ISD::BITCAST) 12228 And = And->getOperand(0); 12229 if (And->getOpcode() != ISD::AND) 12230 return SDValue(); 12231 SDValue Mask = And->getOperand(1); 12232 if (Mask->getOpcode() == ISD::BITCAST) 12233 Mask = Mask->getOperand(0); 12234 12235 if (Mask->getOpcode() != ISD::BUILD_VECTOR || 12236 Mask.getValueType() != MVT::v4i32) 12237 return SDValue(); 12238 if (isAllOnesConstant(Mask->getOperand(0)) && 12239 isNullConstant(Mask->getOperand(1)) && 12240 isAllOnesConstant(Mask->getOperand(2)) && 12241 isNullConstant(Mask->getOperand(3))) 12242 return And->getOperand(0); 12243 return SDValue(); 12244 }; 12245 12246 SDLoc dl(N); 12247 if (SDValue Op0 = IsSignExt(N0)) { 12248 if (SDValue Op1 = IsSignExt(N1)) { 12249 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 12250 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 12251 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); 12252 } 12253 } 12254 if (SDValue Op0 = IsZeroExt(N0)) { 12255 if (SDValue Op1 = IsZeroExt(N1)) { 12256 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 12257 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 12258 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); 12259 } 12260 } 12261 12262 return SDValue(); 12263 } 12264 12265 static SDValue PerformMULCombine(SDNode *N, 12266 TargetLowering::DAGCombinerInfo &DCI, 12267 const ARMSubtarget *Subtarget) { 12268 SelectionDAG &DAG = DCI.DAG; 12269 12270 EVT VT = N->getValueType(0); 12271 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) 12272 return PerformMVEVMULLCombine(N, DAG, Subtarget); 12273 12274 if (Subtarget->isThumb1Only()) 12275 return SDValue(); 12276 12277 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12278 return SDValue(); 12279 12280 if (VT.is64BitVector() || VT.is128BitVector()) 12281 return PerformVMULCombine(N, DCI, Subtarget); 12282 if (VT != MVT::i32) 12283 return SDValue(); 12284 12285 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12286 if (!C) 12287 return SDValue(); 12288 12289 int64_t MulAmt = C->getSExtValue(); 12290 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 12291 12292 ShiftAmt = ShiftAmt & (32 - 1); 12293 SDValue V = N->getOperand(0); 12294 SDLoc DL(N); 12295 12296 SDValue Res; 12297 MulAmt >>= ShiftAmt; 12298 12299 if (MulAmt >= 0) { 12300 if (isPowerOf2_32(MulAmt - 1)) { 12301 // (mul x, 2^N + 1) => (add (shl x, N), x) 12302 Res = DAG.getNode(ISD::ADD, DL, VT, 12303 V, 12304 DAG.getNode(ISD::SHL, DL, VT, 12305 V, 12306 DAG.getConstant(Log2_32(MulAmt - 1), DL, 12307 MVT::i32))); 12308 } else if (isPowerOf2_32(MulAmt + 1)) { 12309 // (mul x, 2^N - 1) => (sub (shl x, N), x) 12310 Res = DAG.getNode(ISD::SUB, DL, VT, 12311 DAG.getNode(ISD::SHL, DL, VT, 12312 V, 12313 DAG.getConstant(Log2_32(MulAmt + 1), DL, 12314 MVT::i32)), 12315 V); 12316 } else 12317 return SDValue(); 12318 } else { 12319 uint64_t MulAmtAbs = -MulAmt; 12320 if (isPowerOf2_32(MulAmtAbs + 1)) { 12321 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 12322 Res = DAG.getNode(ISD::SUB, DL, VT, 12323 V, 12324 DAG.getNode(ISD::SHL, DL, VT, 12325 V, 12326 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 12327 MVT::i32))); 12328 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 12329 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 12330 Res = DAG.getNode(ISD::ADD, DL, VT, 12331 V, 12332 DAG.getNode(ISD::SHL, DL, VT, 12333 V, 12334 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 12335 MVT::i32))); 12336 Res = DAG.getNode(ISD::SUB, DL, VT, 12337 DAG.getConstant(0, DL, MVT::i32), Res); 12338 } else 12339 return SDValue(); 12340 } 12341 12342 if (ShiftAmt != 0) 12343 Res = DAG.getNode(ISD::SHL, DL, VT, 12344 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 12345 12346 // Do not add new nodes to DAG combiner worklist. 12347 DCI.CombineTo(N, Res, false); 12348 return SDValue(); 12349 } 12350 12351 static SDValue CombineANDShift(SDNode *N, 12352 TargetLowering::DAGCombinerInfo &DCI, 12353 const ARMSubtarget *Subtarget) { 12354 // Allow DAGCombine to pattern-match before we touch the canonical form. 12355 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12356 return SDValue(); 12357 12358 if (N->getValueType(0) != MVT::i32) 12359 return SDValue(); 12360 12361 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12362 if (!N1C) 12363 return SDValue(); 12364 12365 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 12366 // Don't transform uxtb/uxth. 12367 if (C1 == 255 || C1 == 65535) 12368 return SDValue(); 12369 12370 SDNode *N0 = N->getOperand(0).getNode(); 12371 if (!N0->hasOneUse()) 12372 return SDValue(); 12373 12374 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 12375 return SDValue(); 12376 12377 bool LeftShift = N0->getOpcode() == ISD::SHL; 12378 12379 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 12380 if (!N01C) 12381 return SDValue(); 12382 12383 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 12384 if (!C2 || C2 >= 32) 12385 return SDValue(); 12386 12387 // Clear irrelevant bits in the mask. 12388 if (LeftShift) 12389 C1 &= (-1U << C2); 12390 else 12391 C1 &= (-1U >> C2); 12392 12393 SelectionDAG &DAG = DCI.DAG; 12394 SDLoc DL(N); 12395 12396 // We have a pattern of the form "(and (shl x, c2) c1)" or 12397 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 12398 // transform to a pair of shifts, to save materializing c1. 12399 12400 // First pattern: right shift, then mask off leading bits. 12401 // FIXME: Use demanded bits? 12402 if (!LeftShift && isMask_32(C1)) { 12403 uint32_t C3 = countLeadingZeros(C1); 12404 if (C2 < C3) { 12405 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12406 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12407 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12408 DAG.getConstant(C3, DL, MVT::i32)); 12409 } 12410 } 12411 12412 // First pattern, reversed: left shift, then mask off trailing bits. 12413 if (LeftShift && isMask_32(~C1)) { 12414 uint32_t C3 = countTrailingZeros(C1); 12415 if (C2 < C3) { 12416 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12417 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12418 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12419 DAG.getConstant(C3, DL, MVT::i32)); 12420 } 12421 } 12422 12423 // Second pattern: left shift, then mask off leading bits. 12424 // FIXME: Use demanded bits? 12425 if (LeftShift && isShiftedMask_32(C1)) { 12426 uint32_t Trailing = countTrailingZeros(C1); 12427 uint32_t C3 = countLeadingZeros(C1); 12428 if (Trailing == C2 && C2 + C3 < 32) { 12429 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12430 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12431 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12432 DAG.getConstant(C3, DL, MVT::i32)); 12433 } 12434 } 12435 12436 // Second pattern, reversed: right shift, then mask off trailing bits. 12437 // FIXME: Handle other patterns of known/demanded bits. 12438 if (!LeftShift && isShiftedMask_32(C1)) { 12439 uint32_t Leading = countLeadingZeros(C1); 12440 uint32_t C3 = countTrailingZeros(C1); 12441 if (Leading == C2 && C2 + C3 < 32) { 12442 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12443 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12444 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12445 DAG.getConstant(C3, DL, MVT::i32)); 12446 } 12447 } 12448 12449 // FIXME: Transform "(and (shl x, c2) c1)" -> 12450 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 12451 // c1. 12452 return SDValue(); 12453 } 12454 12455 static SDValue PerformANDCombine(SDNode *N, 12456 TargetLowering::DAGCombinerInfo &DCI, 12457 const ARMSubtarget *Subtarget) { 12458 // Attempt to use immediate-form VBIC 12459 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12460 SDLoc dl(N); 12461 EVT VT = N->getValueType(0); 12462 SelectionDAG &DAG = DCI.DAG; 12463 12464 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12465 return SDValue(); 12466 12467 APInt SplatBits, SplatUndef; 12468 unsigned SplatBitSize; 12469 bool HasAnyUndefs; 12470 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 12471 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12472 if (SplatBitSize <= 64) { 12473 EVT VbicVT; 12474 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 12475 SplatUndef.getZExtValue(), SplatBitSize, 12476 DAG, dl, VbicVT, VT, OtherModImm); 12477 if (Val.getNode()) { 12478 SDValue Input = 12479 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 12480 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 12481 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 12482 } 12483 } 12484 } 12485 12486 if (!Subtarget->isThumb1Only()) { 12487 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 12488 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 12489 return Result; 12490 12491 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12492 return Result; 12493 } 12494 12495 if (Subtarget->isThumb1Only()) 12496 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 12497 return Result; 12498 12499 return SDValue(); 12500 } 12501 12502 // Try combining OR nodes to SMULWB, SMULWT. 12503 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 12504 TargetLowering::DAGCombinerInfo &DCI, 12505 const ARMSubtarget *Subtarget) { 12506 if (!Subtarget->hasV6Ops() || 12507 (Subtarget->isThumb() && 12508 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 12509 return SDValue(); 12510 12511 SDValue SRL = OR->getOperand(0); 12512 SDValue SHL = OR->getOperand(1); 12513 12514 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 12515 SRL = OR->getOperand(1); 12516 SHL = OR->getOperand(0); 12517 } 12518 if (!isSRL16(SRL) || !isSHL16(SHL)) 12519 return SDValue(); 12520 12521 // The first operands to the shifts need to be the two results from the 12522 // same smul_lohi node. 12523 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 12524 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 12525 return SDValue(); 12526 12527 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 12528 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 12529 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 12530 return SDValue(); 12531 12532 // Now we have: 12533 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 12534 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 12535 // For SMUWB the 16-bit value will signed extended somehow. 12536 // For SMULWT only the SRA is required. 12537 // Check both sides of SMUL_LOHI 12538 SDValue OpS16 = SMULLOHI->getOperand(0); 12539 SDValue OpS32 = SMULLOHI->getOperand(1); 12540 12541 SelectionDAG &DAG = DCI.DAG; 12542 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 12543 OpS16 = OpS32; 12544 OpS32 = SMULLOHI->getOperand(0); 12545 } 12546 12547 SDLoc dl(OR); 12548 unsigned Opcode = 0; 12549 if (isS16(OpS16, DAG)) 12550 Opcode = ARMISD::SMULWB; 12551 else if (isSRA16(OpS16)) { 12552 Opcode = ARMISD::SMULWT; 12553 OpS16 = OpS16->getOperand(0); 12554 } 12555 else 12556 return SDValue(); 12557 12558 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 12559 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 12560 return SDValue(OR, 0); 12561 } 12562 12563 static SDValue PerformORCombineToBFI(SDNode *N, 12564 TargetLowering::DAGCombinerInfo &DCI, 12565 const ARMSubtarget *Subtarget) { 12566 // BFI is only available on V6T2+ 12567 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 12568 return SDValue(); 12569 12570 EVT VT = N->getValueType(0); 12571 SDValue N0 = N->getOperand(0); 12572 SDValue N1 = N->getOperand(1); 12573 SelectionDAG &DAG = DCI.DAG; 12574 SDLoc DL(N); 12575 // 1) or (and A, mask), val => ARMbfi A, val, mask 12576 // iff (val & mask) == val 12577 // 12578 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12579 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 12580 // && mask == ~mask2 12581 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 12582 // && ~mask == mask2 12583 // (i.e., copy a bitfield value into another bitfield of the same width) 12584 12585 if (VT != MVT::i32) 12586 return SDValue(); 12587 12588 SDValue N00 = N0.getOperand(0); 12589 12590 // The value and the mask need to be constants so we can verify this is 12591 // actually a bitfield set. If the mask is 0xffff, we can do better 12592 // via a movt instruction, so don't use BFI in that case. 12593 SDValue MaskOp = N0.getOperand(1); 12594 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 12595 if (!MaskC) 12596 return SDValue(); 12597 unsigned Mask = MaskC->getZExtValue(); 12598 if (Mask == 0xffff) 12599 return SDValue(); 12600 SDValue Res; 12601 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 12602 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12603 if (N1C) { 12604 unsigned Val = N1C->getZExtValue(); 12605 if ((Val & ~Mask) != Val) 12606 return SDValue(); 12607 12608 if (ARM::isBitFieldInvertedMask(Mask)) { 12609 Val >>= countTrailingZeros(~Mask); 12610 12611 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 12612 DAG.getConstant(Val, DL, MVT::i32), 12613 DAG.getConstant(Mask, DL, MVT::i32)); 12614 12615 DCI.CombineTo(N, Res, false); 12616 // Return value from the original node to inform the combiner than N is 12617 // now dead. 12618 return SDValue(N, 0); 12619 } 12620 } else if (N1.getOpcode() == ISD::AND) { 12621 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12622 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12623 if (!N11C) 12624 return SDValue(); 12625 unsigned Mask2 = N11C->getZExtValue(); 12626 12627 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 12628 // as is to match. 12629 if (ARM::isBitFieldInvertedMask(Mask) && 12630 (Mask == ~Mask2)) { 12631 // The pack halfword instruction works better for masks that fit it, 12632 // so use that when it's available. 12633 if (Subtarget->hasDSP() && 12634 (Mask == 0xffff || Mask == 0xffff0000)) 12635 return SDValue(); 12636 // 2a 12637 unsigned amt = countTrailingZeros(Mask2); 12638 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 12639 DAG.getConstant(amt, DL, MVT::i32)); 12640 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 12641 DAG.getConstant(Mask, DL, MVT::i32)); 12642 DCI.CombineTo(N, Res, false); 12643 // Return value from the original node to inform the combiner than N is 12644 // now dead. 12645 return SDValue(N, 0); 12646 } else if (ARM::isBitFieldInvertedMask(~Mask) && 12647 (~Mask == Mask2)) { 12648 // The pack halfword instruction works better for masks that fit it, 12649 // so use that when it's available. 12650 if (Subtarget->hasDSP() && 12651 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 12652 return SDValue(); 12653 // 2b 12654 unsigned lsb = countTrailingZeros(Mask); 12655 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 12656 DAG.getConstant(lsb, DL, MVT::i32)); 12657 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 12658 DAG.getConstant(Mask2, DL, MVT::i32)); 12659 DCI.CombineTo(N, Res, false); 12660 // Return value from the original node to inform the combiner than N is 12661 // now dead. 12662 return SDValue(N, 0); 12663 } 12664 } 12665 12666 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 12667 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 12668 ARM::isBitFieldInvertedMask(~Mask)) { 12669 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 12670 // where lsb(mask) == #shamt and masked bits of B are known zero. 12671 SDValue ShAmt = N00.getOperand(1); 12672 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 12673 unsigned LSB = countTrailingZeros(Mask); 12674 if (ShAmtC != LSB) 12675 return SDValue(); 12676 12677 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 12678 DAG.getConstant(~Mask, DL, MVT::i32)); 12679 12680 DCI.CombineTo(N, Res, false); 12681 // Return value from the original node to inform the combiner than N is 12682 // now dead. 12683 return SDValue(N, 0); 12684 } 12685 12686 return SDValue(); 12687 } 12688 12689 static bool isValidMVECond(unsigned CC, bool IsFloat) { 12690 switch (CC) { 12691 case ARMCC::EQ: 12692 case ARMCC::NE: 12693 case ARMCC::LE: 12694 case ARMCC::GT: 12695 case ARMCC::GE: 12696 case ARMCC::LT: 12697 return true; 12698 case ARMCC::HS: 12699 case ARMCC::HI: 12700 return !IsFloat; 12701 default: 12702 return false; 12703 }; 12704 } 12705 12706 static ARMCC::CondCodes getVCMPCondCode(SDValue N) { 12707 if (N->getOpcode() == ARMISD::VCMP) 12708 return (ARMCC::CondCodes)N->getConstantOperandVal(2); 12709 else if (N->getOpcode() == ARMISD::VCMPZ) 12710 return (ARMCC::CondCodes)N->getConstantOperandVal(1); 12711 else 12712 llvm_unreachable("Not a VCMP/VCMPZ!"); 12713 } 12714 12715 static bool CanInvertMVEVCMP(SDValue N) { 12716 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); 12717 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); 12718 } 12719 12720 static SDValue PerformORCombine_i1(SDNode *N, 12721 TargetLowering::DAGCombinerInfo &DCI, 12722 const ARMSubtarget *Subtarget) { 12723 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 12724 // together with predicates 12725 EVT VT = N->getValueType(0); 12726 SDLoc DL(N); 12727 SDValue N0 = N->getOperand(0); 12728 SDValue N1 = N->getOperand(1); 12729 12730 auto IsFreelyInvertable = [&](SDValue V) { 12731 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) 12732 return CanInvertMVEVCMP(V); 12733 return false; 12734 }; 12735 12736 // At least one operand must be freely invertable. 12737 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) 12738 return SDValue(); 12739 12740 SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT); 12741 SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT); 12742 SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); 12743 return DCI.DAG.getLogicalNOT(DL, And, VT); 12744 } 12745 12746 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 12747 static SDValue PerformORCombine(SDNode *N, 12748 TargetLowering::DAGCombinerInfo &DCI, 12749 const ARMSubtarget *Subtarget) { 12750 // Attempt to use immediate-form VORR 12751 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12752 SDLoc dl(N); 12753 EVT VT = N->getValueType(0); 12754 SelectionDAG &DAG = DCI.DAG; 12755 12756 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12757 return SDValue(); 12758 12759 APInt SplatBits, SplatUndef; 12760 unsigned SplatBitSize; 12761 bool HasAnyUndefs; 12762 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 12763 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12764 if (SplatBitSize <= 64) { 12765 EVT VorrVT; 12766 SDValue Val = 12767 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 12768 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); 12769 if (Val.getNode()) { 12770 SDValue Input = 12771 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 12772 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 12773 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 12774 } 12775 } 12776 } 12777 12778 if (!Subtarget->isThumb1Only()) { 12779 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12780 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12781 return Result; 12782 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 12783 return Result; 12784 } 12785 12786 SDValue N0 = N->getOperand(0); 12787 SDValue N1 = N->getOperand(1); 12788 12789 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 12790 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 12791 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12792 12793 // The code below optimizes (or (and X, Y), Z). 12794 // The AND operand needs to have a single user to make these optimizations 12795 // profitable. 12796 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 12797 return SDValue(); 12798 12799 APInt SplatUndef; 12800 unsigned SplatBitSize; 12801 bool HasAnyUndefs; 12802 12803 APInt SplatBits0, SplatBits1; 12804 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 12805 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 12806 // Ensure that the second operand of both ands are constants 12807 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 12808 HasAnyUndefs) && !HasAnyUndefs) { 12809 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 12810 HasAnyUndefs) && !HasAnyUndefs) { 12811 // Ensure that the bit width of the constants are the same and that 12812 // the splat arguments are logical inverses as per the pattern we 12813 // are trying to simplify. 12814 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 12815 SplatBits0 == ~SplatBits1) { 12816 // Canonicalize the vector type to make instruction selection 12817 // simpler. 12818 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 12819 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 12820 N0->getOperand(1), 12821 N0->getOperand(0), 12822 N1->getOperand(0)); 12823 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 12824 } 12825 } 12826 } 12827 } 12828 12829 if (Subtarget->hasMVEIntegerOps() && 12830 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 12831 return PerformORCombine_i1(N, DCI, Subtarget); 12832 12833 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 12834 // reasonable. 12835 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 12836 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 12837 return Res; 12838 } 12839 12840 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12841 return Result; 12842 12843 return SDValue(); 12844 } 12845 12846 static SDValue PerformXORCombine(SDNode *N, 12847 TargetLowering::DAGCombinerInfo &DCI, 12848 const ARMSubtarget *Subtarget) { 12849 EVT VT = N->getValueType(0); 12850 SelectionDAG &DAG = DCI.DAG; 12851 12852 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12853 return SDValue(); 12854 12855 if (!Subtarget->isThumb1Only()) { 12856 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12857 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12858 return Result; 12859 12860 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12861 return Result; 12862 } 12863 12864 if (Subtarget->hasMVEIntegerOps()) { 12865 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. 12866 SDValue N0 = N->getOperand(0); 12867 SDValue N1 = N->getOperand(1); 12868 const TargetLowering *TLI = Subtarget->getTargetLowering(); 12869 if (TLI->isConstTrueVal(N1.getNode()) && 12870 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { 12871 if (CanInvertMVEVCMP(N0)) { 12872 SDLoc DL(N0); 12873 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); 12874 12875 SmallVector<SDValue, 4> Ops; 12876 Ops.push_back(N0->getOperand(0)); 12877 if (N0->getOpcode() == ARMISD::VCMP) 12878 Ops.push_back(N0->getOperand(1)); 12879 Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32)); 12880 return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); 12881 } 12882 } 12883 } 12884 12885 return SDValue(); 12886 } 12887 12888 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 12889 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 12890 // their position in "to" (Rd). 12891 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 12892 assert(N->getOpcode() == ARMISD::BFI); 12893 12894 SDValue From = N->getOperand(1); 12895 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 12896 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 12897 12898 // If the Base came from a SHR #C, we can deduce that it is really testing bit 12899 // #C in the base of the SHR. 12900 if (From->getOpcode() == ISD::SRL && 12901 isa<ConstantSDNode>(From->getOperand(1))) { 12902 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 12903 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 12904 FromMask <<= Shift.getLimitedValue(31); 12905 From = From->getOperand(0); 12906 } 12907 12908 return From; 12909 } 12910 12911 // If A and B contain one contiguous set of bits, does A | B == A . B? 12912 // 12913 // Neither A nor B must be zero. 12914 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 12915 unsigned LastActiveBitInA = A.countTrailingZeros(); 12916 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 12917 return LastActiveBitInA - 1 == FirstActiveBitInB; 12918 } 12919 12920 static SDValue FindBFIToCombineWith(SDNode *N) { 12921 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 12922 // if one exists. 12923 APInt ToMask, FromMask; 12924 SDValue From = ParseBFI(N, ToMask, FromMask); 12925 SDValue To = N->getOperand(0); 12926 12927 // Now check for a compatible BFI to merge with. We can pass through BFIs that 12928 // aren't compatible, but not if they set the same bit in their destination as 12929 // we do (or that of any BFI we're going to combine with). 12930 SDValue V = To; 12931 APInt CombinedToMask = ToMask; 12932 while (V.getOpcode() == ARMISD::BFI) { 12933 APInt NewToMask, NewFromMask; 12934 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 12935 if (NewFrom != From) { 12936 // This BFI has a different base. Keep going. 12937 CombinedToMask |= NewToMask; 12938 V = V.getOperand(0); 12939 continue; 12940 } 12941 12942 // Do the written bits conflict with any we've seen so far? 12943 if ((NewToMask & CombinedToMask).getBoolValue()) 12944 // Conflicting bits - bail out because going further is unsafe. 12945 return SDValue(); 12946 12947 // Are the new bits contiguous when combined with the old bits? 12948 if (BitsProperlyConcatenate(ToMask, NewToMask) && 12949 BitsProperlyConcatenate(FromMask, NewFromMask)) 12950 return V; 12951 if (BitsProperlyConcatenate(NewToMask, ToMask) && 12952 BitsProperlyConcatenate(NewFromMask, FromMask)) 12953 return V; 12954 12955 // We've seen a write to some bits, so track it. 12956 CombinedToMask |= NewToMask; 12957 // Keep going... 12958 V = V.getOperand(0); 12959 } 12960 12961 return SDValue(); 12962 } 12963 12964 static SDValue PerformBFICombine(SDNode *N, 12965 TargetLowering::DAGCombinerInfo &DCI) { 12966 SDValue N1 = N->getOperand(1); 12967 if (N1.getOpcode() == ISD::AND) { 12968 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 12969 // the bits being cleared by the AND are not demanded by the BFI. 12970 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12971 if (!N11C) 12972 return SDValue(); 12973 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12974 unsigned LSB = countTrailingZeros(~InvMask); 12975 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 12976 assert(Width < 12977 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 12978 "undefined behavior"); 12979 unsigned Mask = (1u << Width) - 1; 12980 unsigned Mask2 = N11C->getZExtValue(); 12981 if ((Mask & (~Mask2)) == 0) 12982 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 12983 N->getOperand(0), N1.getOperand(0), 12984 N->getOperand(2)); 12985 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 12986 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 12987 // Keep track of any consecutive bits set that all come from the same base 12988 // value. We can combine these together into a single BFI. 12989 SDValue CombineBFI = FindBFIToCombineWith(N); 12990 if (CombineBFI == SDValue()) 12991 return SDValue(); 12992 12993 // We've found a BFI. 12994 APInt ToMask1, FromMask1; 12995 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 12996 12997 APInt ToMask2, FromMask2; 12998 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 12999 assert(From1 == From2); 13000 (void)From2; 13001 13002 // First, unlink CombineBFI. 13003 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 13004 // Then create a new BFI, combining the two together. 13005 APInt NewFromMask = FromMask1 | FromMask2; 13006 APInt NewToMask = ToMask1 | ToMask2; 13007 13008 EVT VT = N->getValueType(0); 13009 SDLoc dl(N); 13010 13011 if (NewFromMask[0] == 0) 13012 From1 = DCI.DAG.getNode( 13013 ISD::SRL, dl, VT, From1, 13014 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 13015 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 13016 DCI.DAG.getConstant(~NewToMask, dl, VT)); 13017 } 13018 return SDValue(); 13019 } 13020 13021 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 13022 /// ARMISD::VMOVRRD. 13023 static SDValue PerformVMOVRRDCombine(SDNode *N, 13024 TargetLowering::DAGCombinerInfo &DCI, 13025 const ARMSubtarget *Subtarget) { 13026 // vmovrrd(vmovdrr x, y) -> x,y 13027 SDValue InDouble = N->getOperand(0); 13028 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 13029 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 13030 13031 // vmovrrd(load f64) -> (load i32), (load i32) 13032 SDNode *InNode = InDouble.getNode(); 13033 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 13034 InNode->getValueType(0) == MVT::f64 && 13035 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 13036 !cast<LoadSDNode>(InNode)->isVolatile()) { 13037 // TODO: Should this be done for non-FrameIndex operands? 13038 LoadSDNode *LD = cast<LoadSDNode>(InNode); 13039 13040 SelectionDAG &DAG = DCI.DAG; 13041 SDLoc DL(LD); 13042 SDValue BasePtr = LD->getBasePtr(); 13043 SDValue NewLD1 = 13044 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 13045 LD->getAlignment(), LD->getMemOperand()->getFlags()); 13046 13047 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 13048 DAG.getConstant(4, DL, MVT::i32)); 13049 13050 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 13051 LD->getPointerInfo().getWithOffset(4), 13052 std::min(4U, LD->getAlignment()), 13053 LD->getMemOperand()->getFlags()); 13054 13055 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 13056 if (DCI.DAG.getDataLayout().isBigEndian()) 13057 std::swap (NewLD1, NewLD2); 13058 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 13059 return Result; 13060 } 13061 13062 return SDValue(); 13063 } 13064 13065 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 13066 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 13067 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 13068 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 13069 SDValue Op0 = N->getOperand(0); 13070 SDValue Op1 = N->getOperand(1); 13071 if (Op0.getOpcode() == ISD::BITCAST) 13072 Op0 = Op0.getOperand(0); 13073 if (Op1.getOpcode() == ISD::BITCAST) 13074 Op1 = Op1.getOperand(0); 13075 if (Op0.getOpcode() == ARMISD::VMOVRRD && 13076 Op0.getNode() == Op1.getNode() && 13077 Op0.getResNo() == 0 && Op1.getResNo() == 1) 13078 return DAG.getNode(ISD::BITCAST, SDLoc(N), 13079 N->getValueType(0), Op0.getOperand(0)); 13080 return SDValue(); 13081 } 13082 13083 static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13084 SDValue Op0 = N->getOperand(0); 13085 13086 // VMOVhr (VMOVrh (X)) -> X 13087 if (Op0->getOpcode() == ARMISD::VMOVrh) 13088 return Op0->getOperand(0); 13089 13090 // FullFP16: half values are passed in S-registers, and we don't 13091 // need any of the bitcast and moves: 13092 // 13093 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 13094 // t5: i32 = bitcast t2 13095 // t18: f16 = ARMISD::VMOVhr t5 13096 if (Op0->getOpcode() == ISD::BITCAST) { 13097 SDValue Copy = Op0->getOperand(0); 13098 if (Copy.getValueType() == MVT::f32 && 13099 Copy->getOpcode() == ISD::CopyFromReg) { 13100 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; 13101 SDValue NewCopy = 13102 DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), MVT::f16, Ops); 13103 return NewCopy; 13104 } 13105 } 13106 13107 // fold (VMOVhr (load x)) -> (load (f16*)x) 13108 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { 13109 if (LN0->hasOneUse() && LN0->isUnindexed() && 13110 LN0->getMemoryVT() == MVT::i16) { 13111 SDValue Load = DCI.DAG.getLoad(MVT::f16, SDLoc(N), LN0->getChain(), 13112 LN0->getBasePtr(), LN0->getMemOperand()); 13113 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 13114 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); 13115 return Load; 13116 } 13117 } 13118 13119 // Only the bottom 16 bits of the source register are used. 13120 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 13121 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 13122 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) 13123 return SDValue(N, 0); 13124 13125 return SDValue(); 13126 } 13127 13128 static SDValue PerformVMOVrhCombine(SDNode *N, 13129 TargetLowering::DAGCombinerInfo &DCI) { 13130 SDValue N0 = N->getOperand(0); 13131 EVT VT = N->getValueType(0); 13132 13133 // fold (VMOVrh (load x)) -> (zextload (i16*)x) 13134 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { 13135 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 13136 13137 SDValue Load = 13138 DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), 13139 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); 13140 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 13141 DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 13142 return Load; 13143 } 13144 13145 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) 13146 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 13147 isa<ConstantSDNode>(N0->getOperand(1))) 13148 return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), 13149 N0->getOperand(1)); 13150 13151 return SDValue(); 13152 } 13153 13154 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 13155 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 13156 /// i64 vector to have f64 elements, since the value can then be loaded 13157 /// directly into a VFP register. 13158 static bool hasNormalLoadOperand(SDNode *N) { 13159 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 13160 for (unsigned i = 0; i < NumElts; ++i) { 13161 SDNode *Elt = N->getOperand(i).getNode(); 13162 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 13163 return true; 13164 } 13165 return false; 13166 } 13167 13168 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 13169 /// ISD::BUILD_VECTOR. 13170 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 13171 TargetLowering::DAGCombinerInfo &DCI, 13172 const ARMSubtarget *Subtarget) { 13173 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 13174 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 13175 // into a pair of GPRs, which is fine when the value is used as a scalar, 13176 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 13177 SelectionDAG &DAG = DCI.DAG; 13178 if (N->getNumOperands() == 2) 13179 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 13180 return RV; 13181 13182 // Load i64 elements as f64 values so that type legalization does not split 13183 // them up into i32 values. 13184 EVT VT = N->getValueType(0); 13185 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 13186 return SDValue(); 13187 SDLoc dl(N); 13188 SmallVector<SDValue, 8> Ops; 13189 unsigned NumElts = VT.getVectorNumElements(); 13190 for (unsigned i = 0; i < NumElts; ++i) { 13191 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 13192 Ops.push_back(V); 13193 // Make the DAGCombiner fold the bitcast. 13194 DCI.AddToWorklist(V.getNode()); 13195 } 13196 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 13197 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 13198 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 13199 } 13200 13201 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 13202 static SDValue 13203 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13204 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 13205 // At that time, we may have inserted bitcasts from integer to float. 13206 // If these bitcasts have survived DAGCombine, change the lowering of this 13207 // BUILD_VECTOR in something more vector friendly, i.e., that does not 13208 // force to use floating point types. 13209 13210 // Make sure we can change the type of the vector. 13211 // This is possible iff: 13212 // 1. The vector is only used in a bitcast to a integer type. I.e., 13213 // 1.1. Vector is used only once. 13214 // 1.2. Use is a bit convert to an integer type. 13215 // 2. The size of its operands are 32-bits (64-bits are not legal). 13216 EVT VT = N->getValueType(0); 13217 EVT EltVT = VT.getVectorElementType(); 13218 13219 // Check 1.1. and 2. 13220 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 13221 return SDValue(); 13222 13223 // By construction, the input type must be float. 13224 assert(EltVT == MVT::f32 && "Unexpected type!"); 13225 13226 // Check 1.2. 13227 SDNode *Use = *N->use_begin(); 13228 if (Use->getOpcode() != ISD::BITCAST || 13229 Use->getValueType(0).isFloatingPoint()) 13230 return SDValue(); 13231 13232 // Check profitability. 13233 // Model is, if more than half of the relevant operands are bitcast from 13234 // i32, turn the build_vector into a sequence of insert_vector_elt. 13235 // Relevant operands are everything that is not statically 13236 // (i.e., at compile time) bitcasted. 13237 unsigned NumOfBitCastedElts = 0; 13238 unsigned NumElts = VT.getVectorNumElements(); 13239 unsigned NumOfRelevantElts = NumElts; 13240 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 13241 SDValue Elt = N->getOperand(Idx); 13242 if (Elt->getOpcode() == ISD::BITCAST) { 13243 // Assume only bit cast to i32 will go away. 13244 if (Elt->getOperand(0).getValueType() == MVT::i32) 13245 ++NumOfBitCastedElts; 13246 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 13247 // Constants are statically casted, thus do not count them as 13248 // relevant operands. 13249 --NumOfRelevantElts; 13250 } 13251 13252 // Check if more than half of the elements require a non-free bitcast. 13253 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 13254 return SDValue(); 13255 13256 SelectionDAG &DAG = DCI.DAG; 13257 // Create the new vector type. 13258 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 13259 // Check if the type is legal. 13260 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13261 if (!TLI.isTypeLegal(VecVT)) 13262 return SDValue(); 13263 13264 // Combine: 13265 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 13266 // => BITCAST INSERT_VECTOR_ELT 13267 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 13268 // (BITCAST EN), N. 13269 SDValue Vec = DAG.getUNDEF(VecVT); 13270 SDLoc dl(N); 13271 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 13272 SDValue V = N->getOperand(Idx); 13273 if (V.isUndef()) 13274 continue; 13275 if (V.getOpcode() == ISD::BITCAST && 13276 V->getOperand(0).getValueType() == MVT::i32) 13277 // Fold obvious case. 13278 V = V.getOperand(0); 13279 else { 13280 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 13281 // Make the DAGCombiner fold the bitcasts. 13282 DCI.AddToWorklist(V.getNode()); 13283 } 13284 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 13285 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 13286 } 13287 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 13288 // Make the DAGCombiner fold the bitcasts. 13289 DCI.AddToWorklist(Vec.getNode()); 13290 return Vec; 13291 } 13292 13293 static SDValue 13294 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13295 EVT VT = N->getValueType(0); 13296 SDValue Op = N->getOperand(0); 13297 SDLoc dl(N); 13298 13299 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 13300 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 13301 // If the valuetypes are the same, we can remove the cast entirely. 13302 if (Op->getOperand(0).getValueType() == VT) 13303 return Op->getOperand(0); 13304 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 13305 } 13306 13307 return SDValue(); 13308 } 13309 13310 static SDValue 13311 PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 13312 const ARMSubtarget *ST) { 13313 EVT VT = N->getValueType(0); 13314 SDValue Op = N->getOperand(0); 13315 SDLoc dl(N); 13316 13317 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST 13318 if (ST->isLittle()) 13319 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op); 13320 13321 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) 13322 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { 13323 // If the valuetypes are the same, we can remove the cast entirely. 13324 if (Op->getOperand(0).getValueType() == VT) 13325 return Op->getOperand(0); 13326 return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); 13327 } 13328 13329 return SDValue(); 13330 } 13331 13332 static SDValue PerformVCMPCombine(SDNode *N, 13333 TargetLowering::DAGCombinerInfo &DCI, 13334 const ARMSubtarget *Subtarget) { 13335 if (!Subtarget->hasMVEIntegerOps()) 13336 return SDValue(); 13337 13338 EVT VT = N->getValueType(0); 13339 SDValue Op0 = N->getOperand(0); 13340 SDValue Op1 = N->getOperand(1); 13341 ARMCC::CondCodes Cond = 13342 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 13343 SDLoc dl(N); 13344 13345 // vcmp X, 0, cc -> vcmpz X, cc 13346 if (isZeroVector(Op1)) 13347 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, 13348 N->getOperand(2)); 13349 13350 unsigned SwappedCond = getSwappedCondition(Cond); 13351 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 13352 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 13353 if (isZeroVector(Op0)) 13354 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 13355 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 13356 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 13357 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 13358 return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 13359 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 13360 } 13361 13362 return SDValue(); 13363 } 13364 13365 /// PerformInsertEltCombine - Target-specific dag combine xforms for 13366 /// ISD::INSERT_VECTOR_ELT. 13367 static SDValue PerformInsertEltCombine(SDNode *N, 13368 TargetLowering::DAGCombinerInfo &DCI) { 13369 // Bitcast an i64 load inserted into a vector to f64. 13370 // Otherwise, the i64 value will be legalized to a pair of i32 values. 13371 EVT VT = N->getValueType(0); 13372 SDNode *Elt = N->getOperand(1).getNode(); 13373 if (VT.getVectorElementType() != MVT::i64 || 13374 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 13375 return SDValue(); 13376 13377 SelectionDAG &DAG = DCI.DAG; 13378 SDLoc dl(N); 13379 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13380 VT.getVectorNumElements()); 13381 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 13382 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 13383 // Make the DAGCombiner fold the bitcasts. 13384 DCI.AddToWorklist(Vec.getNode()); 13385 DCI.AddToWorklist(V.getNode()); 13386 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 13387 Vec, V, N->getOperand(2)); 13388 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 13389 } 13390 13391 static SDValue PerformExtractEltCombine(SDNode *N, 13392 TargetLowering::DAGCombinerInfo &DCI) { 13393 SDValue Op0 = N->getOperand(0); 13394 EVT VT = N->getValueType(0); 13395 SDLoc dl(N); 13396 13397 // extract (vdup x) -> x 13398 if (Op0->getOpcode() == ARMISD::VDUP) { 13399 SDValue X = Op0->getOperand(0); 13400 if (VT == MVT::f16 && X.getValueType() == MVT::i32) 13401 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); 13402 if (VT == MVT::i32 && X.getValueType() == MVT::f16) 13403 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); 13404 13405 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) 13406 X = X->getOperand(0); 13407 if (X.getValueType() == VT) 13408 return X; 13409 } 13410 13411 return SDValue(); 13412 } 13413 13414 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 13415 /// ISD::VECTOR_SHUFFLE. 13416 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 13417 // The LLVM shufflevector instruction does not require the shuffle mask 13418 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 13419 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 13420 // operands do not match the mask length, they are extended by concatenating 13421 // them with undef vectors. That is probably the right thing for other 13422 // targets, but for NEON it is better to concatenate two double-register 13423 // size vector operands into a single quad-register size vector. Do that 13424 // transformation here: 13425 // shuffle(concat(v1, undef), concat(v2, undef)) -> 13426 // shuffle(concat(v1, v2), undef) 13427 SDValue Op0 = N->getOperand(0); 13428 SDValue Op1 = N->getOperand(1); 13429 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 13430 Op1.getOpcode() != ISD::CONCAT_VECTORS || 13431 Op0.getNumOperands() != 2 || 13432 Op1.getNumOperands() != 2) 13433 return SDValue(); 13434 SDValue Concat0Op1 = Op0.getOperand(1); 13435 SDValue Concat1Op1 = Op1.getOperand(1); 13436 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 13437 return SDValue(); 13438 // Skip the transformation if any of the types are illegal. 13439 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13440 EVT VT = N->getValueType(0); 13441 if (!TLI.isTypeLegal(VT) || 13442 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 13443 !TLI.isTypeLegal(Concat1Op1.getValueType())) 13444 return SDValue(); 13445 13446 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 13447 Op0.getOperand(0), Op1.getOperand(0)); 13448 // Translate the shuffle mask. 13449 SmallVector<int, 16> NewMask; 13450 unsigned NumElts = VT.getVectorNumElements(); 13451 unsigned HalfElts = NumElts/2; 13452 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 13453 for (unsigned n = 0; n < NumElts; ++n) { 13454 int MaskElt = SVN->getMaskElt(n); 13455 int NewElt = -1; 13456 if (MaskElt < (int)HalfElts) 13457 NewElt = MaskElt; 13458 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 13459 NewElt = HalfElts + MaskElt - NumElts; 13460 NewMask.push_back(NewElt); 13461 } 13462 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 13463 DAG.getUNDEF(VT), NewMask); 13464 } 13465 13466 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 13467 /// NEON load/store intrinsics, and generic vector load/stores, to merge 13468 /// base address updates. 13469 /// For generic load/stores, the memory type is assumed to be a vector. 13470 /// The caller is assumed to have checked legality. 13471 static SDValue CombineBaseUpdate(SDNode *N, 13472 TargetLowering::DAGCombinerInfo &DCI) { 13473 SelectionDAG &DAG = DCI.DAG; 13474 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 13475 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 13476 const bool isStore = N->getOpcode() == ISD::STORE; 13477 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 13478 SDValue Addr = N->getOperand(AddrOpIdx); 13479 MemSDNode *MemN = cast<MemSDNode>(N); 13480 SDLoc dl(N); 13481 13482 // Search for a use of the address operand that is an increment. 13483 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 13484 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 13485 SDNode *User = *UI; 13486 if (User->getOpcode() != ISD::ADD || 13487 UI.getUse().getResNo() != Addr.getResNo()) 13488 continue; 13489 13490 // Check that the add is independent of the load/store. Otherwise, folding 13491 // it would create a cycle. We can avoid searching through Addr as it's a 13492 // predecessor to both. 13493 SmallPtrSet<const SDNode *, 32> Visited; 13494 SmallVector<const SDNode *, 16> Worklist; 13495 Visited.insert(Addr.getNode()); 13496 Worklist.push_back(N); 13497 Worklist.push_back(User); 13498 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 13499 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 13500 continue; 13501 13502 // Find the new opcode for the updating load/store. 13503 bool isLoadOp = true; 13504 bool isLaneOp = false; 13505 unsigned NewOpc = 0; 13506 unsigned NumVecs = 0; 13507 if (isIntrinsic) { 13508 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 13509 switch (IntNo) { 13510 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 13511 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 13512 NumVecs = 1; break; 13513 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 13514 NumVecs = 2; break; 13515 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 13516 NumVecs = 3; break; 13517 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 13518 NumVecs = 4; break; 13519 case Intrinsic::arm_neon_vld2dup: 13520 case Intrinsic::arm_neon_vld3dup: 13521 case Intrinsic::arm_neon_vld4dup: 13522 // TODO: Support updating VLDxDUP nodes. For now, we just skip 13523 // combining base updates for such intrinsics. 13524 continue; 13525 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 13526 NumVecs = 2; isLaneOp = true; break; 13527 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 13528 NumVecs = 3; isLaneOp = true; break; 13529 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 13530 NumVecs = 4; isLaneOp = true; break; 13531 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 13532 NumVecs = 1; isLoadOp = false; break; 13533 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 13534 NumVecs = 2; isLoadOp = false; break; 13535 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 13536 NumVecs = 3; isLoadOp = false; break; 13537 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 13538 NumVecs = 4; isLoadOp = false; break; 13539 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 13540 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 13541 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 13542 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 13543 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 13544 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 13545 } 13546 } else { 13547 isLaneOp = true; 13548 switch (N->getOpcode()) { 13549 default: llvm_unreachable("unexpected opcode for Neon base update"); 13550 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 13551 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 13552 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 13553 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 13554 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 13555 NumVecs = 1; isLaneOp = false; break; 13556 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 13557 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 13558 } 13559 } 13560 13561 // Find the size of memory referenced by the load/store. 13562 EVT VecTy; 13563 if (isLoadOp) { 13564 VecTy = N->getValueType(0); 13565 } else if (isIntrinsic) { 13566 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 13567 } else { 13568 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 13569 VecTy = N->getOperand(1).getValueType(); 13570 } 13571 13572 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 13573 if (isLaneOp) 13574 NumBytes /= VecTy.getVectorNumElements(); 13575 13576 // If the increment is a constant, it must match the memory ref size. 13577 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 13578 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 13579 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 13580 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 13581 // separate instructions that make it harder to use a non-constant update. 13582 continue; 13583 } 13584 13585 // OK, we found an ADD we can fold into the base update. 13586 // Now, create a _UPD node, taking care of not breaking alignment. 13587 13588 EVT AlignedVecTy = VecTy; 13589 unsigned Alignment = MemN->getAlignment(); 13590 13591 // If this is a less-than-standard-aligned load/store, change the type to 13592 // match the standard alignment. 13593 // The alignment is overlooked when selecting _UPD variants; and it's 13594 // easier to introduce bitcasts here than fix that. 13595 // There are 3 ways to get to this base-update combine: 13596 // - intrinsics: they are assumed to be properly aligned (to the standard 13597 // alignment of the memory type), so we don't need to do anything. 13598 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 13599 // intrinsics, so, likewise, there's nothing to do. 13600 // - generic load/store instructions: the alignment is specified as an 13601 // explicit operand, rather than implicitly as the standard alignment 13602 // of the memory type (like the intrisics). We need to change the 13603 // memory type to match the explicit alignment. That way, we don't 13604 // generate non-standard-aligned ARMISD::VLDx nodes. 13605 if (isa<LSBaseSDNode>(N)) { 13606 if (Alignment == 0) 13607 Alignment = 1; 13608 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 13609 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 13610 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 13611 assert(!isLaneOp && "Unexpected generic load/store lane."); 13612 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 13613 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 13614 } 13615 // Don't set an explicit alignment on regular load/stores that we want 13616 // to transform to VLD/VST 1_UPD nodes. 13617 // This matches the behavior of regular load/stores, which only get an 13618 // explicit alignment if the MMO alignment is larger than the standard 13619 // alignment of the memory type. 13620 // Intrinsics, however, always get an explicit alignment, set to the 13621 // alignment of the MMO. 13622 Alignment = 1; 13623 } 13624 13625 // Create the new updating load/store node. 13626 // First, create an SDVTList for the new updating node's results. 13627 EVT Tys[6]; 13628 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13629 unsigned n; 13630 for (n = 0; n < NumResultVecs; ++n) 13631 Tys[n] = AlignedVecTy; 13632 Tys[n++] = MVT::i32; 13633 Tys[n] = MVT::Other; 13634 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 13635 13636 // Then, gather the new node's operands. 13637 SmallVector<SDValue, 8> Ops; 13638 Ops.push_back(N->getOperand(0)); // incoming chain 13639 Ops.push_back(N->getOperand(AddrOpIdx)); 13640 Ops.push_back(Inc); 13641 13642 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 13643 // Try to match the intrinsic's signature 13644 Ops.push_back(StN->getValue()); 13645 } else { 13646 // Loads (and of course intrinsics) match the intrinsics' signature, 13647 // so just add all but the alignment operand. 13648 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 13649 Ops.push_back(N->getOperand(i)); 13650 } 13651 13652 // For all node types, the alignment operand is always the last one. 13653 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 13654 13655 // If this is a non-standard-aligned STORE, the penultimate operand is the 13656 // stored value. Bitcast it to the aligned type. 13657 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 13658 SDValue &StVal = Ops[Ops.size()-2]; 13659 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 13660 } 13661 13662 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 13663 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 13664 MemN->getMemOperand()); 13665 13666 // Update the uses. 13667 SmallVector<SDValue, 5> NewResults; 13668 for (unsigned i = 0; i < NumResultVecs; ++i) 13669 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13670 13671 // If this is an non-standard-aligned LOAD, the first result is the loaded 13672 // value. Bitcast it to the expected result type. 13673 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 13674 SDValue &LdVal = NewResults[0]; 13675 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 13676 } 13677 13678 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 13679 DCI.CombineTo(N, NewResults); 13680 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13681 13682 break; 13683 } 13684 return SDValue(); 13685 } 13686 13687 static SDValue PerformVLDCombine(SDNode *N, 13688 TargetLowering::DAGCombinerInfo &DCI) { 13689 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13690 return SDValue(); 13691 13692 return CombineBaseUpdate(N, DCI); 13693 } 13694 13695 static SDValue PerformMVEVLDCombine(SDNode *N, 13696 TargetLowering::DAGCombinerInfo &DCI) { 13697 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13698 return SDValue(); 13699 13700 SelectionDAG &DAG = DCI.DAG; 13701 SDValue Addr = N->getOperand(2); 13702 MemSDNode *MemN = cast<MemSDNode>(N); 13703 SDLoc dl(N); 13704 13705 // For the stores, where there are multiple intrinsics we only actually want 13706 // to post-inc the last of the them. 13707 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 13708 if (IntNo == Intrinsic::arm_mve_vst2q && 13709 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) 13710 return SDValue(); 13711 if (IntNo == Intrinsic::arm_mve_vst4q && 13712 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) 13713 return SDValue(); 13714 13715 // Search for a use of the address operand that is an increment. 13716 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 13717 UE = Addr.getNode()->use_end(); 13718 UI != UE; ++UI) { 13719 SDNode *User = *UI; 13720 if (User->getOpcode() != ISD::ADD || 13721 UI.getUse().getResNo() != Addr.getResNo()) 13722 continue; 13723 13724 // Check that the add is independent of the load/store. Otherwise, folding 13725 // it would create a cycle. We can avoid searching through Addr as it's a 13726 // predecessor to both. 13727 SmallPtrSet<const SDNode *, 32> Visited; 13728 SmallVector<const SDNode *, 16> Worklist; 13729 Visited.insert(Addr.getNode()); 13730 Worklist.push_back(N); 13731 Worklist.push_back(User); 13732 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 13733 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 13734 continue; 13735 13736 // Find the new opcode for the updating load/store. 13737 bool isLoadOp = true; 13738 unsigned NewOpc = 0; 13739 unsigned NumVecs = 0; 13740 switch (IntNo) { 13741 default: 13742 llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); 13743 case Intrinsic::arm_mve_vld2q: 13744 NewOpc = ARMISD::VLD2_UPD; 13745 NumVecs = 2; 13746 break; 13747 case Intrinsic::arm_mve_vld4q: 13748 NewOpc = ARMISD::VLD4_UPD; 13749 NumVecs = 4; 13750 break; 13751 case Intrinsic::arm_mve_vst2q: 13752 NewOpc = ARMISD::VST2_UPD; 13753 NumVecs = 2; 13754 isLoadOp = false; 13755 break; 13756 case Intrinsic::arm_mve_vst4q: 13757 NewOpc = ARMISD::VST4_UPD; 13758 NumVecs = 4; 13759 isLoadOp = false; 13760 break; 13761 } 13762 13763 // Find the size of memory referenced by the load/store. 13764 EVT VecTy; 13765 if (isLoadOp) { 13766 VecTy = N->getValueType(0); 13767 } else { 13768 VecTy = N->getOperand(3).getValueType(); 13769 } 13770 13771 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 13772 13773 // If the increment is a constant, it must match the memory ref size. 13774 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 13775 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 13776 if (!CInc || CInc->getZExtValue() != NumBytes) 13777 continue; 13778 13779 // Create the new updating load/store node. 13780 // First, create an SDVTList for the new updating node's results. 13781 EVT Tys[6]; 13782 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13783 unsigned n; 13784 for (n = 0; n < NumResultVecs; ++n) 13785 Tys[n] = VecTy; 13786 Tys[n++] = MVT::i32; 13787 Tys[n] = MVT::Other; 13788 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 13789 13790 // Then, gather the new node's operands. 13791 SmallVector<SDValue, 8> Ops; 13792 Ops.push_back(N->getOperand(0)); // incoming chain 13793 Ops.push_back(N->getOperand(2)); // ptr 13794 Ops.push_back(Inc); 13795 13796 for (unsigned i = 3; i < N->getNumOperands(); ++i) 13797 Ops.push_back(N->getOperand(i)); 13798 13799 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, 13800 MemN->getMemOperand()); 13801 13802 // Update the uses. 13803 SmallVector<SDValue, 5> NewResults; 13804 for (unsigned i = 0; i < NumResultVecs; ++i) 13805 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13806 13807 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 13808 DCI.CombineTo(N, NewResults); 13809 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13810 13811 break; 13812 } 13813 13814 return SDValue(); 13815 } 13816 13817 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 13818 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 13819 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 13820 /// return true. 13821 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13822 SelectionDAG &DAG = DCI.DAG; 13823 EVT VT = N->getValueType(0); 13824 // vldN-dup instructions only support 64-bit vectors for N > 1. 13825 if (!VT.is64BitVector()) 13826 return false; 13827 13828 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 13829 SDNode *VLD = N->getOperand(0).getNode(); 13830 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 13831 return false; 13832 unsigned NumVecs = 0; 13833 unsigned NewOpc = 0; 13834 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 13835 if (IntNo == Intrinsic::arm_neon_vld2lane) { 13836 NumVecs = 2; 13837 NewOpc = ARMISD::VLD2DUP; 13838 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 13839 NumVecs = 3; 13840 NewOpc = ARMISD::VLD3DUP; 13841 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 13842 NumVecs = 4; 13843 NewOpc = ARMISD::VLD4DUP; 13844 } else { 13845 return false; 13846 } 13847 13848 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 13849 // numbers match the load. 13850 unsigned VLDLaneNo = 13851 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 13852 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13853 UI != UE; ++UI) { 13854 // Ignore uses of the chain result. 13855 if (UI.getUse().getResNo() == NumVecs) 13856 continue; 13857 SDNode *User = *UI; 13858 if (User->getOpcode() != ARMISD::VDUPLANE || 13859 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 13860 return false; 13861 } 13862 13863 // Create the vldN-dup node. 13864 EVT Tys[5]; 13865 unsigned n; 13866 for (n = 0; n < NumVecs; ++n) 13867 Tys[n] = VT; 13868 Tys[n] = MVT::Other; 13869 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 13870 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 13871 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 13872 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 13873 Ops, VLDMemInt->getMemoryVT(), 13874 VLDMemInt->getMemOperand()); 13875 13876 // Update the uses. 13877 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13878 UI != UE; ++UI) { 13879 unsigned ResNo = UI.getUse().getResNo(); 13880 // Ignore uses of the chain result. 13881 if (ResNo == NumVecs) 13882 continue; 13883 SDNode *User = *UI; 13884 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 13885 } 13886 13887 // Now the vldN-lane intrinsic is dead except for its chain result. 13888 // Update uses of the chain. 13889 std::vector<SDValue> VLDDupResults; 13890 for (unsigned n = 0; n < NumVecs; ++n) 13891 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 13892 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 13893 DCI.CombineTo(VLD, VLDDupResults); 13894 13895 return true; 13896 } 13897 13898 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 13899 /// ARMISD::VDUPLANE. 13900 static SDValue PerformVDUPLANECombine(SDNode *N, 13901 TargetLowering::DAGCombinerInfo &DCI, 13902 const ARMSubtarget *Subtarget) { 13903 SDValue Op = N->getOperand(0); 13904 EVT VT = N->getValueType(0); 13905 13906 // On MVE, we just convert the VDUPLANE to a VDUP with an extract. 13907 if (Subtarget->hasMVEIntegerOps()) { 13908 EVT ExtractVT = VT.getVectorElementType(); 13909 // We need to ensure we are creating a legal type. 13910 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) 13911 ExtractVT = MVT::i32; 13912 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, 13913 N->getOperand(0), N->getOperand(1)); 13914 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); 13915 } 13916 13917 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 13918 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 13919 if (CombineVLDDUP(N, DCI)) 13920 return SDValue(N, 0); 13921 13922 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 13923 // redundant. Ignore bit_converts for now; element sizes are checked below. 13924 while (Op.getOpcode() == ISD::BITCAST) 13925 Op = Op.getOperand(0); 13926 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 13927 return SDValue(); 13928 13929 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 13930 unsigned EltSize = Op.getScalarValueSizeInBits(); 13931 // The canonical VMOV for a zero vector uses a 32-bit element size. 13932 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13933 unsigned EltBits; 13934 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 13935 EltSize = 8; 13936 if (EltSize > VT.getScalarSizeInBits()) 13937 return SDValue(); 13938 13939 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 13940 } 13941 13942 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 13943 static SDValue PerformVDUPCombine(SDNode *N, 13944 TargetLowering::DAGCombinerInfo &DCI, 13945 const ARMSubtarget *Subtarget) { 13946 SelectionDAG &DAG = DCI.DAG; 13947 SDValue Op = N->getOperand(0); 13948 SDLoc dl(N); 13949 13950 if (Subtarget->hasMVEIntegerOps()) { 13951 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will 13952 // need to come from a GPR. 13953 if (Op.getValueType() == MVT::f32) 13954 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 13955 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); 13956 else if (Op.getValueType() == MVT::f16) 13957 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 13958 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); 13959 } 13960 13961 if (!Subtarget->hasNEON()) 13962 return SDValue(); 13963 13964 // Match VDUP(LOAD) -> VLD1DUP. 13965 // We match this pattern here rather than waiting for isel because the 13966 // transform is only legal for unindexed loads. 13967 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 13968 if (LD && Op.hasOneUse() && LD->isUnindexed() && 13969 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 13970 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 13971 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 13972 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 13973 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 13974 Ops, LD->getMemoryVT(), 13975 LD->getMemOperand()); 13976 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 13977 return VLDDup; 13978 } 13979 13980 return SDValue(); 13981 } 13982 13983 static SDValue PerformLOADCombine(SDNode *N, 13984 TargetLowering::DAGCombinerInfo &DCI) { 13985 EVT VT = N->getValueType(0); 13986 13987 // If this is a legal vector load, try to combine it into a VLD1_UPD. 13988 if (ISD::isNormalLoad(N) && VT.isVector() && 13989 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13990 return CombineBaseUpdate(N, DCI); 13991 13992 return SDValue(); 13993 } 13994 13995 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 13996 // pack all of the elements in one place. Next, store to memory in fewer 13997 // chunks. 13998 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 13999 SelectionDAG &DAG) { 14000 SDValue StVal = St->getValue(); 14001 EVT VT = StVal.getValueType(); 14002 if (!St->isTruncatingStore() || !VT.isVector()) 14003 return SDValue(); 14004 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14005 EVT StVT = St->getMemoryVT(); 14006 unsigned NumElems = VT.getVectorNumElements(); 14007 assert(StVT != VT && "Cannot truncate to the same type"); 14008 unsigned FromEltSz = VT.getScalarSizeInBits(); 14009 unsigned ToEltSz = StVT.getScalarSizeInBits(); 14010 14011 // From, To sizes and ElemCount must be pow of two 14012 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 14013 return SDValue(); 14014 14015 // We are going to use the original vector elt for storing. 14016 // Accumulated smaller vector elements must be a multiple of the store size. 14017 if (0 != (NumElems * FromEltSz) % ToEltSz) 14018 return SDValue(); 14019 14020 unsigned SizeRatio = FromEltSz / ToEltSz; 14021 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 14022 14023 // Create a type on which we perform the shuffle. 14024 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 14025 NumElems * SizeRatio); 14026 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 14027 14028 SDLoc DL(St); 14029 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 14030 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 14031 for (unsigned i = 0; i < NumElems; ++i) 14032 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 14033 : i * SizeRatio; 14034 14035 // Can't shuffle using an illegal type. 14036 if (!TLI.isTypeLegal(WideVecVT)) 14037 return SDValue(); 14038 14039 SDValue Shuff = DAG.getVectorShuffle( 14040 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 14041 // At this point all of the data is stored at the bottom of the 14042 // register. We now need to save it to mem. 14043 14044 // Find the largest store unit 14045 MVT StoreType = MVT::i8; 14046 for (MVT Tp : MVT::integer_valuetypes()) { 14047 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 14048 StoreType = Tp; 14049 } 14050 // Didn't find a legal store type. 14051 if (!TLI.isTypeLegal(StoreType)) 14052 return SDValue(); 14053 14054 // Bitcast the original vector into a vector of store-size units 14055 EVT StoreVecVT = 14056 EVT::getVectorVT(*DAG.getContext(), StoreType, 14057 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 14058 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 14059 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 14060 SmallVector<SDValue, 8> Chains; 14061 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 14062 TLI.getPointerTy(DAG.getDataLayout())); 14063 SDValue BasePtr = St->getBasePtr(); 14064 14065 // Perform one or more big stores into memory. 14066 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 14067 for (unsigned I = 0; I < E; I++) { 14068 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 14069 ShuffWide, DAG.getIntPtrConstant(I, DL)); 14070 SDValue Ch = 14071 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 14072 St->getAlignment(), St->getMemOperand()->getFlags()); 14073 BasePtr = 14074 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 14075 Chains.push_back(Ch); 14076 } 14077 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 14078 } 14079 14080 // Try taking a single vector store from an truncate (which would otherwise turn 14081 // into an expensive buildvector) and splitting it into a series of narrowing 14082 // stores. 14083 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 14084 SelectionDAG &DAG) { 14085 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 14086 return SDValue(); 14087 SDValue Trunc = St->getValue(); 14088 if (Trunc->getOpcode() != ISD::TRUNCATE) 14089 return SDValue(); 14090 EVT FromVT = Trunc->getOperand(0).getValueType(); 14091 EVT ToVT = Trunc.getValueType(); 14092 if (!ToVT.isVector()) 14093 return SDValue(); 14094 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 14095 EVT ToEltVT = ToVT.getVectorElementType(); 14096 EVT FromEltVT = FromVT.getVectorElementType(); 14097 14098 unsigned NumElements = 0; 14099 if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) 14100 NumElements = 4; 14101 if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) 14102 NumElements = 8; 14103 if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || 14104 FromVT.getVectorNumElements() % NumElements != 0) 14105 return SDValue(); 14106 14107 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so 14108 // use the VMOVN over splitting the store. We are looking for patterns of: 14109 // !rev: 0 N 1 N+1 2 N+2 ... 14110 // rev: N 0 N+1 1 N+2 2 ... 14111 auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) { 14112 unsigned NumElts = ToVT.getVectorNumElements(); 14113 if (NumElts != M.size() || (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)) 14114 return false; 14115 14116 unsigned Off0 = rev ? NumElts : 0; 14117 unsigned Off1 = rev ? 0 : NumElts; 14118 14119 for (unsigned i = 0; i < NumElts; i += 2) { 14120 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) 14121 return false; 14122 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) 14123 return false; 14124 } 14125 14126 return true; 14127 }; 14128 14129 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0))) 14130 if (isVMOVNOriginalMask(Shuffle->getMask(), false) || 14131 isVMOVNOriginalMask(Shuffle->getMask(), true)) 14132 return SDValue(); 14133 14134 SDLoc DL(St); 14135 // Details about the old store 14136 SDValue Ch = St->getChain(); 14137 SDValue BasePtr = St->getBasePtr(); 14138 Align Alignment = St->getOriginalAlign(); 14139 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 14140 AAMDNodes AAInfo = St->getAAInfo(); 14141 14142 EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); 14143 EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); 14144 14145 SmallVector<SDValue, 4> Stores; 14146 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 14147 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 14148 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 14149 14150 SDValue Extract = 14151 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 14152 DAG.getConstant(i * NumElements, DL, MVT::i32)); 14153 SDValue Store = DAG.getTruncStore( 14154 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 14155 NewToVT, Alignment.value(), MMOFlags, AAInfo); 14156 Stores.push_back(Store); 14157 } 14158 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 14159 } 14160 14161 /// PerformSTORECombine - Target-specific dag combine xforms for 14162 /// ISD::STORE. 14163 static SDValue PerformSTORECombine(SDNode *N, 14164 TargetLowering::DAGCombinerInfo &DCI, 14165 const ARMSubtarget *Subtarget) { 14166 StoreSDNode *St = cast<StoreSDNode>(N); 14167 if (St->isVolatile()) 14168 return SDValue(); 14169 SDValue StVal = St->getValue(); 14170 EVT VT = StVal.getValueType(); 14171 14172 if (Subtarget->hasNEON()) 14173 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 14174 return Store; 14175 14176 if (Subtarget->hasMVEIntegerOps()) 14177 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 14178 return NewToken; 14179 14180 if (!ISD::isNormalStore(St)) 14181 return SDValue(); 14182 14183 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 14184 // ARM stores of arguments in the same cache line. 14185 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 14186 StVal.getNode()->hasOneUse()) { 14187 SelectionDAG &DAG = DCI.DAG; 14188 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 14189 SDLoc DL(St); 14190 SDValue BasePtr = St->getBasePtr(); 14191 SDValue NewST1 = DAG.getStore( 14192 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 14193 BasePtr, St->getPointerInfo(), St->getAlignment(), 14194 St->getMemOperand()->getFlags()); 14195 14196 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 14197 DAG.getConstant(4, DL, MVT::i32)); 14198 return DAG.getStore(NewST1.getValue(0), DL, 14199 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 14200 OffsetPtr, St->getPointerInfo(), 14201 std::min(4U, St->getAlignment() / 2), 14202 St->getMemOperand()->getFlags()); 14203 } 14204 14205 if (StVal.getValueType() == MVT::i64 && 14206 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 14207 14208 // Bitcast an i64 store extracted from a vector to f64. 14209 // Otherwise, the i64 value will be legalized to a pair of i32 values. 14210 SelectionDAG &DAG = DCI.DAG; 14211 SDLoc dl(StVal); 14212 SDValue IntVec = StVal.getOperand(0); 14213 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 14214 IntVec.getValueType().getVectorNumElements()); 14215 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 14216 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 14217 Vec, StVal.getOperand(1)); 14218 dl = SDLoc(N); 14219 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 14220 // Make the DAGCombiner fold the bitcasts. 14221 DCI.AddToWorklist(Vec.getNode()); 14222 DCI.AddToWorklist(ExtElt.getNode()); 14223 DCI.AddToWorklist(V.getNode()); 14224 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 14225 St->getPointerInfo(), St->getAlignment(), 14226 St->getMemOperand()->getFlags(), St->getAAInfo()); 14227 } 14228 14229 // If this is a legal vector store, try to combine it into a VST1_UPD. 14230 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 14231 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 14232 return CombineBaseUpdate(N, DCI); 14233 14234 return SDValue(); 14235 } 14236 14237 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 14238 /// can replace combinations of VMUL and VCVT (floating-point to integer) 14239 /// when the VMUL has a constant operand that is a power of 2. 14240 /// 14241 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 14242 /// vmul.f32 d16, d17, d16 14243 /// vcvt.s32.f32 d16, d16 14244 /// becomes: 14245 /// vcvt.s32.f32 d16, d16, #3 14246 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 14247 const ARMSubtarget *Subtarget) { 14248 if (!Subtarget->hasNEON()) 14249 return SDValue(); 14250 14251 SDValue Op = N->getOperand(0); 14252 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 14253 Op.getOpcode() != ISD::FMUL) 14254 return SDValue(); 14255 14256 SDValue ConstVec = Op->getOperand(1); 14257 if (!isa<BuildVectorSDNode>(ConstVec)) 14258 return SDValue(); 14259 14260 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 14261 uint32_t FloatBits = FloatTy.getSizeInBits(); 14262 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 14263 uint32_t IntBits = IntTy.getSizeInBits(); 14264 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 14265 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 14266 // These instructions only exist converting from f32 to i32. We can handle 14267 // smaller integers by generating an extra truncate, but larger ones would 14268 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 14269 // these intructions only support v2i32/v4i32 types. 14270 return SDValue(); 14271 } 14272 14273 BitVector UndefElements; 14274 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 14275 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 14276 if (C == -1 || C == 0 || C > 32) 14277 return SDValue(); 14278 14279 SDLoc dl(N); 14280 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 14281 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 14282 Intrinsic::arm_neon_vcvtfp2fxu; 14283 SDValue FixConv = DAG.getNode( 14284 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 14285 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 14286 DAG.getConstant(C, dl, MVT::i32)); 14287 14288 if (IntBits < FloatBits) 14289 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 14290 14291 return FixConv; 14292 } 14293 14294 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 14295 /// can replace combinations of VCVT (integer to floating-point) and VDIV 14296 /// when the VDIV has a constant operand that is a power of 2. 14297 /// 14298 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 14299 /// vcvt.f32.s32 d16, d16 14300 /// vdiv.f32 d16, d17, d16 14301 /// becomes: 14302 /// vcvt.f32.s32 d16, d16, #3 14303 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 14304 const ARMSubtarget *Subtarget) { 14305 if (!Subtarget->hasNEON()) 14306 return SDValue(); 14307 14308 SDValue Op = N->getOperand(0); 14309 unsigned OpOpcode = Op.getNode()->getOpcode(); 14310 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 14311 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 14312 return SDValue(); 14313 14314 SDValue ConstVec = N->getOperand(1); 14315 if (!isa<BuildVectorSDNode>(ConstVec)) 14316 return SDValue(); 14317 14318 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 14319 uint32_t FloatBits = FloatTy.getSizeInBits(); 14320 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 14321 uint32_t IntBits = IntTy.getSizeInBits(); 14322 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 14323 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 14324 // These instructions only exist converting from i32 to f32. We can handle 14325 // smaller integers by generating an extra extend, but larger ones would 14326 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 14327 // these intructions only support v2i32/v4i32 types. 14328 return SDValue(); 14329 } 14330 14331 BitVector UndefElements; 14332 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 14333 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 14334 if (C == -1 || C == 0 || C > 32) 14335 return SDValue(); 14336 14337 SDLoc dl(N); 14338 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 14339 SDValue ConvInput = Op.getOperand(0); 14340 if (IntBits < FloatBits) 14341 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 14342 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 14343 ConvInput); 14344 14345 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 14346 Intrinsic::arm_neon_vcvtfxu2fp; 14347 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 14348 Op.getValueType(), 14349 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 14350 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 14351 } 14352 14353 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, 14354 const ARMSubtarget *ST) { 14355 if (!ST->hasMVEIntegerOps()) 14356 return SDValue(); 14357 14358 assert(N->getOpcode() == ISD::VECREDUCE_ADD); 14359 EVT ResVT = N->getValueType(0); 14360 SDValue N0 = N->getOperand(0); 14361 SDLoc dl(N); 14362 14363 // We are looking for something that will have illegal types if left alone, 14364 // but that we can convert to a single instruction undef MVE. For example 14365 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A 14366 // or 14367 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B 14368 14369 // Cases: 14370 // VADDV u/s 8/16/32 14371 // VMLAV u/s 8/16/32 14372 // VADDLV u/s 32 14373 // VMLALV u/s 16/32 14374 14375 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { 14376 if (ResVT != RetTy || N0->getOpcode() != ExtendCode) 14377 return SDValue(); 14378 SDValue A = N0->getOperand(0); 14379 if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) 14380 return A; 14381 return SDValue(); 14382 }; 14383 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 14384 SDValue &A, SDValue &B) { 14385 if (ResVT != RetTy || N0->getOpcode() != ISD::MUL) 14386 return false; 14387 SDValue ExtA = N0->getOperand(0); 14388 SDValue ExtB = N0->getOperand(1); 14389 if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) 14390 return false; 14391 A = ExtA->getOperand(0); 14392 B = ExtB->getOperand(0); 14393 if (A.getValueType() == B.getValueType() && 14394 llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) 14395 return true; 14396 return false; 14397 }; 14398 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { 14399 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); 14400 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, 14401 SDValue(Node.getNode(), 1)); 14402 }; 14403 14404 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) 14405 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); 14406 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) 14407 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); 14408 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) 14409 return Create64bitNode(ARMISD::VADDLVs, {A}); 14410 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) 14411 return Create64bitNode(ARMISD::VADDLVu, {A}); 14412 14413 SDValue A, B; 14414 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 14415 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); 14416 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 14417 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); 14418 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) 14419 return Create64bitNode(ARMISD::VMLALVs, {A, B}); 14420 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) 14421 return Create64bitNode(ARMISD::VMLALVu, {A, B}); 14422 return SDValue(); 14423 } 14424 14425 static SDValue PerformVMOVNCombine(SDNode *N, 14426 TargetLowering::DAGCombinerInfo &DCI) { 14427 SDValue Op0 = N->getOperand(0); 14428 SDValue Op1 = N->getOperand(1); 14429 unsigned IsTop = N->getConstantOperandVal(2); 14430 14431 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) 14432 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) 14433 if ((Op1->getOpcode() == ARMISD::VQMOVNs || 14434 Op1->getOpcode() == ARMISD::VQMOVNu) && 14435 Op1->getConstantOperandVal(2) == 0) 14436 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), 14437 Op0, Op1->getOperand(1), N->getOperand(2)); 14438 14439 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from 14440 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting 14441 // into the top or bottom lanes. 14442 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 14443 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); 14444 APInt Op0DemandedElts = 14445 IsTop ? Op1DemandedElts 14446 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); 14447 14448 APInt KnownUndef, KnownZero; 14449 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 14450 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, 14451 KnownZero, DCI)) 14452 return SDValue(N, 0); 14453 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef, 14454 KnownZero, DCI)) 14455 return SDValue(N, 0); 14456 14457 return SDValue(); 14458 } 14459 14460 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { 14461 SDLoc DL(N); 14462 SDValue Op0 = N->getOperand(0); 14463 SDValue Op1 = N->getOperand(1); 14464 14465 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from 14466 // uses of the intrinsics. 14467 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 14468 int ShiftAmt = C->getSExtValue(); 14469 if (ShiftAmt == 0) { 14470 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); 14471 DAG.ReplaceAllUsesWith(N, Merge.getNode()); 14472 return SDValue(); 14473 } 14474 14475 if (ShiftAmt >= -32 && ShiftAmt < 0) { 14476 unsigned NewOpcode = 14477 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; 14478 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, 14479 DAG.getConstant(-ShiftAmt, DL, MVT::i32)); 14480 DAG.ReplaceAllUsesWith(N, NewShift.getNode()); 14481 return NewShift; 14482 } 14483 } 14484 14485 return SDValue(); 14486 } 14487 14488 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 14489 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, 14490 DAGCombinerInfo &DCI) const { 14491 SelectionDAG &DAG = DCI.DAG; 14492 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 14493 switch (IntNo) { 14494 default: 14495 // Don't do anything for most intrinsics. 14496 break; 14497 14498 // Vector shifts: check for immediate versions and lower them. 14499 // Note: This is done during DAG combining instead of DAG legalizing because 14500 // the build_vectors for 64-bit vector element shift counts are generally 14501 // not legal, and it is hard to see their values after they get legalized to 14502 // loads from a constant pool. 14503 case Intrinsic::arm_neon_vshifts: 14504 case Intrinsic::arm_neon_vshiftu: 14505 case Intrinsic::arm_neon_vrshifts: 14506 case Intrinsic::arm_neon_vrshiftu: 14507 case Intrinsic::arm_neon_vrshiftn: 14508 case Intrinsic::arm_neon_vqshifts: 14509 case Intrinsic::arm_neon_vqshiftu: 14510 case Intrinsic::arm_neon_vqshiftsu: 14511 case Intrinsic::arm_neon_vqshiftns: 14512 case Intrinsic::arm_neon_vqshiftnu: 14513 case Intrinsic::arm_neon_vqshiftnsu: 14514 case Intrinsic::arm_neon_vqrshiftns: 14515 case Intrinsic::arm_neon_vqrshiftnu: 14516 case Intrinsic::arm_neon_vqrshiftnsu: { 14517 EVT VT = N->getOperand(1).getValueType(); 14518 int64_t Cnt; 14519 unsigned VShiftOpc = 0; 14520 14521 switch (IntNo) { 14522 case Intrinsic::arm_neon_vshifts: 14523 case Intrinsic::arm_neon_vshiftu: 14524 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 14525 VShiftOpc = ARMISD::VSHLIMM; 14526 break; 14527 } 14528 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 14529 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 14530 : ARMISD::VSHRuIMM); 14531 break; 14532 } 14533 return SDValue(); 14534 14535 case Intrinsic::arm_neon_vrshifts: 14536 case Intrinsic::arm_neon_vrshiftu: 14537 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 14538 break; 14539 return SDValue(); 14540 14541 case Intrinsic::arm_neon_vqshifts: 14542 case Intrinsic::arm_neon_vqshiftu: 14543 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 14544 break; 14545 return SDValue(); 14546 14547 case Intrinsic::arm_neon_vqshiftsu: 14548 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 14549 break; 14550 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 14551 14552 case Intrinsic::arm_neon_vrshiftn: 14553 case Intrinsic::arm_neon_vqshiftns: 14554 case Intrinsic::arm_neon_vqshiftnu: 14555 case Intrinsic::arm_neon_vqshiftnsu: 14556 case Intrinsic::arm_neon_vqrshiftns: 14557 case Intrinsic::arm_neon_vqrshiftnu: 14558 case Intrinsic::arm_neon_vqrshiftnsu: 14559 // Narrowing shifts require an immediate right shift. 14560 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 14561 break; 14562 llvm_unreachable("invalid shift count for narrowing vector shift " 14563 "intrinsic"); 14564 14565 default: 14566 llvm_unreachable("unhandled vector shift"); 14567 } 14568 14569 switch (IntNo) { 14570 case Intrinsic::arm_neon_vshifts: 14571 case Intrinsic::arm_neon_vshiftu: 14572 // Opcode already set above. 14573 break; 14574 case Intrinsic::arm_neon_vrshifts: 14575 VShiftOpc = ARMISD::VRSHRsIMM; 14576 break; 14577 case Intrinsic::arm_neon_vrshiftu: 14578 VShiftOpc = ARMISD::VRSHRuIMM; 14579 break; 14580 case Intrinsic::arm_neon_vrshiftn: 14581 VShiftOpc = ARMISD::VRSHRNIMM; 14582 break; 14583 case Intrinsic::arm_neon_vqshifts: 14584 VShiftOpc = ARMISD::VQSHLsIMM; 14585 break; 14586 case Intrinsic::arm_neon_vqshiftu: 14587 VShiftOpc = ARMISD::VQSHLuIMM; 14588 break; 14589 case Intrinsic::arm_neon_vqshiftsu: 14590 VShiftOpc = ARMISD::VQSHLsuIMM; 14591 break; 14592 case Intrinsic::arm_neon_vqshiftns: 14593 VShiftOpc = ARMISD::VQSHRNsIMM; 14594 break; 14595 case Intrinsic::arm_neon_vqshiftnu: 14596 VShiftOpc = ARMISD::VQSHRNuIMM; 14597 break; 14598 case Intrinsic::arm_neon_vqshiftnsu: 14599 VShiftOpc = ARMISD::VQSHRNsuIMM; 14600 break; 14601 case Intrinsic::arm_neon_vqrshiftns: 14602 VShiftOpc = ARMISD::VQRSHRNsIMM; 14603 break; 14604 case Intrinsic::arm_neon_vqrshiftnu: 14605 VShiftOpc = ARMISD::VQRSHRNuIMM; 14606 break; 14607 case Intrinsic::arm_neon_vqrshiftnsu: 14608 VShiftOpc = ARMISD::VQRSHRNsuIMM; 14609 break; 14610 } 14611 14612 SDLoc dl(N); 14613 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 14614 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 14615 } 14616 14617 case Intrinsic::arm_neon_vshiftins: { 14618 EVT VT = N->getOperand(1).getValueType(); 14619 int64_t Cnt; 14620 unsigned VShiftOpc = 0; 14621 14622 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 14623 VShiftOpc = ARMISD::VSLIIMM; 14624 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 14625 VShiftOpc = ARMISD::VSRIIMM; 14626 else { 14627 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 14628 } 14629 14630 SDLoc dl(N); 14631 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 14632 N->getOperand(1), N->getOperand(2), 14633 DAG.getConstant(Cnt, dl, MVT::i32)); 14634 } 14635 14636 case Intrinsic::arm_neon_vqrshifts: 14637 case Intrinsic::arm_neon_vqrshiftu: 14638 // No immediate versions of these to check for. 14639 break; 14640 14641 case Intrinsic::arm_mve_vqdmlah: 14642 case Intrinsic::arm_mve_vqdmlash: 14643 case Intrinsic::arm_mve_vqrdmlah: 14644 case Intrinsic::arm_mve_vqrdmlash: 14645 case Intrinsic::arm_mve_vmla_n_predicated: 14646 case Intrinsic::arm_mve_vmlas_n_predicated: 14647 case Intrinsic::arm_mve_vqdmlah_predicated: 14648 case Intrinsic::arm_mve_vqdmlash_predicated: 14649 case Intrinsic::arm_mve_vqrdmlah_predicated: 14650 case Intrinsic::arm_mve_vqrdmlash_predicated: { 14651 // These intrinsics all take an i32 scalar operand which is narrowed to the 14652 // size of a single lane of the vector type they return. So we don't need 14653 // any bits of that operand above that point, which allows us to eliminate 14654 // uxth/sxth. 14655 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 14656 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 14657 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) 14658 return SDValue(); 14659 break; 14660 } 14661 14662 case Intrinsic::arm_mve_minv: 14663 case Intrinsic::arm_mve_maxv: 14664 case Intrinsic::arm_mve_minav: 14665 case Intrinsic::arm_mve_maxav: 14666 case Intrinsic::arm_mve_minv_predicated: 14667 case Intrinsic::arm_mve_maxv_predicated: 14668 case Intrinsic::arm_mve_minav_predicated: 14669 case Intrinsic::arm_mve_maxav_predicated: { 14670 // These intrinsics all take an i32 scalar operand which is narrowed to the 14671 // size of a single lane of the vector type they take as the other input. 14672 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); 14673 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 14674 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 14675 return SDValue(); 14676 break; 14677 } 14678 14679 case Intrinsic::arm_mve_addv: { 14680 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, 14681 // which allow PerformADDVecReduce to turn it into VADDLV when possible. 14682 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 14683 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; 14684 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); 14685 } 14686 14687 case Intrinsic::arm_mve_addlv: 14688 case Intrinsic::arm_mve_addlv_predicated: { 14689 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR 14690 // which recombines the two outputs into an i64 14691 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 14692 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? 14693 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : 14694 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); 14695 14696 SmallVector<SDValue, 4> Ops; 14697 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) 14698 if (i != 2) // skip the unsigned flag 14699 Ops.push_back(N->getOperand(i)); 14700 14701 SDLoc dl(N); 14702 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); 14703 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), 14704 val.getValue(1)); 14705 } 14706 } 14707 14708 return SDValue(); 14709 } 14710 14711 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 14712 /// lowers them. As with the vector shift intrinsics, this is done during DAG 14713 /// combining instead of DAG legalizing because the build_vectors for 64-bit 14714 /// vector element shift counts are generally not legal, and it is hard to see 14715 /// their values after they get legalized to loads from a constant pool. 14716 static SDValue PerformShiftCombine(SDNode *N, 14717 TargetLowering::DAGCombinerInfo &DCI, 14718 const ARMSubtarget *ST) { 14719 SelectionDAG &DAG = DCI.DAG; 14720 EVT VT = N->getValueType(0); 14721 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 14722 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 14723 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 14724 SDValue N1 = N->getOperand(1); 14725 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 14726 SDValue N0 = N->getOperand(0); 14727 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 14728 DAG.MaskedValueIsZero(N0.getOperand(0), 14729 APInt::getHighBitsSet(32, 16))) 14730 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 14731 } 14732 } 14733 14734 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 14735 N->getOperand(0)->getOpcode() == ISD::AND && 14736 N->getOperand(0)->hasOneUse()) { 14737 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14738 return SDValue(); 14739 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 14740 // usually show up because instcombine prefers to canonicalize it to 14741 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 14742 // out of GEP lowering in some cases. 14743 SDValue N0 = N->getOperand(0); 14744 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14745 if (!ShiftAmtNode) 14746 return SDValue(); 14747 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 14748 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 14749 if (!AndMaskNode) 14750 return SDValue(); 14751 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 14752 // Don't transform uxtb/uxth. 14753 if (AndMask == 255 || AndMask == 65535) 14754 return SDValue(); 14755 if (isMask_32(AndMask)) { 14756 uint32_t MaskedBits = countLeadingZeros(AndMask); 14757 if (MaskedBits > ShiftAmt) { 14758 SDLoc DL(N); 14759 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 14760 DAG.getConstant(MaskedBits, DL, MVT::i32)); 14761 return DAG.getNode( 14762 ISD::SRL, DL, MVT::i32, SHL, 14763 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 14764 } 14765 } 14766 } 14767 14768 // Nothing to be done for scalar shifts. 14769 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14770 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 14771 return SDValue(); 14772 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 14773 return SDValue(); 14774 14775 int64_t Cnt; 14776 14777 switch (N->getOpcode()) { 14778 default: llvm_unreachable("unexpected shift opcode"); 14779 14780 case ISD::SHL: 14781 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 14782 SDLoc dl(N); 14783 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 14784 DAG.getConstant(Cnt, dl, MVT::i32)); 14785 } 14786 break; 14787 14788 case ISD::SRA: 14789 case ISD::SRL: 14790 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 14791 unsigned VShiftOpc = 14792 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 14793 SDLoc dl(N); 14794 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 14795 DAG.getConstant(Cnt, dl, MVT::i32)); 14796 } 14797 } 14798 return SDValue(); 14799 } 14800 14801 // Look for a sign/zero extend of a larger than legal load. This can be split 14802 // into two extending loads, which are simpler to deal with than an arbitrary 14803 // sign extend. 14804 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 14805 SDValue N0 = N->getOperand(0); 14806 if (N0.getOpcode() != ISD::LOAD) 14807 return SDValue(); 14808 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 14809 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 14810 LD->getExtensionType() != ISD::NON_EXTLOAD) 14811 return SDValue(); 14812 EVT FromVT = LD->getValueType(0); 14813 EVT ToVT = N->getValueType(0); 14814 if (!ToVT.isVector()) 14815 return SDValue(); 14816 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 14817 EVT ToEltVT = ToVT.getVectorElementType(); 14818 EVT FromEltVT = FromVT.getVectorElementType(); 14819 14820 unsigned NumElements = 0; 14821 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 14822 NumElements = 4; 14823 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 14824 NumElements = 8; 14825 if (NumElements == 0 || 14826 FromVT.getVectorNumElements() == NumElements || 14827 FromVT.getVectorNumElements() % NumElements != 0 || 14828 !isPowerOf2_32(NumElements)) 14829 return SDValue(); 14830 14831 SDLoc DL(LD); 14832 // Details about the old load 14833 SDValue Ch = LD->getChain(); 14834 SDValue BasePtr = LD->getBasePtr(); 14835 Align Alignment = LD->getOriginalAlign(); 14836 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 14837 AAMDNodes AAInfo = LD->getAAInfo(); 14838 14839 ISD::LoadExtType NewExtType = 14840 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 14841 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 14842 EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14843 EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14844 unsigned NewOffset = NewFromVT.getSizeInBits() / 8; 14845 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 14846 14847 // Split the load in half, each side of which is extended separately. This 14848 // is good enough, as legalisation will take it from there. They are either 14849 // already legal or they will be split further into something that is 14850 // legal. 14851 SDValue NewLoad1 = DAG.getLoad( 14852 ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, 14853 LD->getPointerInfo(), NewFromVT, Alignment.value(), MMOFlags, AAInfo); 14854 SDValue NewLoad2 = 14855 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 14856 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 14857 Alignment.value(), MMOFlags, AAInfo); 14858 14859 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 14860 SDValue(NewLoad1.getNode(), 1), 14861 SDValue(NewLoad2.getNode(), 1)); 14862 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 14863 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); 14864 } 14865 14866 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 14867 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 14868 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 14869 const ARMSubtarget *ST) { 14870 SDValue N0 = N->getOperand(0); 14871 14872 // Check for sign- and zero-extensions of vector extract operations of 8- and 14873 // 16-bit vector elements. NEON and MVE support these directly. They are 14874 // handled during DAG combining because type legalization will promote them 14875 // to 32-bit types and it is messy to recognize the operations after that. 14876 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 14877 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 14878 SDValue Vec = N0.getOperand(0); 14879 SDValue Lane = N0.getOperand(1); 14880 EVT VT = N->getValueType(0); 14881 EVT EltVT = N0.getValueType(); 14882 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14883 14884 if (VT == MVT::i32 && 14885 (EltVT == MVT::i8 || EltVT == MVT::i16) && 14886 TLI.isTypeLegal(Vec.getValueType()) && 14887 isa<ConstantSDNode>(Lane)) { 14888 14889 unsigned Opc = 0; 14890 switch (N->getOpcode()) { 14891 default: llvm_unreachable("unexpected opcode"); 14892 case ISD::SIGN_EXTEND: 14893 Opc = ARMISD::VGETLANEs; 14894 break; 14895 case ISD::ZERO_EXTEND: 14896 case ISD::ANY_EXTEND: 14897 Opc = ARMISD::VGETLANEu; 14898 break; 14899 } 14900 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 14901 } 14902 } 14903 14904 if (ST->hasMVEIntegerOps()) 14905 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 14906 return NewLoad; 14907 14908 return SDValue(); 14909 } 14910 14911 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating 14912 /// saturates. 14913 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, 14914 const ARMSubtarget *ST) { 14915 EVT VT = N->getValueType(0); 14916 SDValue N0 = N->getOperand(0); 14917 if (!ST->hasMVEIntegerOps()) 14918 return SDValue(); 14919 14920 if (VT != MVT::v4i32 && VT != MVT::v8i16) 14921 return SDValue(); 14922 14923 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { 14924 // Check one is a smin and the other is a smax 14925 if (Min->getOpcode() != ISD::SMIN) 14926 std::swap(Min, Max); 14927 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) 14928 return false; 14929 14930 APInt SaturateC; 14931 if (VT == MVT::v4i32) 14932 SaturateC = APInt(32, (1 << 15) - 1, true); 14933 else //if (VT == MVT::v8i16) 14934 SaturateC = APInt(16, (1 << 7) - 1, true); 14935 14936 APInt MinC, MaxC; 14937 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 14938 MinC != SaturateC) 14939 return false; 14940 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || 14941 MaxC != ~SaturateC) 14942 return false; 14943 return true; 14944 }; 14945 14946 if (IsSignedSaturate(N, N0.getNode())) { 14947 SDLoc DL(N); 14948 MVT ExtVT, HalfVT; 14949 if (VT == MVT::v4i32) { 14950 HalfVT = MVT::v8i16; 14951 ExtVT = MVT::v4i16; 14952 } else { // if (VT == MVT::v8i16) 14953 HalfVT = MVT::v16i8; 14954 ExtVT = MVT::v8i8; 14955 } 14956 14957 // Create a VQMOVNB with undef top lanes, then signed extended into the top 14958 // half. That extend will hopefully be removed if only the bottom bits are 14959 // demanded (though a truncating store, for example). 14960 SDValue VQMOVN = 14961 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), 14962 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); 14963 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 14964 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, 14965 DAG.getValueType(ExtVT)); 14966 } 14967 14968 auto IsUnsignedSaturate = [&](SDNode *Min) { 14969 // For unsigned, we just need to check for <= 0xffff 14970 if (Min->getOpcode() != ISD::UMIN) 14971 return false; 14972 14973 APInt SaturateC; 14974 if (VT == MVT::v4i32) 14975 SaturateC = APInt(32, (1 << 16) - 1, true); 14976 else //if (VT == MVT::v8i16) 14977 SaturateC = APInt(16, (1 << 8) - 1, true); 14978 14979 APInt MinC; 14980 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 14981 MinC != SaturateC) 14982 return false; 14983 return true; 14984 }; 14985 14986 if (IsUnsignedSaturate(N)) { 14987 SDLoc DL(N); 14988 MVT HalfVT; 14989 unsigned ExtConst; 14990 if (VT == MVT::v4i32) { 14991 HalfVT = MVT::v8i16; 14992 ExtConst = 0x0000FFFF; 14993 } else { //if (VT == MVT::v8i16) 14994 HalfVT = MVT::v16i8; 14995 ExtConst = 0x00FF; 14996 } 14997 14998 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with 14999 // an AND. That extend will hopefully be removed if only the bottom bits are 15000 // demanded (though a truncating store, for example). 15001 SDValue VQMOVN = 15002 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, 15003 DAG.getConstant(0, DL, MVT::i32)); 15004 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 15005 return DAG.getNode(ISD::AND, DL, VT, Bitcast, 15006 DAG.getConstant(ExtConst, DL, VT)); 15007 } 15008 15009 return SDValue(); 15010 } 15011 15012 static const APInt *isPowerOf2Constant(SDValue V) { 15013 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 15014 if (!C) 15015 return nullptr; 15016 const APInt *CV = &C->getAPIntValue(); 15017 return CV->isPowerOf2() ? CV : nullptr; 15018 } 15019 15020 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 15021 // If we have a CMOV, OR and AND combination such as: 15022 // if (x & CN) 15023 // y |= CM; 15024 // 15025 // And: 15026 // * CN is a single bit; 15027 // * All bits covered by CM are known zero in y 15028 // 15029 // Then we can convert this into a sequence of BFI instructions. This will 15030 // always be a win if CM is a single bit, will always be no worse than the 15031 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 15032 // three bits (due to the extra IT instruction). 15033 15034 SDValue Op0 = CMOV->getOperand(0); 15035 SDValue Op1 = CMOV->getOperand(1); 15036 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 15037 auto CC = CCNode->getAPIntValue().getLimitedValue(); 15038 SDValue CmpZ = CMOV->getOperand(4); 15039 15040 // The compare must be against zero. 15041 if (!isNullConstant(CmpZ->getOperand(1))) 15042 return SDValue(); 15043 15044 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 15045 SDValue And = CmpZ->getOperand(0); 15046 if (And->getOpcode() != ISD::AND) 15047 return SDValue(); 15048 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 15049 if (!AndC) 15050 return SDValue(); 15051 SDValue X = And->getOperand(0); 15052 15053 if (CC == ARMCC::EQ) { 15054 // We're performing an "equal to zero" compare. Swap the operands so we 15055 // canonicalize on a "not equal to zero" compare. 15056 std::swap(Op0, Op1); 15057 } else { 15058 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 15059 } 15060 15061 if (Op1->getOpcode() != ISD::OR) 15062 return SDValue(); 15063 15064 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 15065 if (!OrC) 15066 return SDValue(); 15067 SDValue Y = Op1->getOperand(0); 15068 15069 if (Op0 != Y) 15070 return SDValue(); 15071 15072 // Now, is it profitable to continue? 15073 APInt OrCI = OrC->getAPIntValue(); 15074 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 15075 if (OrCI.countPopulation() > Heuristic) 15076 return SDValue(); 15077 15078 // Lastly, can we determine that the bits defined by OrCI 15079 // are zero in Y? 15080 KnownBits Known = DAG.computeKnownBits(Y); 15081 if ((OrCI & Known.Zero) != OrCI) 15082 return SDValue(); 15083 15084 // OK, we can do the combine. 15085 SDValue V = Y; 15086 SDLoc dl(X); 15087 EVT VT = X.getValueType(); 15088 unsigned BitInX = AndC->logBase2(); 15089 15090 if (BitInX != 0) { 15091 // We must shift X first. 15092 X = DAG.getNode(ISD::SRL, dl, VT, X, 15093 DAG.getConstant(BitInX, dl, VT)); 15094 } 15095 15096 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 15097 BitInY < NumActiveBits; ++BitInY) { 15098 if (OrCI[BitInY] == 0) 15099 continue; 15100 APInt Mask(VT.getSizeInBits(), 0); 15101 Mask.setBit(BitInY); 15102 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 15103 // Confusingly, the operand is an *inverted* mask. 15104 DAG.getConstant(~Mask, dl, VT)); 15105 } 15106 15107 return V; 15108 } 15109 15110 // Given N, the value controlling the conditional branch, search for the loop 15111 // intrinsic, returning it, along with how the value is used. We need to handle 15112 // patterns such as the following: 15113 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 15114 // (brcond (setcc (loop.decrement), 0, eq), exit) 15115 // (brcond (setcc (loop.decrement), 0, ne), header) 15116 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 15117 bool &Negate) { 15118 switch (N->getOpcode()) { 15119 default: 15120 break; 15121 case ISD::XOR: { 15122 if (!isa<ConstantSDNode>(N.getOperand(1))) 15123 return SDValue(); 15124 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 15125 return SDValue(); 15126 Negate = !Negate; 15127 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 15128 } 15129 case ISD::SETCC: { 15130 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 15131 if (!Const) 15132 return SDValue(); 15133 if (Const->isNullValue()) 15134 Imm = 0; 15135 else if (Const->isOne()) 15136 Imm = 1; 15137 else 15138 return SDValue(); 15139 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 15140 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 15141 } 15142 case ISD::INTRINSIC_W_CHAIN: { 15143 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 15144 if (IntOp != Intrinsic::test_set_loop_iterations && 15145 IntOp != Intrinsic::loop_decrement_reg) 15146 return SDValue(); 15147 return N; 15148 } 15149 } 15150 return SDValue(); 15151 } 15152 15153 static SDValue PerformHWLoopCombine(SDNode *N, 15154 TargetLowering::DAGCombinerInfo &DCI, 15155 const ARMSubtarget *ST) { 15156 15157 // The hwloop intrinsics that we're interested are used for control-flow, 15158 // either for entering or exiting the loop: 15159 // - test.set.loop.iterations will test whether its operand is zero. If it 15160 // is zero, the proceeding branch should not enter the loop. 15161 // - loop.decrement.reg also tests whether its operand is zero. If it is 15162 // zero, the proceeding branch should not branch back to the beginning of 15163 // the loop. 15164 // So here, we need to check that how the brcond is using the result of each 15165 // of the intrinsics to ensure that we're branching to the right place at the 15166 // right time. 15167 15168 ISD::CondCode CC; 15169 SDValue Cond; 15170 int Imm = 1; 15171 bool Negate = false; 15172 SDValue Chain = N->getOperand(0); 15173 SDValue Dest; 15174 15175 if (N->getOpcode() == ISD::BRCOND) { 15176 CC = ISD::SETEQ; 15177 Cond = N->getOperand(1); 15178 Dest = N->getOperand(2); 15179 } else { 15180 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 15181 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 15182 Cond = N->getOperand(2); 15183 Dest = N->getOperand(4); 15184 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 15185 if (!Const->isOne() && !Const->isNullValue()) 15186 return SDValue(); 15187 Imm = Const->getZExtValue(); 15188 } else 15189 return SDValue(); 15190 } 15191 15192 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 15193 if (!Int) 15194 return SDValue(); 15195 15196 if (Negate) 15197 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 15198 15199 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 15200 return (CC == ISD::SETEQ && Imm == 0) || 15201 (CC == ISD::SETNE && Imm == 1) || 15202 (CC == ISD::SETLT && Imm == 1) || 15203 (CC == ISD::SETULT && Imm == 1); 15204 }; 15205 15206 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 15207 return (CC == ISD::SETEQ && Imm == 1) || 15208 (CC == ISD::SETNE && Imm == 0) || 15209 (CC == ISD::SETGT && Imm == 0) || 15210 (CC == ISD::SETUGT && Imm == 0) || 15211 (CC == ISD::SETGE && Imm == 1) || 15212 (CC == ISD::SETUGE && Imm == 1); 15213 }; 15214 15215 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 15216 "unsupported condition"); 15217 15218 SDLoc dl(Int); 15219 SelectionDAG &DAG = DCI.DAG; 15220 SDValue Elements = Int.getOperand(2); 15221 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 15222 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 15223 && "expected single br user"); 15224 SDNode *Br = *N->use_begin(); 15225 SDValue OtherTarget = Br->getOperand(1); 15226 15227 // Update the unconditional branch to branch to the given Dest. 15228 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 15229 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 15230 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 15231 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 15232 }; 15233 15234 if (IntOp == Intrinsic::test_set_loop_iterations) { 15235 SDValue Res; 15236 // We expect this 'instruction' to branch when the counter is zero. 15237 if (IsTrueIfZero(CC, Imm)) { 15238 SDValue Ops[] = { Chain, Elements, Dest }; 15239 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 15240 } else { 15241 // The logic is the reverse of what we need for WLS, so find the other 15242 // basic block target: the target of the proceeding br. 15243 UpdateUncondBr(Br, Dest, DAG); 15244 15245 SDValue Ops[] = { Chain, Elements, OtherTarget }; 15246 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 15247 } 15248 DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); 15249 return Res; 15250 } else { 15251 SDValue Size = DAG.getTargetConstant( 15252 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 15253 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 15254 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 15255 DAG.getVTList(MVT::i32, MVT::Other), Args); 15256 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 15257 15258 // We expect this instruction to branch when the count is not zero. 15259 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 15260 15261 // Update the unconditional branch to target the loop preheader if we've 15262 // found the condition has been reversed. 15263 if (Target == OtherTarget) 15264 UpdateUncondBr(Br, Dest, DAG); 15265 15266 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 15267 SDValue(LoopDec.getNode(), 1), Chain); 15268 15269 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 15270 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 15271 } 15272 return SDValue(); 15273 } 15274 15275 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 15276 SDValue 15277 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 15278 SDValue Cmp = N->getOperand(4); 15279 if (Cmp.getOpcode() != ARMISD::CMPZ) 15280 // Only looking at NE cases. 15281 return SDValue(); 15282 15283 EVT VT = N->getValueType(0); 15284 SDLoc dl(N); 15285 SDValue LHS = Cmp.getOperand(0); 15286 SDValue RHS = Cmp.getOperand(1); 15287 SDValue Chain = N->getOperand(0); 15288 SDValue BB = N->getOperand(1); 15289 SDValue ARMcc = N->getOperand(2); 15290 ARMCC::CondCodes CC = 15291 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 15292 15293 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 15294 // -> (brcond Chain BB CC CPSR Cmp) 15295 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 15296 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 15297 LHS->getOperand(0)->hasOneUse()) { 15298 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 15299 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 15300 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 15301 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 15302 if ((LHS00C && LHS00C->getZExtValue() == 0) && 15303 (LHS01C && LHS01C->getZExtValue() == 1) && 15304 (LHS1C && LHS1C->getZExtValue() == 1) && 15305 (RHSC && RHSC->getZExtValue() == 0)) { 15306 return DAG.getNode( 15307 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 15308 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 15309 } 15310 } 15311 15312 return SDValue(); 15313 } 15314 15315 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 15316 SDValue 15317 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 15318 SDValue Cmp = N->getOperand(4); 15319 if (Cmp.getOpcode() != ARMISD::CMPZ) 15320 // Only looking at EQ and NE cases. 15321 return SDValue(); 15322 15323 EVT VT = N->getValueType(0); 15324 SDLoc dl(N); 15325 SDValue LHS = Cmp.getOperand(0); 15326 SDValue RHS = Cmp.getOperand(1); 15327 SDValue FalseVal = N->getOperand(0); 15328 SDValue TrueVal = N->getOperand(1); 15329 SDValue ARMcc = N->getOperand(2); 15330 ARMCC::CondCodes CC = 15331 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 15332 15333 // BFI is only available on V6T2+. 15334 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 15335 SDValue R = PerformCMOVToBFICombine(N, DAG); 15336 if (R) 15337 return R; 15338 } 15339 15340 // Simplify 15341 // mov r1, r0 15342 // cmp r1, x 15343 // mov r0, y 15344 // moveq r0, x 15345 // to 15346 // cmp r0, x 15347 // movne r0, y 15348 // 15349 // mov r1, r0 15350 // cmp r1, x 15351 // mov r0, x 15352 // movne r0, y 15353 // to 15354 // cmp r0, x 15355 // movne r0, y 15356 /// FIXME: Turn this into a target neutral optimization? 15357 SDValue Res; 15358 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 15359 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 15360 N->getOperand(3), Cmp); 15361 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 15362 SDValue ARMcc; 15363 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 15364 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 15365 N->getOperand(3), NewCmp); 15366 } 15367 15368 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 15369 // -> (cmov F T CC CPSR Cmp) 15370 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 15371 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 15372 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 15373 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 15374 if ((LHS0C && LHS0C->getZExtValue() == 0) && 15375 (LHS1C && LHS1C->getZExtValue() == 1) && 15376 (RHSC && RHSC->getZExtValue() == 0)) { 15377 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 15378 LHS->getOperand(2), LHS->getOperand(3), 15379 LHS->getOperand(4)); 15380 } 15381 } 15382 15383 if (!VT.isInteger()) 15384 return SDValue(); 15385 15386 // Materialize a boolean comparison for integers so we can avoid branching. 15387 if (isNullConstant(FalseVal)) { 15388 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 15389 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 15390 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 15391 // right 5 bits will make that 32 be 1, otherwise it will be 0. 15392 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 15393 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 15394 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 15395 DAG.getConstant(5, dl, MVT::i32)); 15396 } else { 15397 // CMOV 0, 1, ==, (CMPZ x, y) -> 15398 // (ADDCARRY (SUB x, y), t:0, t:1) 15399 // where t = (SUBCARRY 0, (SUB x, y), 0) 15400 // 15401 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 15402 // x != y. In other words, a carry C == 1 when x == y, C == 0 15403 // otherwise. 15404 // The final ADDCARRY computes 15405 // x - y + (0 - (x - y)) + C == C 15406 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 15407 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 15408 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 15409 // ISD::SUBCARRY returns a borrow but we want the carry here 15410 // actually. 15411 SDValue Carry = 15412 DAG.getNode(ISD::SUB, dl, MVT::i32, 15413 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 15414 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 15415 } 15416 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 15417 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 15418 // This seems pointless but will allow us to combine it further below. 15419 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 15420 SDValue Sub = 15421 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 15422 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 15423 Sub.getValue(1), SDValue()); 15424 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 15425 N->getOperand(3), CPSRGlue.getValue(1)); 15426 FalseVal = Sub; 15427 } 15428 } else if (isNullConstant(TrueVal)) { 15429 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 15430 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 15431 // This seems pointless but will allow us to combine it further below 15432 // Note that we change == for != as this is the dual for the case above. 15433 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 15434 SDValue Sub = 15435 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 15436 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 15437 Sub.getValue(1), SDValue()); 15438 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 15439 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 15440 N->getOperand(3), CPSRGlue.getValue(1)); 15441 FalseVal = Sub; 15442 } 15443 } 15444 15445 // On Thumb1, the DAG above may be further combined if z is a power of 2 15446 // (z == 2 ^ K). 15447 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 15448 // t1 = (USUBO (SUB x, y), 1) 15449 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 15450 // Result = if K != 0 then (SHL t2:0, K) else t2:0 15451 // 15452 // This also handles the special case of comparing against zero; it's 15453 // essentially, the same pattern, except there's no SUBS: 15454 // CMOV x, z, !=, (CMPZ x, 0) -> 15455 // t1 = (USUBO x, 1) 15456 // t2 = (SUBCARRY x, t1:0, t1:1) 15457 // Result = if K != 0 then (SHL t2:0, K) else t2:0 15458 const APInt *TrueConst; 15459 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 15460 ((FalseVal.getOpcode() == ARMISD::SUBS && 15461 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 15462 (FalseVal == LHS && isNullConstant(RHS))) && 15463 (TrueConst = isPowerOf2Constant(TrueVal))) { 15464 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 15465 unsigned ShiftAmount = TrueConst->logBase2(); 15466 if (ShiftAmount) 15467 TrueVal = DAG.getConstant(1, dl, VT); 15468 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 15469 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 15470 15471 if (ShiftAmount) 15472 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 15473 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 15474 } 15475 15476 if (Res.getNode()) { 15477 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 15478 // Capture demanded bits information that would be otherwise lost. 15479 if (Known.Zero == 0xfffffffe) 15480 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 15481 DAG.getValueType(MVT::i1)); 15482 else if (Known.Zero == 0xffffff00) 15483 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 15484 DAG.getValueType(MVT::i8)); 15485 else if (Known.Zero == 0xffff0000) 15486 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 15487 DAG.getValueType(MVT::i16)); 15488 } 15489 15490 return Res; 15491 } 15492 15493 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, 15494 const ARMSubtarget *ST) { 15495 SDValue Src = N->getOperand(0); 15496 EVT DstVT = N->getValueType(0); 15497 15498 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. 15499 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { 15500 EVT SrcVT = Src.getValueType(); 15501 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) 15502 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); 15503 } 15504 15505 // We may have a bitcast of something that has already had this bitcast 15506 // combine performed on it, so skip past any VECTOR_REG_CASTs. 15507 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) 15508 Src = Src.getOperand(0); 15509 15510 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that 15511 // would be generated is at least the width of the element type. 15512 EVT SrcVT = Src.getValueType(); 15513 if ((Src.getOpcode() == ARMISD::VMOVIMM || 15514 Src.getOpcode() == ARMISD::VMVNIMM || 15515 Src.getOpcode() == ARMISD::VMOVFPIMM) && 15516 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && 15517 DAG.getDataLayout().isBigEndian()) 15518 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); 15519 15520 return SDValue(); 15521 } 15522 15523 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 15524 DAGCombinerInfo &DCI) const { 15525 switch (N->getOpcode()) { 15526 default: break; 15527 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); 15528 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 15529 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 15530 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 15531 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 15532 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 15533 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 15534 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 15535 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 15536 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 15537 case ISD::BRCOND: 15538 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 15539 case ARMISD::ADDC: 15540 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 15541 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 15542 case ARMISD::BFI: return PerformBFICombine(N, DCI); 15543 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 15544 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 15545 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); 15546 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI); 15547 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 15548 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 15549 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 15550 case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI); 15551 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 15552 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); 15553 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); 15554 case ISD::FP_TO_SINT: 15555 case ISD::FP_TO_UINT: 15556 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 15557 case ISD::FDIV: 15558 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 15559 case ISD::INTRINSIC_WO_CHAIN: 15560 return PerformIntrinsicCombine(N, DCI); 15561 case ISD::SHL: 15562 case ISD::SRA: 15563 case ISD::SRL: 15564 return PerformShiftCombine(N, DCI, Subtarget); 15565 case ISD::SIGN_EXTEND: 15566 case ISD::ZERO_EXTEND: 15567 case ISD::ANY_EXTEND: 15568 return PerformExtendCombine(N, DCI.DAG, Subtarget); 15569 case ISD::SMIN: 15570 case ISD::UMIN: 15571 case ISD::SMAX: 15572 case ISD::UMAX: 15573 return PerformMinMaxCombine(N, DCI.DAG, Subtarget); 15574 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 15575 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 15576 case ISD::LOAD: return PerformLOADCombine(N, DCI); 15577 case ARMISD::VLD1DUP: 15578 case ARMISD::VLD2DUP: 15579 case ARMISD::VLD3DUP: 15580 case ARMISD::VLD4DUP: 15581 return PerformVLDCombine(N, DCI); 15582 case ARMISD::BUILD_VECTOR: 15583 return PerformARMBUILD_VECTORCombine(N, DCI); 15584 case ISD::BITCAST: 15585 return PerformBITCASTCombine(N, DCI.DAG, Subtarget); 15586 case ARMISD::PREDICATE_CAST: 15587 return PerformPREDICATE_CASTCombine(N, DCI); 15588 case ARMISD::VECTOR_REG_CAST: 15589 return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); 15590 case ARMISD::VCMP: 15591 return PerformVCMPCombine(N, DCI, Subtarget); 15592 case ISD::VECREDUCE_ADD: 15593 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); 15594 case ARMISD::VMOVN: 15595 return PerformVMOVNCombine(N, DCI); 15596 case ARMISD::ASRL: 15597 case ARMISD::LSRL: 15598 case ARMISD::LSLL: 15599 return PerformLongShiftCombine(N, DCI.DAG); 15600 case ARMISD::SMULWB: { 15601 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15602 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 15603 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 15604 return SDValue(); 15605 break; 15606 } 15607 case ARMISD::SMULWT: { 15608 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15609 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 15610 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 15611 return SDValue(); 15612 break; 15613 } 15614 case ARMISD::SMLALBB: 15615 case ARMISD::QADD16b: 15616 case ARMISD::QSUB16b: { 15617 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15618 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 15619 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 15620 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 15621 return SDValue(); 15622 break; 15623 } 15624 case ARMISD::SMLALBT: { 15625 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 15626 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 15627 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 15628 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 15629 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 15630 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 15631 return SDValue(); 15632 break; 15633 } 15634 case ARMISD::SMLALTB: { 15635 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 15636 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 15637 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 15638 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 15639 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 15640 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 15641 return SDValue(); 15642 break; 15643 } 15644 case ARMISD::SMLALTT: { 15645 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15646 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 15647 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 15648 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 15649 return SDValue(); 15650 break; 15651 } 15652 case ARMISD::QADD8b: 15653 case ARMISD::QSUB8b: { 15654 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15655 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 15656 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 15657 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 15658 return SDValue(); 15659 break; 15660 } 15661 case ISD::INTRINSIC_VOID: 15662 case ISD::INTRINSIC_W_CHAIN: 15663 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 15664 case Intrinsic::arm_neon_vld1: 15665 case Intrinsic::arm_neon_vld1x2: 15666 case Intrinsic::arm_neon_vld1x3: 15667 case Intrinsic::arm_neon_vld1x4: 15668 case Intrinsic::arm_neon_vld2: 15669 case Intrinsic::arm_neon_vld3: 15670 case Intrinsic::arm_neon_vld4: 15671 case Intrinsic::arm_neon_vld2lane: 15672 case Intrinsic::arm_neon_vld3lane: 15673 case Intrinsic::arm_neon_vld4lane: 15674 case Intrinsic::arm_neon_vld2dup: 15675 case Intrinsic::arm_neon_vld3dup: 15676 case Intrinsic::arm_neon_vld4dup: 15677 case Intrinsic::arm_neon_vst1: 15678 case Intrinsic::arm_neon_vst1x2: 15679 case Intrinsic::arm_neon_vst1x3: 15680 case Intrinsic::arm_neon_vst1x4: 15681 case Intrinsic::arm_neon_vst2: 15682 case Intrinsic::arm_neon_vst3: 15683 case Intrinsic::arm_neon_vst4: 15684 case Intrinsic::arm_neon_vst2lane: 15685 case Intrinsic::arm_neon_vst3lane: 15686 case Intrinsic::arm_neon_vst4lane: 15687 return PerformVLDCombine(N, DCI); 15688 case Intrinsic::arm_mve_vld2q: 15689 case Intrinsic::arm_mve_vld4q: 15690 case Intrinsic::arm_mve_vst2q: 15691 case Intrinsic::arm_mve_vst4q: 15692 return PerformMVEVLDCombine(N, DCI); 15693 default: break; 15694 } 15695 break; 15696 } 15697 return SDValue(); 15698 } 15699 15700 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 15701 EVT VT) const { 15702 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 15703 } 15704 15705 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 15706 unsigned Alignment, 15707 MachineMemOperand::Flags, 15708 bool *Fast) const { 15709 // Depends what it gets converted into if the type is weird. 15710 if (!VT.isSimple()) 15711 return false; 15712 15713 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 15714 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 15715 auto Ty = VT.getSimpleVT().SimpleTy; 15716 15717 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 15718 // Unaligned access can use (for example) LRDB, LRDH, LDR 15719 if (AllowsUnaligned) { 15720 if (Fast) 15721 *Fast = Subtarget->hasV7Ops(); 15722 return true; 15723 } 15724 } 15725 15726 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 15727 // For any little-endian targets with neon, we can support unaligned ld/st 15728 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 15729 // A big-endian target may also explicitly support unaligned accesses 15730 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 15731 if (Fast) 15732 *Fast = true; 15733 return true; 15734 } 15735 } 15736 15737 if (!Subtarget->hasMVEIntegerOps()) 15738 return false; 15739 15740 // These are for predicates 15741 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 15742 if (Fast) 15743 *Fast = true; 15744 return true; 15745 } 15746 15747 // These are for truncated stores/narrowing loads. They are fine so long as 15748 // the alignment is at least the size of the item being loaded 15749 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 15750 Alignment >= VT.getScalarSizeInBits() / 8) { 15751 if (Fast) 15752 *Fast = true; 15753 return true; 15754 } 15755 15756 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 15757 // VSTRW.U32 all store the vector register in exactly the same format, and 15758 // differ only in the range of their immediate offset field and the required 15759 // alignment. So there is always a store that can be used, regardless of 15760 // actual type. 15761 // 15762 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 15763 // VREV64.8) pair and get the same effect. This will likely be better than 15764 // aligning the vector through the stack. 15765 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 15766 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 15767 Ty == MVT::v2f64) { 15768 if (Fast) 15769 *Fast = true; 15770 return true; 15771 } 15772 15773 return false; 15774 } 15775 15776 15777 EVT ARMTargetLowering::getOptimalMemOpType( 15778 const MemOp &Op, const AttributeList &FuncAttributes) const { 15779 // See if we can use NEON instructions for this... 15780 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && 15781 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 15782 bool Fast; 15783 if (Op.size() >= 16 && 15784 (Op.isAligned(Align(16)) || 15785 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, 15786 MachineMemOperand::MONone, &Fast) && 15787 Fast))) { 15788 return MVT::v2f64; 15789 } else if (Op.size() >= 8 && 15790 (Op.isAligned(Align(8)) || 15791 (allowsMisalignedMemoryAccesses( 15792 MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && 15793 Fast))) { 15794 return MVT::f64; 15795 } 15796 } 15797 15798 // Let the target-independent logic figure it out. 15799 return MVT::Other; 15800 } 15801 15802 // 64-bit integers are split into their high and low parts and held in two 15803 // different registers, so the trunc is free since the low register can just 15804 // be used. 15805 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 15806 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 15807 return false; 15808 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 15809 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 15810 return (SrcBits == 64 && DestBits == 32); 15811 } 15812 15813 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 15814 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 15815 !DstVT.isInteger()) 15816 return false; 15817 unsigned SrcBits = SrcVT.getSizeInBits(); 15818 unsigned DestBits = DstVT.getSizeInBits(); 15819 return (SrcBits == 64 && DestBits == 32); 15820 } 15821 15822 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 15823 if (Val.getOpcode() != ISD::LOAD) 15824 return false; 15825 15826 EVT VT1 = Val.getValueType(); 15827 if (!VT1.isSimple() || !VT1.isInteger() || 15828 !VT2.isSimple() || !VT2.isInteger()) 15829 return false; 15830 15831 switch (VT1.getSimpleVT().SimpleTy) { 15832 default: break; 15833 case MVT::i1: 15834 case MVT::i8: 15835 case MVT::i16: 15836 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 15837 return true; 15838 } 15839 15840 return false; 15841 } 15842 15843 bool ARMTargetLowering::isFNegFree(EVT VT) const { 15844 if (!VT.isSimple()) 15845 return false; 15846 15847 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 15848 // negate values directly (fneg is free). So, we don't want to let the DAG 15849 // combiner rewrite fneg into xors and some other instructions. For f16 and 15850 // FullFP16 argument passing, some bitcast nodes may be introduced, 15851 // triggering this DAG combine rewrite, so we are avoiding that with this. 15852 switch (VT.getSimpleVT().SimpleTy) { 15853 default: break; 15854 case MVT::f16: 15855 return Subtarget->hasFullFP16(); 15856 } 15857 15858 return false; 15859 } 15860 15861 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 15862 /// of the vector elements. 15863 static bool areExtractExts(Value *Ext1, Value *Ext2) { 15864 auto areExtDoubled = [](Instruction *Ext) { 15865 return Ext->getType()->getScalarSizeInBits() == 15866 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 15867 }; 15868 15869 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 15870 !match(Ext2, m_ZExtOrSExt(m_Value())) || 15871 !areExtDoubled(cast<Instruction>(Ext1)) || 15872 !areExtDoubled(cast<Instruction>(Ext2))) 15873 return false; 15874 15875 return true; 15876 } 15877 15878 /// Check if sinking \p I's operands to I's basic block is profitable, because 15879 /// the operands can be folded into a target instruction, e.g. 15880 /// sext/zext can be folded into vsubl. 15881 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 15882 SmallVectorImpl<Use *> &Ops) const { 15883 if (!I->getType()->isVectorTy()) 15884 return false; 15885 15886 if (Subtarget->hasNEON()) { 15887 switch (I->getOpcode()) { 15888 case Instruction::Sub: 15889 case Instruction::Add: { 15890 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 15891 return false; 15892 Ops.push_back(&I->getOperandUse(0)); 15893 Ops.push_back(&I->getOperandUse(1)); 15894 return true; 15895 } 15896 default: 15897 return false; 15898 } 15899 } 15900 15901 if (!Subtarget->hasMVEIntegerOps()) 15902 return false; 15903 15904 auto IsFMSMul = [&](Instruction *I) { 15905 if (!I->hasOneUse()) 15906 return false; 15907 auto *Sub = cast<Instruction>(*I->users().begin()); 15908 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; 15909 }; 15910 auto IsFMS = [&](Instruction *I) { 15911 if (match(I->getOperand(0), m_FNeg(m_Value())) || 15912 match(I->getOperand(1), m_FNeg(m_Value()))) 15913 return true; 15914 return false; 15915 }; 15916 15917 auto IsSinker = [&](Instruction *I, int Operand) { 15918 switch (I->getOpcode()) { 15919 case Instruction::Add: 15920 case Instruction::Mul: 15921 case Instruction::FAdd: 15922 case Instruction::ICmp: 15923 case Instruction::FCmp: 15924 return true; 15925 case Instruction::FMul: 15926 return !IsFMSMul(I); 15927 case Instruction::Sub: 15928 case Instruction::FSub: 15929 case Instruction::Shl: 15930 case Instruction::LShr: 15931 case Instruction::AShr: 15932 return Operand == 1; 15933 case Instruction::Call: 15934 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 15935 switch (II->getIntrinsicID()) { 15936 case Intrinsic::fma: 15937 return !IsFMS(I); 15938 default: 15939 return false; 15940 } 15941 } 15942 return false; 15943 default: 15944 return false; 15945 } 15946 }; 15947 15948 for (auto OpIdx : enumerate(I->operands())) { 15949 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); 15950 // Make sure we are not already sinking this operand 15951 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) 15952 continue; 15953 15954 Instruction *Shuffle = Op; 15955 if (Shuffle->getOpcode() == Instruction::BitCast) 15956 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); 15957 // We are looking for a splat that can be sunk. 15958 if (!Shuffle || 15959 !match(Shuffle, m_Shuffle( 15960 m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 15961 m_Undef(), m_ZeroMask()))) 15962 continue; 15963 if (!IsSinker(I, OpIdx.index())) 15964 continue; 15965 15966 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 15967 // and vector registers 15968 for (Use &U : Op->uses()) { 15969 Instruction *Insn = cast<Instruction>(U.getUser()); 15970 if (!IsSinker(Insn, U.getOperandNo())) 15971 return false; 15972 } 15973 15974 Ops.push_back(&Shuffle->getOperandUse(0)); 15975 if (Shuffle != Op) 15976 Ops.push_back(&Op->getOperandUse(0)); 15977 Ops.push_back(&OpIdx.value()); 15978 } 15979 return true; 15980 } 15981 15982 Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { 15983 if (!Subtarget->hasMVEIntegerOps()) 15984 return nullptr; 15985 Type *SVIType = SVI->getType(); 15986 Type *ScalarType = SVIType->getScalarType(); 15987 15988 if (ScalarType->isFloatTy()) 15989 return Type::getInt32Ty(SVIType->getContext()); 15990 if (ScalarType->isHalfTy()) 15991 return Type::getInt16Ty(SVIType->getContext()); 15992 return nullptr; 15993 } 15994 15995 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 15996 EVT VT = ExtVal.getValueType(); 15997 15998 if (!isTypeLegal(VT)) 15999 return false; 16000 16001 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 16002 if (Ld->isExpandingLoad()) 16003 return false; 16004 } 16005 16006 if (Subtarget->hasMVEIntegerOps()) 16007 return true; 16008 16009 // Don't create a loadext if we can fold the extension into a wide/long 16010 // instruction. 16011 // If there's more than one user instruction, the loadext is desirable no 16012 // matter what. There can be two uses by the same instruction. 16013 if (ExtVal->use_empty() || 16014 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 16015 return true; 16016 16017 SDNode *U = *ExtVal->use_begin(); 16018 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 16019 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 16020 return false; 16021 16022 return true; 16023 } 16024 16025 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 16026 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 16027 return false; 16028 16029 if (!isTypeLegal(EVT::getEVT(Ty1))) 16030 return false; 16031 16032 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 16033 16034 // Assuming the caller doesn't have a zeroext or signext return parameter, 16035 // truncation all the way down to i1 is valid. 16036 return true; 16037 } 16038 16039 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 16040 const AddrMode &AM, Type *Ty, 16041 unsigned AS) const { 16042 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 16043 if (Subtarget->hasFPAO()) 16044 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 16045 return 0; 16046 } 16047 return -1; 16048 } 16049 16050 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 16051 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 16052 /// expanded to FMAs when this method returns true, otherwise fmuladd is 16053 /// expanded to fmul + fadd. 16054 /// 16055 /// ARM supports both fused and unfused multiply-add operations; we already 16056 /// lower a pair of fmul and fadd to the latter so it's not clear that there 16057 /// would be a gain or that the gain would be worthwhile enough to risk 16058 /// correctness bugs. 16059 /// 16060 /// For MVE, we set this to true as it helps simplify the need for some 16061 /// patterns (and we don't have the non-fused floating point instruction). 16062 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 16063 EVT VT) const { 16064 if (!VT.isSimple()) 16065 return false; 16066 16067 switch (VT.getSimpleVT().SimpleTy) { 16068 case MVT::v4f32: 16069 case MVT::v8f16: 16070 return Subtarget->hasMVEFloatOps(); 16071 case MVT::f16: 16072 return Subtarget->useFPVFMx16(); 16073 case MVT::f32: 16074 return Subtarget->useFPVFMx(); 16075 case MVT::f64: 16076 return Subtarget->useFPVFMx64(); 16077 default: 16078 break; 16079 } 16080 16081 return false; 16082 } 16083 16084 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 16085 if (V < 0) 16086 return false; 16087 16088 unsigned Scale = 1; 16089 switch (VT.getSimpleVT().SimpleTy) { 16090 case MVT::i1: 16091 case MVT::i8: 16092 // Scale == 1; 16093 break; 16094 case MVT::i16: 16095 // Scale == 2; 16096 Scale = 2; 16097 break; 16098 default: 16099 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 16100 // Scale == 4; 16101 Scale = 4; 16102 break; 16103 } 16104 16105 if ((V & (Scale - 1)) != 0) 16106 return false; 16107 return isUInt<5>(V / Scale); 16108 } 16109 16110 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 16111 const ARMSubtarget *Subtarget) { 16112 if (!VT.isInteger() && !VT.isFloatingPoint()) 16113 return false; 16114 if (VT.isVector() && Subtarget->hasNEON()) 16115 return false; 16116 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 16117 !Subtarget->hasMVEFloatOps()) 16118 return false; 16119 16120 bool IsNeg = false; 16121 if (V < 0) { 16122 IsNeg = true; 16123 V = -V; 16124 } 16125 16126 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 16127 16128 // MVE: size * imm7 16129 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 16130 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 16131 case MVT::i32: 16132 case MVT::f32: 16133 return isShiftedUInt<7,2>(V); 16134 case MVT::i16: 16135 case MVT::f16: 16136 return isShiftedUInt<7,1>(V); 16137 case MVT::i8: 16138 return isUInt<7>(V); 16139 default: 16140 return false; 16141 } 16142 } 16143 16144 // half VLDR: 2 * imm8 16145 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 16146 return isShiftedUInt<8, 1>(V); 16147 // VLDR and LDRD: 4 * imm8 16148 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 16149 return isShiftedUInt<8, 2>(V); 16150 16151 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 16152 // + imm12 or - imm8 16153 if (IsNeg) 16154 return isUInt<8>(V); 16155 return isUInt<12>(V); 16156 } 16157 16158 return false; 16159 } 16160 16161 /// isLegalAddressImmediate - Return true if the integer value can be used 16162 /// as the offset of the target addressing mode for load / store of the 16163 /// given type. 16164 static bool isLegalAddressImmediate(int64_t V, EVT VT, 16165 const ARMSubtarget *Subtarget) { 16166 if (V == 0) 16167 return true; 16168 16169 if (!VT.isSimple()) 16170 return false; 16171 16172 if (Subtarget->isThumb1Only()) 16173 return isLegalT1AddressImmediate(V, VT); 16174 else if (Subtarget->isThumb2()) 16175 return isLegalT2AddressImmediate(V, VT, Subtarget); 16176 16177 // ARM mode. 16178 if (V < 0) 16179 V = - V; 16180 switch (VT.getSimpleVT().SimpleTy) { 16181 default: return false; 16182 case MVT::i1: 16183 case MVT::i8: 16184 case MVT::i32: 16185 // +- imm12 16186 return isUInt<12>(V); 16187 case MVT::i16: 16188 // +- imm8 16189 return isUInt<8>(V); 16190 case MVT::f32: 16191 case MVT::f64: 16192 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 16193 return false; 16194 return isShiftedUInt<8, 2>(V); 16195 } 16196 } 16197 16198 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 16199 EVT VT) const { 16200 int Scale = AM.Scale; 16201 if (Scale < 0) 16202 return false; 16203 16204 switch (VT.getSimpleVT().SimpleTy) { 16205 default: return false; 16206 case MVT::i1: 16207 case MVT::i8: 16208 case MVT::i16: 16209 case MVT::i32: 16210 if (Scale == 1) 16211 return true; 16212 // r + r << imm 16213 Scale = Scale & ~1; 16214 return Scale == 2 || Scale == 4 || Scale == 8; 16215 case MVT::i64: 16216 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 16217 // version in Thumb mode. 16218 // r + r 16219 if (Scale == 1) 16220 return true; 16221 // r * 2 (this can be lowered to r + r). 16222 if (!AM.HasBaseReg && Scale == 2) 16223 return true; 16224 return false; 16225 case MVT::isVoid: 16226 // Note, we allow "void" uses (basically, uses that aren't loads or 16227 // stores), because arm allows folding a scale into many arithmetic 16228 // operations. This should be made more precise and revisited later. 16229 16230 // Allow r << imm, but the imm has to be a multiple of two. 16231 if (Scale & 1) return false; 16232 return isPowerOf2_32(Scale); 16233 } 16234 } 16235 16236 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 16237 EVT VT) const { 16238 const int Scale = AM.Scale; 16239 16240 // Negative scales are not supported in Thumb1. 16241 if (Scale < 0) 16242 return false; 16243 16244 // Thumb1 addressing modes do not support register scaling excepting the 16245 // following cases: 16246 // 1. Scale == 1 means no scaling. 16247 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 16248 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 16249 } 16250 16251 /// isLegalAddressingMode - Return true if the addressing mode represented 16252 /// by AM is legal for this target, for a load/store of the specified type. 16253 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 16254 const AddrMode &AM, Type *Ty, 16255 unsigned AS, Instruction *I) const { 16256 EVT VT = getValueType(DL, Ty, true); 16257 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 16258 return false; 16259 16260 // Can never fold addr of global into load/store. 16261 if (AM.BaseGV) 16262 return false; 16263 16264 switch (AM.Scale) { 16265 case 0: // no scale reg, must be "r+i" or "r", or "i". 16266 break; 16267 default: 16268 // ARM doesn't support any R+R*scale+imm addr modes. 16269 if (AM.BaseOffs) 16270 return false; 16271 16272 if (!VT.isSimple()) 16273 return false; 16274 16275 if (Subtarget->isThumb1Only()) 16276 return isLegalT1ScaledAddressingMode(AM, VT); 16277 16278 if (Subtarget->isThumb2()) 16279 return isLegalT2ScaledAddressingMode(AM, VT); 16280 16281 int Scale = AM.Scale; 16282 switch (VT.getSimpleVT().SimpleTy) { 16283 default: return false; 16284 case MVT::i1: 16285 case MVT::i8: 16286 case MVT::i32: 16287 if (Scale < 0) Scale = -Scale; 16288 if (Scale == 1) 16289 return true; 16290 // r + r << imm 16291 return isPowerOf2_32(Scale & ~1); 16292 case MVT::i16: 16293 case MVT::i64: 16294 // r +/- r 16295 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 16296 return true; 16297 // r * 2 (this can be lowered to r + r). 16298 if (!AM.HasBaseReg && Scale == 2) 16299 return true; 16300 return false; 16301 16302 case MVT::isVoid: 16303 // Note, we allow "void" uses (basically, uses that aren't loads or 16304 // stores), because arm allows folding a scale into many arithmetic 16305 // operations. This should be made more precise and revisited later. 16306 16307 // Allow r << imm, but the imm has to be a multiple of two. 16308 if (Scale & 1) return false; 16309 return isPowerOf2_32(Scale); 16310 } 16311 } 16312 return true; 16313 } 16314 16315 /// isLegalICmpImmediate - Return true if the specified immediate is legal 16316 /// icmp immediate, that is the target has icmp instructions which can compare 16317 /// a register against the immediate without having to materialize the 16318 /// immediate into a register. 16319 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 16320 // Thumb2 and ARM modes can use cmn for negative immediates. 16321 if (!Subtarget->isThumb()) 16322 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 16323 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 16324 if (Subtarget->isThumb2()) 16325 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 16326 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 16327 // Thumb1 doesn't have cmn, and only 8-bit immediates. 16328 return Imm >= 0 && Imm <= 255; 16329 } 16330 16331 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 16332 /// *or sub* immediate, that is the target has add or sub instructions which can 16333 /// add a register with the immediate without having to materialize the 16334 /// immediate into a register. 16335 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 16336 // Same encoding for add/sub, just flip the sign. 16337 int64_t AbsImm = std::abs(Imm); 16338 if (!Subtarget->isThumb()) 16339 return ARM_AM::getSOImmVal(AbsImm) != -1; 16340 if (Subtarget->isThumb2()) 16341 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 16342 // Thumb1 only has 8-bit unsigned immediate. 16343 return AbsImm >= 0 && AbsImm <= 255; 16344 } 16345 16346 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 16347 bool isSEXTLoad, SDValue &Base, 16348 SDValue &Offset, bool &isInc, 16349 SelectionDAG &DAG) { 16350 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 16351 return false; 16352 16353 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 16354 // AddressingMode 3 16355 Base = Ptr->getOperand(0); 16356 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 16357 int RHSC = (int)RHS->getZExtValue(); 16358 if (RHSC < 0 && RHSC > -256) { 16359 assert(Ptr->getOpcode() == ISD::ADD); 16360 isInc = false; 16361 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 16362 return true; 16363 } 16364 } 16365 isInc = (Ptr->getOpcode() == ISD::ADD); 16366 Offset = Ptr->getOperand(1); 16367 return true; 16368 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 16369 // AddressingMode 2 16370 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 16371 int RHSC = (int)RHS->getZExtValue(); 16372 if (RHSC < 0 && RHSC > -0x1000) { 16373 assert(Ptr->getOpcode() == ISD::ADD); 16374 isInc = false; 16375 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 16376 Base = Ptr->getOperand(0); 16377 return true; 16378 } 16379 } 16380 16381 if (Ptr->getOpcode() == ISD::ADD) { 16382 isInc = true; 16383 ARM_AM::ShiftOpc ShOpcVal= 16384 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 16385 if (ShOpcVal != ARM_AM::no_shift) { 16386 Base = Ptr->getOperand(1); 16387 Offset = Ptr->getOperand(0); 16388 } else { 16389 Base = Ptr->getOperand(0); 16390 Offset = Ptr->getOperand(1); 16391 } 16392 return true; 16393 } 16394 16395 isInc = (Ptr->getOpcode() == ISD::ADD); 16396 Base = Ptr->getOperand(0); 16397 Offset = Ptr->getOperand(1); 16398 return true; 16399 } 16400 16401 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 16402 return false; 16403 } 16404 16405 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 16406 bool isSEXTLoad, SDValue &Base, 16407 SDValue &Offset, bool &isInc, 16408 SelectionDAG &DAG) { 16409 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 16410 return false; 16411 16412 Base = Ptr->getOperand(0); 16413 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 16414 int RHSC = (int)RHS->getZExtValue(); 16415 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 16416 assert(Ptr->getOpcode() == ISD::ADD); 16417 isInc = false; 16418 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 16419 return true; 16420 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 16421 isInc = Ptr->getOpcode() == ISD::ADD; 16422 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 16423 return true; 16424 } 16425 } 16426 16427 return false; 16428 } 16429 16430 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, 16431 bool isSEXTLoad, bool IsMasked, bool isLE, 16432 SDValue &Base, SDValue &Offset, 16433 bool &isInc, SelectionDAG &DAG) { 16434 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 16435 return false; 16436 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 16437 return false; 16438 16439 // We allow LE non-masked loads to change the type (for example use a vldrb.8 16440 // as opposed to a vldrw.32). This can allow extra addressing modes or 16441 // alignments for what is otherwise an equivalent instruction. 16442 bool CanChangeType = isLE && !IsMasked; 16443 16444 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 16445 int RHSC = (int)RHS->getZExtValue(); 16446 16447 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 16448 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 16449 assert(Ptr->getOpcode() == ISD::ADD); 16450 isInc = false; 16451 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 16452 return true; 16453 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 16454 isInc = Ptr->getOpcode() == ISD::ADD; 16455 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 16456 return true; 16457 } 16458 return false; 16459 }; 16460 16461 // Try to find a matching instruction based on s/zext, Alignment, Offset and 16462 // (in BE/masked) type. 16463 Base = Ptr->getOperand(0); 16464 if (VT == MVT::v4i16) { 16465 if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) 16466 return true; 16467 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 16468 if (IsInRange(RHSC, 0x80, 1)) 16469 return true; 16470 } else if (Align >= 4 && 16471 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 16472 IsInRange(RHSC, 0x80, 4)) 16473 return true; 16474 else if (Align >= 2 && 16475 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 16476 IsInRange(RHSC, 0x80, 2)) 16477 return true; 16478 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 16479 return true; 16480 return false; 16481 } 16482 16483 /// getPreIndexedAddressParts - returns true by value, base pointer and 16484 /// offset pointer and addressing mode by reference if the node's address 16485 /// can be legally represented as pre-indexed load / store address. 16486 bool 16487 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 16488 SDValue &Offset, 16489 ISD::MemIndexedMode &AM, 16490 SelectionDAG &DAG) const { 16491 if (Subtarget->isThumb1Only()) 16492 return false; 16493 16494 EVT VT; 16495 SDValue Ptr; 16496 unsigned Align; 16497 bool isSEXTLoad = false; 16498 bool IsMasked = false; 16499 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 16500 Ptr = LD->getBasePtr(); 16501 VT = LD->getMemoryVT(); 16502 Align = LD->getAlignment(); 16503 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 16504 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 16505 Ptr = ST->getBasePtr(); 16506 VT = ST->getMemoryVT(); 16507 Align = ST->getAlignment(); 16508 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 16509 Ptr = LD->getBasePtr(); 16510 VT = LD->getMemoryVT(); 16511 Align = LD->getAlignment(); 16512 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 16513 IsMasked = true; 16514 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 16515 Ptr = ST->getBasePtr(); 16516 VT = ST->getMemoryVT(); 16517 Align = ST->getAlignment(); 16518 IsMasked = true; 16519 } else 16520 return false; 16521 16522 bool isInc; 16523 bool isLegal = false; 16524 if (VT.isVector()) 16525 isLegal = Subtarget->hasMVEIntegerOps() && 16526 getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, 16527 IsMasked, Subtarget->isLittle(), Base, 16528 Offset, isInc, DAG); 16529 else { 16530 if (Subtarget->isThumb2()) 16531 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 16532 Offset, isInc, DAG); 16533 else 16534 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 16535 Offset, isInc, DAG); 16536 } 16537 if (!isLegal) 16538 return false; 16539 16540 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 16541 return true; 16542 } 16543 16544 /// getPostIndexedAddressParts - returns true by value, base pointer and 16545 /// offset pointer and addressing mode by reference if this node can be 16546 /// combined with a load / store to form a post-indexed load / store. 16547 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 16548 SDValue &Base, 16549 SDValue &Offset, 16550 ISD::MemIndexedMode &AM, 16551 SelectionDAG &DAG) const { 16552 EVT VT; 16553 SDValue Ptr; 16554 unsigned Align; 16555 bool isSEXTLoad = false, isNonExt; 16556 bool IsMasked = false; 16557 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 16558 VT = LD->getMemoryVT(); 16559 Ptr = LD->getBasePtr(); 16560 Align = LD->getAlignment(); 16561 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 16562 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 16563 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 16564 VT = ST->getMemoryVT(); 16565 Ptr = ST->getBasePtr(); 16566 Align = ST->getAlignment(); 16567 isNonExt = !ST->isTruncatingStore(); 16568 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 16569 VT = LD->getMemoryVT(); 16570 Ptr = LD->getBasePtr(); 16571 Align = LD->getAlignment(); 16572 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 16573 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 16574 IsMasked = true; 16575 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 16576 VT = ST->getMemoryVT(); 16577 Ptr = ST->getBasePtr(); 16578 Align = ST->getAlignment(); 16579 isNonExt = !ST->isTruncatingStore(); 16580 IsMasked = true; 16581 } else 16582 return false; 16583 16584 if (Subtarget->isThumb1Only()) { 16585 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 16586 // must be non-extending/truncating, i32, with an offset of 4. 16587 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 16588 if (Op->getOpcode() != ISD::ADD || !isNonExt) 16589 return false; 16590 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 16591 if (!RHS || RHS->getZExtValue() != 4) 16592 return false; 16593 16594 Offset = Op->getOperand(1); 16595 Base = Op->getOperand(0); 16596 AM = ISD::POST_INC; 16597 return true; 16598 } 16599 16600 bool isInc; 16601 bool isLegal = false; 16602 if (VT.isVector()) 16603 isLegal = Subtarget->hasMVEIntegerOps() && 16604 getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, 16605 Subtarget->isLittle(), Base, Offset, 16606 isInc, DAG); 16607 else { 16608 if (Subtarget->isThumb2()) 16609 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 16610 isInc, DAG); 16611 else 16612 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 16613 isInc, DAG); 16614 } 16615 if (!isLegal) 16616 return false; 16617 16618 if (Ptr != Base) { 16619 // Swap base ptr and offset to catch more post-index load / store when 16620 // it's legal. In Thumb2 mode, offset must be an immediate. 16621 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 16622 !Subtarget->isThumb2()) 16623 std::swap(Base, Offset); 16624 16625 // Post-indexed load / store update the base pointer. 16626 if (Ptr != Base) 16627 return false; 16628 } 16629 16630 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 16631 return true; 16632 } 16633 16634 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 16635 KnownBits &Known, 16636 const APInt &DemandedElts, 16637 const SelectionDAG &DAG, 16638 unsigned Depth) const { 16639 unsigned BitWidth = Known.getBitWidth(); 16640 Known.resetAll(); 16641 switch (Op.getOpcode()) { 16642 default: break; 16643 case ARMISD::ADDC: 16644 case ARMISD::ADDE: 16645 case ARMISD::SUBC: 16646 case ARMISD::SUBE: 16647 // Special cases when we convert a carry to a boolean. 16648 if (Op.getResNo() == 0) { 16649 SDValue LHS = Op.getOperand(0); 16650 SDValue RHS = Op.getOperand(1); 16651 // (ADDE 0, 0, C) will give us a single bit. 16652 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 16653 isNullConstant(RHS)) { 16654 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 16655 return; 16656 } 16657 } 16658 break; 16659 case ARMISD::CMOV: { 16660 // Bits are known zero/one if known on the LHS and RHS. 16661 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 16662 if (Known.isUnknown()) 16663 return; 16664 16665 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 16666 Known.Zero &= KnownRHS.Zero; 16667 Known.One &= KnownRHS.One; 16668 return; 16669 } 16670 case ISD::INTRINSIC_W_CHAIN: { 16671 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 16672 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 16673 switch (IntID) { 16674 default: return; 16675 case Intrinsic::arm_ldaex: 16676 case Intrinsic::arm_ldrex: { 16677 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 16678 unsigned MemBits = VT.getScalarSizeInBits(); 16679 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 16680 return; 16681 } 16682 } 16683 } 16684 case ARMISD::BFI: { 16685 // Conservatively, we can recurse down the first operand 16686 // and just mask out all affected bits. 16687 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 16688 16689 // The operand to BFI is already a mask suitable for removing the bits it 16690 // sets. 16691 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 16692 const APInt &Mask = CI->getAPIntValue(); 16693 Known.Zero &= Mask; 16694 Known.One &= Mask; 16695 return; 16696 } 16697 case ARMISD::VGETLANEs: 16698 case ARMISD::VGETLANEu: { 16699 const SDValue &SrcSV = Op.getOperand(0); 16700 EVT VecVT = SrcSV.getValueType(); 16701 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 16702 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 16703 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 16704 assert(Pos->getAPIntValue().ult(NumSrcElts) && 16705 "VGETLANE index out of bounds"); 16706 unsigned Idx = Pos->getZExtValue(); 16707 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 16708 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 16709 16710 EVT VT = Op.getValueType(); 16711 const unsigned DstSz = VT.getScalarSizeInBits(); 16712 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 16713 (void)SrcSz; 16714 assert(SrcSz == Known.getBitWidth()); 16715 assert(DstSz > SrcSz); 16716 if (Op.getOpcode() == ARMISD::VGETLANEs) 16717 Known = Known.sext(DstSz); 16718 else { 16719 Known = Known.zext(DstSz); 16720 } 16721 assert(DstSz == Known.getBitWidth()); 16722 break; 16723 } 16724 case ARMISD::VMOVrh: { 16725 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 16726 assert(KnownOp.getBitWidth() == 16); 16727 Known = KnownOp.zext(32); 16728 break; 16729 } 16730 } 16731 } 16732 16733 bool 16734 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 16735 const APInt &DemandedAPInt, 16736 TargetLoweringOpt &TLO) const { 16737 // Delay optimization, so we don't have to deal with illegal types, or block 16738 // optimizations. 16739 if (!TLO.LegalOps) 16740 return false; 16741 16742 // Only optimize AND for now. 16743 if (Op.getOpcode() != ISD::AND) 16744 return false; 16745 16746 EVT VT = Op.getValueType(); 16747 16748 // Ignore vectors. 16749 if (VT.isVector()) 16750 return false; 16751 16752 assert(VT == MVT::i32 && "Unexpected integer type"); 16753 16754 // Make sure the RHS really is a constant. 16755 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 16756 if (!C) 16757 return false; 16758 16759 unsigned Mask = C->getZExtValue(); 16760 16761 unsigned Demanded = DemandedAPInt.getZExtValue(); 16762 unsigned ShrunkMask = Mask & Demanded; 16763 unsigned ExpandedMask = Mask | ~Demanded; 16764 16765 // If the mask is all zeros, let the target-independent code replace the 16766 // result with zero. 16767 if (ShrunkMask == 0) 16768 return false; 16769 16770 // If the mask is all ones, erase the AND. (Currently, the target-independent 16771 // code won't do this, so we have to do it explicitly to avoid an infinite 16772 // loop in obscure cases.) 16773 if (ExpandedMask == ~0U) 16774 return TLO.CombineTo(Op, Op.getOperand(0)); 16775 16776 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 16777 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 16778 }; 16779 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 16780 if (NewMask == Mask) 16781 return true; 16782 SDLoc DL(Op); 16783 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 16784 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 16785 return TLO.CombineTo(Op, NewOp); 16786 }; 16787 16788 // Prefer uxtb mask. 16789 if (IsLegalMask(0xFF)) 16790 return UseMask(0xFF); 16791 16792 // Prefer uxth mask. 16793 if (IsLegalMask(0xFFFF)) 16794 return UseMask(0xFFFF); 16795 16796 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 16797 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 16798 if (ShrunkMask < 256) 16799 return UseMask(ShrunkMask); 16800 16801 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 16802 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 16803 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 16804 return UseMask(ExpandedMask); 16805 16806 // Potential improvements: 16807 // 16808 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 16809 // We could try to prefer Thumb1 immediates which can be lowered to a 16810 // two-instruction sequence. 16811 // We could try to recognize more legal ARM/Thumb2 immediates here. 16812 16813 return false; 16814 } 16815 16816 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( 16817 SDValue Op, const APInt &OriginalDemandedBits, 16818 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 16819 unsigned Depth) const { 16820 unsigned Opc = Op.getOpcode(); 16821 16822 switch (Opc) { 16823 case ARMISD::ASRL: 16824 case ARMISD::LSRL: { 16825 // If this is result 0 and the other result is unused, see if the demand 16826 // bits allow us to shrink this long shift into a standard small shift in 16827 // the opposite direction. 16828 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && 16829 isa<ConstantSDNode>(Op->getOperand(2))) { 16830 unsigned ShAmt = Op->getConstantOperandVal(2); 16831 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf( 16832 APInt::getAllOnesValue(32) << (32 - ShAmt))) 16833 return TLO.CombineTo( 16834 Op, TLO.DAG.getNode( 16835 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), 16836 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); 16837 } 16838 break; 16839 } 16840 } 16841 16842 return TargetLowering::SimplifyDemandedBitsForTargetNode( 16843 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 16844 } 16845 16846 //===----------------------------------------------------------------------===// 16847 // ARM Inline Assembly Support 16848 //===----------------------------------------------------------------------===// 16849 16850 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 16851 // Looking for "rev" which is V6+. 16852 if (!Subtarget->hasV6Ops()) 16853 return false; 16854 16855 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); 16856 std::string AsmStr = IA->getAsmString(); 16857 SmallVector<StringRef, 4> AsmPieces; 16858 SplitString(AsmStr, AsmPieces, ";\n"); 16859 16860 switch (AsmPieces.size()) { 16861 default: return false; 16862 case 1: 16863 AsmStr = std::string(AsmPieces[0]); 16864 AsmPieces.clear(); 16865 SplitString(AsmStr, AsmPieces, " \t,"); 16866 16867 // rev $0, $1 16868 if (AsmPieces.size() == 3 && 16869 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 16870 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 16871 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 16872 if (Ty && Ty->getBitWidth() == 32) 16873 return IntrinsicLowering::LowerToByteSwap(CI); 16874 } 16875 break; 16876 } 16877 16878 return false; 16879 } 16880 16881 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 16882 // At this point, we have to lower this constraint to something else, so we 16883 // lower it to an "r" or "w". However, by doing this we will force the result 16884 // to be in register, while the X constraint is much more permissive. 16885 // 16886 // Although we are correct (we are free to emit anything, without 16887 // constraints), we might break use cases that would expect us to be more 16888 // efficient and emit something else. 16889 if (!Subtarget->hasVFP2Base()) 16890 return "r"; 16891 if (ConstraintVT.isFloatingPoint()) 16892 return "w"; 16893 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 16894 (ConstraintVT.getSizeInBits() == 64 || 16895 ConstraintVT.getSizeInBits() == 128)) 16896 return "w"; 16897 16898 return "r"; 16899 } 16900 16901 /// getConstraintType - Given a constraint letter, return the type of 16902 /// constraint it is for this target. 16903 ARMTargetLowering::ConstraintType 16904 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 16905 unsigned S = Constraint.size(); 16906 if (S == 1) { 16907 switch (Constraint[0]) { 16908 default: break; 16909 case 'l': return C_RegisterClass; 16910 case 'w': return C_RegisterClass; 16911 case 'h': return C_RegisterClass; 16912 case 'x': return C_RegisterClass; 16913 case 't': return C_RegisterClass; 16914 case 'j': return C_Immediate; // Constant for movw. 16915 // An address with a single base register. Due to the way we 16916 // currently handle addresses it is the same as an 'r' memory constraint. 16917 case 'Q': return C_Memory; 16918 } 16919 } else if (S == 2) { 16920 switch (Constraint[0]) { 16921 default: break; 16922 case 'T': return C_RegisterClass; 16923 // All 'U+' constraints are addresses. 16924 case 'U': return C_Memory; 16925 } 16926 } 16927 return TargetLowering::getConstraintType(Constraint); 16928 } 16929 16930 /// Examine constraint type and operand type and determine a weight value. 16931 /// This object must already have been set up with the operand type 16932 /// and the current alternative constraint selected. 16933 TargetLowering::ConstraintWeight 16934 ARMTargetLowering::getSingleConstraintMatchWeight( 16935 AsmOperandInfo &info, const char *constraint) const { 16936 ConstraintWeight weight = CW_Invalid; 16937 Value *CallOperandVal = info.CallOperandVal; 16938 // If we don't have a value, we can't do a match, 16939 // but allow it at the lowest weight. 16940 if (!CallOperandVal) 16941 return CW_Default; 16942 Type *type = CallOperandVal->getType(); 16943 // Look at the constraint type. 16944 switch (*constraint) { 16945 default: 16946 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 16947 break; 16948 case 'l': 16949 if (type->isIntegerTy()) { 16950 if (Subtarget->isThumb()) 16951 weight = CW_SpecificReg; 16952 else 16953 weight = CW_Register; 16954 } 16955 break; 16956 case 'w': 16957 if (type->isFloatingPointTy()) 16958 weight = CW_Register; 16959 break; 16960 } 16961 return weight; 16962 } 16963 16964 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 16965 16966 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 16967 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 16968 switch (Constraint.size()) { 16969 case 1: 16970 // GCC ARM Constraint Letters 16971 switch (Constraint[0]) { 16972 case 'l': // Low regs or general regs. 16973 if (Subtarget->isThumb()) 16974 return RCPair(0U, &ARM::tGPRRegClass); 16975 return RCPair(0U, &ARM::GPRRegClass); 16976 case 'h': // High regs or no regs. 16977 if (Subtarget->isThumb()) 16978 return RCPair(0U, &ARM::hGPRRegClass); 16979 break; 16980 case 'r': 16981 if (Subtarget->isThumb1Only()) 16982 return RCPair(0U, &ARM::tGPRRegClass); 16983 return RCPair(0U, &ARM::GPRRegClass); 16984 case 'w': 16985 if (VT == MVT::Other) 16986 break; 16987 if (VT == MVT::f32) 16988 return RCPair(0U, &ARM::SPRRegClass); 16989 if (VT.getSizeInBits() == 64) 16990 return RCPair(0U, &ARM::DPRRegClass); 16991 if (VT.getSizeInBits() == 128) 16992 return RCPair(0U, &ARM::QPRRegClass); 16993 break; 16994 case 'x': 16995 if (VT == MVT::Other) 16996 break; 16997 if (VT == MVT::f32) 16998 return RCPair(0U, &ARM::SPR_8RegClass); 16999 if (VT.getSizeInBits() == 64) 17000 return RCPair(0U, &ARM::DPR_8RegClass); 17001 if (VT.getSizeInBits() == 128) 17002 return RCPair(0U, &ARM::QPR_8RegClass); 17003 break; 17004 case 't': 17005 if (VT == MVT::Other) 17006 break; 17007 if (VT == MVT::f32 || VT == MVT::i32) 17008 return RCPair(0U, &ARM::SPRRegClass); 17009 if (VT.getSizeInBits() == 64) 17010 return RCPair(0U, &ARM::DPR_VFP2RegClass); 17011 if (VT.getSizeInBits() == 128) 17012 return RCPair(0U, &ARM::QPR_VFP2RegClass); 17013 break; 17014 } 17015 break; 17016 17017 case 2: 17018 if (Constraint[0] == 'T') { 17019 switch (Constraint[1]) { 17020 default: 17021 break; 17022 case 'e': 17023 return RCPair(0U, &ARM::tGPREvenRegClass); 17024 case 'o': 17025 return RCPair(0U, &ARM::tGPROddRegClass); 17026 } 17027 } 17028 break; 17029 17030 default: 17031 break; 17032 } 17033 17034 if (StringRef("{cc}").equals_lower(Constraint)) 17035 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 17036 17037 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 17038 } 17039 17040 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 17041 /// vector. If it is invalid, don't add anything to Ops. 17042 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 17043 std::string &Constraint, 17044 std::vector<SDValue>&Ops, 17045 SelectionDAG &DAG) const { 17046 SDValue Result; 17047 17048 // Currently only support length 1 constraints. 17049 if (Constraint.length() != 1) return; 17050 17051 char ConstraintLetter = Constraint[0]; 17052 switch (ConstraintLetter) { 17053 default: break; 17054 case 'j': 17055 case 'I': case 'J': case 'K': case 'L': 17056 case 'M': case 'N': case 'O': 17057 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 17058 if (!C) 17059 return; 17060 17061 int64_t CVal64 = C->getSExtValue(); 17062 int CVal = (int) CVal64; 17063 // None of these constraints allow values larger than 32 bits. Check 17064 // that the value fits in an int. 17065 if (CVal != CVal64) 17066 return; 17067 17068 switch (ConstraintLetter) { 17069 case 'j': 17070 // Constant suitable for movw, must be between 0 and 17071 // 65535. 17072 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 17073 if (CVal >= 0 && CVal <= 65535) 17074 break; 17075 return; 17076 case 'I': 17077 if (Subtarget->isThumb1Only()) { 17078 // This must be a constant between 0 and 255, for ADD 17079 // immediates. 17080 if (CVal >= 0 && CVal <= 255) 17081 break; 17082 } else if (Subtarget->isThumb2()) { 17083 // A constant that can be used as an immediate value in a 17084 // data-processing instruction. 17085 if (ARM_AM::getT2SOImmVal(CVal) != -1) 17086 break; 17087 } else { 17088 // A constant that can be used as an immediate value in a 17089 // data-processing instruction. 17090 if (ARM_AM::getSOImmVal(CVal) != -1) 17091 break; 17092 } 17093 return; 17094 17095 case 'J': 17096 if (Subtarget->isThumb1Only()) { 17097 // This must be a constant between -255 and -1, for negated ADD 17098 // immediates. This can be used in GCC with an "n" modifier that 17099 // prints the negated value, for use with SUB instructions. It is 17100 // not useful otherwise but is implemented for compatibility. 17101 if (CVal >= -255 && CVal <= -1) 17102 break; 17103 } else { 17104 // This must be a constant between -4095 and 4095. It is not clear 17105 // what this constraint is intended for. Implemented for 17106 // compatibility with GCC. 17107 if (CVal >= -4095 && CVal <= 4095) 17108 break; 17109 } 17110 return; 17111 17112 case 'K': 17113 if (Subtarget->isThumb1Only()) { 17114 // A 32-bit value where only one byte has a nonzero value. Exclude 17115 // zero to match GCC. This constraint is used by GCC internally for 17116 // constants that can be loaded with a move/shift combination. 17117 // It is not useful otherwise but is implemented for compatibility. 17118 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 17119 break; 17120 } else if (Subtarget->isThumb2()) { 17121 // A constant whose bitwise inverse can be used as an immediate 17122 // value in a data-processing instruction. This can be used in GCC 17123 // with a "B" modifier that prints the inverted value, for use with 17124 // BIC and MVN instructions. It is not useful otherwise but is 17125 // implemented for compatibility. 17126 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 17127 break; 17128 } else { 17129 // A constant whose bitwise inverse can be used as an immediate 17130 // value in a data-processing instruction. This can be used in GCC 17131 // with a "B" modifier that prints the inverted value, for use with 17132 // BIC and MVN instructions. It is not useful otherwise but is 17133 // implemented for compatibility. 17134 if (ARM_AM::getSOImmVal(~CVal) != -1) 17135 break; 17136 } 17137 return; 17138 17139 case 'L': 17140 if (Subtarget->isThumb1Only()) { 17141 // This must be a constant between -7 and 7, 17142 // for 3-operand ADD/SUB immediate instructions. 17143 if (CVal >= -7 && CVal < 7) 17144 break; 17145 } else if (Subtarget->isThumb2()) { 17146 // A constant whose negation can be used as an immediate value in a 17147 // data-processing instruction. This can be used in GCC with an "n" 17148 // modifier that prints the negated value, for use with SUB 17149 // instructions. It is not useful otherwise but is implemented for 17150 // compatibility. 17151 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 17152 break; 17153 } else { 17154 // A constant whose negation can be used as an immediate value in a 17155 // data-processing instruction. This can be used in GCC with an "n" 17156 // modifier that prints the negated value, for use with SUB 17157 // instructions. It is not useful otherwise but is implemented for 17158 // compatibility. 17159 if (ARM_AM::getSOImmVal(-CVal) != -1) 17160 break; 17161 } 17162 return; 17163 17164 case 'M': 17165 if (Subtarget->isThumb1Only()) { 17166 // This must be a multiple of 4 between 0 and 1020, for 17167 // ADD sp + immediate. 17168 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 17169 break; 17170 } else { 17171 // A power of two or a constant between 0 and 32. This is used in 17172 // GCC for the shift amount on shifted register operands, but it is 17173 // useful in general for any shift amounts. 17174 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 17175 break; 17176 } 17177 return; 17178 17179 case 'N': 17180 if (Subtarget->isThumb1Only()) { 17181 // This must be a constant between 0 and 31, for shift amounts. 17182 if (CVal >= 0 && CVal <= 31) 17183 break; 17184 } 17185 return; 17186 17187 case 'O': 17188 if (Subtarget->isThumb1Only()) { 17189 // This must be a multiple of 4 between -508 and 508, for 17190 // ADD/SUB sp = sp + immediate. 17191 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 17192 break; 17193 } 17194 return; 17195 } 17196 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 17197 break; 17198 } 17199 17200 if (Result.getNode()) { 17201 Ops.push_back(Result); 17202 return; 17203 } 17204 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 17205 } 17206 17207 static RTLIB::Libcall getDivRemLibcall( 17208 const SDNode *N, MVT::SimpleValueType SVT) { 17209 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 17210 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 17211 "Unhandled Opcode in getDivRemLibcall"); 17212 bool isSigned = N->getOpcode() == ISD::SDIVREM || 17213 N->getOpcode() == ISD::SREM; 17214 RTLIB::Libcall LC; 17215 switch (SVT) { 17216 default: llvm_unreachable("Unexpected request for libcall!"); 17217 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 17218 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 17219 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 17220 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 17221 } 17222 return LC; 17223 } 17224 17225 static TargetLowering::ArgListTy getDivRemArgList( 17226 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 17227 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 17228 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 17229 "Unhandled Opcode in getDivRemArgList"); 17230 bool isSigned = N->getOpcode() == ISD::SDIVREM || 17231 N->getOpcode() == ISD::SREM; 17232 TargetLowering::ArgListTy Args; 17233 TargetLowering::ArgListEntry Entry; 17234 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 17235 EVT ArgVT = N->getOperand(i).getValueType(); 17236 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 17237 Entry.Node = N->getOperand(i); 17238 Entry.Ty = ArgTy; 17239 Entry.IsSExt = isSigned; 17240 Entry.IsZExt = !isSigned; 17241 Args.push_back(Entry); 17242 } 17243 if (Subtarget->isTargetWindows() && Args.size() >= 2) 17244 std::swap(Args[0], Args[1]); 17245 return Args; 17246 } 17247 17248 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 17249 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 17250 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 17251 Subtarget->isTargetWindows()) && 17252 "Register-based DivRem lowering only"); 17253 unsigned Opcode = Op->getOpcode(); 17254 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 17255 "Invalid opcode for Div/Rem lowering"); 17256 bool isSigned = (Opcode == ISD::SDIVREM); 17257 EVT VT = Op->getValueType(0); 17258 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 17259 SDLoc dl(Op); 17260 17261 // If the target has hardware divide, use divide + multiply + subtract: 17262 // div = a / b 17263 // rem = a - b * div 17264 // return {div, rem} 17265 // This should be lowered into UDIV/SDIV + MLS later on. 17266 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 17267 : Subtarget->hasDivideInARMMode(); 17268 if (hasDivide && Op->getValueType(0).isSimple() && 17269 Op->getSimpleValueType(0) == MVT::i32) { 17270 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 17271 const SDValue Dividend = Op->getOperand(0); 17272 const SDValue Divisor = Op->getOperand(1); 17273 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 17274 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 17275 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 17276 17277 SDValue Values[2] = {Div, Rem}; 17278 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 17279 } 17280 17281 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 17282 VT.getSimpleVT().SimpleTy); 17283 SDValue InChain = DAG.getEntryNode(); 17284 17285 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 17286 DAG.getContext(), 17287 Subtarget); 17288 17289 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 17290 getPointerTy(DAG.getDataLayout())); 17291 17292 Type *RetTy = StructType::get(Ty, Ty); 17293 17294 if (Subtarget->isTargetWindows()) 17295 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 17296 17297 TargetLowering::CallLoweringInfo CLI(DAG); 17298 CLI.setDebugLoc(dl).setChain(InChain) 17299 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 17300 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 17301 17302 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 17303 return CallInfo.first; 17304 } 17305 17306 // Lowers REM using divmod helpers 17307 // see RTABI section 4.2/4.3 17308 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 17309 // Build return types (div and rem) 17310 std::vector<Type*> RetTyParams; 17311 Type *RetTyElement; 17312 17313 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 17314 default: llvm_unreachable("Unexpected request for libcall!"); 17315 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 17316 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 17317 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 17318 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 17319 } 17320 17321 RetTyParams.push_back(RetTyElement); 17322 RetTyParams.push_back(RetTyElement); 17323 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 17324 Type *RetTy = StructType::get(*DAG.getContext(), ret); 17325 17326 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 17327 SimpleTy); 17328 SDValue InChain = DAG.getEntryNode(); 17329 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 17330 Subtarget); 17331 bool isSigned = N->getOpcode() == ISD::SREM; 17332 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 17333 getPointerTy(DAG.getDataLayout())); 17334 17335 if (Subtarget->isTargetWindows()) 17336 InChain = WinDBZCheckDenominator(DAG, N, InChain); 17337 17338 // Lower call 17339 CallLoweringInfo CLI(DAG); 17340 CLI.setChain(InChain) 17341 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 17342 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 17343 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 17344 17345 // Return second (rem) result operand (first contains div) 17346 SDNode *ResNode = CallResult.first.getNode(); 17347 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 17348 return ResNode->getOperand(1); 17349 } 17350 17351 SDValue 17352 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 17353 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 17354 SDLoc DL(Op); 17355 17356 // Get the inputs. 17357 SDValue Chain = Op.getOperand(0); 17358 SDValue Size = Op.getOperand(1); 17359 17360 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 17361 "no-stack-arg-probe")) { 17362 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 17363 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 17364 Chain = SP.getValue(1); 17365 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 17366 if (Align) 17367 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 17368 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 17369 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 17370 SDValue Ops[2] = { SP, Chain }; 17371 return DAG.getMergeValues(Ops, DL); 17372 } 17373 17374 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 17375 DAG.getConstant(2, DL, MVT::i32)); 17376 17377 SDValue Flag; 17378 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 17379 Flag = Chain.getValue(1); 17380 17381 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 17382 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 17383 17384 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 17385 Chain = NewSP.getValue(1); 17386 17387 SDValue Ops[2] = { NewSP, Chain }; 17388 return DAG.getMergeValues(Ops, DL); 17389 } 17390 17391 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 17392 bool IsStrict = Op->isStrictFPOpcode(); 17393 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 17394 const unsigned DstSz = Op.getValueType().getSizeInBits(); 17395 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 17396 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 17397 "Unexpected type for custom-lowering FP_EXTEND"); 17398 17399 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 17400 "With both FP DP and 16, any FP conversion is legal!"); 17401 17402 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 17403 "With FP16, 16 to 32 conversion is legal!"); 17404 17405 // Converting from 32 -> 64 is valid if we have FP64. 17406 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 17407 // FIXME: Remove this when we have strict fp instruction selection patterns 17408 if (IsStrict) { 17409 SDLoc Loc(Op); 17410 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 17411 Loc, Op.getValueType(), SrcVal); 17412 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 17413 } 17414 return Op; 17415 } 17416 17417 // Either we are converting from 16 -> 64, without FP16 and/or 17418 // FP.double-precision or without Armv8-fp. So we must do it in two 17419 // steps. 17420 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 17421 // without FP16. So we must do a function call. 17422 SDLoc Loc(Op); 17423 RTLIB::Libcall LC; 17424 MakeLibCallOptions CallOptions; 17425 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 17426 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 17427 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 17428 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 17429 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 17430 if (Supported) { 17431 if (IsStrict) { 17432 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 17433 {DstVT, MVT::Other}, {Chain, SrcVal}); 17434 Chain = SrcVal.getValue(1); 17435 } else { 17436 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 17437 } 17438 } else { 17439 LC = RTLIB::getFPEXT(SrcVT, DstVT); 17440 assert(LC != RTLIB::UNKNOWN_LIBCALL && 17441 "Unexpected type for custom-lowering FP_EXTEND"); 17442 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 17443 Loc, Chain); 17444 } 17445 } 17446 17447 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 17448 } 17449 17450 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 17451 bool IsStrict = Op->isStrictFPOpcode(); 17452 17453 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 17454 EVT SrcVT = SrcVal.getValueType(); 17455 EVT DstVT = Op.getValueType(); 17456 const unsigned DstSz = Op.getValueType().getSizeInBits(); 17457 const unsigned SrcSz = SrcVT.getSizeInBits(); 17458 (void)DstSz; 17459 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 17460 "Unexpected type for custom-lowering FP_ROUND"); 17461 17462 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 17463 "With both FP DP and 16, any FP conversion is legal!"); 17464 17465 SDLoc Loc(Op); 17466 17467 // Instruction from 32 -> 16 if hasFP16 is valid 17468 if (SrcSz == 32 && Subtarget->hasFP16()) 17469 return Op; 17470 17471 // Lib call from 32 -> 16 / 64 -> [32, 16] 17472 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 17473 assert(LC != RTLIB::UNKNOWN_LIBCALL && 17474 "Unexpected type for custom-lowering FP_ROUND"); 17475 MakeLibCallOptions CallOptions; 17476 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 17477 SDValue Result; 17478 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 17479 Loc, Chain); 17480 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 17481 } 17482 17483 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 17484 SelectionDAG &DAG) const { 17485 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 17486 MVT HalfT = MVT::i32; 17487 SDLoc dl(N); 17488 SDValue Hi, Lo, Tmp; 17489 17490 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 17491 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 17492 return ; 17493 17494 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 17495 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 17496 17497 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 17498 DAG.getConstant(0, dl, HalfT)); 17499 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 17500 DAG.getConstant(1, dl, HalfT)); 17501 17502 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 17503 DAG.getConstant(OpTypeBits - 1, dl, 17504 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 17505 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 17506 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 17507 SDValue(Lo.getNode(), 1)); 17508 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 17509 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 17510 17511 Results.push_back(Lo); 17512 Results.push_back(Hi); 17513 } 17514 17515 bool 17516 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 17517 // The ARM target isn't yet aware of offsets. 17518 return false; 17519 } 17520 17521 bool ARM::isBitFieldInvertedMask(unsigned v) { 17522 if (v == 0xffffffff) 17523 return false; 17524 17525 // there can be 1's on either or both "outsides", all the "inside" 17526 // bits must be 0's 17527 return isShiftedMask_32(~v); 17528 } 17529 17530 /// isFPImmLegal - Returns true if the target can instruction select the 17531 /// specified FP immediate natively. If false, the legalizer will 17532 /// materialize the FP immediate as a load from a constant pool. 17533 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 17534 bool ForCodeSize) const { 17535 if (!Subtarget->hasVFP3Base()) 17536 return false; 17537 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 17538 return ARM_AM::getFP16Imm(Imm) != -1; 17539 if (VT == MVT::f32) 17540 return ARM_AM::getFP32Imm(Imm) != -1; 17541 if (VT == MVT::f64 && Subtarget->hasFP64()) 17542 return ARM_AM::getFP64Imm(Imm) != -1; 17543 return false; 17544 } 17545 17546 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 17547 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 17548 /// specified in the intrinsic calls. 17549 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 17550 const CallInst &I, 17551 MachineFunction &MF, 17552 unsigned Intrinsic) const { 17553 switch (Intrinsic) { 17554 case Intrinsic::arm_neon_vld1: 17555 case Intrinsic::arm_neon_vld2: 17556 case Intrinsic::arm_neon_vld3: 17557 case Intrinsic::arm_neon_vld4: 17558 case Intrinsic::arm_neon_vld2lane: 17559 case Intrinsic::arm_neon_vld3lane: 17560 case Intrinsic::arm_neon_vld4lane: 17561 case Intrinsic::arm_neon_vld2dup: 17562 case Intrinsic::arm_neon_vld3dup: 17563 case Intrinsic::arm_neon_vld4dup: { 17564 Info.opc = ISD::INTRINSIC_W_CHAIN; 17565 // Conservatively set memVT to the entire set of vectors loaded. 17566 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17567 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 17568 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 17569 Info.ptrVal = I.getArgOperand(0); 17570 Info.offset = 0; 17571 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 17572 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 17573 // volatile loads with NEON intrinsics not supported 17574 Info.flags = MachineMemOperand::MOLoad; 17575 return true; 17576 } 17577 case Intrinsic::arm_neon_vld1x2: 17578 case Intrinsic::arm_neon_vld1x3: 17579 case Intrinsic::arm_neon_vld1x4: { 17580 Info.opc = ISD::INTRINSIC_W_CHAIN; 17581 // Conservatively set memVT to the entire set of vectors loaded. 17582 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17583 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 17584 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 17585 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 17586 Info.offset = 0; 17587 Info.align.reset(); 17588 // volatile loads with NEON intrinsics not supported 17589 Info.flags = MachineMemOperand::MOLoad; 17590 return true; 17591 } 17592 case Intrinsic::arm_neon_vst1: 17593 case Intrinsic::arm_neon_vst2: 17594 case Intrinsic::arm_neon_vst3: 17595 case Intrinsic::arm_neon_vst4: 17596 case Intrinsic::arm_neon_vst2lane: 17597 case Intrinsic::arm_neon_vst3lane: 17598 case Intrinsic::arm_neon_vst4lane: { 17599 Info.opc = ISD::INTRINSIC_VOID; 17600 // Conservatively set memVT to the entire set of vectors stored. 17601 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17602 unsigned NumElts = 0; 17603 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 17604 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 17605 if (!ArgTy->isVectorTy()) 17606 break; 17607 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 17608 } 17609 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 17610 Info.ptrVal = I.getArgOperand(0); 17611 Info.offset = 0; 17612 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 17613 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 17614 // volatile stores with NEON intrinsics not supported 17615 Info.flags = MachineMemOperand::MOStore; 17616 return true; 17617 } 17618 case Intrinsic::arm_neon_vst1x2: 17619 case Intrinsic::arm_neon_vst1x3: 17620 case Intrinsic::arm_neon_vst1x4: { 17621 Info.opc = ISD::INTRINSIC_VOID; 17622 // Conservatively set memVT to the entire set of vectors stored. 17623 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17624 unsigned NumElts = 0; 17625 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 17626 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 17627 if (!ArgTy->isVectorTy()) 17628 break; 17629 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 17630 } 17631 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 17632 Info.ptrVal = I.getArgOperand(0); 17633 Info.offset = 0; 17634 Info.align.reset(); 17635 // volatile stores with NEON intrinsics not supported 17636 Info.flags = MachineMemOperand::MOStore; 17637 return true; 17638 } 17639 case Intrinsic::arm_mve_vld2q: 17640 case Intrinsic::arm_mve_vld4q: { 17641 Info.opc = ISD::INTRINSIC_W_CHAIN; 17642 // Conservatively set memVT to the entire set of vectors loaded. 17643 Type *VecTy = cast<StructType>(I.getType())->getElementType(1); 17644 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; 17645 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 17646 Info.ptrVal = I.getArgOperand(0); 17647 Info.offset = 0; 17648 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 17649 // volatile loads with MVE intrinsics not supported 17650 Info.flags = MachineMemOperand::MOLoad; 17651 return true; 17652 } 17653 case Intrinsic::arm_mve_vst2q: 17654 case Intrinsic::arm_mve_vst4q: { 17655 Info.opc = ISD::INTRINSIC_VOID; 17656 // Conservatively set memVT to the entire set of vectors stored. 17657 Type *VecTy = I.getArgOperand(1)->getType(); 17658 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; 17659 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 17660 Info.ptrVal = I.getArgOperand(0); 17661 Info.offset = 0; 17662 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 17663 // volatile stores with MVE intrinsics not supported 17664 Info.flags = MachineMemOperand::MOStore; 17665 return true; 17666 } 17667 case Intrinsic::arm_ldaex: 17668 case Intrinsic::arm_ldrex: { 17669 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17670 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 17671 Info.opc = ISD::INTRINSIC_W_CHAIN; 17672 Info.memVT = MVT::getVT(PtrTy->getElementType()); 17673 Info.ptrVal = I.getArgOperand(0); 17674 Info.offset = 0; 17675 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 17676 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 17677 return true; 17678 } 17679 case Intrinsic::arm_stlex: 17680 case Intrinsic::arm_strex: { 17681 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17682 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 17683 Info.opc = ISD::INTRINSIC_W_CHAIN; 17684 Info.memVT = MVT::getVT(PtrTy->getElementType()); 17685 Info.ptrVal = I.getArgOperand(1); 17686 Info.offset = 0; 17687 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 17688 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 17689 return true; 17690 } 17691 case Intrinsic::arm_stlexd: 17692 case Intrinsic::arm_strexd: 17693 Info.opc = ISD::INTRINSIC_W_CHAIN; 17694 Info.memVT = MVT::i64; 17695 Info.ptrVal = I.getArgOperand(2); 17696 Info.offset = 0; 17697 Info.align = Align(8); 17698 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 17699 return true; 17700 17701 case Intrinsic::arm_ldaexd: 17702 case Intrinsic::arm_ldrexd: 17703 Info.opc = ISD::INTRINSIC_W_CHAIN; 17704 Info.memVT = MVT::i64; 17705 Info.ptrVal = I.getArgOperand(0); 17706 Info.offset = 0; 17707 Info.align = Align(8); 17708 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 17709 return true; 17710 17711 default: 17712 break; 17713 } 17714 17715 return false; 17716 } 17717 17718 /// Returns true if it is beneficial to convert a load of a constant 17719 /// to just the constant itself. 17720 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 17721 Type *Ty) const { 17722 assert(Ty->isIntegerTy()); 17723 17724 unsigned Bits = Ty->getPrimitiveSizeInBits(); 17725 if (Bits == 0 || Bits > 32) 17726 return false; 17727 return true; 17728 } 17729 17730 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 17731 unsigned Index) const { 17732 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 17733 return false; 17734 17735 return (Index == 0 || Index == ResVT.getVectorNumElements()); 17736 } 17737 17738 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 17739 ARM_MB::MemBOpt Domain) const { 17740 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17741 17742 // First, if the target has no DMB, see what fallback we can use. 17743 if (!Subtarget->hasDataBarrier()) { 17744 // Some ARMv6 cpus can support data barriers with an mcr instruction. 17745 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 17746 // here. 17747 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 17748 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 17749 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 17750 Builder.getInt32(0), Builder.getInt32(7), 17751 Builder.getInt32(10), Builder.getInt32(5)}; 17752 return Builder.CreateCall(MCR, args); 17753 } else { 17754 // Instead of using barriers, atomic accesses on these subtargets use 17755 // libcalls. 17756 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 17757 } 17758 } else { 17759 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 17760 // Only a full system barrier exists in the M-class architectures. 17761 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 17762 Constant *CDomain = Builder.getInt32(Domain); 17763 return Builder.CreateCall(DMB, CDomain); 17764 } 17765 } 17766 17767 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 17768 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 17769 Instruction *Inst, 17770 AtomicOrdering Ord) const { 17771 switch (Ord) { 17772 case AtomicOrdering::NotAtomic: 17773 case AtomicOrdering::Unordered: 17774 llvm_unreachable("Invalid fence: unordered/non-atomic"); 17775 case AtomicOrdering::Monotonic: 17776 case AtomicOrdering::Acquire: 17777 return nullptr; // Nothing to do 17778 case AtomicOrdering::SequentiallyConsistent: 17779 if (!Inst->hasAtomicStore()) 17780 return nullptr; // Nothing to do 17781 LLVM_FALLTHROUGH; 17782 case AtomicOrdering::Release: 17783 case AtomicOrdering::AcquireRelease: 17784 if (Subtarget->preferISHSTBarriers()) 17785 return makeDMB(Builder, ARM_MB::ISHST); 17786 // FIXME: add a comment with a link to documentation justifying this. 17787 else 17788 return makeDMB(Builder, ARM_MB::ISH); 17789 } 17790 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 17791 } 17792 17793 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 17794 Instruction *Inst, 17795 AtomicOrdering Ord) const { 17796 switch (Ord) { 17797 case AtomicOrdering::NotAtomic: 17798 case AtomicOrdering::Unordered: 17799 llvm_unreachable("Invalid fence: unordered/not-atomic"); 17800 case AtomicOrdering::Monotonic: 17801 case AtomicOrdering::Release: 17802 return nullptr; // Nothing to do 17803 case AtomicOrdering::Acquire: 17804 case AtomicOrdering::AcquireRelease: 17805 case AtomicOrdering::SequentiallyConsistent: 17806 return makeDMB(Builder, ARM_MB::ISH); 17807 } 17808 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 17809 } 17810 17811 // Loads and stores less than 64-bits are already atomic; ones above that 17812 // are doomed anyway, so defer to the default libcall and blame the OS when 17813 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 17814 // anything for those. 17815 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 17816 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 17817 return (Size == 64) && !Subtarget->isMClass(); 17818 } 17819 17820 // Loads and stores less than 64-bits are already atomic; ones above that 17821 // are doomed anyway, so defer to the default libcall and blame the OS when 17822 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 17823 // anything for those. 17824 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 17825 // guarantee, see DDI0406C ARM architecture reference manual, 17826 // sections A8.8.72-74 LDRD) 17827 TargetLowering::AtomicExpansionKind 17828 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 17829 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 17830 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 17831 : AtomicExpansionKind::None; 17832 } 17833 17834 // For the real atomic operations, we have ldrex/strex up to 32 bits, 17835 // and up to 64 bits on the non-M profiles 17836 TargetLowering::AtomicExpansionKind 17837 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 17838 if (AI->isFloatingPointOperation()) 17839 return AtomicExpansionKind::CmpXChg; 17840 17841 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 17842 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 17843 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 17844 ? AtomicExpansionKind::LLSC 17845 : AtomicExpansionKind::None; 17846 } 17847 17848 TargetLowering::AtomicExpansionKind 17849 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 17850 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 17851 // implement cmpxchg without spilling. If the address being exchanged is also 17852 // on the stack and close enough to the spill slot, this can lead to a 17853 // situation where the monitor always gets cleared and the atomic operation 17854 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 17855 bool HasAtomicCmpXchg = 17856 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 17857 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 17858 return AtomicExpansionKind::LLSC; 17859 return AtomicExpansionKind::None; 17860 } 17861 17862 bool ARMTargetLowering::shouldInsertFencesForAtomic( 17863 const Instruction *I) const { 17864 return InsertFencesForAtomic; 17865 } 17866 17867 // This has so far only been implemented for MachO. 17868 bool ARMTargetLowering::useLoadStackGuardNode() const { 17869 return Subtarget->isTargetMachO(); 17870 } 17871 17872 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 17873 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17874 return TargetLowering::insertSSPDeclarations(M); 17875 17876 // MSVC CRT has a global variable holding security cookie. 17877 M.getOrInsertGlobal("__security_cookie", 17878 Type::getInt8PtrTy(M.getContext())); 17879 17880 // MSVC CRT has a function to validate security cookie. 17881 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 17882 "__security_check_cookie", Type::getVoidTy(M.getContext()), 17883 Type::getInt8PtrTy(M.getContext())); 17884 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 17885 F->addAttribute(1, Attribute::AttrKind::InReg); 17886 } 17887 17888 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 17889 // MSVC CRT has a global variable holding security cookie. 17890 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17891 return M.getGlobalVariable("__security_cookie"); 17892 return TargetLowering::getSDagStackGuard(M); 17893 } 17894 17895 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 17896 // MSVC CRT has a function to validate security cookie. 17897 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17898 return M.getFunction("__security_check_cookie"); 17899 return TargetLowering::getSSPStackGuardCheck(M); 17900 } 17901 17902 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 17903 unsigned &Cost) const { 17904 // If we do not have NEON, vector types are not natively supported. 17905 if (!Subtarget->hasNEON()) 17906 return false; 17907 17908 // Floating point values and vector values map to the same register file. 17909 // Therefore, although we could do a store extract of a vector type, this is 17910 // better to leave at float as we have more freedom in the addressing mode for 17911 // those. 17912 if (VectorTy->isFPOrFPVectorTy()) 17913 return false; 17914 17915 // If the index is unknown at compile time, this is very expensive to lower 17916 // and it is not possible to combine the store with the extract. 17917 if (!isa<ConstantInt>(Idx)) 17918 return false; 17919 17920 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 17921 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); 17922 // We can do a store + vector extract on any vector that fits perfectly in a D 17923 // or Q register. 17924 if (BitWidth == 64 || BitWidth == 128) { 17925 Cost = 0; 17926 return true; 17927 } 17928 return false; 17929 } 17930 17931 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 17932 return Subtarget->hasV6T2Ops(); 17933 } 17934 17935 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 17936 return Subtarget->hasV6T2Ops(); 17937 } 17938 17939 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 17940 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 17941 } 17942 17943 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 17944 AtomicOrdering Ord) const { 17945 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17946 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 17947 bool IsAcquire = isAcquireOrStronger(Ord); 17948 17949 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 17950 // intrinsic must return {i32, i32} and we have to recombine them into a 17951 // single i64 here. 17952 if (ValTy->getPrimitiveSizeInBits() == 64) { 17953 Intrinsic::ID Int = 17954 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 17955 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 17956 17957 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 17958 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 17959 17960 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 17961 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 17962 if (!Subtarget->isLittle()) 17963 std::swap (Lo, Hi); 17964 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 17965 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 17966 return Builder.CreateOr( 17967 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 17968 } 17969 17970 Type *Tys[] = { Addr->getType() }; 17971 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 17972 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 17973 17974 return Builder.CreateTruncOrBitCast( 17975 Builder.CreateCall(Ldrex, Addr), 17976 cast<PointerType>(Addr->getType())->getElementType()); 17977 } 17978 17979 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 17980 IRBuilder<> &Builder) const { 17981 if (!Subtarget->hasV7Ops()) 17982 return; 17983 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17984 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 17985 } 17986 17987 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 17988 Value *Addr, 17989 AtomicOrdering Ord) const { 17990 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17991 bool IsRelease = isReleaseOrStronger(Ord); 17992 17993 // Since the intrinsics must have legal type, the i64 intrinsics take two 17994 // parameters: "i32, i32". We must marshal Val into the appropriate form 17995 // before the call. 17996 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 17997 Intrinsic::ID Int = 17998 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 17999 Function *Strex = Intrinsic::getDeclaration(M, Int); 18000 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 18001 18002 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 18003 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 18004 if (!Subtarget->isLittle()) 18005 std::swap(Lo, Hi); 18006 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 18007 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 18008 } 18009 18010 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 18011 Type *Tys[] = { Addr->getType() }; 18012 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 18013 18014 return Builder.CreateCall( 18015 Strex, {Builder.CreateZExtOrBitCast( 18016 Val, Strex->getFunctionType()->getParamType(0)), 18017 Addr}); 18018 } 18019 18020 18021 bool ARMTargetLowering::alignLoopsWithOptSize() const { 18022 return Subtarget->isMClass(); 18023 } 18024 18025 /// A helper function for determining the number of interleaved accesses we 18026 /// will generate when lowering accesses of the given type. 18027 unsigned 18028 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 18029 const DataLayout &DL) const { 18030 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 18031 } 18032 18033 bool ARMTargetLowering::isLegalInterleavedAccessType( 18034 unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const { 18035 18036 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 18037 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 18038 18039 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 18040 return false; 18041 18042 // Ensure the vector doesn't have f16 elements. Even though we could do an 18043 // i16 vldN, we can't hold the f16 vectors and will end up converting via 18044 // f32. 18045 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 18046 return false; 18047 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 18048 return false; 18049 18050 // Ensure the number of vector elements is greater than 1. 18051 if (VecTy->getNumElements() < 2) 18052 return false; 18053 18054 // Ensure the element type is legal. 18055 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 18056 return false; 18057 18058 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 18059 // 128 will be split into multiple interleaved accesses. 18060 if (Subtarget->hasNEON() && VecSize == 64) 18061 return true; 18062 return VecSize % 128 == 0; 18063 } 18064 18065 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 18066 if (Subtarget->hasNEON()) 18067 return 4; 18068 if (Subtarget->hasMVEIntegerOps()) 18069 return MVEMaxSupportedInterleaveFactor; 18070 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 18071 } 18072 18073 /// Lower an interleaved load into a vldN intrinsic. 18074 /// 18075 /// E.g. Lower an interleaved load (Factor = 2): 18076 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 18077 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 18078 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 18079 /// 18080 /// Into: 18081 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 18082 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 18083 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 18084 bool ARMTargetLowering::lowerInterleavedLoad( 18085 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 18086 ArrayRef<unsigned> Indices, unsigned Factor) const { 18087 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 18088 "Invalid interleave factor"); 18089 assert(!Shuffles.empty() && "Empty shufflevector input"); 18090 assert(Shuffles.size() == Indices.size() && 18091 "Unmatched number of shufflevectors and indices"); 18092 18093 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); 18094 Type *EltTy = VecTy->getElementType(); 18095 18096 const DataLayout &DL = LI->getModule()->getDataLayout(); 18097 18098 // Skip if we do not have NEON and skip illegal vector types. We can 18099 // "legalize" wide vector types into multiple interleaved accesses as long as 18100 // the vector types are divisible by 128. 18101 if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) 18102 return false; 18103 18104 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 18105 18106 // A pointer vector can not be the return type of the ldN intrinsics. Need to 18107 // load integer vectors first and then convert to pointer vectors. 18108 if (EltTy->isPointerTy()) 18109 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); 18110 18111 IRBuilder<> Builder(LI); 18112 18113 // The base address of the load. 18114 Value *BaseAddr = LI->getPointerOperand(); 18115 18116 if (NumLoads > 1) { 18117 // If we're going to generate more than one load, reset the sub-vector type 18118 // to something legal. 18119 VecTy = FixedVectorType::get(VecTy->getElementType(), 18120 VecTy->getNumElements() / NumLoads); 18121 18122 // We will compute the pointer operand of each load from the original base 18123 // address using GEPs. Cast the base address to a pointer to the scalar 18124 // element type. 18125 BaseAddr = Builder.CreateBitCast( 18126 BaseAddr, 18127 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 18128 } 18129 18130 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 18131 18132 auto createLoadIntrinsic = [&](Value *BaseAddr) { 18133 if (Subtarget->hasNEON()) { 18134 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 18135 Type *Tys[] = {VecTy, Int8Ptr}; 18136 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 18137 Intrinsic::arm_neon_vld3, 18138 Intrinsic::arm_neon_vld4}; 18139 Function *VldnFunc = 18140 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 18141 18142 SmallVector<Value *, 2> Ops; 18143 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 18144 Ops.push_back(Builder.getInt32(LI->getAlignment())); 18145 18146 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 18147 } else { 18148 assert((Factor == 2 || Factor == 4) && 18149 "expected interleave factor of 2 or 4 for MVE"); 18150 Intrinsic::ID LoadInts = 18151 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 18152 Type *VecEltTy = 18153 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); 18154 Type *Tys[] = {VecTy, VecEltTy}; 18155 Function *VldnFunc = 18156 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 18157 18158 SmallVector<Value *, 2> Ops; 18159 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 18160 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 18161 } 18162 }; 18163 18164 // Holds sub-vectors extracted from the load intrinsic return values. The 18165 // sub-vectors are associated with the shufflevector instructions they will 18166 // replace. 18167 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 18168 18169 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 18170 // If we're generating more than one load, compute the base address of 18171 // subsequent loads as an offset from the previous. 18172 if (LoadCount > 0) 18173 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, 18174 VecTy->getNumElements() * Factor); 18175 18176 CallInst *VldN = createLoadIntrinsic(BaseAddr); 18177 18178 // Replace uses of each shufflevector with the corresponding vector loaded 18179 // by ldN. 18180 for (unsigned i = 0; i < Shuffles.size(); i++) { 18181 ShuffleVectorInst *SV = Shuffles[i]; 18182 unsigned Index = Indices[i]; 18183 18184 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 18185 18186 // Convert the integer vector to pointer vector if the element is pointer. 18187 if (EltTy->isPointerTy()) 18188 SubVec = Builder.CreateIntToPtr( 18189 SubVec, 18190 FixedVectorType::get(SV->getType()->getElementType(), VecTy)); 18191 18192 SubVecs[SV].push_back(SubVec); 18193 } 18194 } 18195 18196 // Replace uses of the shufflevector instructions with the sub-vectors 18197 // returned by the load intrinsic. If a shufflevector instruction is 18198 // associated with more than one sub-vector, those sub-vectors will be 18199 // concatenated into a single wide vector. 18200 for (ShuffleVectorInst *SVI : Shuffles) { 18201 auto &SubVec = SubVecs[SVI]; 18202 auto *WideVec = 18203 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 18204 SVI->replaceAllUsesWith(WideVec); 18205 } 18206 18207 return true; 18208 } 18209 18210 /// Lower an interleaved store into a vstN intrinsic. 18211 /// 18212 /// E.g. Lower an interleaved store (Factor = 3): 18213 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 18214 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 18215 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 18216 /// 18217 /// Into: 18218 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 18219 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 18220 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 18221 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 18222 /// 18223 /// Note that the new shufflevectors will be removed and we'll only generate one 18224 /// vst3 instruction in CodeGen. 18225 /// 18226 /// Example for a more general valid mask (Factor 3). Lower: 18227 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 18228 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 18229 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 18230 /// 18231 /// Into: 18232 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 18233 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 18234 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 18235 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 18236 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 18237 ShuffleVectorInst *SVI, 18238 unsigned Factor) const { 18239 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 18240 "Invalid interleave factor"); 18241 18242 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 18243 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 18244 18245 unsigned LaneLen = VecTy->getNumElements() / Factor; 18246 Type *EltTy = VecTy->getElementType(); 18247 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 18248 18249 const DataLayout &DL = SI->getModule()->getDataLayout(); 18250 18251 // Skip if we do not have NEON and skip illegal vector types. We can 18252 // "legalize" wide vector types into multiple interleaved accesses as long as 18253 // the vector types are divisible by 128. 18254 if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) 18255 return false; 18256 18257 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 18258 18259 Value *Op0 = SVI->getOperand(0); 18260 Value *Op1 = SVI->getOperand(1); 18261 IRBuilder<> Builder(SI); 18262 18263 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 18264 // vectors to integer vectors. 18265 if (EltTy->isPointerTy()) { 18266 Type *IntTy = DL.getIntPtrType(EltTy); 18267 18268 // Convert to the corresponding integer vector. 18269 auto *IntVecTy = 18270 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); 18271 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 18272 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 18273 18274 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 18275 } 18276 18277 // The base address of the store. 18278 Value *BaseAddr = SI->getPointerOperand(); 18279 18280 if (NumStores > 1) { 18281 // If we're going to generate more than one store, reset the lane length 18282 // and sub-vector type to something legal. 18283 LaneLen /= NumStores; 18284 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 18285 18286 // We will compute the pointer operand of each store from the original base 18287 // address using GEPs. Cast the base address to a pointer to the scalar 18288 // element type. 18289 BaseAddr = Builder.CreateBitCast( 18290 BaseAddr, 18291 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 18292 } 18293 18294 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 18295 18296 auto Mask = SVI->getShuffleMask(); 18297 18298 auto createStoreIntrinsic = [&](Value *BaseAddr, 18299 SmallVectorImpl<Value *> &Shuffles) { 18300 if (Subtarget->hasNEON()) { 18301 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 18302 Intrinsic::arm_neon_vst3, 18303 Intrinsic::arm_neon_vst4}; 18304 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 18305 Type *Tys[] = {Int8Ptr, SubVecTy}; 18306 18307 Function *VstNFunc = Intrinsic::getDeclaration( 18308 SI->getModule(), StoreInts[Factor - 2], Tys); 18309 18310 SmallVector<Value *, 6> Ops; 18311 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 18312 for (auto S : Shuffles) 18313 Ops.push_back(S); 18314 Ops.push_back(Builder.getInt32(SI->getAlignment())); 18315 Builder.CreateCall(VstNFunc, Ops); 18316 } else { 18317 assert((Factor == 2 || Factor == 4) && 18318 "expected interleave factor of 2 or 4 for MVE"); 18319 Intrinsic::ID StoreInts = 18320 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 18321 Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( 18322 SI->getPointerAddressSpace()); 18323 Type *Tys[] = {EltPtrTy, SubVecTy}; 18324 Function *VstNFunc = 18325 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 18326 18327 SmallVector<Value *, 6> Ops; 18328 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 18329 for (auto S : Shuffles) 18330 Ops.push_back(S); 18331 for (unsigned F = 0; F < Factor; F++) { 18332 Ops.push_back(Builder.getInt32(F)); 18333 Builder.CreateCall(VstNFunc, Ops); 18334 Ops.pop_back(); 18335 } 18336 } 18337 }; 18338 18339 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 18340 // If we generating more than one store, we compute the base address of 18341 // subsequent stores as an offset from the previous. 18342 if (StoreCount > 0) 18343 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 18344 BaseAddr, LaneLen * Factor); 18345 18346 SmallVector<Value *, 4> Shuffles; 18347 18348 // Split the shufflevector operands into sub vectors for the new vstN call. 18349 for (unsigned i = 0; i < Factor; i++) { 18350 unsigned IdxI = StoreCount * LaneLen * Factor + i; 18351 if (Mask[IdxI] >= 0) { 18352 Shuffles.push_back(Builder.CreateShuffleVector( 18353 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); 18354 } else { 18355 unsigned StartMask = 0; 18356 for (unsigned j = 1; j < LaneLen; j++) { 18357 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 18358 if (Mask[IdxJ * Factor + IdxI] >= 0) { 18359 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 18360 break; 18361 } 18362 } 18363 // Note: If all elements in a chunk are undefs, StartMask=0! 18364 // Note: Filling undef gaps with random elements is ok, since 18365 // those elements were being written anyway (with undefs). 18366 // In the case of all undefs we're defaulting to using elems from 0 18367 // Note: StartMask cannot be negative, it's checked in 18368 // isReInterleaveMask 18369 Shuffles.push_back(Builder.CreateShuffleVector( 18370 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); 18371 } 18372 } 18373 18374 createStoreIntrinsic(BaseAddr, Shuffles); 18375 } 18376 return true; 18377 } 18378 18379 enum HABaseType { 18380 HA_UNKNOWN = 0, 18381 HA_FLOAT, 18382 HA_DOUBLE, 18383 HA_VECT64, 18384 HA_VECT128 18385 }; 18386 18387 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 18388 uint64_t &Members) { 18389 if (auto *ST = dyn_cast<StructType>(Ty)) { 18390 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 18391 uint64_t SubMembers = 0; 18392 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 18393 return false; 18394 Members += SubMembers; 18395 } 18396 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 18397 uint64_t SubMembers = 0; 18398 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 18399 return false; 18400 Members += SubMembers * AT->getNumElements(); 18401 } else if (Ty->isFloatTy()) { 18402 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 18403 return false; 18404 Members = 1; 18405 Base = HA_FLOAT; 18406 } else if (Ty->isDoubleTy()) { 18407 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 18408 return false; 18409 Members = 1; 18410 Base = HA_DOUBLE; 18411 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 18412 Members = 1; 18413 switch (Base) { 18414 case HA_FLOAT: 18415 case HA_DOUBLE: 18416 return false; 18417 case HA_VECT64: 18418 return VT->getPrimitiveSizeInBits().getFixedSize() == 64; 18419 case HA_VECT128: 18420 return VT->getPrimitiveSizeInBits().getFixedSize() == 128; 18421 case HA_UNKNOWN: 18422 switch (VT->getPrimitiveSizeInBits().getFixedSize()) { 18423 case 64: 18424 Base = HA_VECT64; 18425 return true; 18426 case 128: 18427 Base = HA_VECT128; 18428 return true; 18429 default: 18430 return false; 18431 } 18432 } 18433 } 18434 18435 return (Members > 0 && Members <= 4); 18436 } 18437 18438 /// Return the correct alignment for the current calling convention. 18439 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 18440 DataLayout DL) const { 18441 const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); 18442 if (!ArgTy->isVectorTy()) 18443 return ABITypeAlign; 18444 18445 // Avoid over-aligning vector parameters. It would require realigning the 18446 // stack and waste space for no real benefit. 18447 return std::min(ABITypeAlign, DL.getStackAlignment()); 18448 } 18449 18450 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 18451 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 18452 /// passing according to AAPCS rules. 18453 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 18454 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 18455 if (getEffectiveCallingConv(CallConv, isVarArg) != 18456 CallingConv::ARM_AAPCS_VFP) 18457 return false; 18458 18459 HABaseType Base = HA_UNKNOWN; 18460 uint64_t Members = 0; 18461 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 18462 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 18463 18464 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 18465 return IsHA || IsIntArray; 18466 } 18467 18468 Register ARMTargetLowering::getExceptionPointerRegister( 18469 const Constant *PersonalityFn) const { 18470 // Platforms which do not use SjLj EH may return values in these registers 18471 // via the personality function. 18472 return Subtarget->useSjLjEH() ? Register() : ARM::R0; 18473 } 18474 18475 Register ARMTargetLowering::getExceptionSelectorRegister( 18476 const Constant *PersonalityFn) const { 18477 // Platforms which do not use SjLj EH may return values in these registers 18478 // via the personality function. 18479 return Subtarget->useSjLjEH() ? Register() : ARM::R1; 18480 } 18481 18482 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 18483 // Update IsSplitCSR in ARMFunctionInfo. 18484 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 18485 AFI->setIsSplitCSR(true); 18486 } 18487 18488 void ARMTargetLowering::insertCopiesSplitCSR( 18489 MachineBasicBlock *Entry, 18490 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 18491 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 18492 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 18493 if (!IStart) 18494 return; 18495 18496 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 18497 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 18498 MachineBasicBlock::iterator MBBI = Entry->begin(); 18499 for (const MCPhysReg *I = IStart; *I; ++I) { 18500 const TargetRegisterClass *RC = nullptr; 18501 if (ARM::GPRRegClass.contains(*I)) 18502 RC = &ARM::GPRRegClass; 18503 else if (ARM::DPRRegClass.contains(*I)) 18504 RC = &ARM::DPRRegClass; 18505 else 18506 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 18507 18508 Register NewVR = MRI->createVirtualRegister(RC); 18509 // Create copy from CSR to a virtual register. 18510 // FIXME: this currently does not emit CFI pseudo-instructions, it works 18511 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 18512 // nounwind. If we want to generalize this later, we may need to emit 18513 // CFI pseudo-instructions. 18514 assert(Entry->getParent()->getFunction().hasFnAttribute( 18515 Attribute::NoUnwind) && 18516 "Function should be nounwind in insertCopiesSplitCSR!"); 18517 Entry->addLiveIn(*I); 18518 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 18519 .addReg(*I); 18520 18521 // Insert the copy-back instructions right before the terminator. 18522 for (auto *Exit : Exits) 18523 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 18524 TII->get(TargetOpcode::COPY), *I) 18525 .addReg(NewVR); 18526 } 18527 } 18528 18529 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 18530 MF.getFrameInfo().computeMaxCallFrameSize(MF); 18531 TargetLoweringBase::finalizeLowering(MF); 18532 } 18533