1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "MCTargetDesc/ARMAddressingModes.h" 25 #include "MCTargetDesc/ARMBaseInfo.h" 26 #include "Utils/ARMBaseInfo.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/BitVector.h" 31 #include "llvm/ADT/DenseMap.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringExtras.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/ADT/StringSwitch.h" 39 #include "llvm/ADT/Triple.h" 40 #include "llvm/ADT/Twine.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/CodeGen/CallingConvLower.h" 43 #include "llvm/CodeGen/ISDOpcodes.h" 44 #include "llvm/CodeGen/IntrinsicLowering.h" 45 #include "llvm/CodeGen/MachineBasicBlock.h" 46 #include "llvm/CodeGen/MachineConstantPool.h" 47 #include "llvm/CodeGen/MachineFrameInfo.h" 48 #include "llvm/CodeGen/MachineFunction.h" 49 #include "llvm/CodeGen/MachineInstr.h" 50 #include "llvm/CodeGen/MachineInstrBuilder.h" 51 #include "llvm/CodeGen/MachineJumpTableInfo.h" 52 #include "llvm/CodeGen/MachineMemOperand.h" 53 #include "llvm/CodeGen/MachineOperand.h" 54 #include "llvm/CodeGen/MachineRegisterInfo.h" 55 #include "llvm/CodeGen/RuntimeLibcalls.h" 56 #include "llvm/CodeGen/SelectionDAG.h" 57 #include "llvm/CodeGen/SelectionDAGNodes.h" 58 #include "llvm/CodeGen/TargetInstrInfo.h" 59 #include "llvm/CodeGen/TargetLowering.h" 60 #include "llvm/CodeGen/TargetOpcodes.h" 61 #include "llvm/CodeGen/TargetRegisterInfo.h" 62 #include "llvm/CodeGen/TargetSubtargetInfo.h" 63 #include "llvm/CodeGen/ValueTypes.h" 64 #include "llvm/IR/Attributes.h" 65 #include "llvm/IR/CallingConv.h" 66 #include "llvm/IR/Constant.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DebugLoc.h" 70 #include "llvm/IR/DerivedTypes.h" 71 #include "llvm/IR/Function.h" 72 #include "llvm/IR/GlobalAlias.h" 73 #include "llvm/IR/GlobalValue.h" 74 #include "llvm/IR/GlobalVariable.h" 75 #include "llvm/IR/IRBuilder.h" 76 #include "llvm/IR/InlineAsm.h" 77 #include "llvm/IR/Instruction.h" 78 #include "llvm/IR/Instructions.h" 79 #include "llvm/IR/IntrinsicInst.h" 80 #include "llvm/IR/Intrinsics.h" 81 #include "llvm/IR/IntrinsicsARM.h" 82 #include "llvm/IR/Module.h" 83 #include "llvm/IR/PatternMatch.h" 84 #include "llvm/IR/Type.h" 85 #include "llvm/IR/User.h" 86 #include "llvm/IR/Value.h" 87 #include "llvm/MC/MCInstrDesc.h" 88 #include "llvm/MC/MCInstrItineraries.h" 89 #include "llvm/MC/MCRegisterInfo.h" 90 #include "llvm/MC/MCSchedule.h" 91 #include "llvm/Support/AtomicOrdering.h" 92 #include "llvm/Support/BranchProbability.h" 93 #include "llvm/Support/Casting.h" 94 #include "llvm/Support/CodeGen.h" 95 #include "llvm/Support/CommandLine.h" 96 #include "llvm/Support/Compiler.h" 97 #include "llvm/Support/Debug.h" 98 #include "llvm/Support/ErrorHandling.h" 99 #include "llvm/Support/KnownBits.h" 100 #include "llvm/Support/MachineValueType.h" 101 #include "llvm/Support/MathExtras.h" 102 #include "llvm/Support/raw_ostream.h" 103 #include "llvm/Target/TargetMachine.h" 104 #include "llvm/Target/TargetOptions.h" 105 #include <algorithm> 106 #include <cassert> 107 #include <cstdint> 108 #include <cstdlib> 109 #include <iterator> 110 #include <limits> 111 #include <string> 112 #include <tuple> 113 #include <utility> 114 #include <vector> 115 116 using namespace llvm; 117 using namespace llvm::PatternMatch; 118 119 #define DEBUG_TYPE "arm-isel" 120 121 STATISTIC(NumTailCalls, "Number of tail calls"); 122 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 123 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 124 STATISTIC(NumConstpoolPromoted, 125 "Number of constants with their storage promoted into constant pools"); 126 127 static cl::opt<bool> 128 ARMInterworking("arm-interworking", cl::Hidden, 129 cl::desc("Enable / disable ARM interworking (for debugging only)"), 130 cl::init(true)); 131 132 static cl::opt<bool> EnableConstpoolPromotion( 133 "arm-promote-constant", cl::Hidden, 134 cl::desc("Enable / disable promotion of unnamed_addr constants into " 135 "constant pools"), 136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 137 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 138 "arm-promote-constant-max-size", cl::Hidden, 139 cl::desc("Maximum size of constant to promote into a constant pool"), 140 cl::init(64)); 141 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 142 "arm-promote-constant-max-total", cl::Hidden, 143 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 144 cl::init(128)); 145 146 static cl::opt<unsigned> 147 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 148 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 149 cl::init(2)); 150 151 // The APCS parameter registers. 152 static const MCPhysReg GPRArgRegs[] = { 153 ARM::R0, ARM::R1, ARM::R2, ARM::R3 154 }; 155 156 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 157 MVT PromotedBitwiseVT) { 158 if (VT != PromotedLdStVT) { 159 setOperationAction(ISD::LOAD, VT, Promote); 160 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 161 162 setOperationAction(ISD::STORE, VT, Promote); 163 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 164 } 165 166 MVT ElemTy = VT.getVectorElementType(); 167 if (ElemTy != MVT::f64) 168 setOperationAction(ISD::SETCC, VT, Custom); 169 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 171 if (ElemTy == MVT::i32) { 172 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 173 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 175 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 176 } else { 177 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 178 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 180 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 181 } 182 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 183 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 184 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 185 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 186 setOperationAction(ISD::SELECT, VT, Expand); 187 setOperationAction(ISD::SELECT_CC, VT, Expand); 188 setOperationAction(ISD::VSELECT, VT, Expand); 189 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 190 if (VT.isInteger()) { 191 setOperationAction(ISD::SHL, VT, Custom); 192 setOperationAction(ISD::SRA, VT, Custom); 193 setOperationAction(ISD::SRL, VT, Custom); 194 } 195 196 // Promote all bit-wise operations. 197 if (VT.isInteger() && VT != PromotedBitwiseVT) { 198 setOperationAction(ISD::AND, VT, Promote); 199 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 200 setOperationAction(ISD::OR, VT, Promote); 201 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 202 setOperationAction(ISD::XOR, VT, Promote); 203 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 204 } 205 206 // Neon does not support vector divide/remainder operations. 207 setOperationAction(ISD::SDIV, VT, Expand); 208 setOperationAction(ISD::UDIV, VT, Expand); 209 setOperationAction(ISD::FDIV, VT, Expand); 210 setOperationAction(ISD::SREM, VT, Expand); 211 setOperationAction(ISD::UREM, VT, Expand); 212 setOperationAction(ISD::FREM, VT, Expand); 213 214 if (!VT.isFloatingPoint() && 215 VT != MVT::v2i64 && VT != MVT::v1i64) 216 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 217 setOperationAction(Opcode, VT, Legal); 218 if (!VT.isFloatingPoint()) 219 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 220 setOperationAction(Opcode, VT, Legal); 221 } 222 223 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 224 addRegisterClass(VT, &ARM::DPRRegClass); 225 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 226 } 227 228 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 229 addRegisterClass(VT, &ARM::DPairRegClass); 230 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 231 } 232 233 void ARMTargetLowering::setAllExpand(MVT VT) { 234 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 235 setOperationAction(Opc, VT, Expand); 236 237 // We support these really simple operations even on types where all 238 // the actual arithmetic has to be broken down into simpler 239 // operations or turned into library calls. 240 setOperationAction(ISD::BITCAST, VT, Legal); 241 setOperationAction(ISD::LOAD, VT, Legal); 242 setOperationAction(ISD::STORE, VT, Legal); 243 setOperationAction(ISD::UNDEF, VT, Legal); 244 } 245 246 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 247 LegalizeAction Action) { 248 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 249 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 250 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 251 } 252 253 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 254 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 255 256 for (auto VT : IntTypes) { 257 addRegisterClass(VT, &ARM::MQPRRegClass); 258 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 259 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 260 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 261 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 262 setOperationAction(ISD::SHL, VT, Custom); 263 setOperationAction(ISD::SRA, VT, Custom); 264 setOperationAction(ISD::SRL, VT, Custom); 265 setOperationAction(ISD::SMIN, VT, Legal); 266 setOperationAction(ISD::SMAX, VT, Legal); 267 setOperationAction(ISD::UMIN, VT, Legal); 268 setOperationAction(ISD::UMAX, VT, Legal); 269 setOperationAction(ISD::ABS, VT, Legal); 270 setOperationAction(ISD::SETCC, VT, Custom); 271 setOperationAction(ISD::MLOAD, VT, Custom); 272 setOperationAction(ISD::MSTORE, VT, Legal); 273 setOperationAction(ISD::CTLZ, VT, Legal); 274 setOperationAction(ISD::CTTZ, VT, Custom); 275 setOperationAction(ISD::BITREVERSE, VT, Legal); 276 setOperationAction(ISD::BSWAP, VT, Legal); 277 setOperationAction(ISD::SADDSAT, VT, Legal); 278 setOperationAction(ISD::UADDSAT, VT, Legal); 279 setOperationAction(ISD::SSUBSAT, VT, Legal); 280 setOperationAction(ISD::USUBSAT, VT, Legal); 281 282 // No native support for these. 283 setOperationAction(ISD::UDIV, VT, Expand); 284 setOperationAction(ISD::SDIV, VT, Expand); 285 setOperationAction(ISD::UREM, VT, Expand); 286 setOperationAction(ISD::SREM, VT, Expand); 287 setOperationAction(ISD::CTPOP, VT, Expand); 288 289 // Vector reductions 290 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 291 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 292 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 293 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 294 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 295 296 if (!HasMVEFP) { 297 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 298 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 299 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 300 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 301 } 302 303 // Pre and Post inc are supported on loads and stores 304 for (unsigned im = (unsigned)ISD::PRE_INC; 305 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 306 setIndexedLoadAction(im, VT, Legal); 307 setIndexedStoreAction(im, VT, Legal); 308 setIndexedMaskedLoadAction(im, VT, Legal); 309 setIndexedMaskedStoreAction(im, VT, Legal); 310 } 311 } 312 313 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 314 for (auto VT : FloatTypes) { 315 addRegisterClass(VT, &ARM::MQPRRegClass); 316 if (!HasMVEFP) 317 setAllExpand(VT); 318 319 // These are legal or custom whether we have MVE.fp or not 320 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 321 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 322 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 323 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 324 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 325 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 326 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 327 setOperationAction(ISD::SETCC, VT, Custom); 328 setOperationAction(ISD::MLOAD, VT, Custom); 329 setOperationAction(ISD::MSTORE, VT, Legal); 330 331 // Pre and Post inc are supported on loads and stores 332 for (unsigned im = (unsigned)ISD::PRE_INC; 333 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 334 setIndexedLoadAction(im, VT, Legal); 335 setIndexedStoreAction(im, VT, Legal); 336 setIndexedMaskedLoadAction(im, VT, Legal); 337 setIndexedMaskedStoreAction(im, VT, Legal); 338 } 339 340 if (HasMVEFP) { 341 setOperationAction(ISD::FMINNUM, VT, Legal); 342 setOperationAction(ISD::FMAXNUM, VT, Legal); 343 setOperationAction(ISD::FROUND, VT, Legal); 344 345 // No native support for these. 346 setOperationAction(ISD::FDIV, VT, Expand); 347 setOperationAction(ISD::FREM, VT, Expand); 348 setOperationAction(ISD::FSQRT, VT, Expand); 349 setOperationAction(ISD::FSIN, VT, Expand); 350 setOperationAction(ISD::FCOS, VT, Expand); 351 setOperationAction(ISD::FPOW, VT, Expand); 352 setOperationAction(ISD::FLOG, VT, Expand); 353 setOperationAction(ISD::FLOG2, VT, Expand); 354 setOperationAction(ISD::FLOG10, VT, Expand); 355 setOperationAction(ISD::FEXP, VT, Expand); 356 setOperationAction(ISD::FEXP2, VT, Expand); 357 setOperationAction(ISD::FNEARBYINT, VT, Expand); 358 } 359 } 360 361 // We 'support' these types up to bitcast/load/store level, regardless of 362 // MVE integer-only / float support. Only doing FP data processing on the FP 363 // vector types is inhibited at integer-only level. 364 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 365 for (auto VT : LongTypes) { 366 addRegisterClass(VT, &ARM::MQPRRegClass); 367 setAllExpand(VT); 368 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 369 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 370 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 371 } 372 // We can do bitwise operations on v2i64 vectors 373 setOperationAction(ISD::AND, MVT::v2i64, Legal); 374 setOperationAction(ISD::OR, MVT::v2i64, Legal); 375 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 376 377 // It is legal to extload from v4i8 to v4i16 or v4i32. 378 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 379 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 380 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 381 382 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 383 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 384 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 385 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 386 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 387 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 388 389 // Some truncating stores are legal too. 390 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 391 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 392 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 393 394 // Pre and Post inc on these are legal, given the correct extends 395 for (unsigned im = (unsigned)ISD::PRE_INC; 396 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 397 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 398 setIndexedLoadAction(im, VT, Legal); 399 setIndexedStoreAction(im, VT, Legal); 400 setIndexedMaskedLoadAction(im, VT, Legal); 401 setIndexedMaskedStoreAction(im, VT, Legal); 402 } 403 } 404 405 // Predicate types 406 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 407 for (auto VT : pTypes) { 408 addRegisterClass(VT, &ARM::VCCRRegClass); 409 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 410 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 411 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 412 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 413 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 414 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 415 setOperationAction(ISD::SETCC, VT, Custom); 416 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 417 setOperationAction(ISD::LOAD, VT, Custom); 418 setOperationAction(ISD::STORE, VT, Custom); 419 } 420 } 421 422 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 423 const ARMSubtarget &STI) 424 : TargetLowering(TM), Subtarget(&STI) { 425 RegInfo = Subtarget->getRegisterInfo(); 426 Itins = Subtarget->getInstrItineraryData(); 427 428 setBooleanContents(ZeroOrOneBooleanContent); 429 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 430 431 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 432 !Subtarget->isTargetWatchOS()) { 433 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 434 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 435 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 436 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 437 : CallingConv::ARM_AAPCS); 438 } 439 440 if (Subtarget->isTargetMachO()) { 441 // Uses VFP for Thumb libfuncs if available. 442 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 443 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 444 static const struct { 445 const RTLIB::Libcall Op; 446 const char * const Name; 447 const ISD::CondCode Cond; 448 } LibraryCalls[] = { 449 // Single-precision floating-point arithmetic. 450 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 451 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 452 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 453 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 454 455 // Double-precision floating-point arithmetic. 456 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 457 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 458 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 459 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 460 461 // Single-precision comparisons. 462 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 463 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 464 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 465 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 466 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 467 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 468 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 469 470 // Double-precision comparisons. 471 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 472 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 473 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 474 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 475 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 476 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 477 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 478 479 // Floating-point to integer conversions. 480 // i64 conversions are done via library routines even when generating VFP 481 // instructions, so use the same ones. 482 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 483 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 484 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 485 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 486 487 // Conversions between floating types. 488 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 489 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 490 491 // Integer to floating-point conversions. 492 // i64 conversions are done via library routines even when generating VFP 493 // instructions, so use the same ones. 494 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 495 // e.g., __floatunsidf vs. __floatunssidfvfp. 496 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 497 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 498 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 499 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 500 }; 501 502 for (const auto &LC : LibraryCalls) { 503 setLibcallName(LC.Op, LC.Name); 504 if (LC.Cond != ISD::SETCC_INVALID) 505 setCmpLibcallCC(LC.Op, LC.Cond); 506 } 507 } 508 } 509 510 // These libcalls are not available in 32-bit. 511 setLibcallName(RTLIB::SHL_I128, nullptr); 512 setLibcallName(RTLIB::SRL_I128, nullptr); 513 setLibcallName(RTLIB::SRA_I128, nullptr); 514 515 // RTLIB 516 if (Subtarget->isAAPCS_ABI() && 517 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 518 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 519 static const struct { 520 const RTLIB::Libcall Op; 521 const char * const Name; 522 const CallingConv::ID CC; 523 const ISD::CondCode Cond; 524 } LibraryCalls[] = { 525 // Double-precision floating-point arithmetic helper functions 526 // RTABI chapter 4.1.2, Table 2 527 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 528 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 529 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 530 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 531 532 // Double-precision floating-point comparison helper functions 533 // RTABI chapter 4.1.2, Table 3 534 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 535 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 536 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 537 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 538 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 539 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 540 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 541 542 // Single-precision floating-point arithmetic helper functions 543 // RTABI chapter 4.1.2, Table 4 544 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 545 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 546 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 547 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 548 549 // Single-precision floating-point comparison helper functions 550 // RTABI chapter 4.1.2, Table 5 551 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 552 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 553 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 554 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 555 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 556 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 557 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 558 559 // Floating-point to integer conversions. 560 // RTABI chapter 4.1.2, Table 6 561 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 562 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 565 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 566 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 567 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 568 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 569 570 // Conversions between floating types. 571 // RTABI chapter 4.1.2, Table 7 572 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 573 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 574 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 575 576 // Integer to floating-point conversions. 577 // RTABI chapter 4.1.2, Table 8 578 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 579 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 582 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 583 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 584 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 585 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 586 587 // Long long helper functions 588 // RTABI chapter 4.2, Table 9 589 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 590 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 591 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 592 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 593 594 // Integer division functions 595 // RTABI chapter 4.3.1 596 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 597 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 599 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 600 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 601 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 602 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 603 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 604 }; 605 606 for (const auto &LC : LibraryCalls) { 607 setLibcallName(LC.Op, LC.Name); 608 setLibcallCallingConv(LC.Op, LC.CC); 609 if (LC.Cond != ISD::SETCC_INVALID) 610 setCmpLibcallCC(LC.Op, LC.Cond); 611 } 612 613 // EABI dependent RTLIB 614 if (TM.Options.EABIVersion == EABI::EABI4 || 615 TM.Options.EABIVersion == EABI::EABI5) { 616 static const struct { 617 const RTLIB::Libcall Op; 618 const char *const Name; 619 const CallingConv::ID CC; 620 const ISD::CondCode Cond; 621 } MemOpsLibraryCalls[] = { 622 // Memory operations 623 // RTABI chapter 4.3.4 624 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 627 }; 628 629 for (const auto &LC : MemOpsLibraryCalls) { 630 setLibcallName(LC.Op, LC.Name); 631 setLibcallCallingConv(LC.Op, LC.CC); 632 if (LC.Cond != ISD::SETCC_INVALID) 633 setCmpLibcallCC(LC.Op, LC.Cond); 634 } 635 } 636 } 637 638 if (Subtarget->isTargetWindows()) { 639 static const struct { 640 const RTLIB::Libcall Op; 641 const char * const Name; 642 const CallingConv::ID CC; 643 } LibraryCalls[] = { 644 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 645 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 646 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 647 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 648 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 649 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 650 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 651 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 652 }; 653 654 for (const auto &LC : LibraryCalls) { 655 setLibcallName(LC.Op, LC.Name); 656 setLibcallCallingConv(LC.Op, LC.CC); 657 } 658 } 659 660 // Use divmod compiler-rt calls for iOS 5.0 and later. 661 if (Subtarget->isTargetMachO() && 662 !(Subtarget->isTargetIOS() && 663 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 664 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 665 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 666 } 667 668 // The half <-> float conversion functions are always soft-float on 669 // non-watchos platforms, but are needed for some targets which use a 670 // hard-float calling convention by default. 671 if (!Subtarget->isTargetWatchABI()) { 672 if (Subtarget->isAAPCS_ABI()) { 673 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 674 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 675 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 676 } else { 677 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 678 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 679 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 680 } 681 } 682 683 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 684 // a __gnu_ prefix (which is the default). 685 if (Subtarget->isTargetAEABI()) { 686 static const struct { 687 const RTLIB::Libcall Op; 688 const char * const Name; 689 const CallingConv::ID CC; 690 } LibraryCalls[] = { 691 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 692 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 693 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 694 }; 695 696 for (const auto &LC : LibraryCalls) { 697 setLibcallName(LC.Op, LC.Name); 698 setLibcallCallingConv(LC.Op, LC.CC); 699 } 700 } 701 702 if (Subtarget->isThumb1Only()) 703 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 704 else 705 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 706 707 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 708 Subtarget->hasFPRegs()) { 709 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 710 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 711 if (!Subtarget->hasVFP2Base()) 712 setAllExpand(MVT::f32); 713 if (!Subtarget->hasFP64()) 714 setAllExpand(MVT::f64); 715 } 716 717 if (Subtarget->hasFullFP16()) { 718 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 719 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 720 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 721 722 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 723 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 724 } 725 726 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 727 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 728 setTruncStoreAction(VT, InnerVT, Expand); 729 addAllExtLoads(VT, InnerVT, Expand); 730 } 731 732 setOperationAction(ISD::MULHS, VT, Expand); 733 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 734 setOperationAction(ISD::MULHU, VT, Expand); 735 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 736 737 setOperationAction(ISD::BSWAP, VT, Expand); 738 } 739 740 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 741 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 742 743 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 744 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 745 746 if (Subtarget->hasMVEIntegerOps()) 747 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 748 749 // Combine low-overhead loop intrinsics so that we can lower i1 types. 750 if (Subtarget->hasLOB()) { 751 setTargetDAGCombine(ISD::BRCOND); 752 setTargetDAGCombine(ISD::BR_CC); 753 } 754 755 if (Subtarget->hasNEON()) { 756 addDRTypeForNEON(MVT::v2f32); 757 addDRTypeForNEON(MVT::v8i8); 758 addDRTypeForNEON(MVT::v4i16); 759 addDRTypeForNEON(MVT::v2i32); 760 addDRTypeForNEON(MVT::v1i64); 761 762 addQRTypeForNEON(MVT::v4f32); 763 addQRTypeForNEON(MVT::v2f64); 764 addQRTypeForNEON(MVT::v16i8); 765 addQRTypeForNEON(MVT::v8i16); 766 addQRTypeForNEON(MVT::v4i32); 767 addQRTypeForNEON(MVT::v2i64); 768 769 if (Subtarget->hasFullFP16()) { 770 addQRTypeForNEON(MVT::v8f16); 771 addDRTypeForNEON(MVT::v4f16); 772 } 773 } 774 775 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 776 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 777 // none of Neon, MVE or VFP supports any arithmetic operations on it. 778 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 779 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 780 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 781 // FIXME: Code duplication: FDIV and FREM are expanded always, see 782 // ARMTargetLowering::addTypeForNEON method for details. 783 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 784 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 785 // FIXME: Create unittest. 786 // In another words, find a way when "copysign" appears in DAG with vector 787 // operands. 788 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 789 // FIXME: Code duplication: SETCC has custom operation action, see 790 // ARMTargetLowering::addTypeForNEON method for details. 791 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 792 // FIXME: Create unittest for FNEG and for FABS. 793 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 794 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 795 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 796 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 797 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 798 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 799 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 800 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 801 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 802 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 803 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 804 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 805 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 806 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 807 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 808 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 809 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 810 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 811 } 812 813 if (Subtarget->hasNEON()) { 814 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 815 // supported for v4f32. 816 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 817 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 818 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 819 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 820 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 821 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 822 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 823 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 824 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 825 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 826 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 827 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 828 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 829 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 830 831 // Mark v2f32 intrinsics. 832 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 833 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 834 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 835 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 836 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 837 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 838 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 839 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 840 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 841 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 842 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 843 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 844 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 845 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 846 847 // Neon does not support some operations on v1i64 and v2i64 types. 848 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 849 // Custom handling for some quad-vector types to detect VMULL. 850 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 851 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 852 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 853 // Custom handling for some vector types to avoid expensive expansions 854 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 855 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 856 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 857 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 858 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 859 // a destination type that is wider than the source, and nor does 860 // it have a FP_TO_[SU]INT instruction with a narrower destination than 861 // source. 862 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 863 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 864 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 865 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 866 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 867 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 868 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 869 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 870 871 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 872 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 873 874 // NEON does not have single instruction CTPOP for vectors with element 875 // types wider than 8-bits. However, custom lowering can leverage the 876 // v8i8/v16i8 vcnt instruction. 877 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 878 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 879 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 880 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 881 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 882 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 883 884 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 885 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 886 887 // NEON does not have single instruction CTTZ for vectors. 888 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 889 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 890 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 891 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 892 893 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 894 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 895 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 896 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 897 898 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 899 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 900 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 901 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 902 903 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 904 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 905 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 906 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 907 908 // NEON only has FMA instructions as of VFP4. 909 if (!Subtarget->hasVFP4Base()) { 910 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 911 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 912 } 913 914 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 915 setTargetDAGCombine(ISD::SHL); 916 setTargetDAGCombine(ISD::SRL); 917 setTargetDAGCombine(ISD::SRA); 918 setTargetDAGCombine(ISD::FP_TO_SINT); 919 setTargetDAGCombine(ISD::FP_TO_UINT); 920 setTargetDAGCombine(ISD::FDIV); 921 setTargetDAGCombine(ISD::LOAD); 922 923 // It is legal to extload from v4i8 to v4i16 or v4i32. 924 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 925 MVT::v2i32}) { 926 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 927 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 928 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 929 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 930 } 931 } 932 } 933 934 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 935 setTargetDAGCombine(ISD::BUILD_VECTOR); 936 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 937 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 938 setTargetDAGCombine(ISD::STORE); 939 setTargetDAGCombine(ISD::SIGN_EXTEND); 940 setTargetDAGCombine(ISD::ZERO_EXTEND); 941 setTargetDAGCombine(ISD::ANY_EXTEND); 942 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 943 setTargetDAGCombine(ISD::INTRINSIC_VOID); 944 setTargetDAGCombine(ISD::VECREDUCE_ADD); 945 setTargetDAGCombine(ISD::ADD); 946 } 947 948 if (!Subtarget->hasFP64()) { 949 // When targeting a floating-point unit with only single-precision 950 // operations, f64 is legal for the few double-precision instructions which 951 // are present However, no double-precision operations other than moves, 952 // loads and stores are provided by the hardware. 953 setOperationAction(ISD::FADD, MVT::f64, Expand); 954 setOperationAction(ISD::FSUB, MVT::f64, Expand); 955 setOperationAction(ISD::FMUL, MVT::f64, Expand); 956 setOperationAction(ISD::FMA, MVT::f64, Expand); 957 setOperationAction(ISD::FDIV, MVT::f64, Expand); 958 setOperationAction(ISD::FREM, MVT::f64, Expand); 959 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 960 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 961 setOperationAction(ISD::FNEG, MVT::f64, Expand); 962 setOperationAction(ISD::FABS, MVT::f64, Expand); 963 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 964 setOperationAction(ISD::FSIN, MVT::f64, Expand); 965 setOperationAction(ISD::FCOS, MVT::f64, Expand); 966 setOperationAction(ISD::FPOW, MVT::f64, Expand); 967 setOperationAction(ISD::FLOG, MVT::f64, Expand); 968 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 969 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 970 setOperationAction(ISD::FEXP, MVT::f64, Expand); 971 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 972 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 973 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 974 setOperationAction(ISD::FRINT, MVT::f64, Expand); 975 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 976 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 977 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 978 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 979 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 980 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 981 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 982 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 983 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 984 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 985 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 986 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 987 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 988 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 989 } 990 991 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 992 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 993 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 994 if (Subtarget->hasFullFP16()) { 995 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 996 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 997 } 998 } 999 1000 if (!Subtarget->hasFP16()) { 1001 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1002 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1003 } 1004 1005 computeRegisterProperties(Subtarget->getRegisterInfo()); 1006 1007 // ARM does not have floating-point extending loads. 1008 for (MVT VT : MVT::fp_valuetypes()) { 1009 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1010 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1011 } 1012 1013 // ... or truncating stores 1014 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1015 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1016 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1017 1018 // ARM does not have i1 sign extending load. 1019 for (MVT VT : MVT::integer_valuetypes()) 1020 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1021 1022 // ARM supports all 4 flavors of integer indexed load / store. 1023 if (!Subtarget->isThumb1Only()) { 1024 for (unsigned im = (unsigned)ISD::PRE_INC; 1025 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1026 setIndexedLoadAction(im, MVT::i1, Legal); 1027 setIndexedLoadAction(im, MVT::i8, Legal); 1028 setIndexedLoadAction(im, MVT::i16, Legal); 1029 setIndexedLoadAction(im, MVT::i32, Legal); 1030 setIndexedStoreAction(im, MVT::i1, Legal); 1031 setIndexedStoreAction(im, MVT::i8, Legal); 1032 setIndexedStoreAction(im, MVT::i16, Legal); 1033 setIndexedStoreAction(im, MVT::i32, Legal); 1034 } 1035 } else { 1036 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1037 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1038 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1039 } 1040 1041 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1042 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1043 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1044 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1045 1046 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1047 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1048 if (Subtarget->hasDSP()) { 1049 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1050 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1051 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1052 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1053 } 1054 if (Subtarget->hasBaseDSP()) { 1055 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1056 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1057 } 1058 1059 // i64 operation support. 1060 setOperationAction(ISD::MUL, MVT::i64, Expand); 1061 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1062 if (Subtarget->isThumb1Only()) { 1063 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1064 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1065 } 1066 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1067 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1068 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1069 1070 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1071 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1072 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1073 setOperationAction(ISD::SRL, MVT::i64, Custom); 1074 setOperationAction(ISD::SRA, MVT::i64, Custom); 1075 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1076 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1077 setOperationAction(ISD::LOAD, MVT::i64, Custom); 1078 setOperationAction(ISD::STORE, MVT::i64, Custom); 1079 1080 // MVE lowers 64 bit shifts to lsll and lsrl 1081 // assuming that ISD::SRL and SRA of i64 are already marked custom 1082 if (Subtarget->hasMVEIntegerOps()) 1083 setOperationAction(ISD::SHL, MVT::i64, Custom); 1084 1085 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1086 if (Subtarget->isThumb1Only()) { 1087 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1088 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1089 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1090 } 1091 1092 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1093 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1094 1095 // ARM does not have ROTL. 1096 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1097 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1098 setOperationAction(ISD::ROTL, VT, Expand); 1099 setOperationAction(ISD::ROTR, VT, Expand); 1100 } 1101 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1102 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1103 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1104 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1105 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1106 } 1107 1108 // @llvm.readcyclecounter requires the Performance Monitors extension. 1109 // Default to the 0 expansion on unsupported platforms. 1110 // FIXME: Technically there are older ARM CPUs that have 1111 // implementation-specific ways of obtaining this information. 1112 if (Subtarget->hasPerfMon()) 1113 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1114 1115 // Only ARMv6 has BSWAP. 1116 if (!Subtarget->hasV6Ops()) 1117 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1118 1119 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1120 : Subtarget->hasDivideInARMMode(); 1121 if (!hasDivide) { 1122 // These are expanded into libcalls if the cpu doesn't have HW divider. 1123 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1124 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1125 } 1126 1127 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1128 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1129 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1130 1131 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1132 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1133 } 1134 1135 setOperationAction(ISD::SREM, MVT::i32, Expand); 1136 setOperationAction(ISD::UREM, MVT::i32, Expand); 1137 1138 // Register based DivRem for AEABI (RTABI 4.2) 1139 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1140 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1141 Subtarget->isTargetWindows()) { 1142 setOperationAction(ISD::SREM, MVT::i64, Custom); 1143 setOperationAction(ISD::UREM, MVT::i64, Custom); 1144 HasStandaloneRem = false; 1145 1146 if (Subtarget->isTargetWindows()) { 1147 const struct { 1148 const RTLIB::Libcall Op; 1149 const char * const Name; 1150 const CallingConv::ID CC; 1151 } LibraryCalls[] = { 1152 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1153 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1154 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1155 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1156 1157 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1158 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1159 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1160 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1161 }; 1162 1163 for (const auto &LC : LibraryCalls) { 1164 setLibcallName(LC.Op, LC.Name); 1165 setLibcallCallingConv(LC.Op, LC.CC); 1166 } 1167 } else { 1168 const struct { 1169 const RTLIB::Libcall Op; 1170 const char * const Name; 1171 const CallingConv::ID CC; 1172 } LibraryCalls[] = { 1173 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1174 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1175 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1176 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1177 1178 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1179 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1180 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1181 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1182 }; 1183 1184 for (const auto &LC : LibraryCalls) { 1185 setLibcallName(LC.Op, LC.Name); 1186 setLibcallCallingConv(LC.Op, LC.CC); 1187 } 1188 } 1189 1190 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1191 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1192 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1193 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1194 } else { 1195 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1196 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1197 } 1198 1199 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1200 // MSVCRT doesn't have powi; fall back to pow 1201 setLibcallName(RTLIB::POWI_F32, nullptr); 1202 setLibcallName(RTLIB::POWI_F64, nullptr); 1203 } 1204 1205 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1206 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1207 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1208 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1209 1210 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1211 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1212 1213 // Use the default implementation. 1214 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1215 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1216 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1217 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1218 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1219 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1220 1221 if (Subtarget->isTargetWindows()) 1222 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1223 else 1224 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1225 1226 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1227 // the default expansion. 1228 InsertFencesForAtomic = false; 1229 if (Subtarget->hasAnyDataBarrier() && 1230 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1231 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1232 // to ldrex/strex loops already. 1233 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1234 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1235 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1236 1237 // On v8, we have particularly efficient implementations of atomic fences 1238 // if they can be combined with nearby atomic loads and stores. 1239 if (!Subtarget->hasAcquireRelease() || 1240 getTargetMachine().getOptLevel() == 0) { 1241 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1242 InsertFencesForAtomic = true; 1243 } 1244 } else { 1245 // If there's anything we can use as a barrier, go through custom lowering 1246 // for ATOMIC_FENCE. 1247 // If target has DMB in thumb, Fences can be inserted. 1248 if (Subtarget->hasDataBarrier()) 1249 InsertFencesForAtomic = true; 1250 1251 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1252 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1253 1254 // Set them all for expansion, which will force libcalls. 1255 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1256 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1257 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1258 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1259 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1260 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1261 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1262 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1263 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1264 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1265 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1266 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1267 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1268 // Unordered/Monotonic case. 1269 if (!InsertFencesForAtomic) { 1270 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1271 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1272 } 1273 } 1274 1275 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1276 1277 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1278 if (!Subtarget->hasV6Ops()) { 1279 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1280 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1281 } 1282 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1283 1284 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1285 !Subtarget->isThumb1Only()) { 1286 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1287 // iff target supports vfp2. 1288 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1289 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1290 } 1291 1292 // We want to custom lower some of our intrinsics. 1293 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1294 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1295 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1296 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1297 if (Subtarget->useSjLjEH()) 1298 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1299 1300 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1301 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1302 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1303 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1304 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1305 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1306 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1307 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1308 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1309 if (Subtarget->hasFullFP16()) { 1310 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1311 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1312 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1313 } 1314 1315 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1316 1317 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1318 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1319 if (Subtarget->hasFullFP16()) 1320 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1321 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1322 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1323 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1324 1325 // We don't support sin/cos/fmod/copysign/pow 1326 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1327 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1328 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1329 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1330 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1331 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1332 setOperationAction(ISD::FREM, MVT::f64, Expand); 1333 setOperationAction(ISD::FREM, MVT::f32, Expand); 1334 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1335 !Subtarget->isThumb1Only()) { 1336 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1337 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1338 } 1339 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1340 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1341 1342 if (!Subtarget->hasVFP4Base()) { 1343 setOperationAction(ISD::FMA, MVT::f64, Expand); 1344 setOperationAction(ISD::FMA, MVT::f32, Expand); 1345 } 1346 1347 // Various VFP goodness 1348 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1349 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1350 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1351 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1352 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1353 } 1354 1355 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1356 if (!Subtarget->hasFP16()) { 1357 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1358 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1359 } 1360 1361 // Strict floating-point comparisons need custom lowering. 1362 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1363 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1364 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1365 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1366 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1367 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1368 } 1369 1370 // Use __sincos_stret if available. 1371 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1372 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1373 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1374 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1375 } 1376 1377 // FP-ARMv8 implements a lot of rounding-like FP operations. 1378 if (Subtarget->hasFPARMv8Base()) { 1379 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1380 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1381 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1382 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1383 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1384 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1385 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1386 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1387 if (Subtarget->hasNEON()) { 1388 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1389 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1390 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1391 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1392 } 1393 1394 if (Subtarget->hasFP64()) { 1395 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1396 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1397 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1398 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1399 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1400 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1401 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1402 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1403 } 1404 } 1405 1406 // FP16 often need to be promoted to call lib functions 1407 if (Subtarget->hasFullFP16()) { 1408 setOperationAction(ISD::FREM, MVT::f16, Promote); 1409 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1410 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1411 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1412 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1413 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1414 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1415 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1416 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1417 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1418 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1419 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1420 1421 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1422 } 1423 1424 if (Subtarget->hasNEON()) { 1425 // vmin and vmax aren't available in a scalar form, so we can use 1426 // a NEON instruction with an undef lane instead. This has a performance 1427 // penalty on some cores, so we don't do this unless we have been 1428 // asked to by the core tuning model. 1429 if (Subtarget->useNEONForSinglePrecisionFP()) { 1430 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1431 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1432 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1433 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1434 } 1435 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1436 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1437 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1438 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1439 1440 if (Subtarget->hasFullFP16()) { 1441 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1442 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1443 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1444 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1445 1446 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1447 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1448 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1449 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1450 } 1451 } 1452 1453 // We have target-specific dag combine patterns for the following nodes: 1454 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1455 setTargetDAGCombine(ISD::ADD); 1456 setTargetDAGCombine(ISD::SUB); 1457 setTargetDAGCombine(ISD::MUL); 1458 setTargetDAGCombine(ISD::AND); 1459 setTargetDAGCombine(ISD::OR); 1460 setTargetDAGCombine(ISD::XOR); 1461 1462 if (Subtarget->hasV6Ops()) 1463 setTargetDAGCombine(ISD::SRL); 1464 if (Subtarget->isThumb1Only()) 1465 setTargetDAGCombine(ISD::SHL); 1466 1467 setStackPointerRegisterToSaveRestore(ARM::SP); 1468 1469 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1470 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1471 setSchedulingPreference(Sched::RegPressure); 1472 else 1473 setSchedulingPreference(Sched::Hybrid); 1474 1475 //// temporary - rewrite interface to use type 1476 MaxStoresPerMemset = 8; 1477 MaxStoresPerMemsetOptSize = 4; 1478 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1479 MaxStoresPerMemcpyOptSize = 2; 1480 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1481 MaxStoresPerMemmoveOptSize = 2; 1482 1483 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1484 // are at least 4 bytes aligned. 1485 setMinStackArgumentAlignment(Align(4)); 1486 1487 // Prefer likely predicted branches to selects on out-of-order cores. 1488 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1489 1490 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1491 1492 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1493 1494 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1495 setTargetDAGCombine(ISD::ABS); 1496 } 1497 1498 bool ARMTargetLowering::useSoftFloat() const { 1499 return Subtarget->useSoftFloat(); 1500 } 1501 1502 // FIXME: It might make sense to define the representative register class as the 1503 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1504 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1505 // SPR's representative would be DPR_VFP2. This should work well if register 1506 // pressure tracking were modified such that a register use would increment the 1507 // pressure of the register class's representative and all of it's super 1508 // classes' representatives transitively. We have not implemented this because 1509 // of the difficulty prior to coalescing of modeling operand register classes 1510 // due to the common occurrence of cross class copies and subregister insertions 1511 // and extractions. 1512 std::pair<const TargetRegisterClass *, uint8_t> 1513 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1514 MVT VT) const { 1515 const TargetRegisterClass *RRC = nullptr; 1516 uint8_t Cost = 1; 1517 switch (VT.SimpleTy) { 1518 default: 1519 return TargetLowering::findRepresentativeClass(TRI, VT); 1520 // Use DPR as representative register class for all floating point 1521 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1522 // the cost is 1 for both f32 and f64. 1523 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1524 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1525 RRC = &ARM::DPRRegClass; 1526 // When NEON is used for SP, only half of the register file is available 1527 // because operations that define both SP and DP results will be constrained 1528 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1529 // coalescing by double-counting the SP regs. See the FIXME above. 1530 if (Subtarget->useNEONForSinglePrecisionFP()) 1531 Cost = 2; 1532 break; 1533 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1534 case MVT::v4f32: case MVT::v2f64: 1535 RRC = &ARM::DPRRegClass; 1536 Cost = 2; 1537 break; 1538 case MVT::v4i64: 1539 RRC = &ARM::DPRRegClass; 1540 Cost = 4; 1541 break; 1542 case MVT::v8i64: 1543 RRC = &ARM::DPRRegClass; 1544 Cost = 8; 1545 break; 1546 } 1547 return std::make_pair(RRC, Cost); 1548 } 1549 1550 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1551 switch ((ARMISD::NodeType)Opcode) { 1552 case ARMISD::FIRST_NUMBER: break; 1553 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1554 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1555 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1556 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1557 case ARMISD::CALL: return "ARMISD::CALL"; 1558 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1559 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1560 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1561 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1562 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1563 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1564 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1565 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1566 case ARMISD::CMP: return "ARMISD::CMP"; 1567 case ARMISD::CMN: return "ARMISD::CMN"; 1568 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1569 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1570 case ARMISD::CMPFPE: return "ARMISD::CMPFPE"; 1571 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1572 case ARMISD::CMPFPEw0: return "ARMISD::CMPFPEw0"; 1573 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1574 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1575 1576 case ARMISD::CMOV: return "ARMISD::CMOV"; 1577 case ARMISD::SUBS: return "ARMISD::SUBS"; 1578 1579 case ARMISD::SSAT: return "ARMISD::SSAT"; 1580 case ARMISD::USAT: return "ARMISD::USAT"; 1581 1582 case ARMISD::ASRL: return "ARMISD::ASRL"; 1583 case ARMISD::LSRL: return "ARMISD::LSRL"; 1584 case ARMISD::LSLL: return "ARMISD::LSLL"; 1585 1586 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1587 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1588 case ARMISD::RRX: return "ARMISD::RRX"; 1589 1590 case ARMISD::ADDC: return "ARMISD::ADDC"; 1591 case ARMISD::ADDE: return "ARMISD::ADDE"; 1592 case ARMISD::SUBC: return "ARMISD::SUBC"; 1593 case ARMISD::SUBE: return "ARMISD::SUBE"; 1594 case ARMISD::LSLS: return "ARMISD::LSLS"; 1595 1596 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1597 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1598 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1599 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1600 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1601 1602 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1603 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1604 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1605 1606 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1607 1608 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1609 1610 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1611 1612 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1613 1614 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1615 1616 case ARMISD::LDRD: return "ARMISD::LDRD"; 1617 case ARMISD::STRD: return "ARMISD::STRD"; 1618 1619 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1620 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1621 1622 case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; 1623 case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST"; 1624 case ARMISD::VCMP: return "ARMISD::VCMP"; 1625 case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; 1626 case ARMISD::VTST: return "ARMISD::VTST"; 1627 1628 case ARMISD::VSHLs: return "ARMISD::VSHLs"; 1629 case ARMISD::VSHLu: return "ARMISD::VSHLu"; 1630 case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; 1631 case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; 1632 case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; 1633 case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; 1634 case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; 1635 case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; 1636 case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; 1637 case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; 1638 case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; 1639 case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; 1640 case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; 1641 case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; 1642 case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; 1643 case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; 1644 case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; 1645 case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; 1646 case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; 1647 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1648 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1649 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1650 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1651 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1652 case ARMISD::VDUP: return "ARMISD::VDUP"; 1653 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1654 case ARMISD::VEXT: return "ARMISD::VEXT"; 1655 case ARMISD::VREV64: return "ARMISD::VREV64"; 1656 case ARMISD::VREV32: return "ARMISD::VREV32"; 1657 case ARMISD::VREV16: return "ARMISD::VREV16"; 1658 case ARMISD::VZIP: return "ARMISD::VZIP"; 1659 case ARMISD::VUZP: return "ARMISD::VUZP"; 1660 case ARMISD::VTRN: return "ARMISD::VTRN"; 1661 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1662 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1663 case ARMISD::VMOVN: return "ARMISD::VMOVN"; 1664 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1665 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1666 case ARMISD::VADDVs: return "ARMISD::VADDVs"; 1667 case ARMISD::VADDVu: return "ARMISD::VADDVu"; 1668 case ARMISD::VADDLVs: return "ARMISD::VADDLVs"; 1669 case ARMISD::VADDLVu: return "ARMISD::VADDLVu"; 1670 case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs"; 1671 case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu"; 1672 case ARMISD::VMLAVs: return "ARMISD::VMLAVs"; 1673 case ARMISD::VMLAVu: return "ARMISD::VMLAVu"; 1674 case ARMISD::VMLALVs: return "ARMISD::VMLALVs"; 1675 case ARMISD::VMLALVu: return "ARMISD::VMLALVu"; 1676 case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs"; 1677 case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu"; 1678 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1679 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1680 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1681 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1682 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1683 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1684 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1685 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1686 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1687 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1688 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1689 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1690 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1691 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1692 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1693 case ARMISD::QADD16b: return "ARMISD::QADD16b"; 1694 case ARMISD::QSUB16b: return "ARMISD::QSUB16b"; 1695 case ARMISD::QADD8b: return "ARMISD::QADD8b"; 1696 case ARMISD::QSUB8b: return "ARMISD::QSUB8b"; 1697 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1698 case ARMISD::BFI: return "ARMISD::BFI"; 1699 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1700 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1701 case ARMISD::VBSL: return "ARMISD::VBSL"; 1702 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1703 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1704 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1705 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1706 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1707 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1708 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1709 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1710 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1711 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1712 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1713 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1714 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1715 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1716 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1717 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1718 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1719 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1720 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1721 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1722 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1723 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1724 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1725 case ARMISD::WLS: return "ARMISD::WLS"; 1726 case ARMISD::LE: return "ARMISD::LE"; 1727 case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC"; 1728 case ARMISD::CSINV: return "ARMISD::CSINV"; 1729 case ARMISD::CSNEG: return "ARMISD::CSNEG"; 1730 case ARMISD::CSINC: return "ARMISD::CSINC"; 1731 } 1732 return nullptr; 1733 } 1734 1735 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1736 EVT VT) const { 1737 if (!VT.isVector()) 1738 return getPointerTy(DL); 1739 1740 // MVE has a predicate register. 1741 if (Subtarget->hasMVEIntegerOps() && 1742 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) 1743 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1744 return VT.changeVectorElementTypeToInteger(); 1745 } 1746 1747 /// getRegClassFor - Return the register class that should be used for the 1748 /// specified value type. 1749 const TargetRegisterClass * 1750 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1751 (void)isDivergent; 1752 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1753 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1754 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1755 // MVE Q registers. 1756 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1757 if (VT == MVT::v4i64) 1758 return &ARM::QQPRRegClass; 1759 if (VT == MVT::v8i64) 1760 return &ARM::QQQQPRRegClass; 1761 } 1762 return TargetLowering::getRegClassFor(VT); 1763 } 1764 1765 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1766 // source/dest is aligned and the copy size is large enough. We therefore want 1767 // to align such objects passed to memory intrinsics. 1768 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1769 unsigned &PrefAlign) const { 1770 if (!isa<MemIntrinsic>(CI)) 1771 return false; 1772 MinSize = 8; 1773 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1774 // cycle faster than 4-byte aligned LDM. 1775 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1776 return true; 1777 } 1778 1779 // Create a fast isel object. 1780 FastISel * 1781 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1782 const TargetLibraryInfo *libInfo) const { 1783 return ARM::createFastISel(funcInfo, libInfo); 1784 } 1785 1786 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1787 unsigned NumVals = N->getNumValues(); 1788 if (!NumVals) 1789 return Sched::RegPressure; 1790 1791 for (unsigned i = 0; i != NumVals; ++i) { 1792 EVT VT = N->getValueType(i); 1793 if (VT == MVT::Glue || VT == MVT::Other) 1794 continue; 1795 if (VT.isFloatingPoint() || VT.isVector()) 1796 return Sched::ILP; 1797 } 1798 1799 if (!N->isMachineOpcode()) 1800 return Sched::RegPressure; 1801 1802 // Load are scheduled for latency even if there instruction itinerary 1803 // is not available. 1804 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1805 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1806 1807 if (MCID.getNumDefs() == 0) 1808 return Sched::RegPressure; 1809 if (!Itins->isEmpty() && 1810 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1811 return Sched::ILP; 1812 1813 return Sched::RegPressure; 1814 } 1815 1816 //===----------------------------------------------------------------------===// 1817 // Lowering Code 1818 //===----------------------------------------------------------------------===// 1819 1820 static bool isSRL16(const SDValue &Op) { 1821 if (Op.getOpcode() != ISD::SRL) 1822 return false; 1823 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1824 return Const->getZExtValue() == 16; 1825 return false; 1826 } 1827 1828 static bool isSRA16(const SDValue &Op) { 1829 if (Op.getOpcode() != ISD::SRA) 1830 return false; 1831 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1832 return Const->getZExtValue() == 16; 1833 return false; 1834 } 1835 1836 static bool isSHL16(const SDValue &Op) { 1837 if (Op.getOpcode() != ISD::SHL) 1838 return false; 1839 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1840 return Const->getZExtValue() == 16; 1841 return false; 1842 } 1843 1844 // Check for a signed 16-bit value. We special case SRA because it makes it 1845 // more simple when also looking for SRAs that aren't sign extending a 1846 // smaller value. Without the check, we'd need to take extra care with 1847 // checking order for some operations. 1848 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1849 if (isSRA16(Op)) 1850 return isSHL16(Op.getOperand(0)); 1851 return DAG.ComputeNumSignBits(Op) == 17; 1852 } 1853 1854 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1855 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1856 switch (CC) { 1857 default: llvm_unreachable("Unknown condition code!"); 1858 case ISD::SETNE: return ARMCC::NE; 1859 case ISD::SETEQ: return ARMCC::EQ; 1860 case ISD::SETGT: return ARMCC::GT; 1861 case ISD::SETGE: return ARMCC::GE; 1862 case ISD::SETLT: return ARMCC::LT; 1863 case ISD::SETLE: return ARMCC::LE; 1864 case ISD::SETUGT: return ARMCC::HI; 1865 case ISD::SETUGE: return ARMCC::HS; 1866 case ISD::SETULT: return ARMCC::LO; 1867 case ISD::SETULE: return ARMCC::LS; 1868 } 1869 } 1870 1871 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1872 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1873 ARMCC::CondCodes &CondCode2) { 1874 CondCode2 = ARMCC::AL; 1875 switch (CC) { 1876 default: llvm_unreachable("Unknown FP condition!"); 1877 case ISD::SETEQ: 1878 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1879 case ISD::SETGT: 1880 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1881 case ISD::SETGE: 1882 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1883 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1884 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1885 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1886 case ISD::SETO: CondCode = ARMCC::VC; break; 1887 case ISD::SETUO: CondCode = ARMCC::VS; break; 1888 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1889 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1890 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1891 case ISD::SETLT: 1892 case ISD::SETULT: CondCode = ARMCC::LT; break; 1893 case ISD::SETLE: 1894 case ISD::SETULE: CondCode = ARMCC::LE; break; 1895 case ISD::SETNE: 1896 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1897 } 1898 } 1899 1900 //===----------------------------------------------------------------------===// 1901 // Calling Convention Implementation 1902 //===----------------------------------------------------------------------===// 1903 1904 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1905 /// account presence of floating point hardware and calling convention 1906 /// limitations, such as support for variadic functions. 1907 CallingConv::ID 1908 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1909 bool isVarArg) const { 1910 switch (CC) { 1911 default: 1912 report_fatal_error("Unsupported calling convention"); 1913 case CallingConv::ARM_AAPCS: 1914 case CallingConv::ARM_APCS: 1915 case CallingConv::GHC: 1916 case CallingConv::CFGuard_Check: 1917 return CC; 1918 case CallingConv::PreserveMost: 1919 return CallingConv::PreserveMost; 1920 case CallingConv::ARM_AAPCS_VFP: 1921 case CallingConv::Swift: 1922 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1923 case CallingConv::C: 1924 if (!Subtarget->isAAPCS_ABI()) 1925 return CallingConv::ARM_APCS; 1926 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 1927 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1928 !isVarArg) 1929 return CallingConv::ARM_AAPCS_VFP; 1930 else 1931 return CallingConv::ARM_AAPCS; 1932 case CallingConv::Fast: 1933 case CallingConv::CXX_FAST_TLS: 1934 if (!Subtarget->isAAPCS_ABI()) { 1935 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 1936 return CallingConv::Fast; 1937 return CallingConv::ARM_APCS; 1938 } else if (Subtarget->hasVFP2Base() && 1939 !Subtarget->isThumb1Only() && !isVarArg) 1940 return CallingConv::ARM_AAPCS_VFP; 1941 else 1942 return CallingConv::ARM_AAPCS; 1943 } 1944 } 1945 1946 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1947 bool isVarArg) const { 1948 return CCAssignFnForNode(CC, false, isVarArg); 1949 } 1950 1951 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1952 bool isVarArg) const { 1953 return CCAssignFnForNode(CC, true, isVarArg); 1954 } 1955 1956 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1957 /// CallingConvention. 1958 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1959 bool Return, 1960 bool isVarArg) const { 1961 switch (getEffectiveCallingConv(CC, isVarArg)) { 1962 default: 1963 report_fatal_error("Unsupported calling convention"); 1964 case CallingConv::ARM_APCS: 1965 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1966 case CallingConv::ARM_AAPCS: 1967 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1968 case CallingConv::ARM_AAPCS_VFP: 1969 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1970 case CallingConv::Fast: 1971 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1972 case CallingConv::GHC: 1973 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1974 case CallingConv::PreserveMost: 1975 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1976 case CallingConv::CFGuard_Check: 1977 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 1978 } 1979 } 1980 1981 /// LowerCallResult - Lower the result values of a call into the 1982 /// appropriate copies out of appropriate physical registers. 1983 SDValue ARMTargetLowering::LowerCallResult( 1984 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1985 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1986 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1987 SDValue ThisVal) const { 1988 // Assign locations to each value returned by this call. 1989 SmallVector<CCValAssign, 16> RVLocs; 1990 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1991 *DAG.getContext()); 1992 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1993 1994 // Copy all of the result registers out of their specified physreg. 1995 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1996 CCValAssign VA = RVLocs[i]; 1997 1998 // Pass 'this' value directly from the argument to return value, to avoid 1999 // reg unit interference 2000 if (i == 0 && isThisReturn) { 2001 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 2002 "unexpected return calling convention register assignment"); 2003 InVals.push_back(ThisVal); 2004 continue; 2005 } 2006 2007 SDValue Val; 2008 if (VA.needsCustom()) { 2009 // Handle f64 or half of a v2f64. 2010 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2011 InFlag); 2012 Chain = Lo.getValue(1); 2013 InFlag = Lo.getValue(2); 2014 VA = RVLocs[++i]; // skip ahead to next loc 2015 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2016 InFlag); 2017 Chain = Hi.getValue(1); 2018 InFlag = Hi.getValue(2); 2019 if (!Subtarget->isLittle()) 2020 std::swap (Lo, Hi); 2021 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2022 2023 if (VA.getLocVT() == MVT::v2f64) { 2024 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2025 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2026 DAG.getConstant(0, dl, MVT::i32)); 2027 2028 VA = RVLocs[++i]; // skip ahead to next loc 2029 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2030 Chain = Lo.getValue(1); 2031 InFlag = Lo.getValue(2); 2032 VA = RVLocs[++i]; // skip ahead to next loc 2033 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2034 Chain = Hi.getValue(1); 2035 InFlag = Hi.getValue(2); 2036 if (!Subtarget->isLittle()) 2037 std::swap (Lo, Hi); 2038 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2039 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2040 DAG.getConstant(1, dl, MVT::i32)); 2041 } 2042 } else { 2043 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2044 InFlag); 2045 Chain = Val.getValue(1); 2046 InFlag = Val.getValue(2); 2047 } 2048 2049 switch (VA.getLocInfo()) { 2050 default: llvm_unreachable("Unknown loc info!"); 2051 case CCValAssign::Full: break; 2052 case CCValAssign::BCvt: 2053 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2054 break; 2055 } 2056 2057 InVals.push_back(Val); 2058 } 2059 2060 return Chain; 2061 } 2062 2063 /// LowerMemOpCallTo - Store the argument to the stack. 2064 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2065 SDValue Arg, const SDLoc &dl, 2066 SelectionDAG &DAG, 2067 const CCValAssign &VA, 2068 ISD::ArgFlagsTy Flags) const { 2069 unsigned LocMemOffset = VA.getLocMemOffset(); 2070 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2071 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2072 StackPtr, PtrOff); 2073 return DAG.getStore( 2074 Chain, dl, Arg, PtrOff, 2075 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 2076 } 2077 2078 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2079 SDValue Chain, SDValue &Arg, 2080 RegsToPassVector &RegsToPass, 2081 CCValAssign &VA, CCValAssign &NextVA, 2082 SDValue &StackPtr, 2083 SmallVectorImpl<SDValue> &MemOpChains, 2084 ISD::ArgFlagsTy Flags) const { 2085 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2086 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2087 unsigned id = Subtarget->isLittle() ? 0 : 1; 2088 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2089 2090 if (NextVA.isRegLoc()) 2091 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2092 else { 2093 assert(NextVA.isMemLoc()); 2094 if (!StackPtr.getNode()) 2095 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2096 getPointerTy(DAG.getDataLayout())); 2097 2098 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 2099 dl, DAG, NextVA, 2100 Flags)); 2101 } 2102 } 2103 2104 /// LowerCall - Lowering a call into a callseq_start <- 2105 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2106 /// nodes. 2107 SDValue 2108 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2109 SmallVectorImpl<SDValue> &InVals) const { 2110 SelectionDAG &DAG = CLI.DAG; 2111 SDLoc &dl = CLI.DL; 2112 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2113 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2114 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2115 SDValue Chain = CLI.Chain; 2116 SDValue Callee = CLI.Callee; 2117 bool &isTailCall = CLI.IsTailCall; 2118 CallingConv::ID CallConv = CLI.CallConv; 2119 bool doesNotRet = CLI.DoesNotReturn; 2120 bool isVarArg = CLI.IsVarArg; 2121 2122 MachineFunction &MF = DAG.getMachineFunction(); 2123 MachineFunction::CallSiteInfo CSInfo; 2124 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2125 bool isThisReturn = false; 2126 bool PreferIndirect = false; 2127 2128 // Disable tail calls if they're not supported. 2129 if (!Subtarget->supportsTailCall()) 2130 isTailCall = false; 2131 2132 if (isa<GlobalAddressSDNode>(Callee)) { 2133 // If we're optimizing for minimum size and the function is called three or 2134 // more times in this block, we can improve codesize by calling indirectly 2135 // as BLXr has a 16-bit encoding. 2136 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2137 if (CLI.CS) { 2138 auto *BB = CLI.CS.getParent(); 2139 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2140 count_if(GV->users(), [&BB](const User *U) { 2141 return isa<Instruction>(U) && 2142 cast<Instruction>(U)->getParent() == BB; 2143 }) > 2; 2144 } 2145 } 2146 if (isTailCall) { 2147 // Check if it's really possible to do a tail call. 2148 isTailCall = IsEligibleForTailCallOptimization( 2149 Callee, CallConv, isVarArg, isStructRet, 2150 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2151 PreferIndirect); 2152 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 2153 report_fatal_error("failed to perform tail call elimination on a call " 2154 "site marked musttail"); 2155 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2156 // detected sibcalls. 2157 if (isTailCall) 2158 ++NumTailCalls; 2159 } 2160 2161 // Analyze operands of the call, assigning locations to each operand. 2162 SmallVector<CCValAssign, 16> ArgLocs; 2163 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2164 *DAG.getContext()); 2165 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2166 2167 // Get a count of how many bytes are to be pushed on the stack. 2168 unsigned NumBytes = CCInfo.getNextStackOffset(); 2169 2170 if (isTailCall) { 2171 // For tail calls, memory operands are available in our caller's stack. 2172 NumBytes = 0; 2173 } else { 2174 // Adjust the stack pointer for the new arguments... 2175 // These operations are automatically eliminated by the prolog/epilog pass 2176 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 2177 } 2178 2179 SDValue StackPtr = 2180 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2181 2182 RegsToPassVector RegsToPass; 2183 SmallVector<SDValue, 8> MemOpChains; 2184 2185 // Walk the register/memloc assignments, inserting copies/loads. In the case 2186 // of tail call optimization, arguments are handled later. 2187 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2188 i != e; 2189 ++i, ++realArgIdx) { 2190 CCValAssign &VA = ArgLocs[i]; 2191 SDValue Arg = OutVals[realArgIdx]; 2192 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2193 bool isByVal = Flags.isByVal(); 2194 2195 // Promote the value if needed. 2196 switch (VA.getLocInfo()) { 2197 default: llvm_unreachable("Unknown loc info!"); 2198 case CCValAssign::Full: break; 2199 case CCValAssign::SExt: 2200 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2201 break; 2202 case CCValAssign::ZExt: 2203 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2204 break; 2205 case CCValAssign::AExt: 2206 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2207 break; 2208 case CCValAssign::BCvt: 2209 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2210 break; 2211 } 2212 2213 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2214 if (VA.needsCustom()) { 2215 if (VA.getLocVT() == MVT::v2f64) { 2216 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2217 DAG.getConstant(0, dl, MVT::i32)); 2218 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2219 DAG.getConstant(1, dl, MVT::i32)); 2220 2221 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 2222 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2223 2224 VA = ArgLocs[++i]; // skip ahead to next loc 2225 if (VA.isRegLoc()) { 2226 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 2227 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 2228 } else { 2229 assert(VA.isMemLoc()); 2230 2231 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 2232 dl, DAG, VA, Flags)); 2233 } 2234 } else { 2235 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2236 StackPtr, MemOpChains, Flags); 2237 } 2238 } else if (VA.isRegLoc()) { 2239 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2240 Outs[0].VT == MVT::i32) { 2241 assert(VA.getLocVT() == MVT::i32 && 2242 "unexpected calling convention register assignment"); 2243 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2244 "unexpected use of 'returned'"); 2245 isThisReturn = true; 2246 } 2247 const TargetOptions &Options = DAG.getTarget().Options; 2248 if (Options.EmitCallSiteInfo) 2249 CSInfo.emplace_back(VA.getLocReg(), i); 2250 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2251 } else if (isByVal) { 2252 assert(VA.isMemLoc()); 2253 unsigned offset = 0; 2254 2255 // True if this byval aggregate will be split between registers 2256 // and memory. 2257 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2258 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2259 2260 if (CurByValIdx < ByValArgsCount) { 2261 2262 unsigned RegBegin, RegEnd; 2263 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2264 2265 EVT PtrVT = 2266 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2267 unsigned int i, j; 2268 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2269 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2270 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2271 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 2272 MachinePointerInfo(), 2273 DAG.InferPtrAlignment(AddArg)); 2274 MemOpChains.push_back(Load.getValue(1)); 2275 RegsToPass.push_back(std::make_pair(j, Load)); 2276 } 2277 2278 // If parameter size outsides register area, "offset" value 2279 // helps us to calculate stack slot for remained part properly. 2280 offset = RegEnd - RegBegin; 2281 2282 CCInfo.nextInRegsParam(); 2283 } 2284 2285 if (Flags.getByValSize() > 4*offset) { 2286 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2287 unsigned LocMemOffset = VA.getLocMemOffset(); 2288 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2289 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 2290 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2291 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2292 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2293 MVT::i32); 2294 SDValue AlignNode = 2295 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); 2296 2297 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2298 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2299 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2300 Ops)); 2301 } 2302 } else if (!isTailCall) { 2303 assert(VA.isMemLoc()); 2304 2305 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2306 dl, DAG, VA, Flags)); 2307 } 2308 } 2309 2310 if (!MemOpChains.empty()) 2311 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2312 2313 // Build a sequence of copy-to-reg nodes chained together with token chain 2314 // and flag operands which copy the outgoing args into the appropriate regs. 2315 SDValue InFlag; 2316 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2317 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2318 RegsToPass[i].second, InFlag); 2319 InFlag = Chain.getValue(1); 2320 } 2321 2322 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2323 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2324 // node so that legalize doesn't hack it. 2325 bool isDirect = false; 2326 2327 const TargetMachine &TM = getTargetMachine(); 2328 const Module *Mod = MF.getFunction().getParent(); 2329 const GlobalValue *GV = nullptr; 2330 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2331 GV = G->getGlobal(); 2332 bool isStub = 2333 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2334 2335 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2336 bool isLocalARMFunc = false; 2337 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2338 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2339 2340 if (Subtarget->genLongCalls()) { 2341 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2342 "long-calls codegen is not position independent!"); 2343 // Handle a global address or an external symbol. If it's not one of 2344 // those, the target's already in a register, so we don't need to do 2345 // anything extra. 2346 if (isa<GlobalAddressSDNode>(Callee)) { 2347 // Create a constant pool entry for the callee address 2348 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2349 ARMConstantPoolValue *CPV = 2350 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2351 2352 // Get the address of the callee into a register 2353 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2354 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2355 Callee = DAG.getLoad( 2356 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2357 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2358 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2359 const char *Sym = S->getSymbol(); 2360 2361 // Create a constant pool entry for the callee address 2362 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2363 ARMConstantPoolValue *CPV = 2364 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2365 ARMPCLabelIndex, 0); 2366 // Get the address of the callee into a register 2367 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2368 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2369 Callee = DAG.getLoad( 2370 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2371 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2372 } 2373 } else if (isa<GlobalAddressSDNode>(Callee)) { 2374 if (!PreferIndirect) { 2375 isDirect = true; 2376 bool isDef = GV->isStrongDefinitionForLinker(); 2377 2378 // ARM call to a local ARM function is predicable. 2379 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2380 // tBX takes a register source operand. 2381 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2382 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2383 Callee = DAG.getNode( 2384 ARMISD::WrapperPIC, dl, PtrVt, 2385 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2386 Callee = DAG.getLoad( 2387 PtrVt, dl, DAG.getEntryNode(), Callee, 2388 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2389 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2390 MachineMemOperand::MOInvariant); 2391 } else if (Subtarget->isTargetCOFF()) { 2392 assert(Subtarget->isTargetWindows() && 2393 "Windows is the only supported COFF target"); 2394 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2395 if (GV->hasDLLImportStorageClass()) 2396 TargetFlags = ARMII::MO_DLLIMPORT; 2397 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2398 TargetFlags = ARMII::MO_COFFSTUB; 2399 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2400 TargetFlags); 2401 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2402 Callee = 2403 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2404 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2405 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2406 } else { 2407 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2408 } 2409 } 2410 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2411 isDirect = true; 2412 // tBX takes a register source operand. 2413 const char *Sym = S->getSymbol(); 2414 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2415 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2416 ARMConstantPoolValue *CPV = 2417 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2418 ARMPCLabelIndex, 4); 2419 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2420 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2421 Callee = DAG.getLoad( 2422 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2423 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2424 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2425 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2426 } else { 2427 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2428 } 2429 } 2430 2431 // FIXME: handle tail calls differently. 2432 unsigned CallOpc; 2433 if (Subtarget->isThumb()) { 2434 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2435 CallOpc = ARMISD::CALL_NOLINK; 2436 else 2437 CallOpc = ARMISD::CALL; 2438 } else { 2439 if (!isDirect && !Subtarget->hasV5TOps()) 2440 CallOpc = ARMISD::CALL_NOLINK; 2441 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2442 // Emit regular call when code size is the priority 2443 !Subtarget->hasMinSize()) 2444 // "mov lr, pc; b _foo" to avoid confusing the RSP 2445 CallOpc = ARMISD::CALL_NOLINK; 2446 else 2447 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2448 } 2449 2450 std::vector<SDValue> Ops; 2451 Ops.push_back(Chain); 2452 Ops.push_back(Callee); 2453 2454 // Add argument registers to the end of the list so that they are known live 2455 // into the call. 2456 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2457 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2458 RegsToPass[i].second.getValueType())); 2459 2460 // Add a register mask operand representing the call-preserved registers. 2461 if (!isTailCall) { 2462 const uint32_t *Mask; 2463 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2464 if (isThisReturn) { 2465 // For 'this' returns, use the R0-preserving mask if applicable 2466 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2467 if (!Mask) { 2468 // Set isThisReturn to false if the calling convention is not one that 2469 // allows 'returned' to be modeled in this way, so LowerCallResult does 2470 // not try to pass 'this' straight through 2471 isThisReturn = false; 2472 Mask = ARI->getCallPreservedMask(MF, CallConv); 2473 } 2474 } else 2475 Mask = ARI->getCallPreservedMask(MF, CallConv); 2476 2477 assert(Mask && "Missing call preserved mask for calling convention"); 2478 Ops.push_back(DAG.getRegisterMask(Mask)); 2479 } 2480 2481 if (InFlag.getNode()) 2482 Ops.push_back(InFlag); 2483 2484 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2485 if (isTailCall) { 2486 MF.getFrameInfo().setHasTailCall(); 2487 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2488 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2489 return Ret; 2490 } 2491 2492 // Returns a chain and a flag for retval copy to use. 2493 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2494 InFlag = Chain.getValue(1); 2495 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2496 2497 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2498 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2499 if (!Ins.empty()) 2500 InFlag = Chain.getValue(1); 2501 2502 // Handle result values, copying them out of physregs into vregs that we 2503 // return. 2504 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2505 InVals, isThisReturn, 2506 isThisReturn ? OutVals[0] : SDValue()); 2507 } 2508 2509 /// HandleByVal - Every parameter *after* a byval parameter is passed 2510 /// on the stack. Remember the next parameter register to allocate, 2511 /// and then confiscate the rest of the parameter registers to insure 2512 /// this. 2513 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2514 unsigned Align) const { 2515 // Byval (as with any stack) slots are always at least 4 byte aligned. 2516 Align = std::max(Align, 4U); 2517 2518 unsigned Reg = State->AllocateReg(GPRArgRegs); 2519 if (!Reg) 2520 return; 2521 2522 unsigned AlignInRegs = Align / 4; 2523 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2524 for (unsigned i = 0; i < Waste; ++i) 2525 Reg = State->AllocateReg(GPRArgRegs); 2526 2527 if (!Reg) 2528 return; 2529 2530 unsigned Excess = 4 * (ARM::R4 - Reg); 2531 2532 // Special case when NSAA != SP and parameter size greater than size of 2533 // all remained GPR regs. In that case we can't split parameter, we must 2534 // send it to stack. We also must set NCRN to R4, so waste all 2535 // remained registers. 2536 const unsigned NSAAOffset = State->getNextStackOffset(); 2537 if (NSAAOffset != 0 && Size > Excess) { 2538 while (State->AllocateReg(GPRArgRegs)) 2539 ; 2540 return; 2541 } 2542 2543 // First register for byval parameter is the first register that wasn't 2544 // allocated before this method call, so it would be "reg". 2545 // If parameter is small enough to be saved in range [reg, r4), then 2546 // the end (first after last) register would be reg + param-size-in-regs, 2547 // else parameter would be splitted between registers and stack, 2548 // end register would be r4 in this case. 2549 unsigned ByValRegBegin = Reg; 2550 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2551 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2552 // Note, first register is allocated in the beginning of function already, 2553 // allocate remained amount of registers we need. 2554 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2555 State->AllocateReg(GPRArgRegs); 2556 // A byval parameter that is split between registers and memory needs its 2557 // size truncated here. 2558 // In the case where the entire structure fits in registers, we set the 2559 // size in memory to zero. 2560 Size = std::max<int>(Size - Excess, 0); 2561 } 2562 2563 /// MatchingStackOffset - Return true if the given stack call argument is 2564 /// already available in the same position (relatively) of the caller's 2565 /// incoming argument stack. 2566 static 2567 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2568 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2569 const TargetInstrInfo *TII) { 2570 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2571 int FI = std::numeric_limits<int>::max(); 2572 if (Arg.getOpcode() == ISD::CopyFromReg) { 2573 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2574 if (!Register::isVirtualRegister(VR)) 2575 return false; 2576 MachineInstr *Def = MRI->getVRegDef(VR); 2577 if (!Def) 2578 return false; 2579 if (!Flags.isByVal()) { 2580 if (!TII->isLoadFromStackSlot(*Def, FI)) 2581 return false; 2582 } else { 2583 return false; 2584 } 2585 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2586 if (Flags.isByVal()) 2587 // ByVal argument is passed in as a pointer but it's now being 2588 // dereferenced. e.g. 2589 // define @foo(%struct.X* %A) { 2590 // tail call @bar(%struct.X* byval %A) 2591 // } 2592 return false; 2593 SDValue Ptr = Ld->getBasePtr(); 2594 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2595 if (!FINode) 2596 return false; 2597 FI = FINode->getIndex(); 2598 } else 2599 return false; 2600 2601 assert(FI != std::numeric_limits<int>::max()); 2602 if (!MFI.isFixedObjectIndex(FI)) 2603 return false; 2604 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2605 } 2606 2607 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2608 /// for tail call optimization. Targets which want to do tail call 2609 /// optimization should implement this function. 2610 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2611 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2612 bool isCalleeStructRet, bool isCallerStructRet, 2613 const SmallVectorImpl<ISD::OutputArg> &Outs, 2614 const SmallVectorImpl<SDValue> &OutVals, 2615 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2616 const bool isIndirect) const { 2617 MachineFunction &MF = DAG.getMachineFunction(); 2618 const Function &CallerF = MF.getFunction(); 2619 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2620 2621 assert(Subtarget->supportsTailCall()); 2622 2623 // Indirect tail calls cannot be optimized for Thumb1 if the args 2624 // to the call take up r0-r3. The reason is that there are no legal registers 2625 // left to hold the pointer to the function to be called. 2626 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2627 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2628 return false; 2629 2630 // Look for obvious safe cases to perform tail call optimization that do not 2631 // require ABI changes. This is what gcc calls sibcall. 2632 2633 // Exception-handling functions need a special set of instructions to indicate 2634 // a return to the hardware. Tail-calling another function would probably 2635 // break this. 2636 if (CallerF.hasFnAttribute("interrupt")) 2637 return false; 2638 2639 // Also avoid sibcall optimization if either caller or callee uses struct 2640 // return semantics. 2641 if (isCalleeStructRet || isCallerStructRet) 2642 return false; 2643 2644 // Externally-defined functions with weak linkage should not be 2645 // tail-called on ARM when the OS does not support dynamic 2646 // pre-emption of symbols, as the AAELF spec requires normal calls 2647 // to undefined weak functions to be replaced with a NOP or jump to the 2648 // next instruction. The behaviour of branch instructions in this 2649 // situation (as used for tail calls) is implementation-defined, so we 2650 // cannot rely on the linker replacing the tail call with a return. 2651 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2652 const GlobalValue *GV = G->getGlobal(); 2653 const Triple &TT = getTargetMachine().getTargetTriple(); 2654 if (GV->hasExternalWeakLinkage() && 2655 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2656 return false; 2657 } 2658 2659 // Check that the call results are passed in the same way. 2660 LLVMContext &C = *DAG.getContext(); 2661 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2662 CCAssignFnForReturn(CalleeCC, isVarArg), 2663 CCAssignFnForReturn(CallerCC, isVarArg))) 2664 return false; 2665 // The callee has to preserve all registers the caller needs to preserve. 2666 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2667 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2668 if (CalleeCC != CallerCC) { 2669 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2670 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2671 return false; 2672 } 2673 2674 // If Caller's vararg or byval argument has been split between registers and 2675 // stack, do not perform tail call, since part of the argument is in caller's 2676 // local frame. 2677 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2678 if (AFI_Caller->getArgRegsSaveSize()) 2679 return false; 2680 2681 // If the callee takes no arguments then go on to check the results of the 2682 // call. 2683 if (!Outs.empty()) { 2684 // Check if stack adjustment is needed. For now, do not do this if any 2685 // argument is passed on the stack. 2686 SmallVector<CCValAssign, 16> ArgLocs; 2687 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2688 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2689 if (CCInfo.getNextStackOffset()) { 2690 // Check if the arguments are already laid out in the right way as 2691 // the caller's fixed stack objects. 2692 MachineFrameInfo &MFI = MF.getFrameInfo(); 2693 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2694 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2695 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2696 i != e; 2697 ++i, ++realArgIdx) { 2698 CCValAssign &VA = ArgLocs[i]; 2699 EVT RegVT = VA.getLocVT(); 2700 SDValue Arg = OutVals[realArgIdx]; 2701 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2702 if (VA.getLocInfo() == CCValAssign::Indirect) 2703 return false; 2704 if (VA.needsCustom()) { 2705 // f64 and vector types are split into multiple registers or 2706 // register/stack-slot combinations. The types will not match 2707 // the registers; give up on memory f64 refs until we figure 2708 // out what to do about this. 2709 if (!VA.isRegLoc()) 2710 return false; 2711 if (!ArgLocs[++i].isRegLoc()) 2712 return false; 2713 if (RegVT == MVT::v2f64) { 2714 if (!ArgLocs[++i].isRegLoc()) 2715 return false; 2716 if (!ArgLocs[++i].isRegLoc()) 2717 return false; 2718 } 2719 } else if (!VA.isRegLoc()) { 2720 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2721 MFI, MRI, TII)) 2722 return false; 2723 } 2724 } 2725 } 2726 2727 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2728 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2729 return false; 2730 } 2731 2732 return true; 2733 } 2734 2735 bool 2736 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2737 MachineFunction &MF, bool isVarArg, 2738 const SmallVectorImpl<ISD::OutputArg> &Outs, 2739 LLVMContext &Context) const { 2740 SmallVector<CCValAssign, 16> RVLocs; 2741 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2742 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2743 } 2744 2745 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2746 const SDLoc &DL, SelectionDAG &DAG) { 2747 const MachineFunction &MF = DAG.getMachineFunction(); 2748 const Function &F = MF.getFunction(); 2749 2750 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2751 2752 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2753 // version of the "preferred return address". These offsets affect the return 2754 // instruction if this is a return from PL1 without hypervisor extensions. 2755 // IRQ/FIQ: +4 "subs pc, lr, #4" 2756 // SWI: 0 "subs pc, lr, #0" 2757 // ABORT: +4 "subs pc, lr, #4" 2758 // UNDEF: +4/+2 "subs pc, lr, #0" 2759 // UNDEF varies depending on where the exception came from ARM or Thumb 2760 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2761 2762 int64_t LROffset; 2763 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2764 IntKind == "ABORT") 2765 LROffset = 4; 2766 else if (IntKind == "SWI" || IntKind == "UNDEF") 2767 LROffset = 0; 2768 else 2769 report_fatal_error("Unsupported interrupt attribute. If present, value " 2770 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2771 2772 RetOps.insert(RetOps.begin() + 1, 2773 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2774 2775 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2776 } 2777 2778 SDValue 2779 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2780 bool isVarArg, 2781 const SmallVectorImpl<ISD::OutputArg> &Outs, 2782 const SmallVectorImpl<SDValue> &OutVals, 2783 const SDLoc &dl, SelectionDAG &DAG) const { 2784 // CCValAssign - represent the assignment of the return value to a location. 2785 SmallVector<CCValAssign, 16> RVLocs; 2786 2787 // CCState - Info about the registers and stack slots. 2788 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2789 *DAG.getContext()); 2790 2791 // Analyze outgoing return values. 2792 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2793 2794 SDValue Flag; 2795 SmallVector<SDValue, 4> RetOps; 2796 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2797 bool isLittleEndian = Subtarget->isLittle(); 2798 2799 MachineFunction &MF = DAG.getMachineFunction(); 2800 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2801 AFI->setReturnRegsCount(RVLocs.size()); 2802 2803 // Copy the result values into the output registers. 2804 for (unsigned i = 0, realRVLocIdx = 0; 2805 i != RVLocs.size(); 2806 ++i, ++realRVLocIdx) { 2807 CCValAssign &VA = RVLocs[i]; 2808 assert(VA.isRegLoc() && "Can only return in registers!"); 2809 2810 SDValue Arg = OutVals[realRVLocIdx]; 2811 bool ReturnF16 = false; 2812 2813 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2814 // Half-precision return values can be returned like this: 2815 // 2816 // t11 f16 = fadd ... 2817 // t12: i16 = bitcast t11 2818 // t13: i32 = zero_extend t12 2819 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2820 // 2821 // to avoid code generation for bitcasts, we simply set Arg to the node 2822 // that produces the f16 value, t11 in this case. 2823 // 2824 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2825 SDValue ZE = Arg.getOperand(0); 2826 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2827 SDValue BC = ZE.getOperand(0); 2828 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2829 Arg = BC.getOperand(0); 2830 ReturnF16 = true; 2831 } 2832 } 2833 } 2834 } 2835 2836 switch (VA.getLocInfo()) { 2837 default: llvm_unreachable("Unknown loc info!"); 2838 case CCValAssign::Full: break; 2839 case CCValAssign::BCvt: 2840 if (!ReturnF16) 2841 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2842 break; 2843 } 2844 2845 if (VA.needsCustom()) { 2846 if (VA.getLocVT() == MVT::v2f64) { 2847 // Extract the first half and return it in two registers. 2848 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2849 DAG.getConstant(0, dl, MVT::i32)); 2850 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2851 DAG.getVTList(MVT::i32, MVT::i32), Half); 2852 2853 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2854 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2855 Flag); 2856 Flag = Chain.getValue(1); 2857 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2858 VA = RVLocs[++i]; // skip ahead to next loc 2859 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2860 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2861 Flag); 2862 Flag = Chain.getValue(1); 2863 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2864 VA = RVLocs[++i]; // skip ahead to next loc 2865 2866 // Extract the 2nd half and fall through to handle it as an f64 value. 2867 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2868 DAG.getConstant(1, dl, MVT::i32)); 2869 } 2870 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2871 // available. 2872 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2873 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2874 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2875 fmrrd.getValue(isLittleEndian ? 0 : 1), 2876 Flag); 2877 Flag = Chain.getValue(1); 2878 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2879 VA = RVLocs[++i]; // skip ahead to next loc 2880 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2881 fmrrd.getValue(isLittleEndian ? 1 : 0), 2882 Flag); 2883 } else 2884 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2885 2886 // Guarantee that all emitted copies are 2887 // stuck together, avoiding something bad. 2888 Flag = Chain.getValue(1); 2889 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2890 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2891 } 2892 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2893 const MCPhysReg *I = 2894 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2895 if (I) { 2896 for (; *I; ++I) { 2897 if (ARM::GPRRegClass.contains(*I)) 2898 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2899 else if (ARM::DPRRegClass.contains(*I)) 2900 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2901 else 2902 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2903 } 2904 } 2905 2906 // Update chain and glue. 2907 RetOps[0] = Chain; 2908 if (Flag.getNode()) 2909 RetOps.push_back(Flag); 2910 2911 // CPUs which aren't M-class use a special sequence to return from 2912 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2913 // though we use "subs pc, lr, #N"). 2914 // 2915 // M-class CPUs actually use a normal return sequence with a special 2916 // (hardware-provided) value in LR, so the normal code path works. 2917 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2918 !Subtarget->isMClass()) { 2919 if (Subtarget->isThumb1Only()) 2920 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2921 return LowerInterruptReturn(RetOps, dl, DAG); 2922 } 2923 2924 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2925 } 2926 2927 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2928 if (N->getNumValues() != 1) 2929 return false; 2930 if (!N->hasNUsesOfValue(1, 0)) 2931 return false; 2932 2933 SDValue TCChain = Chain; 2934 SDNode *Copy = *N->use_begin(); 2935 if (Copy->getOpcode() == ISD::CopyToReg) { 2936 // If the copy has a glue operand, we conservatively assume it isn't safe to 2937 // perform a tail call. 2938 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2939 return false; 2940 TCChain = Copy->getOperand(0); 2941 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2942 SDNode *VMov = Copy; 2943 // f64 returned in a pair of GPRs. 2944 SmallPtrSet<SDNode*, 2> Copies; 2945 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2946 UI != UE; ++UI) { 2947 if (UI->getOpcode() != ISD::CopyToReg) 2948 return false; 2949 Copies.insert(*UI); 2950 } 2951 if (Copies.size() > 2) 2952 return false; 2953 2954 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2955 UI != UE; ++UI) { 2956 SDValue UseChain = UI->getOperand(0); 2957 if (Copies.count(UseChain.getNode())) 2958 // Second CopyToReg 2959 Copy = *UI; 2960 else { 2961 // We are at the top of this chain. 2962 // If the copy has a glue operand, we conservatively assume it 2963 // isn't safe to perform a tail call. 2964 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2965 return false; 2966 // First CopyToReg 2967 TCChain = UseChain; 2968 } 2969 } 2970 } else if (Copy->getOpcode() == ISD::BITCAST) { 2971 // f32 returned in a single GPR. 2972 if (!Copy->hasOneUse()) 2973 return false; 2974 Copy = *Copy->use_begin(); 2975 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2976 return false; 2977 // If the copy has a glue operand, we conservatively assume it isn't safe to 2978 // perform a tail call. 2979 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2980 return false; 2981 TCChain = Copy->getOperand(0); 2982 } else { 2983 return false; 2984 } 2985 2986 bool HasRet = false; 2987 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2988 UI != UE; ++UI) { 2989 if (UI->getOpcode() != ARMISD::RET_FLAG && 2990 UI->getOpcode() != ARMISD::INTRET_FLAG) 2991 return false; 2992 HasRet = true; 2993 } 2994 2995 if (!HasRet) 2996 return false; 2997 2998 Chain = TCChain; 2999 return true; 3000 } 3001 3002 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3003 if (!Subtarget->supportsTailCall()) 3004 return false; 3005 3006 if (!CI->isTailCall()) 3007 return false; 3008 3009 return true; 3010 } 3011 3012 // Trying to write a 64 bit value so need to split into two 32 bit values first, 3013 // and pass the lower and high parts through. 3014 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 3015 SDLoc DL(Op); 3016 SDValue WriteValue = Op->getOperand(2); 3017 3018 // This function is only supposed to be called for i64 type argument. 3019 assert(WriteValue.getValueType() == MVT::i64 3020 && "LowerWRITE_REGISTER called for non-i64 type argument."); 3021 3022 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3023 DAG.getConstant(0, DL, MVT::i32)); 3024 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3025 DAG.getConstant(1, DL, MVT::i32)); 3026 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3027 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3028 } 3029 3030 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3031 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3032 // one of the above mentioned nodes. It has to be wrapped because otherwise 3033 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3034 // be used to form addressing mode. These wrapped nodes will be selected 3035 // into MOVi. 3036 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3037 SelectionDAG &DAG) const { 3038 EVT PtrVT = Op.getValueType(); 3039 // FIXME there is no actual debug info here 3040 SDLoc dl(Op); 3041 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3042 SDValue Res; 3043 3044 // When generating execute-only code Constant Pools must be promoted to the 3045 // global data section. It's a bit ugly that we can't share them across basic 3046 // blocks, but this way we guarantee that execute-only behaves correct with 3047 // position-independent addressing modes. 3048 if (Subtarget->genExecuteOnly()) { 3049 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3050 auto T = const_cast<Type*>(CP->getType()); 3051 auto C = const_cast<Constant*>(CP->getConstVal()); 3052 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3053 getFunction().getParent()); 3054 auto GV = new GlobalVariable( 3055 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3056 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3057 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3058 Twine(AFI->createPICLabelUId()) 3059 ); 3060 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3061 dl, PtrVT); 3062 return LowerGlobalAddress(GA, DAG); 3063 } 3064 3065 if (CP->isMachineConstantPoolEntry()) 3066 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 3067 CP->getAlignment()); 3068 else 3069 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 3070 CP->getAlignment()); 3071 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3072 } 3073 3074 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3075 return MachineJumpTableInfo::EK_Inline; 3076 } 3077 3078 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3079 SelectionDAG &DAG) const { 3080 MachineFunction &MF = DAG.getMachineFunction(); 3081 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3082 unsigned ARMPCLabelIndex = 0; 3083 SDLoc DL(Op); 3084 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3085 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3086 SDValue CPAddr; 3087 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3088 if (!IsPositionIndependent) { 3089 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 3090 } else { 3091 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3092 ARMPCLabelIndex = AFI->createPICLabelUId(); 3093 ARMConstantPoolValue *CPV = 3094 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3095 ARMCP::CPBlockAddress, PCAdj); 3096 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3097 } 3098 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3099 SDValue Result = DAG.getLoad( 3100 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3101 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3102 if (!IsPositionIndependent) 3103 return Result; 3104 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3105 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3106 } 3107 3108 /// Convert a TLS address reference into the correct sequence of loads 3109 /// and calls to compute the variable's address for Darwin, and return an 3110 /// SDValue containing the final node. 3111 3112 /// Darwin only has one TLS scheme which must be capable of dealing with the 3113 /// fully general situation, in the worst case. This means: 3114 /// + "extern __thread" declaration. 3115 /// + Defined in a possibly unknown dynamic library. 3116 /// 3117 /// The general system is that each __thread variable has a [3 x i32] descriptor 3118 /// which contains information used by the runtime to calculate the address. The 3119 /// only part of this the compiler needs to know about is the first word, which 3120 /// contains a function pointer that must be called with the address of the 3121 /// entire descriptor in "r0". 3122 /// 3123 /// Since this descriptor may be in a different unit, in general access must 3124 /// proceed along the usual ARM rules. A common sequence to produce is: 3125 /// 3126 /// movw rT1, :lower16:_var$non_lazy_ptr 3127 /// movt rT1, :upper16:_var$non_lazy_ptr 3128 /// ldr r0, [rT1] 3129 /// ldr rT2, [r0] 3130 /// blx rT2 3131 /// [...address now in r0...] 3132 SDValue 3133 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3134 SelectionDAG &DAG) const { 3135 assert(Subtarget->isTargetDarwin() && 3136 "This function expects a Darwin target"); 3137 SDLoc DL(Op); 3138 3139 // First step is to get the address of the actua global symbol. This is where 3140 // the TLS descriptor lives. 3141 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3142 3143 // The first entry in the descriptor is a function pointer that we must call 3144 // to obtain the address of the variable. 3145 SDValue Chain = DAG.getEntryNode(); 3146 SDValue FuncTLVGet = DAG.getLoad( 3147 MVT::i32, DL, Chain, DescAddr, 3148 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 3149 /* Alignment = */ 4, 3150 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3151 MachineMemOperand::MOInvariant); 3152 Chain = FuncTLVGet.getValue(1); 3153 3154 MachineFunction &F = DAG.getMachineFunction(); 3155 MachineFrameInfo &MFI = F.getFrameInfo(); 3156 MFI.setAdjustsStack(true); 3157 3158 // TLS calls preserve all registers except those that absolutely must be 3159 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3160 // silly). 3161 auto TRI = 3162 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3163 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3164 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3165 3166 // Finally, we can make the call. This is just a degenerate version of a 3167 // normal AArch64 call node: r0 takes the address of the descriptor, and 3168 // returns the address of the variable in this thread. 3169 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3170 Chain = 3171 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3172 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3173 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3174 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3175 } 3176 3177 SDValue 3178 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3179 SelectionDAG &DAG) const { 3180 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3181 3182 SDValue Chain = DAG.getEntryNode(); 3183 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3184 SDLoc DL(Op); 3185 3186 // Load the current TEB (thread environment block) 3187 SDValue Ops[] = {Chain, 3188 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3189 DAG.getTargetConstant(15, DL, MVT::i32), 3190 DAG.getTargetConstant(0, DL, MVT::i32), 3191 DAG.getTargetConstant(13, DL, MVT::i32), 3192 DAG.getTargetConstant(0, DL, MVT::i32), 3193 DAG.getTargetConstant(2, DL, MVT::i32)}; 3194 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3195 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3196 3197 SDValue TEB = CurrentTEB.getValue(0); 3198 Chain = CurrentTEB.getValue(1); 3199 3200 // Load the ThreadLocalStoragePointer from the TEB 3201 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3202 SDValue TLSArray = 3203 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3204 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3205 3206 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3207 // offset into the TLSArray. 3208 3209 // Load the TLS index from the C runtime 3210 SDValue TLSIndex = 3211 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3212 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3213 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3214 3215 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3216 DAG.getConstant(2, DL, MVT::i32)); 3217 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3218 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3219 MachinePointerInfo()); 3220 3221 // Get the offset of the start of the .tls section (section base) 3222 const auto *GA = cast<GlobalAddressSDNode>(Op); 3223 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3224 SDValue Offset = DAG.getLoad( 3225 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3226 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 3227 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3228 3229 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3230 } 3231 3232 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3233 SDValue 3234 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3235 SelectionDAG &DAG) const { 3236 SDLoc dl(GA); 3237 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3238 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3239 MachineFunction &MF = DAG.getMachineFunction(); 3240 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3241 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3242 ARMConstantPoolValue *CPV = 3243 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3244 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3245 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3246 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3247 Argument = DAG.getLoad( 3248 PtrVT, dl, DAG.getEntryNode(), Argument, 3249 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3250 SDValue Chain = Argument.getValue(1); 3251 3252 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3253 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3254 3255 // call __tls_get_addr. 3256 ArgListTy Args; 3257 ArgListEntry Entry; 3258 Entry.Node = Argument; 3259 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3260 Args.push_back(Entry); 3261 3262 // FIXME: is there useful debug info available here? 3263 TargetLowering::CallLoweringInfo CLI(DAG); 3264 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3265 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3266 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3267 3268 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3269 return CallResult.first; 3270 } 3271 3272 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3273 // "local exec" model. 3274 SDValue 3275 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3276 SelectionDAG &DAG, 3277 TLSModel::Model model) const { 3278 const GlobalValue *GV = GA->getGlobal(); 3279 SDLoc dl(GA); 3280 SDValue Offset; 3281 SDValue Chain = DAG.getEntryNode(); 3282 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3283 // Get the Thread Pointer 3284 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3285 3286 if (model == TLSModel::InitialExec) { 3287 MachineFunction &MF = DAG.getMachineFunction(); 3288 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3289 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3290 // Initial exec model. 3291 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3292 ARMConstantPoolValue *CPV = 3293 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3294 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3295 true); 3296 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3297 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3298 Offset = DAG.getLoad( 3299 PtrVT, dl, Chain, Offset, 3300 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3301 Chain = Offset.getValue(1); 3302 3303 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3304 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3305 3306 Offset = DAG.getLoad( 3307 PtrVT, dl, Chain, Offset, 3308 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3309 } else { 3310 // local exec model 3311 assert(model == TLSModel::LocalExec); 3312 ARMConstantPoolValue *CPV = 3313 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3314 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3315 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3316 Offset = DAG.getLoad( 3317 PtrVT, dl, Chain, Offset, 3318 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3319 } 3320 3321 // The address of the thread local variable is the add of the thread 3322 // pointer with the offset of the variable. 3323 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3324 } 3325 3326 SDValue 3327 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3328 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3329 if (DAG.getTarget().useEmulatedTLS()) 3330 return LowerToTLSEmulatedModel(GA, DAG); 3331 3332 if (Subtarget->isTargetDarwin()) 3333 return LowerGlobalTLSAddressDarwin(Op, DAG); 3334 3335 if (Subtarget->isTargetWindows()) 3336 return LowerGlobalTLSAddressWindows(Op, DAG); 3337 3338 // TODO: implement the "local dynamic" model 3339 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3340 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3341 3342 switch (model) { 3343 case TLSModel::GeneralDynamic: 3344 case TLSModel::LocalDynamic: 3345 return LowerToTLSGeneralDynamicModel(GA, DAG); 3346 case TLSModel::InitialExec: 3347 case TLSModel::LocalExec: 3348 return LowerToTLSExecModels(GA, DAG, model); 3349 } 3350 llvm_unreachable("bogus TLS model"); 3351 } 3352 3353 /// Return true if all users of V are within function F, looking through 3354 /// ConstantExprs. 3355 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3356 SmallVector<const User*,4> Worklist; 3357 for (auto *U : V->users()) 3358 Worklist.push_back(U); 3359 while (!Worklist.empty()) { 3360 auto *U = Worklist.pop_back_val(); 3361 if (isa<ConstantExpr>(U)) { 3362 for (auto *UU : U->users()) 3363 Worklist.push_back(UU); 3364 continue; 3365 } 3366 3367 auto *I = dyn_cast<Instruction>(U); 3368 if (!I || I->getParent()->getParent() != F) 3369 return false; 3370 } 3371 return true; 3372 } 3373 3374 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3375 const GlobalValue *GV, SelectionDAG &DAG, 3376 EVT PtrVT, const SDLoc &dl) { 3377 // If we're creating a pool entry for a constant global with unnamed address, 3378 // and the global is small enough, we can emit it inline into the constant pool 3379 // to save ourselves an indirection. 3380 // 3381 // This is a win if the constant is only used in one function (so it doesn't 3382 // need to be duplicated) or duplicating the constant wouldn't increase code 3383 // size (implying the constant is no larger than 4 bytes). 3384 const Function &F = DAG.getMachineFunction().getFunction(); 3385 3386 // We rely on this decision to inline being idemopotent and unrelated to the 3387 // use-site. We know that if we inline a variable at one use site, we'll 3388 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3389 // doesn't know about this optimization, so bail out if it's enabled else 3390 // we could decide to inline here (and thus never emit the GV) but require 3391 // the GV from fast-isel generated code. 3392 if (!EnableConstpoolPromotion || 3393 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3394 return SDValue(); 3395 3396 auto *GVar = dyn_cast<GlobalVariable>(GV); 3397 if (!GVar || !GVar->hasInitializer() || 3398 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3399 !GVar->hasLocalLinkage()) 3400 return SDValue(); 3401 3402 // If we inline a value that contains relocations, we move the relocations 3403 // from .data to .text. This is not allowed in position-independent code. 3404 auto *Init = GVar->getInitializer(); 3405 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3406 Init->needsRelocation()) 3407 return SDValue(); 3408 3409 // The constant islands pass can only really deal with alignment requests 3410 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3411 // any type wanting greater alignment requirements than 4 bytes. We also 3412 // can only promote constants that are multiples of 4 bytes in size or 3413 // are paddable to a multiple of 4. Currently we only try and pad constants 3414 // that are strings for simplicity. 3415 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3416 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3417 unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); 3418 unsigned RequiredPadding = 4 - (Size % 4); 3419 bool PaddingPossible = 3420 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3421 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3422 Size == 0) 3423 return SDValue(); 3424 3425 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3426 MachineFunction &MF = DAG.getMachineFunction(); 3427 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3428 3429 // We can't bloat the constant pool too much, else the ConstantIslands pass 3430 // may fail to converge. If we haven't promoted this global yet (it may have 3431 // multiple uses), and promoting it would increase the constant pool size (Sz 3432 // > 4), ensure we have space to do so up to MaxTotal. 3433 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3434 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3435 ConstpoolPromotionMaxTotal) 3436 return SDValue(); 3437 3438 // This is only valid if all users are in a single function; we can't clone 3439 // the constant in general. The LLVM IR unnamed_addr allows merging 3440 // constants, but not cloning them. 3441 // 3442 // We could potentially allow cloning if we could prove all uses of the 3443 // constant in the current function don't care about the address, like 3444 // printf format strings. But that isn't implemented for now. 3445 if (!allUsersAreInFunction(GVar, &F)) 3446 return SDValue(); 3447 3448 // We're going to inline this global. Pad it out if needed. 3449 if (RequiredPadding != 4) { 3450 StringRef S = CDAInit->getAsString(); 3451 3452 SmallVector<uint8_t,16> V(S.size()); 3453 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3454 while (RequiredPadding--) 3455 V.push_back(0); 3456 Init = ConstantDataArray::get(*DAG.getContext(), V); 3457 } 3458 3459 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3460 SDValue CPAddr = 3461 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3462 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3463 AFI->markGlobalAsPromotedToConstantPool(GVar); 3464 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3465 PaddedSize - 4); 3466 } 3467 ++NumConstpoolPromoted; 3468 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3469 } 3470 3471 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3472 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3473 if (!(GV = GA->getBaseObject())) 3474 return false; 3475 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3476 return V->isConstant(); 3477 return isa<Function>(GV); 3478 } 3479 3480 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3481 SelectionDAG &DAG) const { 3482 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3483 default: llvm_unreachable("unknown object format"); 3484 case Triple::COFF: 3485 return LowerGlobalAddressWindows(Op, DAG); 3486 case Triple::ELF: 3487 return LowerGlobalAddressELF(Op, DAG); 3488 case Triple::MachO: 3489 return LowerGlobalAddressDarwin(Op, DAG); 3490 } 3491 } 3492 3493 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3494 SelectionDAG &DAG) const { 3495 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3496 SDLoc dl(Op); 3497 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3498 const TargetMachine &TM = getTargetMachine(); 3499 bool IsRO = isReadOnly(GV); 3500 3501 // promoteToConstantPool only if not generating XO text section 3502 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3503 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3504 return V; 3505 3506 if (isPositionIndependent()) { 3507 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3508 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3509 UseGOT_PREL ? ARMII::MO_GOT : 0); 3510 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3511 if (UseGOT_PREL) 3512 Result = 3513 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3514 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3515 return Result; 3516 } else if (Subtarget->isROPI() && IsRO) { 3517 // PC-relative. 3518 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3519 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3520 return Result; 3521 } else if (Subtarget->isRWPI() && !IsRO) { 3522 // SB-relative. 3523 SDValue RelAddr; 3524 if (Subtarget->useMovt()) { 3525 ++NumMovwMovt; 3526 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3527 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3528 } else { // use literal pool for address constant 3529 ARMConstantPoolValue *CPV = 3530 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3531 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3532 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3533 RelAddr = DAG.getLoad( 3534 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3535 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3536 } 3537 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3538 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3539 return Result; 3540 } 3541 3542 // If we have T2 ops, we can materialize the address directly via movt/movw 3543 // pair. This is always cheaper. 3544 if (Subtarget->useMovt()) { 3545 ++NumMovwMovt; 3546 // FIXME: Once remat is capable of dealing with instructions with register 3547 // operands, expand this into two nodes. 3548 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3549 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3550 } else { 3551 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3552 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3553 return DAG.getLoad( 3554 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3555 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3556 } 3557 } 3558 3559 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3560 SelectionDAG &DAG) const { 3561 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3562 "ROPI/RWPI not currently supported for Darwin"); 3563 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3564 SDLoc dl(Op); 3565 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3566 3567 if (Subtarget->useMovt()) 3568 ++NumMovwMovt; 3569 3570 // FIXME: Once remat is capable of dealing with instructions with register 3571 // operands, expand this into multiple nodes 3572 unsigned Wrapper = 3573 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3574 3575 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3576 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3577 3578 if (Subtarget->isGVIndirectSymbol(GV)) 3579 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3580 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3581 return Result; 3582 } 3583 3584 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3585 SelectionDAG &DAG) const { 3586 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3587 assert(Subtarget->useMovt() && 3588 "Windows on ARM expects to use movw/movt"); 3589 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3590 "ROPI/RWPI not currently supported for Windows"); 3591 3592 const TargetMachine &TM = getTargetMachine(); 3593 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3594 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3595 if (GV->hasDLLImportStorageClass()) 3596 TargetFlags = ARMII::MO_DLLIMPORT; 3597 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3598 TargetFlags = ARMII::MO_COFFSTUB; 3599 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3600 SDValue Result; 3601 SDLoc DL(Op); 3602 3603 ++NumMovwMovt; 3604 3605 // FIXME: Once remat is capable of dealing with instructions with register 3606 // operands, expand this into two nodes. 3607 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3608 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3609 TargetFlags)); 3610 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3611 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3612 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3613 return Result; 3614 } 3615 3616 SDValue 3617 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3618 SDLoc dl(Op); 3619 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3620 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3621 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3622 Op.getOperand(1), Val); 3623 } 3624 3625 SDValue 3626 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3627 SDLoc dl(Op); 3628 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3629 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3630 } 3631 3632 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3633 SelectionDAG &DAG) const { 3634 SDLoc dl(Op); 3635 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3636 Op.getOperand(0)); 3637 } 3638 3639 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3640 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3641 unsigned IntNo = 3642 cast<ConstantSDNode>( 3643 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3644 ->getZExtValue(); 3645 switch (IntNo) { 3646 default: 3647 return SDValue(); // Don't custom lower most intrinsics. 3648 case Intrinsic::arm_gnu_eabi_mcount: { 3649 MachineFunction &MF = DAG.getMachineFunction(); 3650 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3651 SDLoc dl(Op); 3652 SDValue Chain = Op.getOperand(0); 3653 // call "\01__gnu_mcount_nc" 3654 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3655 const uint32_t *Mask = 3656 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3657 assert(Mask && "Missing call preserved mask for calling convention"); 3658 // Mark LR an implicit live-in. 3659 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3660 SDValue ReturnAddress = 3661 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3662 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; 3663 SDValue Callee = 3664 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3665 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3666 if (Subtarget->isThumb()) 3667 return SDValue( 3668 DAG.getMachineNode( 3669 ARM::tBL_PUSHLR, dl, ResultTys, 3670 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3671 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3672 0); 3673 return SDValue( 3674 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3675 {ReturnAddress, Callee, RegisterMask, Chain}), 3676 0); 3677 } 3678 } 3679 } 3680 3681 SDValue 3682 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3683 const ARMSubtarget *Subtarget) const { 3684 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3685 SDLoc dl(Op); 3686 switch (IntNo) { 3687 default: return SDValue(); // Don't custom lower most intrinsics. 3688 case Intrinsic::thread_pointer: { 3689 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3690 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3691 } 3692 case Intrinsic::arm_cls: { 3693 const SDValue &Operand = Op.getOperand(1); 3694 const EVT VTy = Op.getValueType(); 3695 SDValue SRA = 3696 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 3697 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 3698 SDValue SHL = 3699 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 3700 SDValue OR = 3701 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 3702 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 3703 return Result; 3704 } 3705 case Intrinsic::arm_cls64: { 3706 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 3707 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 3708 const SDValue &Operand = Op.getOperand(1); 3709 const EVT VTy = Op.getValueType(); 3710 3711 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3712 DAG.getConstant(1, dl, VTy)); 3713 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3714 DAG.getConstant(0, dl, VTy)); 3715 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 3716 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 3717 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 3718 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 3719 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 3720 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 3721 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 3722 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 3723 SDValue CheckLo = 3724 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 3725 SDValue HiIsZero = 3726 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 3727 SDValue AdjustedLo = 3728 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 3729 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 3730 SDValue Result = 3731 DAG.getSelect(dl, VTy, CheckLo, 3732 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 3733 return Result; 3734 } 3735 case Intrinsic::eh_sjlj_lsda: { 3736 MachineFunction &MF = DAG.getMachineFunction(); 3737 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3738 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3739 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3740 SDValue CPAddr; 3741 bool IsPositionIndependent = isPositionIndependent(); 3742 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3743 ARMConstantPoolValue *CPV = 3744 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3745 ARMCP::CPLSDA, PCAdj); 3746 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3747 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3748 SDValue Result = DAG.getLoad( 3749 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3750 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3751 3752 if (IsPositionIndependent) { 3753 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3754 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3755 } 3756 return Result; 3757 } 3758 case Intrinsic::arm_neon_vabs: 3759 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3760 Op.getOperand(1)); 3761 case Intrinsic::arm_neon_vmulls: 3762 case Intrinsic::arm_neon_vmullu: { 3763 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3764 ? ARMISD::VMULLs : ARMISD::VMULLu; 3765 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3766 Op.getOperand(1), Op.getOperand(2)); 3767 } 3768 case Intrinsic::arm_neon_vminnm: 3769 case Intrinsic::arm_neon_vmaxnm: { 3770 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3771 ? ISD::FMINNUM : ISD::FMAXNUM; 3772 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3773 Op.getOperand(1), Op.getOperand(2)); 3774 } 3775 case Intrinsic::arm_neon_vminu: 3776 case Intrinsic::arm_neon_vmaxu: { 3777 if (Op.getValueType().isFloatingPoint()) 3778 return SDValue(); 3779 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3780 ? ISD::UMIN : ISD::UMAX; 3781 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3782 Op.getOperand(1), Op.getOperand(2)); 3783 } 3784 case Intrinsic::arm_neon_vmins: 3785 case Intrinsic::arm_neon_vmaxs: { 3786 // v{min,max}s is overloaded between signed integers and floats. 3787 if (!Op.getValueType().isFloatingPoint()) { 3788 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3789 ? ISD::SMIN : ISD::SMAX; 3790 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3791 Op.getOperand(1), Op.getOperand(2)); 3792 } 3793 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3794 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3795 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3796 Op.getOperand(1), Op.getOperand(2)); 3797 } 3798 case Intrinsic::arm_neon_vtbl1: 3799 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3800 Op.getOperand(1), Op.getOperand(2)); 3801 case Intrinsic::arm_neon_vtbl2: 3802 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3803 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3804 case Intrinsic::arm_mve_pred_i2v: 3805 case Intrinsic::arm_mve_pred_v2i: 3806 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 3807 Op.getOperand(1)); 3808 case Intrinsic::arm_mve_vreinterpretq: 3809 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), 3810 Op.getOperand(1)); 3811 case Intrinsic::arm_mve_lsll: 3812 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), 3813 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3814 case Intrinsic::arm_mve_asrl: 3815 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), 3816 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3817 } 3818 } 3819 3820 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3821 const ARMSubtarget *Subtarget) { 3822 SDLoc dl(Op); 3823 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3824 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3825 if (SSID == SyncScope::SingleThread) 3826 return Op; 3827 3828 if (!Subtarget->hasDataBarrier()) { 3829 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3830 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3831 // here. 3832 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3833 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3834 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3835 DAG.getConstant(0, dl, MVT::i32)); 3836 } 3837 3838 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3839 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3840 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3841 if (Subtarget->isMClass()) { 3842 // Only a full system barrier exists in the M-class architectures. 3843 Domain = ARM_MB::SY; 3844 } else if (Subtarget->preferISHSTBarriers() && 3845 Ord == AtomicOrdering::Release) { 3846 // Swift happens to implement ISHST barriers in a way that's compatible with 3847 // Release semantics but weaker than ISH so we'd be fools not to use 3848 // it. Beware: other processors probably don't! 3849 Domain = ARM_MB::ISHST; 3850 } 3851 3852 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3853 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3854 DAG.getConstant(Domain, dl, MVT::i32)); 3855 } 3856 3857 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3858 const ARMSubtarget *Subtarget) { 3859 // ARM pre v5TE and Thumb1 does not have preload instructions. 3860 if (!(Subtarget->isThumb2() || 3861 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3862 // Just preserve the chain. 3863 return Op.getOperand(0); 3864 3865 SDLoc dl(Op); 3866 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3867 if (!isRead && 3868 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3869 // ARMv7 with MP extension has PLDW. 3870 return Op.getOperand(0); 3871 3872 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3873 if (Subtarget->isThumb()) { 3874 // Invert the bits. 3875 isRead = ~isRead & 1; 3876 isData = ~isData & 1; 3877 } 3878 3879 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3880 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3881 DAG.getConstant(isData, dl, MVT::i32)); 3882 } 3883 3884 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3885 MachineFunction &MF = DAG.getMachineFunction(); 3886 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3887 3888 // vastart just stores the address of the VarArgsFrameIndex slot into the 3889 // memory location argument. 3890 SDLoc dl(Op); 3891 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3892 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3893 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3894 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3895 MachinePointerInfo(SV)); 3896 } 3897 3898 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3899 CCValAssign &NextVA, 3900 SDValue &Root, 3901 SelectionDAG &DAG, 3902 const SDLoc &dl) const { 3903 MachineFunction &MF = DAG.getMachineFunction(); 3904 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3905 3906 const TargetRegisterClass *RC; 3907 if (AFI->isThumb1OnlyFunction()) 3908 RC = &ARM::tGPRRegClass; 3909 else 3910 RC = &ARM::GPRRegClass; 3911 3912 // Transform the arguments stored in physical registers into virtual ones. 3913 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3914 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3915 3916 SDValue ArgValue2; 3917 if (NextVA.isMemLoc()) { 3918 MachineFrameInfo &MFI = MF.getFrameInfo(); 3919 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3920 3921 // Create load node to retrieve arguments from the stack. 3922 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3923 ArgValue2 = DAG.getLoad( 3924 MVT::i32, dl, Root, FIN, 3925 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3926 } else { 3927 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3928 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3929 } 3930 if (!Subtarget->isLittle()) 3931 std::swap (ArgValue, ArgValue2); 3932 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3933 } 3934 3935 // The remaining GPRs hold either the beginning of variable-argument 3936 // data, or the beginning of an aggregate passed by value (usually 3937 // byval). Either way, we allocate stack slots adjacent to the data 3938 // provided by our caller, and store the unallocated registers there. 3939 // If this is a variadic function, the va_list pointer will begin with 3940 // these values; otherwise, this reassembles a (byval) structure that 3941 // was split between registers and memory. 3942 // Return: The frame index registers were stored into. 3943 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3944 const SDLoc &dl, SDValue &Chain, 3945 const Value *OrigArg, 3946 unsigned InRegsParamRecordIdx, 3947 int ArgOffset, unsigned ArgSize) const { 3948 // Currently, two use-cases possible: 3949 // Case #1. Non-var-args function, and we meet first byval parameter. 3950 // Setup first unallocated register as first byval register; 3951 // eat all remained registers 3952 // (these two actions are performed by HandleByVal method). 3953 // Then, here, we initialize stack frame with 3954 // "store-reg" instructions. 3955 // Case #2. Var-args function, that doesn't contain byval parameters. 3956 // The same: eat all remained unallocated registers, 3957 // initialize stack frame. 3958 3959 MachineFunction &MF = DAG.getMachineFunction(); 3960 MachineFrameInfo &MFI = MF.getFrameInfo(); 3961 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3962 unsigned RBegin, REnd; 3963 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3964 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3965 } else { 3966 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3967 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3968 REnd = ARM::R4; 3969 } 3970 3971 if (REnd != RBegin) 3972 ArgOffset = -4 * (ARM::R4 - RBegin); 3973 3974 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3975 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3976 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3977 3978 SmallVector<SDValue, 4> MemOps; 3979 const TargetRegisterClass *RC = 3980 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3981 3982 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3983 unsigned VReg = MF.addLiveIn(Reg, RC); 3984 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3985 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3986 MachinePointerInfo(OrigArg, 4 * i)); 3987 MemOps.push_back(Store); 3988 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3989 } 3990 3991 if (!MemOps.empty()) 3992 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3993 return FrameIndex; 3994 } 3995 3996 // Setup stack frame, the va_list pointer will start from. 3997 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3998 const SDLoc &dl, SDValue &Chain, 3999 unsigned ArgOffset, 4000 unsigned TotalArgRegsSaveSize, 4001 bool ForceMutable) const { 4002 MachineFunction &MF = DAG.getMachineFunction(); 4003 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4004 4005 // Try to store any remaining integer argument regs 4006 // to their spots on the stack so that they may be loaded by dereferencing 4007 // the result of va_next. 4008 // If there is no regs to be stored, just point address after last 4009 // argument passed via stack. 4010 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 4011 CCInfo.getInRegsParamsCount(), 4012 CCInfo.getNextStackOffset(), 4013 std::max(4U, TotalArgRegsSaveSize)); 4014 AFI->setVarArgsFrameIndex(FrameIndex); 4015 } 4016 4017 SDValue ARMTargetLowering::LowerFormalArguments( 4018 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 4019 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4020 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4021 MachineFunction &MF = DAG.getMachineFunction(); 4022 MachineFrameInfo &MFI = MF.getFrameInfo(); 4023 4024 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4025 4026 // Assign locations to all of the incoming arguments. 4027 SmallVector<CCValAssign, 16> ArgLocs; 4028 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 4029 *DAG.getContext()); 4030 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 4031 4032 SmallVector<SDValue, 16> ArgValues; 4033 SDValue ArgValue; 4034 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4035 unsigned CurArgIdx = 0; 4036 4037 // Initially ArgRegsSaveSize is zero. 4038 // Then we increase this value each time we meet byval parameter. 4039 // We also increase this value in case of varargs function. 4040 AFI->setArgRegsSaveSize(0); 4041 4042 // Calculate the amount of stack space that we need to allocate to store 4043 // byval and variadic arguments that are passed in registers. 4044 // We need to know this before we allocate the first byval or variadic 4045 // argument, as they will be allocated a stack slot below the CFA (Canonical 4046 // Frame Address, the stack pointer at entry to the function). 4047 unsigned ArgRegBegin = ARM::R4; 4048 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4049 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4050 break; 4051 4052 CCValAssign &VA = ArgLocs[i]; 4053 unsigned Index = VA.getValNo(); 4054 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4055 if (!Flags.isByVal()) 4056 continue; 4057 4058 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4059 unsigned RBegin, REnd; 4060 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4061 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4062 4063 CCInfo.nextInRegsParam(); 4064 } 4065 CCInfo.rewindByValRegsInfo(); 4066 4067 int lastInsIndex = -1; 4068 if (isVarArg && MFI.hasVAStart()) { 4069 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4070 if (RegIdx != array_lengthof(GPRArgRegs)) 4071 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4072 } 4073 4074 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4075 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4076 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4077 4078 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4079 CCValAssign &VA = ArgLocs[i]; 4080 if (Ins[VA.getValNo()].isOrigArg()) { 4081 std::advance(CurOrigArg, 4082 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4083 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4084 } 4085 // Arguments stored in registers. 4086 if (VA.isRegLoc()) { 4087 EVT RegVT = VA.getLocVT(); 4088 4089 if (VA.needsCustom()) { 4090 // f64 and vector types are split up into multiple registers or 4091 // combinations of registers and stack slots. 4092 if (VA.getLocVT() == MVT::v2f64) { 4093 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 4094 Chain, DAG, dl); 4095 VA = ArgLocs[++i]; // skip ahead to next loc 4096 SDValue ArgValue2; 4097 if (VA.isMemLoc()) { 4098 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4099 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4100 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 4101 MachinePointerInfo::getFixedStack( 4102 DAG.getMachineFunction(), FI)); 4103 } else { 4104 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 4105 Chain, DAG, dl); 4106 } 4107 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4108 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4109 ArgValue, ArgValue1, 4110 DAG.getIntPtrConstant(0, dl)); 4111 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 4112 ArgValue, ArgValue2, 4113 DAG.getIntPtrConstant(1, dl)); 4114 } else 4115 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4116 } else { 4117 const TargetRegisterClass *RC; 4118 4119 4120 if (RegVT == MVT::f16) 4121 RC = &ARM::HPRRegClass; 4122 else if (RegVT == MVT::f32) 4123 RC = &ARM::SPRRegClass; 4124 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 4125 RC = &ARM::DPRRegClass; 4126 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 4127 RC = &ARM::QPRRegClass; 4128 else if (RegVT == MVT::i32) 4129 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4130 : &ARM::GPRRegClass; 4131 else 4132 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4133 4134 // Transform the arguments in physical registers into virtual ones. 4135 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4136 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4137 4138 // If this value is passed in r0 and has the returned attribute (e.g. 4139 // C++ 'structors), record this fact for later use. 4140 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4141 AFI->setPreservesR0(); 4142 } 4143 } 4144 4145 // If this is an 8 or 16-bit value, it is really passed promoted 4146 // to 32 bits. Insert an assert[sz]ext to capture this, then 4147 // truncate to the right size. 4148 switch (VA.getLocInfo()) { 4149 default: llvm_unreachable("Unknown loc info!"); 4150 case CCValAssign::Full: break; 4151 case CCValAssign::BCvt: 4152 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4153 break; 4154 case CCValAssign::SExt: 4155 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4156 DAG.getValueType(VA.getValVT())); 4157 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4158 break; 4159 case CCValAssign::ZExt: 4160 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4161 DAG.getValueType(VA.getValVT())); 4162 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4163 break; 4164 } 4165 4166 InVals.push_back(ArgValue); 4167 } else { // VA.isRegLoc() 4168 // sanity check 4169 assert(VA.isMemLoc()); 4170 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4171 4172 int index = VA.getValNo(); 4173 4174 // Some Ins[] entries become multiple ArgLoc[] entries. 4175 // Process them only once. 4176 if (index != lastInsIndex) 4177 { 4178 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4179 // FIXME: For now, all byval parameter objects are marked mutable. 4180 // This can be changed with more analysis. 4181 // In case of tail call optimization mark all arguments mutable. 4182 // Since they could be overwritten by lowering of arguments in case of 4183 // a tail call. 4184 if (Flags.isByVal()) { 4185 assert(Ins[index].isOrigArg() && 4186 "Byval arguments cannot be implicit"); 4187 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4188 4189 int FrameIndex = StoreByValRegs( 4190 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4191 VA.getLocMemOffset(), Flags.getByValSize()); 4192 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4193 CCInfo.nextInRegsParam(); 4194 } else { 4195 unsigned FIOffset = VA.getLocMemOffset(); 4196 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4197 FIOffset, true); 4198 4199 // Create load nodes to retrieve arguments from the stack. 4200 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4201 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4202 MachinePointerInfo::getFixedStack( 4203 DAG.getMachineFunction(), FI))); 4204 } 4205 lastInsIndex = index; 4206 } 4207 } 4208 } 4209 4210 // varargs 4211 if (isVarArg && MFI.hasVAStart()) 4212 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 4213 CCInfo.getNextStackOffset(), 4214 TotalArgRegsSaveSize); 4215 4216 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 4217 4218 return Chain; 4219 } 4220 4221 /// isFloatingPointZero - Return true if this is +0.0. 4222 static bool isFloatingPointZero(SDValue Op) { 4223 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4224 return CFP->getValueAPF().isPosZero(); 4225 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4226 // Maybe this has already been legalized into the constant pool? 4227 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4228 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4229 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4230 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4231 return CFP->getValueAPF().isPosZero(); 4232 } 4233 } else if (Op->getOpcode() == ISD::BITCAST && 4234 Op->getValueType(0) == MVT::f64) { 4235 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4236 // created by LowerConstantFP(). 4237 SDValue BitcastOp = Op->getOperand(0); 4238 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4239 isNullConstant(BitcastOp->getOperand(0))) 4240 return true; 4241 } 4242 return false; 4243 } 4244 4245 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4246 /// the given operands. 4247 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4248 SDValue &ARMcc, SelectionDAG &DAG, 4249 const SDLoc &dl) const { 4250 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4251 unsigned C = RHSC->getZExtValue(); 4252 if (!isLegalICmpImmediate((int32_t)C)) { 4253 // Constant does not fit, try adjusting it by one. 4254 switch (CC) { 4255 default: break; 4256 case ISD::SETLT: 4257 case ISD::SETGE: 4258 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4259 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4260 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4261 } 4262 break; 4263 case ISD::SETULT: 4264 case ISD::SETUGE: 4265 if (C != 0 && isLegalICmpImmediate(C-1)) { 4266 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4267 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4268 } 4269 break; 4270 case ISD::SETLE: 4271 case ISD::SETGT: 4272 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4273 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4274 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4275 } 4276 break; 4277 case ISD::SETULE: 4278 case ISD::SETUGT: 4279 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4280 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4281 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4282 } 4283 break; 4284 } 4285 } 4286 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4287 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4288 // In ARM and Thumb-2, the compare instructions can shift their second 4289 // operand. 4290 CC = ISD::getSetCCSwappedOperands(CC); 4291 std::swap(LHS, RHS); 4292 } 4293 4294 // Thumb1 has very limited immediate modes, so turning an "and" into a 4295 // shift can save multiple instructions. 4296 // 4297 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4298 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4299 // own. If it's the operand to an unsigned comparison with an immediate, 4300 // we can eliminate one of the shifts: we transform 4301 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4302 // 4303 // We avoid transforming cases which aren't profitable due to encoding 4304 // details: 4305 // 4306 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4307 // would not; in that case, we're essentially trading one immediate load for 4308 // another. 4309 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4310 // 3. C2 is zero; we have other code for this special case. 4311 // 4312 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4313 // instruction, since the AND is always one instruction anyway, but we could 4314 // use narrow instructions in some cases. 4315 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4316 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4317 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4318 !isSignedIntSetCC(CC)) { 4319 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4320 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4321 uint64_t RHSV = RHSC->getZExtValue(); 4322 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4323 unsigned ShiftBits = countLeadingZeros(Mask); 4324 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4325 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4326 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4327 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4328 } 4329 } 4330 } 4331 4332 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4333 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4334 // way a cmp would. 4335 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4336 // some tweaks to the heuristics for the previous and->shift transform. 4337 // FIXME: Optimize cases where the LHS isn't a shift. 4338 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4339 isa<ConstantSDNode>(RHS) && 4340 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4341 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4342 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4343 unsigned ShiftAmt = 4344 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4345 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4346 DAG.getVTList(MVT::i32, MVT::i32), 4347 LHS.getOperand(0), 4348 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4349 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4350 Shift.getValue(1), SDValue()); 4351 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4352 return Chain.getValue(1); 4353 } 4354 4355 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4356 4357 // If the RHS is a constant zero then the V (overflow) flag will never be 4358 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4359 // simpler for other passes (like the peephole optimiser) to deal with. 4360 if (isNullConstant(RHS)) { 4361 switch (CondCode) { 4362 default: break; 4363 case ARMCC::GE: 4364 CondCode = ARMCC::PL; 4365 break; 4366 case ARMCC::LT: 4367 CondCode = ARMCC::MI; 4368 break; 4369 } 4370 } 4371 4372 ARMISD::NodeType CompareType; 4373 switch (CondCode) { 4374 default: 4375 CompareType = ARMISD::CMP; 4376 break; 4377 case ARMCC::EQ: 4378 case ARMCC::NE: 4379 // Uses only Z Flag 4380 CompareType = ARMISD::CMPZ; 4381 break; 4382 } 4383 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4384 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4385 } 4386 4387 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4388 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4389 SelectionDAG &DAG, const SDLoc &dl, 4390 bool Signaling) const { 4391 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4392 SDValue Cmp; 4393 if (!isFloatingPointZero(RHS)) 4394 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4395 dl, MVT::Glue, LHS, RHS); 4396 else 4397 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4398 dl, MVT::Glue, LHS); 4399 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4400 } 4401 4402 /// duplicateCmp - Glue values can have only one use, so this function 4403 /// duplicates a comparison node. 4404 SDValue 4405 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4406 unsigned Opc = Cmp.getOpcode(); 4407 SDLoc DL(Cmp); 4408 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4409 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4410 4411 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4412 Cmp = Cmp.getOperand(0); 4413 Opc = Cmp.getOpcode(); 4414 if (Opc == ARMISD::CMPFP) 4415 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4416 else { 4417 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4418 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4419 } 4420 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4421 } 4422 4423 // This function returns three things: the arithmetic computation itself 4424 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4425 // comparison and the condition code define the case in which the arithmetic 4426 // computation *does not* overflow. 4427 std::pair<SDValue, SDValue> 4428 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4429 SDValue &ARMcc) const { 4430 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4431 4432 SDValue Value, OverflowCmp; 4433 SDValue LHS = Op.getOperand(0); 4434 SDValue RHS = Op.getOperand(1); 4435 SDLoc dl(Op); 4436 4437 // FIXME: We are currently always generating CMPs because we don't support 4438 // generating CMN through the backend. This is not as good as the natural 4439 // CMP case because it causes a register dependency and cannot be folded 4440 // later. 4441 4442 switch (Op.getOpcode()) { 4443 default: 4444 llvm_unreachable("Unknown overflow instruction!"); 4445 case ISD::SADDO: 4446 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4447 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4448 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4449 break; 4450 case ISD::UADDO: 4451 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4452 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4453 // We do not use it in the USUBO case as Value may not be used. 4454 Value = DAG.getNode(ARMISD::ADDC, dl, 4455 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4456 .getValue(0); 4457 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4458 break; 4459 case ISD::SSUBO: 4460 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4461 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4462 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4463 break; 4464 case ISD::USUBO: 4465 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4466 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4467 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4468 break; 4469 case ISD::UMULO: 4470 // We generate a UMUL_LOHI and then check if the high word is 0. 4471 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4472 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4473 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4474 LHS, RHS); 4475 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4476 DAG.getConstant(0, dl, MVT::i32)); 4477 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4478 break; 4479 case ISD::SMULO: 4480 // We generate a SMUL_LOHI and then check if all the bits of the high word 4481 // are the same as the sign bit of the low word. 4482 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4483 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4484 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4485 LHS, RHS); 4486 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4487 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4488 Value.getValue(0), 4489 DAG.getConstant(31, dl, MVT::i32))); 4490 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4491 break; 4492 } // switch (...) 4493 4494 return std::make_pair(Value, OverflowCmp); 4495 } 4496 4497 SDValue 4498 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4499 // Let legalize expand this if it isn't a legal type yet. 4500 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4501 return SDValue(); 4502 4503 SDValue Value, OverflowCmp; 4504 SDValue ARMcc; 4505 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4506 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4507 SDLoc dl(Op); 4508 // We use 0 and 1 as false and true values. 4509 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4510 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4511 EVT VT = Op.getValueType(); 4512 4513 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4514 ARMcc, CCR, OverflowCmp); 4515 4516 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4517 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4518 } 4519 4520 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4521 SelectionDAG &DAG) { 4522 SDLoc DL(BoolCarry); 4523 EVT CarryVT = BoolCarry.getValueType(); 4524 4525 // This converts the boolean value carry into the carry flag by doing 4526 // ARMISD::SUBC Carry, 1 4527 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4528 DAG.getVTList(CarryVT, MVT::i32), 4529 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4530 return Carry.getValue(1); 4531 } 4532 4533 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4534 SelectionDAG &DAG) { 4535 SDLoc DL(Flags); 4536 4537 // Now convert the carry flag into a boolean carry. We do this 4538 // using ARMISD:ADDE 0, 0, Carry 4539 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4540 DAG.getConstant(0, DL, MVT::i32), 4541 DAG.getConstant(0, DL, MVT::i32), Flags); 4542 } 4543 4544 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4545 SelectionDAG &DAG) const { 4546 // Let legalize expand this if it isn't a legal type yet. 4547 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4548 return SDValue(); 4549 4550 SDValue LHS = Op.getOperand(0); 4551 SDValue RHS = Op.getOperand(1); 4552 SDLoc dl(Op); 4553 4554 EVT VT = Op.getValueType(); 4555 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4556 SDValue Value; 4557 SDValue Overflow; 4558 switch (Op.getOpcode()) { 4559 default: 4560 llvm_unreachable("Unknown overflow instruction!"); 4561 case ISD::UADDO: 4562 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4563 // Convert the carry flag into a boolean value. 4564 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4565 break; 4566 case ISD::USUBO: { 4567 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4568 // Convert the carry flag into a boolean value. 4569 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4570 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4571 // value. So compute 1 - C. 4572 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4573 DAG.getConstant(1, dl, MVT::i32), Overflow); 4574 break; 4575 } 4576 } 4577 4578 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4579 } 4580 4581 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4582 const ARMSubtarget *Subtarget) { 4583 EVT VT = Op.getValueType(); 4584 if (!Subtarget->hasDSP()) 4585 return SDValue(); 4586 if (!VT.isSimple()) 4587 return SDValue(); 4588 4589 unsigned NewOpcode; 4590 bool IsAdd = Op->getOpcode() == ISD::SADDSAT; 4591 switch (VT.getSimpleVT().SimpleTy) { 4592 default: 4593 return SDValue(); 4594 case MVT::i8: 4595 NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; 4596 break; 4597 case MVT::i16: 4598 NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; 4599 break; 4600 } 4601 4602 SDLoc dl(Op); 4603 SDValue Add = 4604 DAG.getNode(NewOpcode, dl, MVT::i32, 4605 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 4606 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 4607 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 4608 } 4609 4610 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4611 SDValue Cond = Op.getOperand(0); 4612 SDValue SelectTrue = Op.getOperand(1); 4613 SDValue SelectFalse = Op.getOperand(2); 4614 SDLoc dl(Op); 4615 unsigned Opc = Cond.getOpcode(); 4616 4617 if (Cond.getResNo() == 1 && 4618 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4619 Opc == ISD::USUBO)) { 4620 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4621 return SDValue(); 4622 4623 SDValue Value, OverflowCmp; 4624 SDValue ARMcc; 4625 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4626 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4627 EVT VT = Op.getValueType(); 4628 4629 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4630 OverflowCmp, DAG); 4631 } 4632 4633 // Convert: 4634 // 4635 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4636 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4637 // 4638 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4639 const ConstantSDNode *CMOVTrue = 4640 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4641 const ConstantSDNode *CMOVFalse = 4642 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4643 4644 if (CMOVTrue && CMOVFalse) { 4645 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4646 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4647 4648 SDValue True; 4649 SDValue False; 4650 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4651 True = SelectTrue; 4652 False = SelectFalse; 4653 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4654 True = SelectFalse; 4655 False = SelectTrue; 4656 } 4657 4658 if (True.getNode() && False.getNode()) { 4659 EVT VT = Op.getValueType(); 4660 SDValue ARMcc = Cond.getOperand(2); 4661 SDValue CCR = Cond.getOperand(3); 4662 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4663 assert(True.getValueType() == VT); 4664 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4665 } 4666 } 4667 } 4668 4669 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4670 // undefined bits before doing a full-word comparison with zero. 4671 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4672 DAG.getConstant(1, dl, Cond.getValueType())); 4673 4674 return DAG.getSelectCC(dl, Cond, 4675 DAG.getConstant(0, dl, Cond.getValueType()), 4676 SelectTrue, SelectFalse, ISD::SETNE); 4677 } 4678 4679 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4680 bool &swpCmpOps, bool &swpVselOps) { 4681 // Start by selecting the GE condition code for opcodes that return true for 4682 // 'equality' 4683 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4684 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4685 CondCode = ARMCC::GE; 4686 4687 // and GT for opcodes that return false for 'equality'. 4688 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4689 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4690 CondCode = ARMCC::GT; 4691 4692 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4693 // to swap the compare operands. 4694 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4695 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4696 swpCmpOps = true; 4697 4698 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4699 // If we have an unordered opcode, we need to swap the operands to the VSEL 4700 // instruction (effectively negating the condition). 4701 // 4702 // This also has the effect of swapping which one of 'less' or 'greater' 4703 // returns true, so we also swap the compare operands. It also switches 4704 // whether we return true for 'equality', so we compensate by picking the 4705 // opposite condition code to our original choice. 4706 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4707 CC == ISD::SETUGT) { 4708 swpCmpOps = !swpCmpOps; 4709 swpVselOps = !swpVselOps; 4710 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4711 } 4712 4713 // 'ordered' is 'anything but unordered', so use the VS condition code and 4714 // swap the VSEL operands. 4715 if (CC == ISD::SETO) { 4716 CondCode = ARMCC::VS; 4717 swpVselOps = true; 4718 } 4719 4720 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4721 // code and swap the VSEL operands. Also do this if we don't care about the 4722 // unordered case. 4723 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4724 CondCode = ARMCC::EQ; 4725 swpVselOps = true; 4726 } 4727 } 4728 4729 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4730 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4731 SDValue Cmp, SelectionDAG &DAG) const { 4732 if (!Subtarget->hasFP64() && VT == MVT::f64) { 4733 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4734 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4735 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4736 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4737 4738 SDValue TrueLow = TrueVal.getValue(0); 4739 SDValue TrueHigh = TrueVal.getValue(1); 4740 SDValue FalseLow = FalseVal.getValue(0); 4741 SDValue FalseHigh = FalseVal.getValue(1); 4742 4743 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4744 ARMcc, CCR, Cmp); 4745 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4746 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4747 4748 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4749 } else { 4750 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4751 Cmp); 4752 } 4753 } 4754 4755 static bool isGTorGE(ISD::CondCode CC) { 4756 return CC == ISD::SETGT || CC == ISD::SETGE; 4757 } 4758 4759 static bool isLTorLE(ISD::CondCode CC) { 4760 return CC == ISD::SETLT || CC == ISD::SETLE; 4761 } 4762 4763 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4764 // All of these conditions (and their <= and >= counterparts) will do: 4765 // x < k ? k : x 4766 // x > k ? x : k 4767 // k < x ? x : k 4768 // k > x ? k : x 4769 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4770 const SDValue TrueVal, const SDValue FalseVal, 4771 const ISD::CondCode CC, const SDValue K) { 4772 return (isGTorGE(CC) && 4773 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4774 (isLTorLE(CC) && 4775 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4776 } 4777 4778 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4779 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4780 const SDValue TrueVal, const SDValue FalseVal, 4781 const ISD::CondCode CC, const SDValue K) { 4782 return (isGTorGE(CC) && 4783 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4784 (isLTorLE(CC) && 4785 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4786 } 4787 4788 // Check if two chained conditionals could be converted into SSAT or USAT. 4789 // 4790 // SSAT can replace a set of two conditional selectors that bound a number to an 4791 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4792 // 4793 // x < -k ? -k : (x > k ? k : x) 4794 // x < -k ? -k : (x < k ? x : k) 4795 // x > -k ? (x > k ? k : x) : -k 4796 // x < k ? (x < -k ? -k : x) : k 4797 // etc. 4798 // 4799 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4800 // a power of 2. 4801 // 4802 // It returns true if the conversion can be done, false otherwise. 4803 // Additionally, the variable is returned in parameter V, the constant in K and 4804 // usat is set to true if the conditional represents an unsigned saturation 4805 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4806 uint64_t &K, bool &usat) { 4807 SDValue LHS1 = Op.getOperand(0); 4808 SDValue RHS1 = Op.getOperand(1); 4809 SDValue TrueVal1 = Op.getOperand(2); 4810 SDValue FalseVal1 = Op.getOperand(3); 4811 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4812 4813 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4814 if (Op2.getOpcode() != ISD::SELECT_CC) 4815 return false; 4816 4817 SDValue LHS2 = Op2.getOperand(0); 4818 SDValue RHS2 = Op2.getOperand(1); 4819 SDValue TrueVal2 = Op2.getOperand(2); 4820 SDValue FalseVal2 = Op2.getOperand(3); 4821 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4822 4823 // Find out which are the constants and which are the variables 4824 // in each conditional 4825 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4826 ? &RHS1 4827 : nullptr; 4828 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4829 ? &RHS2 4830 : nullptr; 4831 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4832 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4833 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4834 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4835 4836 // We must detect cases where the original operations worked with 16- or 4837 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4838 // must work with sign-extended values but the select operations return 4839 // the original non-extended value. 4840 SDValue V2TmpReg = V2Tmp; 4841 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4842 V2TmpReg = V2Tmp->getOperand(0); 4843 4844 // Check that the registers and the constants have the correct values 4845 // in both conditionals 4846 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4847 V2TmpReg != V2) 4848 return false; 4849 4850 // Figure out which conditional is saturating the lower/upper bound. 4851 const SDValue *LowerCheckOp = 4852 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4853 ? &Op 4854 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4855 ? &Op2 4856 : nullptr; 4857 const SDValue *UpperCheckOp = 4858 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4859 ? &Op 4860 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4861 ? &Op2 4862 : nullptr; 4863 4864 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4865 return false; 4866 4867 // Check that the constant in the lower-bound check is 4868 // the opposite of the constant in the upper-bound check 4869 // in 1's complement. 4870 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4871 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4872 int64_t PosVal = std::max(Val1, Val2); 4873 int64_t NegVal = std::min(Val1, Val2); 4874 4875 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4876 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4877 isPowerOf2_64(PosVal + 1)) { 4878 4879 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4880 if (Val1 == ~Val2) 4881 usat = false; 4882 else if (NegVal == 0) 4883 usat = true; 4884 else 4885 return false; 4886 4887 V = V2; 4888 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4889 4890 return true; 4891 } 4892 4893 return false; 4894 } 4895 4896 // Check if a condition of the type x < k ? k : x can be converted into a 4897 // bit operation instead of conditional moves. 4898 // Currently this is allowed given: 4899 // - The conditions and values match up 4900 // - k is 0 or -1 (all ones) 4901 // This function will not check the last condition, thats up to the caller 4902 // It returns true if the transformation can be made, and in such case 4903 // returns x in V, and k in SatK. 4904 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4905 SDValue &SatK) 4906 { 4907 SDValue LHS = Op.getOperand(0); 4908 SDValue RHS = Op.getOperand(1); 4909 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4910 SDValue TrueVal = Op.getOperand(2); 4911 SDValue FalseVal = Op.getOperand(3); 4912 4913 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4914 ? &RHS 4915 : nullptr; 4916 4917 // No constant operation in comparison, early out 4918 if (!K) 4919 return false; 4920 4921 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4922 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4923 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4924 4925 // If the constant on left and right side, or variable on left and right, 4926 // does not match, early out 4927 if (*K != KTmp || V != VTmp) 4928 return false; 4929 4930 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4931 SatK = *K; 4932 return true; 4933 } 4934 4935 return false; 4936 } 4937 4938 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 4939 if (VT == MVT::f32) 4940 return !Subtarget->hasVFP2Base(); 4941 if (VT == MVT::f64) 4942 return !Subtarget->hasFP64(); 4943 if (VT == MVT::f16) 4944 return !Subtarget->hasFullFP16(); 4945 return false; 4946 } 4947 4948 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4949 EVT VT = Op.getValueType(); 4950 SDLoc dl(Op); 4951 4952 // Try to convert two saturating conditional selects into a single SSAT 4953 SDValue SatValue; 4954 uint64_t SatConstant; 4955 bool SatUSat; 4956 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4957 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4958 if (SatUSat) 4959 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4960 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4961 else 4962 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4963 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4964 } 4965 4966 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4967 // into more efficient bit operations, which is possible when k is 0 or -1 4968 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4969 // single instructions. On Thumb the shift and the bit operation will be two 4970 // instructions. 4971 // Only allow this transformation on full-width (32-bit) operations 4972 SDValue LowerSatConstant; 4973 if (VT == MVT::i32 && 4974 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4975 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4976 DAG.getConstant(31, dl, VT)); 4977 if (isNullConstant(LowerSatConstant)) { 4978 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4979 DAG.getAllOnesConstant(dl, VT)); 4980 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4981 } else if (isAllOnesConstant(LowerSatConstant)) 4982 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4983 } 4984 4985 SDValue LHS = Op.getOperand(0); 4986 SDValue RHS = Op.getOperand(1); 4987 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4988 SDValue TrueVal = Op.getOperand(2); 4989 SDValue FalseVal = Op.getOperand(3); 4990 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 4991 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 4992 4993 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 4994 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 4995 unsigned TVal = CTVal->getZExtValue(); 4996 unsigned FVal = CFVal->getZExtValue(); 4997 unsigned Opcode = 0; 4998 4999 if (TVal == ~FVal) { 5000 Opcode = ARMISD::CSINV; 5001 } else if (TVal == ~FVal + 1) { 5002 Opcode = ARMISD::CSNEG; 5003 } else if (TVal + 1 == FVal) { 5004 Opcode = ARMISD::CSINC; 5005 } else if (TVal == FVal + 1) { 5006 Opcode = ARMISD::CSINC; 5007 std::swap(TrueVal, FalseVal); 5008 std::swap(TVal, FVal); 5009 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5010 } 5011 5012 if (Opcode) { 5013 // If one of the constants is cheaper than another, materialise the 5014 // cheaper one and let the csel generate the other. 5015 if (Opcode != ARMISD::CSINC && 5016 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 5017 std::swap(TrueVal, FalseVal); 5018 std::swap(TVal, FVal); 5019 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5020 } 5021 5022 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 5023 // to get there. CSINC not is invertable like the other two (~(~a) == a, 5024 // -(-a) == a, but (a+1)+1 != a). 5025 if (FVal == 0 && Opcode != ARMISD::CSINC) { 5026 std::swap(TrueVal, FalseVal); 5027 std::swap(TVal, FVal); 5028 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5029 } 5030 if (TVal == 0) 5031 TrueVal = DAG.getRegister(ARM::ZR, MVT::i32); 5032 5033 // Drops F's value because we can get it by inverting/negating TVal. 5034 FalseVal = TrueVal; 5035 5036 SDValue ARMcc; 5037 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5038 EVT VT = TrueVal.getValueType(); 5039 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5040 } 5041 } 5042 5043 if (isUnsupportedFloatingType(LHS.getValueType())) { 5044 DAG.getTargetLoweringInfo().softenSetCCOperands( 5045 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5046 5047 // If softenSetCCOperands only returned one value, we should compare it to 5048 // zero. 5049 if (!RHS.getNode()) { 5050 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5051 CC = ISD::SETNE; 5052 } 5053 } 5054 5055 if (LHS.getValueType() == MVT::i32) { 5056 // Try to generate VSEL on ARMv8. 5057 // The VSEL instruction can't use all the usual ARM condition 5058 // codes: it only has two bits to select the condition code, so it's 5059 // constrained to use only GE, GT, VS and EQ. 5060 // 5061 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5062 // swap the operands of the previous compare instruction (effectively 5063 // inverting the compare condition, swapping 'less' and 'greater') and 5064 // sometimes need to swap the operands to the VSEL (which inverts the 5065 // condition in the sense of firing whenever the previous condition didn't) 5066 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5067 TrueVal.getValueType() == MVT::f32 || 5068 TrueVal.getValueType() == MVT::f64)) { 5069 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5070 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5071 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5072 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5073 std::swap(TrueVal, FalseVal); 5074 } 5075 } 5076 5077 SDValue ARMcc; 5078 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5079 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5080 // Choose GE over PL, which vsel does now support 5081 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5082 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5083 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5084 } 5085 5086 ARMCC::CondCodes CondCode, CondCode2; 5087 FPCCToARMCC(CC, CondCode, CondCode2); 5088 5089 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5090 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5091 // must use VSEL (limited condition codes), due to not having conditional f16 5092 // moves. 5093 if (Subtarget->hasFPARMv8Base() && 5094 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5095 (TrueVal.getValueType() == MVT::f16 || 5096 TrueVal.getValueType() == MVT::f32 || 5097 TrueVal.getValueType() == MVT::f64)) { 5098 bool swpCmpOps = false; 5099 bool swpVselOps = false; 5100 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5101 5102 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5103 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5104 if (swpCmpOps) 5105 std::swap(LHS, RHS); 5106 if (swpVselOps) 5107 std::swap(TrueVal, FalseVal); 5108 } 5109 } 5110 5111 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5112 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5113 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5114 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5115 if (CondCode2 != ARMCC::AL) { 5116 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5117 // FIXME: Needs another CMP because flag can have but one use. 5118 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5119 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5120 } 5121 return Result; 5122 } 5123 5124 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5125 /// to morph to an integer compare sequence. 5126 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5127 const ARMSubtarget *Subtarget) { 5128 SDNode *N = Op.getNode(); 5129 if (!N->hasOneUse()) 5130 // Otherwise it requires moving the value from fp to integer registers. 5131 return false; 5132 if (!N->getNumValues()) 5133 return false; 5134 EVT VT = Op.getValueType(); 5135 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5136 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5137 // vmrs are very slow, e.g. cortex-a8. 5138 return false; 5139 5140 if (isFloatingPointZero(Op)) { 5141 SeenZero = true; 5142 return true; 5143 } 5144 return ISD::isNormalLoad(N); 5145 } 5146 5147 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5148 if (isFloatingPointZero(Op)) 5149 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5150 5151 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5152 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5153 Ld->getPointerInfo(), Ld->getAlignment(), 5154 Ld->getMemOperand()->getFlags()); 5155 5156 llvm_unreachable("Unknown VFP cmp argument!"); 5157 } 5158 5159 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5160 SDValue &RetVal1, SDValue &RetVal2) { 5161 SDLoc dl(Op); 5162 5163 if (isFloatingPointZero(Op)) { 5164 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5165 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5166 return; 5167 } 5168 5169 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5170 SDValue Ptr = Ld->getBasePtr(); 5171 RetVal1 = 5172 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5173 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5174 5175 EVT PtrType = Ptr.getValueType(); 5176 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5177 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5178 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5179 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5180 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5181 Ld->getMemOperand()->getFlags()); 5182 return; 5183 } 5184 5185 llvm_unreachable("Unknown VFP cmp argument!"); 5186 } 5187 5188 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5189 /// f32 and even f64 comparisons to integer ones. 5190 SDValue 5191 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5192 SDValue Chain = Op.getOperand(0); 5193 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5194 SDValue LHS = Op.getOperand(2); 5195 SDValue RHS = Op.getOperand(3); 5196 SDValue Dest = Op.getOperand(4); 5197 SDLoc dl(Op); 5198 5199 bool LHSSeenZero = false; 5200 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5201 bool RHSSeenZero = false; 5202 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5203 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5204 // If unsafe fp math optimization is enabled and there are no other uses of 5205 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5206 // to an integer comparison. 5207 if (CC == ISD::SETOEQ) 5208 CC = ISD::SETEQ; 5209 else if (CC == ISD::SETUNE) 5210 CC = ISD::SETNE; 5211 5212 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5213 SDValue ARMcc; 5214 if (LHS.getValueType() == MVT::f32) { 5215 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5216 bitcastf32Toi32(LHS, DAG), Mask); 5217 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5218 bitcastf32Toi32(RHS, DAG), Mask); 5219 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5220 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5221 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5222 Chain, Dest, ARMcc, CCR, Cmp); 5223 } 5224 5225 SDValue LHS1, LHS2; 5226 SDValue RHS1, RHS2; 5227 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5228 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5229 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5230 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5231 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5232 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5233 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5234 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5235 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5236 } 5237 5238 return SDValue(); 5239 } 5240 5241 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5242 SDValue Chain = Op.getOperand(0); 5243 SDValue Cond = Op.getOperand(1); 5244 SDValue Dest = Op.getOperand(2); 5245 SDLoc dl(Op); 5246 5247 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5248 // instruction. 5249 unsigned Opc = Cond.getOpcode(); 5250 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5251 !Subtarget->isThumb1Only(); 5252 if (Cond.getResNo() == 1 && 5253 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5254 Opc == ISD::USUBO || OptimizeMul)) { 5255 // Only lower legal XALUO ops. 5256 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5257 return SDValue(); 5258 5259 // The actual operation with overflow check. 5260 SDValue Value, OverflowCmp; 5261 SDValue ARMcc; 5262 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5263 5264 // Reverse the condition code. 5265 ARMCC::CondCodes CondCode = 5266 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5267 CondCode = ARMCC::getOppositeCondition(CondCode); 5268 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5269 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5270 5271 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5272 OverflowCmp); 5273 } 5274 5275 return SDValue(); 5276 } 5277 5278 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5279 SDValue Chain = Op.getOperand(0); 5280 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5281 SDValue LHS = Op.getOperand(2); 5282 SDValue RHS = Op.getOperand(3); 5283 SDValue Dest = Op.getOperand(4); 5284 SDLoc dl(Op); 5285 5286 if (isUnsupportedFloatingType(LHS.getValueType())) { 5287 DAG.getTargetLoweringInfo().softenSetCCOperands( 5288 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5289 5290 // If softenSetCCOperands only returned one value, we should compare it to 5291 // zero. 5292 if (!RHS.getNode()) { 5293 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5294 CC = ISD::SETNE; 5295 } 5296 } 5297 5298 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5299 // instruction. 5300 unsigned Opc = LHS.getOpcode(); 5301 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5302 !Subtarget->isThumb1Only(); 5303 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5304 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5305 Opc == ISD::USUBO || OptimizeMul) && 5306 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5307 // Only lower legal XALUO ops. 5308 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5309 return SDValue(); 5310 5311 // The actual operation with overflow check. 5312 SDValue Value, OverflowCmp; 5313 SDValue ARMcc; 5314 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5315 5316 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5317 // Reverse the condition code. 5318 ARMCC::CondCodes CondCode = 5319 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5320 CondCode = ARMCC::getOppositeCondition(CondCode); 5321 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5322 } 5323 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5324 5325 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5326 OverflowCmp); 5327 } 5328 5329 if (LHS.getValueType() == MVT::i32) { 5330 SDValue ARMcc; 5331 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5332 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5333 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5334 Chain, Dest, ARMcc, CCR, Cmp); 5335 } 5336 5337 if (getTargetMachine().Options.UnsafeFPMath && 5338 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5339 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5340 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5341 return Result; 5342 } 5343 5344 ARMCC::CondCodes CondCode, CondCode2; 5345 FPCCToARMCC(CC, CondCode, CondCode2); 5346 5347 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5348 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5349 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5350 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5351 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5352 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5353 if (CondCode2 != ARMCC::AL) { 5354 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5355 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5356 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5357 } 5358 return Res; 5359 } 5360 5361 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5362 SDValue Chain = Op.getOperand(0); 5363 SDValue Table = Op.getOperand(1); 5364 SDValue Index = Op.getOperand(2); 5365 SDLoc dl(Op); 5366 5367 EVT PTy = getPointerTy(DAG.getDataLayout()); 5368 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5369 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5370 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5371 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5372 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5373 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5374 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5375 // which does another jump to the destination. This also makes it easier 5376 // to translate it to TBB / TBH later (Thumb2 only). 5377 // FIXME: This might not work if the function is extremely large. 5378 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5379 Addr, Op.getOperand(2), JTI); 5380 } 5381 if (isPositionIndependent() || Subtarget->isROPI()) { 5382 Addr = 5383 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5384 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5385 Chain = Addr.getValue(1); 5386 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5387 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5388 } else { 5389 Addr = 5390 DAG.getLoad(PTy, dl, Chain, Addr, 5391 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5392 Chain = Addr.getValue(1); 5393 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5394 } 5395 } 5396 5397 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5398 EVT VT = Op.getValueType(); 5399 SDLoc dl(Op); 5400 5401 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5402 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5403 return Op; 5404 return DAG.UnrollVectorOp(Op.getNode()); 5405 } 5406 5407 const bool HasFullFP16 = 5408 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5409 5410 EVT NewTy; 5411 const EVT OpTy = Op.getOperand(0).getValueType(); 5412 if (OpTy == MVT::v4f32) 5413 NewTy = MVT::v4i32; 5414 else if (OpTy == MVT::v4f16 && HasFullFP16) 5415 NewTy = MVT::v4i16; 5416 else if (OpTy == MVT::v8f16 && HasFullFP16) 5417 NewTy = MVT::v8i16; 5418 else 5419 llvm_unreachable("Invalid type for custom lowering!"); 5420 5421 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5422 return DAG.UnrollVectorOp(Op.getNode()); 5423 5424 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5425 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5426 } 5427 5428 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5429 EVT VT = Op.getValueType(); 5430 if (VT.isVector()) 5431 return LowerVectorFP_TO_INT(Op, DAG); 5432 5433 bool IsStrict = Op->isStrictFPOpcode(); 5434 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5435 5436 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5437 RTLIB::Libcall LC; 5438 if (Op.getOpcode() == ISD::FP_TO_SINT || 5439 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5440 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5441 Op.getValueType()); 5442 else 5443 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5444 Op.getValueType()); 5445 SDLoc Loc(Op); 5446 MakeLibCallOptions CallOptions; 5447 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5448 SDValue Result; 5449 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5450 CallOptions, Loc, Chain); 5451 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5452 } 5453 5454 // FIXME: Remove this when we have strict fp instruction selection patterns 5455 if (IsStrict) { 5456 SDLoc Loc(Op); 5457 SDValue Result = 5458 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5459 : ISD::FP_TO_UINT, 5460 Loc, Op.getValueType(), SrcVal); 5461 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5462 } 5463 5464 return Op; 5465 } 5466 5467 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5468 EVT VT = Op.getValueType(); 5469 SDLoc dl(Op); 5470 5471 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5472 if (VT.getVectorElementType() == MVT::f32) 5473 return Op; 5474 return DAG.UnrollVectorOp(Op.getNode()); 5475 } 5476 5477 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5478 Op.getOperand(0).getValueType() == MVT::v8i16) && 5479 "Invalid type for custom lowering!"); 5480 5481 const bool HasFullFP16 = 5482 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5483 5484 EVT DestVecType; 5485 if (VT == MVT::v4f32) 5486 DestVecType = MVT::v4i32; 5487 else if (VT == MVT::v4f16 && HasFullFP16) 5488 DestVecType = MVT::v4i16; 5489 else if (VT == MVT::v8f16 && HasFullFP16) 5490 DestVecType = MVT::v8i16; 5491 else 5492 return DAG.UnrollVectorOp(Op.getNode()); 5493 5494 unsigned CastOpc; 5495 unsigned Opc; 5496 switch (Op.getOpcode()) { 5497 default: llvm_unreachable("Invalid opcode!"); 5498 case ISD::SINT_TO_FP: 5499 CastOpc = ISD::SIGN_EXTEND; 5500 Opc = ISD::SINT_TO_FP; 5501 break; 5502 case ISD::UINT_TO_FP: 5503 CastOpc = ISD::ZERO_EXTEND; 5504 Opc = ISD::UINT_TO_FP; 5505 break; 5506 } 5507 5508 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5509 return DAG.getNode(Opc, dl, VT, Op); 5510 } 5511 5512 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5513 EVT VT = Op.getValueType(); 5514 if (VT.isVector()) 5515 return LowerVectorINT_TO_FP(Op, DAG); 5516 if (isUnsupportedFloatingType(VT)) { 5517 RTLIB::Libcall LC; 5518 if (Op.getOpcode() == ISD::SINT_TO_FP) 5519 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5520 Op.getValueType()); 5521 else 5522 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5523 Op.getValueType()); 5524 MakeLibCallOptions CallOptions; 5525 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5526 CallOptions, SDLoc(Op)).first; 5527 } 5528 5529 return Op; 5530 } 5531 5532 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5533 // Implement fcopysign with a fabs and a conditional fneg. 5534 SDValue Tmp0 = Op.getOperand(0); 5535 SDValue Tmp1 = Op.getOperand(1); 5536 SDLoc dl(Op); 5537 EVT VT = Op.getValueType(); 5538 EVT SrcVT = Tmp1.getValueType(); 5539 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5540 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5541 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5542 5543 if (UseNEON) { 5544 // Use VBSL to copy the sign bit. 5545 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5546 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5547 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5548 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5549 if (VT == MVT::f64) 5550 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5551 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5552 DAG.getConstant(32, dl, MVT::i32)); 5553 else /*if (VT == MVT::f32)*/ 5554 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5555 if (SrcVT == MVT::f32) { 5556 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5557 if (VT == MVT::f64) 5558 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5559 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5560 DAG.getConstant(32, dl, MVT::i32)); 5561 } else if (VT == MVT::f32) 5562 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5563 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5564 DAG.getConstant(32, dl, MVT::i32)); 5565 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5566 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5567 5568 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5569 dl, MVT::i32); 5570 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5571 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5572 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5573 5574 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5575 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5576 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5577 if (VT == MVT::f32) { 5578 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5579 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5580 DAG.getConstant(0, dl, MVT::i32)); 5581 } else { 5582 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5583 } 5584 5585 return Res; 5586 } 5587 5588 // Bitcast operand 1 to i32. 5589 if (SrcVT == MVT::f64) 5590 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5591 Tmp1).getValue(1); 5592 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5593 5594 // Or in the signbit with integer operations. 5595 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5596 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5597 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5598 if (VT == MVT::f32) { 5599 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5600 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5601 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5602 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5603 } 5604 5605 // f64: Or the high part with signbit and then combine two parts. 5606 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5607 Tmp0); 5608 SDValue Lo = Tmp0.getValue(0); 5609 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5610 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5611 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5612 } 5613 5614 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5615 MachineFunction &MF = DAG.getMachineFunction(); 5616 MachineFrameInfo &MFI = MF.getFrameInfo(); 5617 MFI.setReturnAddressIsTaken(true); 5618 5619 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5620 return SDValue(); 5621 5622 EVT VT = Op.getValueType(); 5623 SDLoc dl(Op); 5624 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5625 if (Depth) { 5626 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5627 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5628 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5629 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5630 MachinePointerInfo()); 5631 } 5632 5633 // Return LR, which contains the return address. Mark it an implicit live-in. 5634 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5635 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5636 } 5637 5638 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5639 const ARMBaseRegisterInfo &ARI = 5640 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5641 MachineFunction &MF = DAG.getMachineFunction(); 5642 MachineFrameInfo &MFI = MF.getFrameInfo(); 5643 MFI.setFrameAddressIsTaken(true); 5644 5645 EVT VT = Op.getValueType(); 5646 SDLoc dl(Op); // FIXME probably not meaningful 5647 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5648 Register FrameReg = ARI.getFrameRegister(MF); 5649 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5650 while (Depth--) 5651 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5652 MachinePointerInfo()); 5653 return FrameAddr; 5654 } 5655 5656 // FIXME? Maybe this could be a TableGen attribute on some registers and 5657 // this table could be generated automatically from RegInfo. 5658 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 5659 const MachineFunction &MF) const { 5660 Register Reg = StringSwitch<unsigned>(RegName) 5661 .Case("sp", ARM::SP) 5662 .Default(0); 5663 if (Reg) 5664 return Reg; 5665 report_fatal_error(Twine("Invalid register name \"" 5666 + StringRef(RegName) + "\".")); 5667 } 5668 5669 // Result is 64 bit value so split into two 32 bit values and return as a 5670 // pair of values. 5671 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5672 SelectionDAG &DAG) { 5673 SDLoc DL(N); 5674 5675 // This function is only supposed to be called for i64 type destination. 5676 assert(N->getValueType(0) == MVT::i64 5677 && "ExpandREAD_REGISTER called for non-i64 type result."); 5678 5679 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5680 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5681 N->getOperand(0), 5682 N->getOperand(1)); 5683 5684 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5685 Read.getValue(1))); 5686 Results.push_back(Read.getOperand(0)); 5687 } 5688 5689 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5690 /// When \p DstVT, the destination type of \p BC, is on the vector 5691 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5692 /// it might be possible to combine them, such that everything stays on the 5693 /// vector register bank. 5694 /// \p return The node that would replace \p BT, if the combine 5695 /// is possible. 5696 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5697 SelectionDAG &DAG) { 5698 SDValue Op = BC->getOperand(0); 5699 EVT DstVT = BC->getValueType(0); 5700 5701 // The only vector instruction that can produce a scalar (remember, 5702 // since the bitcast was about to be turned into VMOVDRR, the source 5703 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5704 // Moreover, we can do this combine only if there is one use. 5705 // Finally, if the destination type is not a vector, there is not 5706 // much point on forcing everything on the vector bank. 5707 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5708 !Op.hasOneUse()) 5709 return SDValue(); 5710 5711 // If the index is not constant, we will introduce an additional 5712 // multiply that will stick. 5713 // Give up in that case. 5714 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5715 if (!Index) 5716 return SDValue(); 5717 unsigned DstNumElt = DstVT.getVectorNumElements(); 5718 5719 // Compute the new index. 5720 const APInt &APIntIndex = Index->getAPIntValue(); 5721 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5722 NewIndex *= APIntIndex; 5723 // Check if the new constant index fits into i32. 5724 if (NewIndex.getBitWidth() > 32) 5725 return SDValue(); 5726 5727 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5728 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5729 SDLoc dl(Op); 5730 SDValue ExtractSrc = Op.getOperand(0); 5731 EVT VecVT = EVT::getVectorVT( 5732 *DAG.getContext(), DstVT.getScalarType(), 5733 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5734 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5735 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5736 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5737 } 5738 5739 /// ExpandBITCAST - If the target supports VFP, this function is called to 5740 /// expand a bit convert where either the source or destination type is i64 to 5741 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5742 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5743 /// vectors), since the legalizer won't know what to do with that. 5744 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5745 const ARMSubtarget *Subtarget) { 5746 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5747 SDLoc dl(N); 5748 SDValue Op = N->getOperand(0); 5749 5750 // This function is only supposed to be called for i64 types, either as the 5751 // source or destination of the bit convert. 5752 EVT SrcVT = Op.getValueType(); 5753 EVT DstVT = N->getValueType(0); 5754 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5755 5756 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5757 if (!HasFullFP16) 5758 return SDValue(); 5759 // SoftFP: read half-precision arguments: 5760 // 5761 // t2: i32,ch = ... 5762 // t7: i16 = truncate t2 <~~~~ Op 5763 // t8: f16 = bitcast t7 <~~~~ N 5764 // 5765 if (Op.getOperand(0).getValueType() == MVT::i32) 5766 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5767 MVT::f16, Op.getOperand(0)); 5768 5769 return SDValue(); 5770 } 5771 5772 // Half-precision return values 5773 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5774 if (!HasFullFP16) 5775 return SDValue(); 5776 // 5777 // t11: f16 = fadd t8, t10 5778 // t12: i16 = bitcast t11 <~~~ SDNode N 5779 // t13: i32 = zero_extend t12 5780 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5781 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5782 // 5783 // transform this into: 5784 // 5785 // t20: i32 = ARMISD::VMOVrh t11 5786 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5787 // 5788 auto ZeroExtend = N->use_begin(); 5789 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5790 ZeroExtend->getValueType(0) != MVT::i32) 5791 return SDValue(); 5792 5793 auto Copy = ZeroExtend->use_begin(); 5794 if (Copy->getOpcode() == ISD::CopyToReg && 5795 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5796 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5797 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5798 return Cvt; 5799 } 5800 return SDValue(); 5801 } 5802 5803 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5804 return SDValue(); 5805 5806 // Turn i64->f64 into VMOVDRR. 5807 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5808 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5809 // if we can combine the bitcast with its source. 5810 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5811 return Val; 5812 5813 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5814 DAG.getConstant(0, dl, MVT::i32)); 5815 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5816 DAG.getConstant(1, dl, MVT::i32)); 5817 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5818 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5819 } 5820 5821 // Turn f64->i64 into VMOVRRD. 5822 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5823 SDValue Cvt; 5824 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5825 SrcVT.getVectorNumElements() > 1) 5826 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5827 DAG.getVTList(MVT::i32, MVT::i32), 5828 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5829 else 5830 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5831 DAG.getVTList(MVT::i32, MVT::i32), Op); 5832 // Merge the pieces into a single i64 value. 5833 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5834 } 5835 5836 return SDValue(); 5837 } 5838 5839 /// getZeroVector - Returns a vector of specified type with all zero elements. 5840 /// Zero vectors are used to represent vector negation and in those cases 5841 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5842 /// not support i64 elements, so sometimes the zero vectors will need to be 5843 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5844 /// zero vector. 5845 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5846 assert(VT.isVector() && "Expected a vector type"); 5847 // The canonical modified immediate encoding of a zero vector is....0! 5848 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5849 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5850 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5851 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5852 } 5853 5854 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5855 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5856 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5857 SelectionDAG &DAG) const { 5858 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5859 EVT VT = Op.getValueType(); 5860 unsigned VTBits = VT.getSizeInBits(); 5861 SDLoc dl(Op); 5862 SDValue ShOpLo = Op.getOperand(0); 5863 SDValue ShOpHi = Op.getOperand(1); 5864 SDValue ShAmt = Op.getOperand(2); 5865 SDValue ARMcc; 5866 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5867 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5868 5869 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5870 5871 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5872 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5873 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5874 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5875 DAG.getConstant(VTBits, dl, MVT::i32)); 5876 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5877 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5878 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5879 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5880 ISD::SETGE, ARMcc, DAG, dl); 5881 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5882 ARMcc, CCR, CmpLo); 5883 5884 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5885 SDValue HiBigShift = Opc == ISD::SRA 5886 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5887 DAG.getConstant(VTBits - 1, dl, VT)) 5888 : DAG.getConstant(0, dl, VT); 5889 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5890 ISD::SETGE, ARMcc, DAG, dl); 5891 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5892 ARMcc, CCR, CmpHi); 5893 5894 SDValue Ops[2] = { Lo, Hi }; 5895 return DAG.getMergeValues(Ops, dl); 5896 } 5897 5898 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5899 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5900 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5901 SelectionDAG &DAG) const { 5902 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5903 EVT VT = Op.getValueType(); 5904 unsigned VTBits = VT.getSizeInBits(); 5905 SDLoc dl(Op); 5906 SDValue ShOpLo = Op.getOperand(0); 5907 SDValue ShOpHi = Op.getOperand(1); 5908 SDValue ShAmt = Op.getOperand(2); 5909 SDValue ARMcc; 5910 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5911 5912 assert(Op.getOpcode() == ISD::SHL_PARTS); 5913 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5914 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5915 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5916 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5917 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5918 5919 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5920 DAG.getConstant(VTBits, dl, MVT::i32)); 5921 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5922 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5923 ISD::SETGE, ARMcc, DAG, dl); 5924 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5925 ARMcc, CCR, CmpHi); 5926 5927 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5928 ISD::SETGE, ARMcc, DAG, dl); 5929 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5930 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5931 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5932 5933 SDValue Ops[2] = { Lo, Hi }; 5934 return DAG.getMergeValues(Ops, dl); 5935 } 5936 5937 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5938 SelectionDAG &DAG) const { 5939 // The rounding mode is in bits 23:22 of the FPSCR. 5940 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5941 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5942 // so that the shift + and get folded into a bitfield extract. 5943 SDLoc dl(Op); 5944 SDValue Chain = Op.getOperand(0); 5945 SDValue Ops[] = {Chain, 5946 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; 5947 5948 SDValue FPSCR = 5949 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); 5950 Chain = FPSCR.getValue(1); 5951 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5952 DAG.getConstant(1U << 22, dl, MVT::i32)); 5953 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5954 DAG.getConstant(22, dl, MVT::i32)); 5955 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5956 DAG.getConstant(3, dl, MVT::i32)); 5957 return DAG.getMergeValues({And, Chain}, dl); 5958 } 5959 5960 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5961 const ARMSubtarget *ST) { 5962 SDLoc dl(N); 5963 EVT VT = N->getValueType(0); 5964 if (VT.isVector() && ST->hasNEON()) { 5965 5966 // Compute the least significant set bit: LSB = X & -X 5967 SDValue X = N->getOperand(0); 5968 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5969 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5970 5971 EVT ElemTy = VT.getVectorElementType(); 5972 5973 if (ElemTy == MVT::i8) { 5974 // Compute with: cttz(x) = ctpop(lsb - 1) 5975 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5976 DAG.getTargetConstant(1, dl, ElemTy)); 5977 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5978 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5979 } 5980 5981 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5982 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5983 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5984 unsigned NumBits = ElemTy.getSizeInBits(); 5985 SDValue WidthMinus1 = 5986 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5987 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5988 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5989 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5990 } 5991 5992 // Compute with: cttz(x) = ctpop(lsb - 1) 5993 5994 // Compute LSB - 1. 5995 SDValue Bits; 5996 if (ElemTy == MVT::i64) { 5997 // Load constant 0xffff'ffff'ffff'ffff to register. 5998 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5999 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 6000 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 6001 } else { 6002 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6003 DAG.getTargetConstant(1, dl, ElemTy)); 6004 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6005 } 6006 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6007 } 6008 6009 if (!ST->hasV6T2Ops()) 6010 return SDValue(); 6011 6012 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 6013 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 6014 } 6015 6016 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6017 const ARMSubtarget *ST) { 6018 EVT VT = N->getValueType(0); 6019 SDLoc DL(N); 6020 6021 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6022 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6023 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6024 "Unexpected type for custom ctpop lowering"); 6025 6026 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6027 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6028 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6029 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6030 6031 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6032 unsigned EltSize = 8; 6033 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6034 while (EltSize != VT.getScalarSizeInBits()) { 6035 SmallVector<SDValue, 8> Ops; 6036 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6037 TLI.getPointerTy(DAG.getDataLayout()))); 6038 Ops.push_back(Res); 6039 6040 EltSize *= 2; 6041 NumElts /= 2; 6042 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6043 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6044 } 6045 6046 return Res; 6047 } 6048 6049 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6050 /// operand of a vector shift operation, where all the elements of the 6051 /// build_vector must have the same constant integer value. 6052 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6053 // Ignore bit_converts. 6054 while (Op.getOpcode() == ISD::BITCAST) 6055 Op = Op.getOperand(0); 6056 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6057 APInt SplatBits, SplatUndef; 6058 unsigned SplatBitSize; 6059 bool HasAnyUndefs; 6060 if (!BVN || 6061 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6062 ElementBits) || 6063 SplatBitSize > ElementBits) 6064 return false; 6065 Cnt = SplatBits.getSExtValue(); 6066 return true; 6067 } 6068 6069 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6070 /// operand of a vector shift left operation. That value must be in the range: 6071 /// 0 <= Value < ElementBits for a left shift; or 6072 /// 0 <= Value <= ElementBits for a long left shift. 6073 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6074 assert(VT.isVector() && "vector shift count is not a vector type"); 6075 int64_t ElementBits = VT.getScalarSizeInBits(); 6076 if (!getVShiftImm(Op, ElementBits, Cnt)) 6077 return false; 6078 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6079 } 6080 6081 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6082 /// operand of a vector shift right operation. For a shift opcode, the value 6083 /// is positive, but for an intrinsic the value count must be negative. The 6084 /// absolute value must be in the range: 6085 /// 1 <= |Value| <= ElementBits for a right shift; or 6086 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6087 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6088 int64_t &Cnt) { 6089 assert(VT.isVector() && "vector shift count is not a vector type"); 6090 int64_t ElementBits = VT.getScalarSizeInBits(); 6091 if (!getVShiftImm(Op, ElementBits, Cnt)) 6092 return false; 6093 if (!isIntrinsic) 6094 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6095 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6096 Cnt = -Cnt; 6097 return true; 6098 } 6099 return false; 6100 } 6101 6102 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6103 const ARMSubtarget *ST) { 6104 EVT VT = N->getValueType(0); 6105 SDLoc dl(N); 6106 int64_t Cnt; 6107 6108 if (!VT.isVector()) 6109 return SDValue(); 6110 6111 // We essentially have two forms here. Shift by an immediate and shift by a 6112 // vector register (there are also shift by a gpr, but that is just handled 6113 // with a tablegen pattern). We cannot easily match shift by an immediate in 6114 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6115 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6116 // signed or unsigned, and a negative shift indicates a shift right). 6117 if (N->getOpcode() == ISD::SHL) { 6118 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6119 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6120 DAG.getConstant(Cnt, dl, MVT::i32)); 6121 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6122 N->getOperand(1)); 6123 } 6124 6125 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6126 "unexpected vector shift opcode"); 6127 6128 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6129 unsigned VShiftOpc = 6130 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6131 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6132 DAG.getConstant(Cnt, dl, MVT::i32)); 6133 } 6134 6135 // Other right shifts we don't have operations for (we use a shift left by a 6136 // negative number). 6137 EVT ShiftVT = N->getOperand(1).getValueType(); 6138 SDValue NegatedCount = DAG.getNode( 6139 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6140 unsigned VShiftOpc = 6141 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6142 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6143 } 6144 6145 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6146 const ARMSubtarget *ST) { 6147 EVT VT = N->getValueType(0); 6148 SDLoc dl(N); 6149 6150 // We can get here for a node like i32 = ISD::SHL i32, i64 6151 if (VT != MVT::i64) 6152 return SDValue(); 6153 6154 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6155 N->getOpcode() == ISD::SHL) && 6156 "Unknown shift to lower!"); 6157 6158 unsigned ShOpc = N->getOpcode(); 6159 if (ST->hasMVEIntegerOps()) { 6160 SDValue ShAmt = N->getOperand(1); 6161 unsigned ShPartsOpc = ARMISD::LSLL; 6162 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6163 6164 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6165 // then do the default optimisation 6166 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6167 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6168 return SDValue(); 6169 6170 // Extract the lower 32 bits of the shift amount if it's not an i32 6171 if (ShAmt->getValueType(0) != MVT::i32) 6172 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6173 6174 if (ShOpc == ISD::SRL) { 6175 if (!Con) 6176 // There is no t2LSRLr instruction so negate and perform an lsll if the 6177 // shift amount is in a register, emulating a right shift. 6178 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6179 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6180 else 6181 // Else generate an lsrl on the immediate shift amount 6182 ShPartsOpc = ARMISD::LSRL; 6183 } else if (ShOpc == ISD::SRA) 6184 ShPartsOpc = ARMISD::ASRL; 6185 6186 // Lower 32 bits of the destination/source 6187 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6188 DAG.getConstant(0, dl, MVT::i32)); 6189 // Upper 32 bits of the destination/source 6190 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6191 DAG.getConstant(1, dl, MVT::i32)); 6192 6193 // Generate the shift operation as computed above 6194 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6195 ShAmt); 6196 // The upper 32 bits come from the second return value of lsll 6197 Hi = SDValue(Lo.getNode(), 1); 6198 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6199 } 6200 6201 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6202 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6203 return SDValue(); 6204 6205 // If we are in thumb mode, we don't have RRX. 6206 if (ST->isThumb1Only()) 6207 return SDValue(); 6208 6209 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6210 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6211 DAG.getConstant(0, dl, MVT::i32)); 6212 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6213 DAG.getConstant(1, dl, MVT::i32)); 6214 6215 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6216 // captures the result into a carry flag. 6217 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6218 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6219 6220 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6221 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6222 6223 // Merge the pieces into a single i64 value. 6224 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6225 } 6226 6227 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6228 const ARMSubtarget *ST) { 6229 bool Invert = false; 6230 bool Swap = false; 6231 unsigned Opc = ARMCC::AL; 6232 6233 SDValue Op0 = Op.getOperand(0); 6234 SDValue Op1 = Op.getOperand(1); 6235 SDValue CC = Op.getOperand(2); 6236 EVT VT = Op.getValueType(); 6237 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6238 SDLoc dl(Op); 6239 6240 EVT CmpVT; 6241 if (ST->hasNEON()) 6242 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6243 else { 6244 assert(ST->hasMVEIntegerOps() && 6245 "No hardware support for integer vector comparison!"); 6246 6247 if (Op.getValueType().getVectorElementType() != MVT::i1) 6248 return SDValue(); 6249 6250 // Make sure we expand floating point setcc to scalar if we do not have 6251 // mve.fp, so that we can handle them from there. 6252 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6253 return SDValue(); 6254 6255 CmpVT = VT; 6256 } 6257 6258 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6259 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6260 // Special-case integer 64-bit equality comparisons. They aren't legal, 6261 // but they can be lowered with a few vector instructions. 6262 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6263 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6264 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6265 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6266 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6267 DAG.getCondCode(ISD::SETEQ)); 6268 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6269 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6270 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6271 if (SetCCOpcode == ISD::SETNE) 6272 Merged = DAG.getNOT(dl, Merged, CmpVT); 6273 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6274 return Merged; 6275 } 6276 6277 if (CmpVT.getVectorElementType() == MVT::i64) 6278 // 64-bit comparisons are not legal in general. 6279 return SDValue(); 6280 6281 if (Op1.getValueType().isFloatingPoint()) { 6282 switch (SetCCOpcode) { 6283 default: llvm_unreachable("Illegal FP comparison"); 6284 case ISD::SETUNE: 6285 case ISD::SETNE: 6286 if (ST->hasMVEFloatOps()) { 6287 Opc = ARMCC::NE; break; 6288 } else { 6289 Invert = true; LLVM_FALLTHROUGH; 6290 } 6291 case ISD::SETOEQ: 6292 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6293 case ISD::SETOLT: 6294 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6295 case ISD::SETOGT: 6296 case ISD::SETGT: Opc = ARMCC::GT; break; 6297 case ISD::SETOLE: 6298 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6299 case ISD::SETOGE: 6300 case ISD::SETGE: Opc = ARMCC::GE; break; 6301 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6302 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6303 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6304 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6305 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6306 case ISD::SETONE: { 6307 // Expand this to (OLT | OGT). 6308 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6309 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6310 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6311 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6312 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6313 if (Invert) 6314 Result = DAG.getNOT(dl, Result, VT); 6315 return Result; 6316 } 6317 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6318 case ISD::SETO: { 6319 // Expand this to (OLT | OGE). 6320 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6321 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6322 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6323 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6324 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6325 if (Invert) 6326 Result = DAG.getNOT(dl, Result, VT); 6327 return Result; 6328 } 6329 } 6330 } else { 6331 // Integer comparisons. 6332 switch (SetCCOpcode) { 6333 default: llvm_unreachable("Illegal integer comparison"); 6334 case ISD::SETNE: 6335 if (ST->hasMVEIntegerOps()) { 6336 Opc = ARMCC::NE; break; 6337 } else { 6338 Invert = true; LLVM_FALLTHROUGH; 6339 } 6340 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6341 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6342 case ISD::SETGT: Opc = ARMCC::GT; break; 6343 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6344 case ISD::SETGE: Opc = ARMCC::GE; break; 6345 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6346 case ISD::SETUGT: Opc = ARMCC::HI; break; 6347 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6348 case ISD::SETUGE: Opc = ARMCC::HS; break; 6349 } 6350 6351 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6352 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6353 SDValue AndOp; 6354 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6355 AndOp = Op0; 6356 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6357 AndOp = Op1; 6358 6359 // Ignore bitconvert. 6360 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6361 AndOp = AndOp.getOperand(0); 6362 6363 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6364 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6365 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6366 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6367 if (!Invert) 6368 Result = DAG.getNOT(dl, Result, VT); 6369 return Result; 6370 } 6371 } 6372 } 6373 6374 if (Swap) 6375 std::swap(Op0, Op1); 6376 6377 // If one of the operands is a constant vector zero, attempt to fold the 6378 // comparison to a specialized compare-against-zero form. 6379 SDValue SingleOp; 6380 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6381 SingleOp = Op0; 6382 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6383 if (Opc == ARMCC::GE) 6384 Opc = ARMCC::LE; 6385 else if (Opc == ARMCC::GT) 6386 Opc = ARMCC::LT; 6387 SingleOp = Op1; 6388 } 6389 6390 SDValue Result; 6391 if (SingleOp.getNode()) { 6392 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6393 DAG.getConstant(Opc, dl, MVT::i32)); 6394 } else { 6395 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6396 DAG.getConstant(Opc, dl, MVT::i32)); 6397 } 6398 6399 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6400 6401 if (Invert) 6402 Result = DAG.getNOT(dl, Result, VT); 6403 6404 return Result; 6405 } 6406 6407 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6408 SDValue LHS = Op.getOperand(0); 6409 SDValue RHS = Op.getOperand(1); 6410 SDValue Carry = Op.getOperand(2); 6411 SDValue Cond = Op.getOperand(3); 6412 SDLoc DL(Op); 6413 6414 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6415 6416 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6417 // have to invert the carry first. 6418 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6419 DAG.getConstant(1, DL, MVT::i32), Carry); 6420 // This converts the boolean value carry into the carry flag. 6421 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6422 6423 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6424 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6425 6426 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6427 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6428 SDValue ARMcc = DAG.getConstant( 6429 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6430 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6431 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6432 Cmp.getValue(1), SDValue()); 6433 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6434 CCR, Chain.getValue(1)); 6435 } 6436 6437 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6438 /// valid vector constant for a NEON or MVE instruction with a "modified 6439 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6440 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6441 unsigned SplatBitSize, SelectionDAG &DAG, 6442 const SDLoc &dl, EVT &VT, bool is128Bits, 6443 VMOVModImmType type) { 6444 unsigned OpCmode, Imm; 6445 6446 // SplatBitSize is set to the smallest size that splats the vector, so a 6447 // zero vector will always have SplatBitSize == 8. However, NEON modified 6448 // immediate instructions others than VMOV do not support the 8-bit encoding 6449 // of a zero vector, and the default encoding of zero is supposed to be the 6450 // 32-bit version. 6451 if (SplatBits == 0) 6452 SplatBitSize = 32; 6453 6454 switch (SplatBitSize) { 6455 case 8: 6456 if (type != VMOVModImm) 6457 return SDValue(); 6458 // Any 1-byte value is OK. Op=0, Cmode=1110. 6459 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6460 OpCmode = 0xe; 6461 Imm = SplatBits; 6462 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6463 break; 6464 6465 case 16: 6466 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6467 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6468 if ((SplatBits & ~0xff) == 0) { 6469 // Value = 0x00nn: Op=x, Cmode=100x. 6470 OpCmode = 0x8; 6471 Imm = SplatBits; 6472 break; 6473 } 6474 if ((SplatBits & ~0xff00) == 0) { 6475 // Value = 0xnn00: Op=x, Cmode=101x. 6476 OpCmode = 0xa; 6477 Imm = SplatBits >> 8; 6478 break; 6479 } 6480 return SDValue(); 6481 6482 case 32: 6483 // NEON's 32-bit VMOV supports splat values where: 6484 // * only one byte is nonzero, or 6485 // * the least significant byte is 0xff and the second byte is nonzero, or 6486 // * the least significant 2 bytes are 0xff and the third is nonzero. 6487 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6488 if ((SplatBits & ~0xff) == 0) { 6489 // Value = 0x000000nn: Op=x, Cmode=000x. 6490 OpCmode = 0; 6491 Imm = SplatBits; 6492 break; 6493 } 6494 if ((SplatBits & ~0xff00) == 0) { 6495 // Value = 0x0000nn00: Op=x, Cmode=001x. 6496 OpCmode = 0x2; 6497 Imm = SplatBits >> 8; 6498 break; 6499 } 6500 if ((SplatBits & ~0xff0000) == 0) { 6501 // Value = 0x00nn0000: Op=x, Cmode=010x. 6502 OpCmode = 0x4; 6503 Imm = SplatBits >> 16; 6504 break; 6505 } 6506 if ((SplatBits & ~0xff000000) == 0) { 6507 // Value = 0xnn000000: Op=x, Cmode=011x. 6508 OpCmode = 0x6; 6509 Imm = SplatBits >> 24; 6510 break; 6511 } 6512 6513 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6514 if (type == OtherModImm) return SDValue(); 6515 6516 if ((SplatBits & ~0xffff) == 0 && 6517 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6518 // Value = 0x0000nnff: Op=x, Cmode=1100. 6519 OpCmode = 0xc; 6520 Imm = SplatBits >> 8; 6521 break; 6522 } 6523 6524 // cmode == 0b1101 is not supported for MVE VMVN 6525 if (type == MVEVMVNModImm) 6526 return SDValue(); 6527 6528 if ((SplatBits & ~0xffffff) == 0 && 6529 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6530 // Value = 0x00nnffff: Op=x, Cmode=1101. 6531 OpCmode = 0xd; 6532 Imm = SplatBits >> 16; 6533 break; 6534 } 6535 6536 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6537 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6538 // VMOV.I32. A (very) minor optimization would be to replicate the value 6539 // and fall through here to test for a valid 64-bit splat. But, then the 6540 // caller would also need to check and handle the change in size. 6541 return SDValue(); 6542 6543 case 64: { 6544 if (type != VMOVModImm) 6545 return SDValue(); 6546 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6547 uint64_t BitMask = 0xff; 6548 uint64_t Val = 0; 6549 unsigned ImmMask = 1; 6550 Imm = 0; 6551 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6552 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6553 Val |= BitMask; 6554 Imm |= ImmMask; 6555 } else if ((SplatBits & BitMask) != 0) { 6556 return SDValue(); 6557 } 6558 BitMask <<= 8; 6559 ImmMask <<= 1; 6560 } 6561 6562 if (DAG.getDataLayout().isBigEndian()) 6563 // swap higher and lower 32 bit word 6564 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 6565 6566 // Op=1, Cmode=1110. 6567 OpCmode = 0x1e; 6568 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6569 break; 6570 } 6571 6572 default: 6573 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6574 } 6575 6576 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6577 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6578 } 6579 6580 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6581 const ARMSubtarget *ST) const { 6582 EVT VT = Op.getValueType(); 6583 bool IsDouble = (VT == MVT::f64); 6584 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6585 const APFloat &FPVal = CFP->getValueAPF(); 6586 6587 // Prevent floating-point constants from using literal loads 6588 // when execute-only is enabled. 6589 if (ST->genExecuteOnly()) { 6590 // If we can represent the constant as an immediate, don't lower it 6591 if (isFPImmLegal(FPVal, VT)) 6592 return Op; 6593 // Otherwise, construct as integer, and move to float register 6594 APInt INTVal = FPVal.bitcastToAPInt(); 6595 SDLoc DL(CFP); 6596 switch (VT.getSimpleVT().SimpleTy) { 6597 default: 6598 llvm_unreachable("Unknown floating point type!"); 6599 break; 6600 case MVT::f64: { 6601 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6602 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6603 if (!ST->isLittle()) 6604 std::swap(Lo, Hi); 6605 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6606 } 6607 case MVT::f32: 6608 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6609 DAG.getConstant(INTVal, DL, MVT::i32)); 6610 } 6611 } 6612 6613 if (!ST->hasVFP3Base()) 6614 return SDValue(); 6615 6616 // Use the default (constant pool) lowering for double constants when we have 6617 // an SP-only FPU 6618 if (IsDouble && !Subtarget->hasFP64()) 6619 return SDValue(); 6620 6621 // Try splatting with a VMOV.f32... 6622 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6623 6624 if (ImmVal != -1) { 6625 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6626 // We have code in place to select a valid ConstantFP already, no need to 6627 // do any mangling. 6628 return Op; 6629 } 6630 6631 // It's a float and we are trying to use NEON operations where 6632 // possible. Lower it to a splat followed by an extract. 6633 SDLoc DL(Op); 6634 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6635 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6636 NewVal); 6637 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6638 DAG.getConstant(0, DL, MVT::i32)); 6639 } 6640 6641 // The rest of our options are NEON only, make sure that's allowed before 6642 // proceeding.. 6643 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6644 return SDValue(); 6645 6646 EVT VMovVT; 6647 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6648 6649 // It wouldn't really be worth bothering for doubles except for one very 6650 // important value, which does happen to match: 0.0. So make sure we don't do 6651 // anything stupid. 6652 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6653 return SDValue(); 6654 6655 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6656 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6657 VMovVT, false, VMOVModImm); 6658 if (NewVal != SDValue()) { 6659 SDLoc DL(Op); 6660 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6661 NewVal); 6662 if (IsDouble) 6663 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6664 6665 // It's a float: cast and extract a vector element. 6666 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6667 VecConstant); 6668 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6669 DAG.getConstant(0, DL, MVT::i32)); 6670 } 6671 6672 // Finally, try a VMVN.i32 6673 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6674 false, VMVNModImm); 6675 if (NewVal != SDValue()) { 6676 SDLoc DL(Op); 6677 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6678 6679 if (IsDouble) 6680 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6681 6682 // It's a float: cast and extract a vector element. 6683 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6684 VecConstant); 6685 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6686 DAG.getConstant(0, DL, MVT::i32)); 6687 } 6688 6689 return SDValue(); 6690 } 6691 6692 // check if an VEXT instruction can handle the shuffle mask when the 6693 // vector sources of the shuffle are the same. 6694 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6695 unsigned NumElts = VT.getVectorNumElements(); 6696 6697 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6698 if (M[0] < 0) 6699 return false; 6700 6701 Imm = M[0]; 6702 6703 // If this is a VEXT shuffle, the immediate value is the index of the first 6704 // element. The other shuffle indices must be the successive elements after 6705 // the first one. 6706 unsigned ExpectedElt = Imm; 6707 for (unsigned i = 1; i < NumElts; ++i) { 6708 // Increment the expected index. If it wraps around, just follow it 6709 // back to index zero and keep going. 6710 ++ExpectedElt; 6711 if (ExpectedElt == NumElts) 6712 ExpectedElt = 0; 6713 6714 if (M[i] < 0) continue; // ignore UNDEF indices 6715 if (ExpectedElt != static_cast<unsigned>(M[i])) 6716 return false; 6717 } 6718 6719 return true; 6720 } 6721 6722 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6723 bool &ReverseVEXT, unsigned &Imm) { 6724 unsigned NumElts = VT.getVectorNumElements(); 6725 ReverseVEXT = false; 6726 6727 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6728 if (M[0] < 0) 6729 return false; 6730 6731 Imm = M[0]; 6732 6733 // If this is a VEXT shuffle, the immediate value is the index of the first 6734 // element. The other shuffle indices must be the successive elements after 6735 // the first one. 6736 unsigned ExpectedElt = Imm; 6737 for (unsigned i = 1; i < NumElts; ++i) { 6738 // Increment the expected index. If it wraps around, it may still be 6739 // a VEXT but the source vectors must be swapped. 6740 ExpectedElt += 1; 6741 if (ExpectedElt == NumElts * 2) { 6742 ExpectedElt = 0; 6743 ReverseVEXT = true; 6744 } 6745 6746 if (M[i] < 0) continue; // ignore UNDEF indices 6747 if (ExpectedElt != static_cast<unsigned>(M[i])) 6748 return false; 6749 } 6750 6751 // Adjust the index value if the source operands will be swapped. 6752 if (ReverseVEXT) 6753 Imm -= NumElts; 6754 6755 return true; 6756 } 6757 6758 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6759 /// instruction with the specified blocksize. (The order of the elements 6760 /// within each block of the vector is reversed.) 6761 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6762 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6763 "Only possible block sizes for VREV are: 16, 32, 64"); 6764 6765 unsigned EltSz = VT.getScalarSizeInBits(); 6766 if (EltSz == 64) 6767 return false; 6768 6769 unsigned NumElts = VT.getVectorNumElements(); 6770 unsigned BlockElts = M[0] + 1; 6771 // If the first shuffle index is UNDEF, be optimistic. 6772 if (M[0] < 0) 6773 BlockElts = BlockSize / EltSz; 6774 6775 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6776 return false; 6777 6778 for (unsigned i = 0; i < NumElts; ++i) { 6779 if (M[i] < 0) continue; // ignore UNDEF indices 6780 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6781 return false; 6782 } 6783 6784 return true; 6785 } 6786 6787 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6788 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6789 // range, then 0 is placed into the resulting vector. So pretty much any mask 6790 // of 8 elements can work here. 6791 return VT == MVT::v8i8 && M.size() == 8; 6792 } 6793 6794 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6795 unsigned Index) { 6796 if (Mask.size() == Elements * 2) 6797 return Index / Elements; 6798 return Mask[Index] == 0 ? 0 : 1; 6799 } 6800 6801 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6802 // checking that pairs of elements in the shuffle mask represent the same index 6803 // in each vector, incrementing the expected index by 2 at each step. 6804 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6805 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6806 // v2={e,f,g,h} 6807 // WhichResult gives the offset for each element in the mask based on which 6808 // of the two results it belongs to. 6809 // 6810 // The transpose can be represented either as: 6811 // result1 = shufflevector v1, v2, result1_shuffle_mask 6812 // result2 = shufflevector v1, v2, result2_shuffle_mask 6813 // where v1/v2 and the shuffle masks have the same number of elements 6814 // (here WhichResult (see below) indicates which result is being checked) 6815 // 6816 // or as: 6817 // results = shufflevector v1, v2, shuffle_mask 6818 // where both results are returned in one vector and the shuffle mask has twice 6819 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6820 // want to check the low half and high half of the shuffle mask as if it were 6821 // the other case 6822 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6823 unsigned EltSz = VT.getScalarSizeInBits(); 6824 if (EltSz == 64) 6825 return false; 6826 6827 unsigned NumElts = VT.getVectorNumElements(); 6828 if (M.size() != NumElts && M.size() != NumElts*2) 6829 return false; 6830 6831 // If the mask is twice as long as the input vector then we need to check the 6832 // upper and lower parts of the mask with a matching value for WhichResult 6833 // FIXME: A mask with only even values will be rejected in case the first 6834 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6835 // M[0] is used to determine WhichResult 6836 for (unsigned i = 0; i < M.size(); i += NumElts) { 6837 WhichResult = SelectPairHalf(NumElts, M, i); 6838 for (unsigned j = 0; j < NumElts; j += 2) { 6839 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6840 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6841 return false; 6842 } 6843 } 6844 6845 if (M.size() == NumElts*2) 6846 WhichResult = 0; 6847 6848 return true; 6849 } 6850 6851 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6852 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6853 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6854 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6855 unsigned EltSz = VT.getScalarSizeInBits(); 6856 if (EltSz == 64) 6857 return false; 6858 6859 unsigned NumElts = VT.getVectorNumElements(); 6860 if (M.size() != NumElts && M.size() != NumElts*2) 6861 return false; 6862 6863 for (unsigned i = 0; i < M.size(); i += NumElts) { 6864 WhichResult = SelectPairHalf(NumElts, M, i); 6865 for (unsigned j = 0; j < NumElts; j += 2) { 6866 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6867 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6868 return false; 6869 } 6870 } 6871 6872 if (M.size() == NumElts*2) 6873 WhichResult = 0; 6874 6875 return true; 6876 } 6877 6878 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6879 // that the mask elements are either all even and in steps of size 2 or all odd 6880 // and in steps of size 2. 6881 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6882 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6883 // v2={e,f,g,h} 6884 // Requires similar checks to that of isVTRNMask with 6885 // respect the how results are returned. 6886 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6887 unsigned EltSz = VT.getScalarSizeInBits(); 6888 if (EltSz == 64) 6889 return false; 6890 6891 unsigned NumElts = VT.getVectorNumElements(); 6892 if (M.size() != NumElts && M.size() != NumElts*2) 6893 return false; 6894 6895 for (unsigned i = 0; i < M.size(); i += NumElts) { 6896 WhichResult = SelectPairHalf(NumElts, M, i); 6897 for (unsigned j = 0; j < NumElts; ++j) { 6898 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6899 return false; 6900 } 6901 } 6902 6903 if (M.size() == NumElts*2) 6904 WhichResult = 0; 6905 6906 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6907 if (VT.is64BitVector() && EltSz == 32) 6908 return false; 6909 6910 return true; 6911 } 6912 6913 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6914 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6915 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6916 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6917 unsigned EltSz = VT.getScalarSizeInBits(); 6918 if (EltSz == 64) 6919 return false; 6920 6921 unsigned NumElts = VT.getVectorNumElements(); 6922 if (M.size() != NumElts && M.size() != NumElts*2) 6923 return false; 6924 6925 unsigned Half = NumElts / 2; 6926 for (unsigned i = 0; i < M.size(); i += NumElts) { 6927 WhichResult = SelectPairHalf(NumElts, M, i); 6928 for (unsigned j = 0; j < NumElts; j += Half) { 6929 unsigned Idx = WhichResult; 6930 for (unsigned k = 0; k < Half; ++k) { 6931 int MIdx = M[i + j + k]; 6932 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6933 return false; 6934 Idx += 2; 6935 } 6936 } 6937 } 6938 6939 if (M.size() == NumElts*2) 6940 WhichResult = 0; 6941 6942 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6943 if (VT.is64BitVector() && EltSz == 32) 6944 return false; 6945 6946 return true; 6947 } 6948 6949 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6950 // that pairs of elements of the shufflemask represent the same index in each 6951 // vector incrementing sequentially through the vectors. 6952 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6953 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6954 // v2={e,f,g,h} 6955 // Requires similar checks to that of isVTRNMask with respect the how results 6956 // are returned. 6957 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6958 unsigned EltSz = VT.getScalarSizeInBits(); 6959 if (EltSz == 64) 6960 return false; 6961 6962 unsigned NumElts = VT.getVectorNumElements(); 6963 if (M.size() != NumElts && M.size() != NumElts*2) 6964 return false; 6965 6966 for (unsigned i = 0; i < M.size(); i += NumElts) { 6967 WhichResult = SelectPairHalf(NumElts, M, i); 6968 unsigned Idx = WhichResult * NumElts / 2; 6969 for (unsigned j = 0; j < NumElts; j += 2) { 6970 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6971 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6972 return false; 6973 Idx += 1; 6974 } 6975 } 6976 6977 if (M.size() == NumElts*2) 6978 WhichResult = 0; 6979 6980 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6981 if (VT.is64BitVector() && EltSz == 32) 6982 return false; 6983 6984 return true; 6985 } 6986 6987 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6988 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6989 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6990 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6991 unsigned EltSz = VT.getScalarSizeInBits(); 6992 if (EltSz == 64) 6993 return false; 6994 6995 unsigned NumElts = VT.getVectorNumElements(); 6996 if (M.size() != NumElts && M.size() != NumElts*2) 6997 return false; 6998 6999 for (unsigned i = 0; i < M.size(); i += NumElts) { 7000 WhichResult = SelectPairHalf(NumElts, M, i); 7001 unsigned Idx = WhichResult * NumElts / 2; 7002 for (unsigned j = 0; j < NumElts; j += 2) { 7003 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7004 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 7005 return false; 7006 Idx += 1; 7007 } 7008 } 7009 7010 if (M.size() == NumElts*2) 7011 WhichResult = 0; 7012 7013 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7014 if (VT.is64BitVector() && EltSz == 32) 7015 return false; 7016 7017 return true; 7018 } 7019 7020 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7021 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7022 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7023 unsigned &WhichResult, 7024 bool &isV_UNDEF) { 7025 isV_UNDEF = false; 7026 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7027 return ARMISD::VTRN; 7028 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7029 return ARMISD::VUZP; 7030 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7031 return ARMISD::VZIP; 7032 7033 isV_UNDEF = true; 7034 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7035 return ARMISD::VTRN; 7036 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7037 return ARMISD::VUZP; 7038 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7039 return ARMISD::VZIP; 7040 7041 return 0; 7042 } 7043 7044 /// \return true if this is a reverse operation on an vector. 7045 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7046 unsigned NumElts = VT.getVectorNumElements(); 7047 // Make sure the mask has the right size. 7048 if (NumElts != M.size()) 7049 return false; 7050 7051 // Look for <15, ..., 3, -1, 1, 0>. 7052 for (unsigned i = 0; i != NumElts; ++i) 7053 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7054 return false; 7055 7056 return true; 7057 } 7058 7059 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { 7060 unsigned NumElts = VT.getVectorNumElements(); 7061 // Make sure the mask has the right size. 7062 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7063 return false; 7064 7065 // If Top 7066 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7067 // This inserts Input2 into Input1 7068 // else if not Top 7069 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7070 // This inserts Input1 into Input2 7071 unsigned Offset = Top ? 0 : 1; 7072 for (unsigned i = 0; i < NumElts; i+=2) { 7073 if (M[i] >= 0 && M[i] != (int)i) 7074 return false; 7075 if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset)) 7076 return false; 7077 } 7078 7079 return true; 7080 } 7081 7082 // If N is an integer constant that can be moved into a register in one 7083 // instruction, return an SDValue of such a constant (will become a MOV 7084 // instruction). Otherwise return null. 7085 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7086 const ARMSubtarget *ST, const SDLoc &dl) { 7087 uint64_t Val; 7088 if (!isa<ConstantSDNode>(N)) 7089 return SDValue(); 7090 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7091 7092 if (ST->isThumb1Only()) { 7093 if (Val <= 255 || ~Val <= 255) 7094 return DAG.getConstant(Val, dl, MVT::i32); 7095 } else { 7096 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7097 return DAG.getConstant(Val, dl, MVT::i32); 7098 } 7099 return SDValue(); 7100 } 7101 7102 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7103 const ARMSubtarget *ST) { 7104 SDLoc dl(Op); 7105 EVT VT = Op.getValueType(); 7106 7107 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7108 7109 unsigned NumElts = VT.getVectorNumElements(); 7110 unsigned BoolMask; 7111 unsigned BitsPerBool; 7112 if (NumElts == 4) { 7113 BitsPerBool = 4; 7114 BoolMask = 0xf; 7115 } else if (NumElts == 8) { 7116 BitsPerBool = 2; 7117 BoolMask = 0x3; 7118 } else if (NumElts == 16) { 7119 BitsPerBool = 1; 7120 BoolMask = 0x1; 7121 } else 7122 return SDValue(); 7123 7124 // If this is a single value copied into all lanes (a splat), we can just sign 7125 // extend that single value 7126 SDValue FirstOp = Op.getOperand(0); 7127 if (!isa<ConstantSDNode>(FirstOp) && 7128 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7129 [&FirstOp](SDUse &U) { 7130 return U.get().isUndef() || U.get() == FirstOp; 7131 })) { 7132 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7133 DAG.getValueType(MVT::i1)); 7134 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7135 } 7136 7137 // First create base with bits set where known 7138 unsigned Bits32 = 0; 7139 for (unsigned i = 0; i < NumElts; ++i) { 7140 SDValue V = Op.getOperand(i); 7141 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7142 continue; 7143 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7144 if (BitSet) 7145 Bits32 |= BoolMask << (i * BitsPerBool); 7146 } 7147 7148 // Add in unknown nodes 7149 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7150 DAG.getConstant(Bits32, dl, MVT::i32)); 7151 for (unsigned i = 0; i < NumElts; ++i) { 7152 SDValue V = Op.getOperand(i); 7153 if (isa<ConstantSDNode>(V) || V.isUndef()) 7154 continue; 7155 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7156 DAG.getConstant(i, dl, MVT::i32)); 7157 } 7158 7159 return Base; 7160 } 7161 7162 // If this is a case we can't handle, return null and let the default 7163 // expansion code take care of it. 7164 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7165 const ARMSubtarget *ST) const { 7166 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7167 SDLoc dl(Op); 7168 EVT VT = Op.getValueType(); 7169 7170 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7171 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7172 7173 APInt SplatBits, SplatUndef; 7174 unsigned SplatBitSize; 7175 bool HasAnyUndefs; 7176 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7177 if (SplatUndef.isAllOnesValue()) 7178 return DAG.getUNDEF(VT); 7179 7180 if ((ST->hasNEON() && SplatBitSize <= 64) || 7181 (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { 7182 // Check if an immediate VMOV works. 7183 EVT VmovVT; 7184 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 7185 SplatUndef.getZExtValue(), SplatBitSize, 7186 DAG, dl, VmovVT, VT.is128BitVector(), 7187 VMOVModImm); 7188 7189 if (Val.getNode()) { 7190 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7191 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7192 } 7193 7194 // Try an immediate VMVN. 7195 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7196 Val = isVMOVModifiedImm( 7197 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, 7198 DAG, dl, VmovVT, VT.is128BitVector(), 7199 ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7200 if (Val.getNode()) { 7201 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7202 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7203 } 7204 7205 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7206 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7207 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7208 if (ImmVal != -1) { 7209 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7210 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7211 } 7212 } 7213 } 7214 } 7215 7216 // Scan through the operands to see if only one value is used. 7217 // 7218 // As an optimisation, even if more than one value is used it may be more 7219 // profitable to splat with one value then change some lanes. 7220 // 7221 // Heuristically we decide to do this if the vector has a "dominant" value, 7222 // defined as splatted to more than half of the lanes. 7223 unsigned NumElts = VT.getVectorNumElements(); 7224 bool isOnlyLowElement = true; 7225 bool usesOnlyOneValue = true; 7226 bool hasDominantValue = false; 7227 bool isConstant = true; 7228 7229 // Map of the number of times a particular SDValue appears in the 7230 // element list. 7231 DenseMap<SDValue, unsigned> ValueCounts; 7232 SDValue Value; 7233 for (unsigned i = 0; i < NumElts; ++i) { 7234 SDValue V = Op.getOperand(i); 7235 if (V.isUndef()) 7236 continue; 7237 if (i > 0) 7238 isOnlyLowElement = false; 7239 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7240 isConstant = false; 7241 7242 ValueCounts.insert(std::make_pair(V, 0)); 7243 unsigned &Count = ValueCounts[V]; 7244 7245 // Is this value dominant? (takes up more than half of the lanes) 7246 if (++Count > (NumElts / 2)) { 7247 hasDominantValue = true; 7248 Value = V; 7249 } 7250 } 7251 if (ValueCounts.size() != 1) 7252 usesOnlyOneValue = false; 7253 if (!Value.getNode() && !ValueCounts.empty()) 7254 Value = ValueCounts.begin()->first; 7255 7256 if (ValueCounts.empty()) 7257 return DAG.getUNDEF(VT); 7258 7259 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7260 // Keep going if we are hitting this case. 7261 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7262 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7263 7264 unsigned EltSize = VT.getScalarSizeInBits(); 7265 7266 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7267 // i32 and try again. 7268 if (hasDominantValue && EltSize <= 32) { 7269 if (!isConstant) { 7270 SDValue N; 7271 7272 // If we are VDUPing a value that comes directly from a vector, that will 7273 // cause an unnecessary move to and from a GPR, where instead we could 7274 // just use VDUPLANE. We can only do this if the lane being extracted 7275 // is at a constant index, as the VDUP from lane instructions only have 7276 // constant-index forms. 7277 ConstantSDNode *constIndex; 7278 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7279 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7280 // We need to create a new undef vector to use for the VDUPLANE if the 7281 // size of the vector from which we get the value is different than the 7282 // size of the vector that we need to create. We will insert the element 7283 // such that the register coalescer will remove unnecessary copies. 7284 if (VT != Value->getOperand(0).getValueType()) { 7285 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7286 VT.getVectorNumElements(); 7287 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7288 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7289 Value, DAG.getConstant(index, dl, MVT::i32)), 7290 DAG.getConstant(index, dl, MVT::i32)); 7291 } else 7292 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7293 Value->getOperand(0), Value->getOperand(1)); 7294 } else 7295 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7296 7297 if (!usesOnlyOneValue) { 7298 // The dominant value was splatted as 'N', but we now have to insert 7299 // all differing elements. 7300 for (unsigned I = 0; I < NumElts; ++I) { 7301 if (Op.getOperand(I) == Value) 7302 continue; 7303 SmallVector<SDValue, 3> Ops; 7304 Ops.push_back(N); 7305 Ops.push_back(Op.getOperand(I)); 7306 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7307 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7308 } 7309 } 7310 return N; 7311 } 7312 if (VT.getVectorElementType().isFloatingPoint()) { 7313 SmallVector<SDValue, 8> Ops; 7314 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7315 assert(FVT == MVT::f32 || FVT == MVT::f16); 7316 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7317 for (unsigned i = 0; i < NumElts; ++i) 7318 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7319 Op.getOperand(i))); 7320 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7321 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7322 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7323 if (Val.getNode()) 7324 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7325 } 7326 if (usesOnlyOneValue) { 7327 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7328 if (isConstant && Val.getNode()) 7329 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7330 } 7331 } 7332 7333 // If all elements are constants and the case above didn't get hit, fall back 7334 // to the default expansion, which will generate a load from the constant 7335 // pool. 7336 if (isConstant) 7337 return SDValue(); 7338 7339 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 7340 if (NumElts >= 4) { 7341 SDValue shuffle = ReconstructShuffle(Op, DAG); 7342 if (shuffle != SDValue()) 7343 return shuffle; 7344 } 7345 7346 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7347 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7348 // into two 64-bit vectors; we might discover a better way to lower it. 7349 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7350 EVT ExtVT = VT.getVectorElementType(); 7351 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7352 SDValue Lower = 7353 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7354 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7355 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7356 SDValue Upper = DAG.getBuildVector( 7357 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7358 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7359 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7360 if (Lower && Upper) 7361 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7362 } 7363 7364 // Vectors with 32- or 64-bit elements can be built by directly assigning 7365 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7366 // will be legalized. 7367 if (EltSize >= 32) { 7368 // Do the expansion with floating-point types, since that is what the VFP 7369 // registers are defined to use, and since i64 is not legal. 7370 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7371 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7372 SmallVector<SDValue, 8> Ops; 7373 for (unsigned i = 0; i < NumElts; ++i) 7374 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7375 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7376 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7377 } 7378 7379 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7380 // know the default expansion would otherwise fall back on something even 7381 // worse. For a vector with one or two non-undef values, that's 7382 // scalar_to_vector for the elements followed by a shuffle (provided the 7383 // shuffle is valid for the target) and materialization element by element 7384 // on the stack followed by a load for everything else. 7385 if (!isConstant && !usesOnlyOneValue) { 7386 SDValue Vec = DAG.getUNDEF(VT); 7387 for (unsigned i = 0 ; i < NumElts; ++i) { 7388 SDValue V = Op.getOperand(i); 7389 if (V.isUndef()) 7390 continue; 7391 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7392 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7393 } 7394 return Vec; 7395 } 7396 7397 return SDValue(); 7398 } 7399 7400 // Gather data to see if the operation can be modelled as a 7401 // shuffle in combination with VEXTs. 7402 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7403 SelectionDAG &DAG) const { 7404 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7405 SDLoc dl(Op); 7406 EVT VT = Op.getValueType(); 7407 unsigned NumElts = VT.getVectorNumElements(); 7408 7409 struct ShuffleSourceInfo { 7410 SDValue Vec; 7411 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7412 unsigned MaxElt = 0; 7413 7414 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7415 // be compatible with the shuffle we intend to construct. As a result 7416 // ShuffleVec will be some sliding window into the original Vec. 7417 SDValue ShuffleVec; 7418 7419 // Code should guarantee that element i in Vec starts at element "WindowBase 7420 // + i * WindowScale in ShuffleVec". 7421 int WindowBase = 0; 7422 int WindowScale = 1; 7423 7424 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7425 7426 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7427 }; 7428 7429 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7430 // node. 7431 SmallVector<ShuffleSourceInfo, 2> Sources; 7432 for (unsigned i = 0; i < NumElts; ++i) { 7433 SDValue V = Op.getOperand(i); 7434 if (V.isUndef()) 7435 continue; 7436 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7437 // A shuffle can only come from building a vector from various 7438 // elements of other vectors. 7439 return SDValue(); 7440 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7441 // Furthermore, shuffles require a constant mask, whereas extractelts 7442 // accept variable indices. 7443 return SDValue(); 7444 } 7445 7446 // Add this element source to the list if it's not already there. 7447 SDValue SourceVec = V.getOperand(0); 7448 auto Source = llvm::find(Sources, SourceVec); 7449 if (Source == Sources.end()) 7450 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7451 7452 // Update the minimum and maximum lane number seen. 7453 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7454 Source->MinElt = std::min(Source->MinElt, EltNo); 7455 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7456 } 7457 7458 // Currently only do something sane when at most two source vectors 7459 // are involved. 7460 if (Sources.size() > 2) 7461 return SDValue(); 7462 7463 // Find out the smallest element size among result and two sources, and use 7464 // it as element size to build the shuffle_vector. 7465 EVT SmallestEltTy = VT.getVectorElementType(); 7466 for (auto &Source : Sources) { 7467 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7468 if (SrcEltTy.bitsLT(SmallestEltTy)) 7469 SmallestEltTy = SrcEltTy; 7470 } 7471 unsigned ResMultiplier = 7472 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7473 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7474 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7475 7476 // If the source vector is too wide or too narrow, we may nevertheless be able 7477 // to construct a compatible shuffle either by concatenating it with UNDEF or 7478 // extracting a suitable range of elements. 7479 for (auto &Src : Sources) { 7480 EVT SrcVT = Src.ShuffleVec.getValueType(); 7481 7482 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 7483 continue; 7484 7485 // This stage of the search produces a source with the same element type as 7486 // the original, but with a total width matching the BUILD_VECTOR output. 7487 EVT EltVT = SrcVT.getVectorElementType(); 7488 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 7489 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7490 7491 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 7492 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 7493 return SDValue(); 7494 // We can pad out the smaller vector for free, so if it's part of a 7495 // shuffle... 7496 Src.ShuffleVec = 7497 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7498 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7499 continue; 7500 } 7501 7502 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 7503 return SDValue(); 7504 7505 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7506 // Span too large for a VEXT to cope 7507 return SDValue(); 7508 } 7509 7510 if (Src.MinElt >= NumSrcElts) { 7511 // The extraction can just take the second half 7512 Src.ShuffleVec = 7513 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7514 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7515 Src.WindowBase = -NumSrcElts; 7516 } else if (Src.MaxElt < NumSrcElts) { 7517 // The extraction can just take the first half 7518 Src.ShuffleVec = 7519 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7520 DAG.getConstant(0, dl, MVT::i32)); 7521 } else { 7522 // An actual VEXT is needed 7523 SDValue VEXTSrc1 = 7524 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7525 DAG.getConstant(0, dl, MVT::i32)); 7526 SDValue VEXTSrc2 = 7527 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7528 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7529 7530 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 7531 VEXTSrc2, 7532 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 7533 Src.WindowBase = -Src.MinElt; 7534 } 7535 } 7536 7537 // Another possible incompatibility occurs from the vector element types. We 7538 // can fix this by bitcasting the source vectors to the same type we intend 7539 // for the shuffle. 7540 for (auto &Src : Sources) { 7541 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7542 if (SrcEltTy == SmallestEltTy) 7543 continue; 7544 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7545 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); 7546 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7547 Src.WindowBase *= Src.WindowScale; 7548 } 7549 7550 // Final sanity check before we try to actually produce a shuffle. 7551 LLVM_DEBUG(for (auto Src 7552 : Sources) 7553 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7554 7555 // The stars all align, our next step is to produce the mask for the shuffle. 7556 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7557 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7558 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7559 SDValue Entry = Op.getOperand(i); 7560 if (Entry.isUndef()) 7561 continue; 7562 7563 auto Src = llvm::find(Sources, Entry.getOperand(0)); 7564 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7565 7566 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7567 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7568 // segment. 7569 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7570 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 7571 VT.getScalarSizeInBits()); 7572 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7573 7574 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7575 // starting at the appropriate offset. 7576 int *LaneMask = &Mask[i * ResMultiplier]; 7577 7578 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7579 ExtractBase += NumElts * (Src - Sources.begin()); 7580 for (int j = 0; j < LanesDefined; ++j) 7581 LaneMask[j] = ExtractBase + j; 7582 } 7583 7584 7585 // We can't handle more than two sources. This should have already 7586 // been checked before this point. 7587 assert(Sources.size() <= 2 && "Too many sources!"); 7588 7589 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7590 for (unsigned i = 0; i < Sources.size(); ++i) 7591 ShuffleOps[i] = Sources[i].ShuffleVec; 7592 7593 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7594 ShuffleOps[1], Mask, DAG); 7595 if (!Shuffle) 7596 return SDValue(); 7597 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); 7598 } 7599 7600 enum ShuffleOpCodes { 7601 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7602 OP_VREV, 7603 OP_VDUP0, 7604 OP_VDUP1, 7605 OP_VDUP2, 7606 OP_VDUP3, 7607 OP_VEXT1, 7608 OP_VEXT2, 7609 OP_VEXT3, 7610 OP_VUZPL, // VUZP, left result 7611 OP_VUZPR, // VUZP, right result 7612 OP_VZIPL, // VZIP, left result 7613 OP_VZIPR, // VZIP, right result 7614 OP_VTRNL, // VTRN, left result 7615 OP_VTRNR // VTRN, right result 7616 }; 7617 7618 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 7619 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7620 switch (OpNum) { 7621 case OP_COPY: 7622 case OP_VREV: 7623 case OP_VDUP0: 7624 case OP_VDUP1: 7625 case OP_VDUP2: 7626 case OP_VDUP3: 7627 return true; 7628 } 7629 return false; 7630 } 7631 7632 /// isShuffleMaskLegal - Targets can use this to indicate that they only 7633 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7634 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7635 /// are assumed to be legal. 7636 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7637 if (VT.getVectorNumElements() == 4 && 7638 (VT.is128BitVector() || VT.is64BitVector())) { 7639 unsigned PFIndexes[4]; 7640 for (unsigned i = 0; i != 4; ++i) { 7641 if (M[i] < 0) 7642 PFIndexes[i] = 8; 7643 else 7644 PFIndexes[i] = M[i]; 7645 } 7646 7647 // Compute the index in the perfect shuffle table. 7648 unsigned PFTableIndex = 7649 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7650 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7651 unsigned Cost = (PFEntry >> 30); 7652 7653 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 7654 return true; 7655 } 7656 7657 bool ReverseVEXT, isV_UNDEF; 7658 unsigned Imm, WhichResult; 7659 7660 unsigned EltSize = VT.getScalarSizeInBits(); 7661 if (EltSize >= 32 || 7662 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7663 ShuffleVectorInst::isIdentityMask(M) || 7664 isVREVMask(M, VT, 64) || 7665 isVREVMask(M, VT, 32) || 7666 isVREVMask(M, VT, 16)) 7667 return true; 7668 else if (Subtarget->hasNEON() && 7669 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 7670 isVTBLMask(M, VT) || 7671 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 7672 return true; 7673 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 7674 isReverseMask(M, VT)) 7675 return true; 7676 else if (Subtarget->hasMVEIntegerOps() && 7677 (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1))) 7678 return true; 7679 else 7680 return false; 7681 } 7682 7683 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7684 /// the specified operations to build the shuffle. 7685 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7686 SDValue RHS, SelectionDAG &DAG, 7687 const SDLoc &dl) { 7688 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7689 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7690 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7691 7692 if (OpNum == OP_COPY) { 7693 if (LHSID == (1*9+2)*9+3) return LHS; 7694 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7695 return RHS; 7696 } 7697 7698 SDValue OpLHS, OpRHS; 7699 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7700 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7701 EVT VT = OpLHS.getValueType(); 7702 7703 switch (OpNum) { 7704 default: llvm_unreachable("Unknown shuffle opcode!"); 7705 case OP_VREV: 7706 // VREV divides the vector in half and swaps within the half. 7707 if (VT.getVectorElementType() == MVT::i32 || 7708 VT.getVectorElementType() == MVT::f32) 7709 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 7710 // vrev <4 x i16> -> VREV32 7711 if (VT.getVectorElementType() == MVT::i16) 7712 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 7713 // vrev <4 x i8> -> VREV16 7714 assert(VT.getVectorElementType() == MVT::i8); 7715 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 7716 case OP_VDUP0: 7717 case OP_VDUP1: 7718 case OP_VDUP2: 7719 case OP_VDUP3: 7720 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7721 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 7722 case OP_VEXT1: 7723 case OP_VEXT2: 7724 case OP_VEXT3: 7725 return DAG.getNode(ARMISD::VEXT, dl, VT, 7726 OpLHS, OpRHS, 7727 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 7728 case OP_VUZPL: 7729 case OP_VUZPR: 7730 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 7731 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 7732 case OP_VZIPL: 7733 case OP_VZIPR: 7734 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 7735 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 7736 case OP_VTRNL: 7737 case OP_VTRNR: 7738 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 7739 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 7740 } 7741 } 7742 7743 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 7744 ArrayRef<int> ShuffleMask, 7745 SelectionDAG &DAG) { 7746 // Check to see if we can use the VTBL instruction. 7747 SDValue V1 = Op.getOperand(0); 7748 SDValue V2 = Op.getOperand(1); 7749 SDLoc DL(Op); 7750 7751 SmallVector<SDValue, 8> VTBLMask; 7752 for (ArrayRef<int>::iterator 7753 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 7754 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 7755 7756 if (V2.getNode()->isUndef()) 7757 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 7758 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7759 7760 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 7761 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7762 } 7763 7764 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 7765 SelectionDAG &DAG) { 7766 SDLoc DL(Op); 7767 SDValue OpLHS = Op.getOperand(0); 7768 EVT VT = OpLHS.getValueType(); 7769 7770 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 7771 "Expect an v8i16/v16i8 type"); 7772 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 7773 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 7774 // extract the first 8 bytes into the top double word and the last 8 bytes 7775 // into the bottom double word. The v8i16 case is similar. 7776 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 7777 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 7778 DAG.getConstant(ExtractNum, DL, MVT::i32)); 7779 } 7780 7781 static EVT getVectorTyFromPredicateVector(EVT VT) { 7782 switch (VT.getSimpleVT().SimpleTy) { 7783 case MVT::v4i1: 7784 return MVT::v4i32; 7785 case MVT::v8i1: 7786 return MVT::v8i16; 7787 case MVT::v16i1: 7788 return MVT::v16i8; 7789 default: 7790 llvm_unreachable("Unexpected vector predicate type"); 7791 } 7792 } 7793 7794 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 7795 SelectionDAG &DAG) { 7796 // Converting from boolean predicates to integers involves creating a vector 7797 // of all ones or all zeroes and selecting the lanes based upon the real 7798 // predicate. 7799 SDValue AllOnes = 7800 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 7801 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 7802 7803 SDValue AllZeroes = 7804 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 7805 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 7806 7807 // Get full vector type from predicate type 7808 EVT NewVT = getVectorTyFromPredicateVector(VT); 7809 7810 SDValue RecastV1; 7811 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 7812 // this to a v16i1. This cannot be done with an ordinary bitcast because the 7813 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 7814 // since we know in hardware the sizes are really the same. 7815 if (VT != MVT::v16i1) 7816 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 7817 else 7818 RecastV1 = Pred; 7819 7820 // Select either all ones or zeroes depending upon the real predicate bits. 7821 SDValue PredAsVector = 7822 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 7823 7824 // Recast our new predicate-as-integer v16i8 vector into something 7825 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 7826 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 7827 } 7828 7829 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 7830 const ARMSubtarget *ST) { 7831 EVT VT = Op.getValueType(); 7832 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7833 ArrayRef<int> ShuffleMask = SVN->getMask(); 7834 7835 assert(ST->hasMVEIntegerOps() && 7836 "No support for vector shuffle of boolean predicates"); 7837 7838 SDValue V1 = Op.getOperand(0); 7839 SDLoc dl(Op); 7840 if (isReverseMask(ShuffleMask, VT)) { 7841 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 7842 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 7843 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 7844 DAG.getConstant(16, dl, MVT::i32)); 7845 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 7846 } 7847 7848 // Until we can come up with optimised cases for every single vector 7849 // shuffle in existence we have chosen the least painful strategy. This is 7850 // to essentially promote the boolean predicate to a 8-bit integer, where 7851 // each predicate represents a byte. Then we fall back on a normal integer 7852 // vector shuffle and convert the result back into a predicate vector. In 7853 // many cases the generated code might be even better than scalar code 7854 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 7855 // fields in a register into 8 other arbitrary 2-bit fields! 7856 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 7857 EVT NewVT = PredAsVector.getValueType(); 7858 7859 // Do the shuffle! 7860 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 7861 DAG.getUNDEF(NewVT), ShuffleMask); 7862 7863 // Now return the result of comparing the shuffled vector with zero, 7864 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 7865 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 7866 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 7867 } 7868 7869 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 7870 ArrayRef<int> ShuffleMask, 7871 SelectionDAG &DAG) { 7872 // Attempt to lower the vector shuffle using as many whole register movs as 7873 // possible. This is useful for types smaller than 32bits, which would 7874 // often otherwise become a series for grp movs. 7875 SDLoc dl(Op); 7876 EVT VT = Op.getValueType(); 7877 if (VT.getScalarSizeInBits() >= 32) 7878 return SDValue(); 7879 7880 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 7881 "Unexpected vector type"); 7882 int NumElts = VT.getVectorNumElements(); 7883 int QuarterSize = NumElts / 4; 7884 // The four final parts of the vector, as i32's 7885 SDValue Parts[4]; 7886 7887 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 7888 // <u,u,u,u>), returning the vmov lane index 7889 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 7890 // Detect which mov lane this would be from the first non-undef element. 7891 int MovIdx = -1; 7892 for (int i = 0; i < Length; i++) { 7893 if (ShuffleMask[Start + i] >= 0) { 7894 if (ShuffleMask[Start + i] % Length != i) 7895 return -1; 7896 MovIdx = ShuffleMask[Start + i] / Length; 7897 break; 7898 } 7899 } 7900 // If all items are undef, leave this for other combines 7901 if (MovIdx == -1) 7902 return -1; 7903 // Check the remaining values are the correct part of the same mov 7904 for (int i = 1; i < Length; i++) { 7905 if (ShuffleMask[Start + i] >= 0 && 7906 (ShuffleMask[Start + i] / Length != MovIdx || 7907 ShuffleMask[Start + i] % Length != i)) 7908 return -1; 7909 } 7910 return MovIdx; 7911 }; 7912 7913 for (int Part = 0; Part < 4; ++Part) { 7914 // Does this part look like a mov 7915 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 7916 if (Elt != -1) { 7917 SDValue Input = Op->getOperand(0); 7918 if (Elt >= 4) { 7919 Input = Op->getOperand(1); 7920 Elt -= 4; 7921 } 7922 SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input); 7923 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast, 7924 DAG.getConstant(Elt, dl, MVT::i32)); 7925 } 7926 } 7927 7928 // Nothing interesting found, just return 7929 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 7930 return SDValue(); 7931 7932 // The other parts need to be built with the old shuffle vector, cast to a 7933 // v4i32 and extract_vector_elts 7934 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 7935 SmallVector<int, 16> NewShuffleMask; 7936 for (int Part = 0; Part < 4; ++Part) 7937 for (int i = 0; i < QuarterSize; i++) 7938 NewShuffleMask.push_back( 7939 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 7940 SDValue NewShuffle = DAG.getVectorShuffle( 7941 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 7942 SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle); 7943 7944 for (int Part = 0; Part < 4; ++Part) 7945 if (!Parts[Part]) 7946 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7947 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 7948 } 7949 // Build a vector out of the various parts and bitcast it back to the original 7950 // type. 7951 SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts); 7952 return DAG.getBitcast(VT, NewVec); 7953 } 7954 7955 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 7956 const ARMSubtarget *ST) { 7957 SDValue V1 = Op.getOperand(0); 7958 SDValue V2 = Op.getOperand(1); 7959 SDLoc dl(Op); 7960 EVT VT = Op.getValueType(); 7961 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7962 unsigned EltSize = VT.getScalarSizeInBits(); 7963 7964 if (ST->hasMVEIntegerOps() && EltSize == 1) 7965 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 7966 7967 // Convert shuffles that are directly supported on NEON to target-specific 7968 // DAG nodes, instead of keeping them as shuffles and matching them again 7969 // during code selection. This is more efficient and avoids the possibility 7970 // of inconsistencies between legalization and selection. 7971 // FIXME: floating-point vectors should be canonicalized to integer vectors 7972 // of the same time so that they get CSEd properly. 7973 ArrayRef<int> ShuffleMask = SVN->getMask(); 7974 7975 if (EltSize <= 32) { 7976 if (SVN->isSplat()) { 7977 int Lane = SVN->getSplatIndex(); 7978 // If this is undef splat, generate it via "just" vdup, if possible. 7979 if (Lane == -1) Lane = 0; 7980 7981 // Test if V1 is a SCALAR_TO_VECTOR. 7982 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7983 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7984 } 7985 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 7986 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 7987 // reaches it). 7988 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 7989 !isa<ConstantSDNode>(V1.getOperand(0))) { 7990 bool IsScalarToVector = true; 7991 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 7992 if (!V1.getOperand(i).isUndef()) { 7993 IsScalarToVector = false; 7994 break; 7995 } 7996 if (IsScalarToVector) 7997 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7998 } 7999 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 8000 DAG.getConstant(Lane, dl, MVT::i32)); 8001 } 8002 8003 bool ReverseVEXT = false; 8004 unsigned Imm = 0; 8005 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 8006 if (ReverseVEXT) 8007 std::swap(V1, V2); 8008 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 8009 DAG.getConstant(Imm, dl, MVT::i32)); 8010 } 8011 8012 if (isVREVMask(ShuffleMask, VT, 64)) 8013 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 8014 if (isVREVMask(ShuffleMask, VT, 32)) 8015 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8016 if (isVREVMask(ShuffleMask, VT, 16)) 8017 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8018 8019 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8020 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8021 DAG.getConstant(Imm, dl, MVT::i32)); 8022 } 8023 8024 // Check for Neon shuffles that modify both input vectors in place. 8025 // If both results are used, i.e., if there are two shuffles with the same 8026 // source operands and with masks corresponding to both results of one of 8027 // these operations, DAG memoization will ensure that a single node is 8028 // used for both shuffles. 8029 unsigned WhichResult = 0; 8030 bool isV_UNDEF = false; 8031 if (ST->hasNEON()) { 8032 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8033 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8034 if (isV_UNDEF) 8035 V2 = V1; 8036 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8037 .getValue(WhichResult); 8038 } 8039 } 8040 if (ST->hasMVEIntegerOps()) { 8041 if (isVMOVNMask(ShuffleMask, VT, 0)) 8042 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8043 DAG.getConstant(0, dl, MVT::i32)); 8044 if (isVMOVNMask(ShuffleMask, VT, 1)) 8045 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8046 DAG.getConstant(1, dl, MVT::i32)); 8047 } 8048 8049 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8050 // shuffles that produce a result larger than their operands with: 8051 // shuffle(concat(v1, undef), concat(v2, undef)) 8052 // -> 8053 // shuffle(concat(v1, v2), undef) 8054 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8055 // 8056 // This is useful in the general case, but there are special cases where 8057 // native shuffles produce larger results: the two-result ops. 8058 // 8059 // Look through the concat when lowering them: 8060 // shuffle(concat(v1, v2), undef) 8061 // -> 8062 // concat(VZIP(v1, v2):0, :1) 8063 // 8064 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8065 SDValue SubV1 = V1->getOperand(0); 8066 SDValue SubV2 = V1->getOperand(1); 8067 EVT SubVT = SubV1.getValueType(); 8068 8069 // We expect these to have been canonicalized to -1. 8070 assert(llvm::all_of(ShuffleMask, [&](int i) { 8071 return i < (int)VT.getVectorNumElements(); 8072 }) && "Unexpected shuffle index into UNDEF operand!"); 8073 8074 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8075 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8076 if (isV_UNDEF) 8077 SubV2 = SubV1; 8078 assert((WhichResult == 0) && 8079 "In-place shuffle of concat can only have one result!"); 8080 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8081 SubV1, SubV2); 8082 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8083 Res.getValue(1)); 8084 } 8085 } 8086 } 8087 8088 // If the shuffle is not directly supported and it has 4 elements, use 8089 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8090 unsigned NumElts = VT.getVectorNumElements(); 8091 if (NumElts == 4) { 8092 unsigned PFIndexes[4]; 8093 for (unsigned i = 0; i != 4; ++i) { 8094 if (ShuffleMask[i] < 0) 8095 PFIndexes[i] = 8; 8096 else 8097 PFIndexes[i] = ShuffleMask[i]; 8098 } 8099 8100 // Compute the index in the perfect shuffle table. 8101 unsigned PFTableIndex = 8102 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8103 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8104 unsigned Cost = (PFEntry >> 30); 8105 8106 if (Cost <= 4) { 8107 if (ST->hasNEON()) 8108 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8109 else if (isLegalMVEShuffleOp(PFEntry)) { 8110 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8111 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8112 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8113 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8114 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8115 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8116 } 8117 } 8118 } 8119 8120 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8121 if (EltSize >= 32) { 8122 // Do the expansion with floating-point types, since that is what the VFP 8123 // registers are defined to use, and since i64 is not legal. 8124 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8125 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8126 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8127 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8128 SmallVector<SDValue, 8> Ops; 8129 for (unsigned i = 0; i < NumElts; ++i) { 8130 if (ShuffleMask[i] < 0) 8131 Ops.push_back(DAG.getUNDEF(EltVT)); 8132 else 8133 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8134 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8135 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8136 dl, MVT::i32))); 8137 } 8138 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8139 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8140 } 8141 8142 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 8143 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 8144 8145 if (ST->hasNEON() && VT == MVT::v8i8) 8146 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8147 return NewOp; 8148 8149 if (ST->hasMVEIntegerOps()) 8150 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8151 return NewOp; 8152 8153 return SDValue(); 8154 } 8155 8156 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8157 const ARMSubtarget *ST) { 8158 EVT VecVT = Op.getOperand(0).getValueType(); 8159 SDLoc dl(Op); 8160 8161 assert(ST->hasMVEIntegerOps() && 8162 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8163 8164 SDValue Conv = 8165 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8166 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8167 unsigned LaneWidth = 8168 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8169 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8170 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8171 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8172 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8173 DAG.getConstant(~Mask, dl, MVT::i32)); 8174 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8175 } 8176 8177 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8178 SelectionDAG &DAG) const { 8179 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8180 SDValue Lane = Op.getOperand(2); 8181 if (!isa<ConstantSDNode>(Lane)) 8182 return SDValue(); 8183 8184 SDValue Elt = Op.getOperand(1); 8185 EVT EltVT = Elt.getValueType(); 8186 8187 if (Subtarget->hasMVEIntegerOps() && 8188 Op.getValueType().getScalarSizeInBits() == 1) 8189 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8190 8191 if (getTypeAction(*DAG.getContext(), EltVT) == 8192 TargetLowering::TypePromoteFloat) { 8193 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8194 // but the type system will try to do that if we don't intervene. 8195 // Reinterpret any such vector-element insertion as one with the 8196 // corresponding integer types. 8197 8198 SDLoc dl(Op); 8199 8200 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8201 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8202 TargetLowering::TypePromoteFloat); 8203 8204 SDValue VecIn = Op.getOperand(0); 8205 EVT VecVT = VecIn.getValueType(); 8206 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8207 VecVT.getVectorNumElements()); 8208 8209 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8210 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8211 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8212 IVecIn, IElt, Lane); 8213 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8214 } 8215 8216 return Op; 8217 } 8218 8219 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8220 const ARMSubtarget *ST) { 8221 EVT VecVT = Op.getOperand(0).getValueType(); 8222 SDLoc dl(Op); 8223 8224 assert(ST->hasMVEIntegerOps() && 8225 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8226 8227 SDValue Conv = 8228 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8229 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8230 unsigned LaneWidth = 8231 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8232 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8233 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8234 return Shift; 8235 } 8236 8237 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8238 const ARMSubtarget *ST) { 8239 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8240 SDValue Lane = Op.getOperand(1); 8241 if (!isa<ConstantSDNode>(Lane)) 8242 return SDValue(); 8243 8244 SDValue Vec = Op.getOperand(0); 8245 EVT VT = Vec.getValueType(); 8246 8247 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8248 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8249 8250 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8251 SDLoc dl(Op); 8252 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8253 } 8254 8255 return Op; 8256 } 8257 8258 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8259 const ARMSubtarget *ST) { 8260 SDValue V1 = Op.getOperand(0); 8261 SDValue V2 = Op.getOperand(1); 8262 SDLoc dl(Op); 8263 EVT VT = Op.getValueType(); 8264 EVT Op1VT = V1.getValueType(); 8265 EVT Op2VT = V2.getValueType(); 8266 unsigned NumElts = VT.getVectorNumElements(); 8267 8268 assert(Op1VT == Op2VT && "Operand types don't match!"); 8269 assert(VT.getScalarSizeInBits() == 1 && 8270 "Unexpected custom CONCAT_VECTORS lowering"); 8271 assert(ST->hasMVEIntegerOps() && 8272 "CONCAT_VECTORS lowering only supported for MVE"); 8273 8274 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8275 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8276 8277 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8278 // promoted to v8i16, etc. 8279 8280 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8281 8282 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8283 // to be the right size for the destination. For example, if Op1 is v4i1 then 8284 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8285 // which when promoted is v8i16. That means each i32 element from Op1 needs 8286 // truncating to i16 and inserting in the result. 8287 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8288 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8289 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8290 EVT NewVT = NewV.getValueType(); 8291 EVT ConcatVT = ConVec.getValueType(); 8292 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8293 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8294 DAG.getIntPtrConstant(i, dl)); 8295 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8296 DAG.getConstant(j, dl, MVT::i32)); 8297 } 8298 return ConVec; 8299 }; 8300 unsigned j = 0; 8301 ConVec = ExractInto(NewV1, ConVec, j); 8302 ConVec = ExractInto(NewV2, ConVec, j); 8303 8304 // Now return the result of comparing the subvector with zero, 8305 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8306 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8307 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8308 } 8309 8310 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8311 const ARMSubtarget *ST) { 8312 EVT VT = Op->getValueType(0); 8313 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8314 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8315 8316 // The only time a CONCAT_VECTORS operation can have legal types is when 8317 // two 64-bit vectors are concatenated to a 128-bit vector. 8318 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8319 "unexpected CONCAT_VECTORS"); 8320 SDLoc dl(Op); 8321 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8322 SDValue Op0 = Op.getOperand(0); 8323 SDValue Op1 = Op.getOperand(1); 8324 if (!Op0.isUndef()) 8325 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8326 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8327 DAG.getIntPtrConstant(0, dl)); 8328 if (!Op1.isUndef()) 8329 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8330 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8331 DAG.getIntPtrConstant(1, dl)); 8332 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8333 } 8334 8335 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8336 const ARMSubtarget *ST) { 8337 SDValue V1 = Op.getOperand(0); 8338 SDValue V2 = Op.getOperand(1); 8339 SDLoc dl(Op); 8340 EVT VT = Op.getValueType(); 8341 EVT Op1VT = V1.getValueType(); 8342 unsigned NumElts = VT.getVectorNumElements(); 8343 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8344 8345 assert(VT.getScalarSizeInBits() == 1 && 8346 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8347 assert(ST->hasMVEIntegerOps() && 8348 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8349 8350 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8351 8352 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8353 // promoted to v8i16, etc. 8354 8355 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8356 8357 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8358 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8359 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8360 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8361 DAG.getIntPtrConstant(i, dl)); 8362 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8363 DAG.getConstant(j, dl, MVT::i32)); 8364 } 8365 8366 // Now return the result of comparing the subvector with zero, 8367 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8368 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8369 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8370 } 8371 8372 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 8373 /// element has been zero/sign-extended, depending on the isSigned parameter, 8374 /// from an integer type half its size. 8375 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 8376 bool isSigned) { 8377 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 8378 EVT VT = N->getValueType(0); 8379 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 8380 SDNode *BVN = N->getOperand(0).getNode(); 8381 if (BVN->getValueType(0) != MVT::v4i32 || 8382 BVN->getOpcode() != ISD::BUILD_VECTOR) 8383 return false; 8384 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8385 unsigned HiElt = 1 - LoElt; 8386 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 8387 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 8388 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 8389 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 8390 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 8391 return false; 8392 if (isSigned) { 8393 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 8394 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 8395 return true; 8396 } else { 8397 if (Hi0->isNullValue() && Hi1->isNullValue()) 8398 return true; 8399 } 8400 return false; 8401 } 8402 8403 if (N->getOpcode() != ISD::BUILD_VECTOR) 8404 return false; 8405 8406 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 8407 SDNode *Elt = N->getOperand(i).getNode(); 8408 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 8409 unsigned EltSize = VT.getScalarSizeInBits(); 8410 unsigned HalfSize = EltSize / 2; 8411 if (isSigned) { 8412 if (!isIntN(HalfSize, C->getSExtValue())) 8413 return false; 8414 } else { 8415 if (!isUIntN(HalfSize, C->getZExtValue())) 8416 return false; 8417 } 8418 continue; 8419 } 8420 return false; 8421 } 8422 8423 return true; 8424 } 8425 8426 /// isSignExtended - Check if a node is a vector value that is sign-extended 8427 /// or a constant BUILD_VECTOR with sign-extended elements. 8428 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 8429 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 8430 return true; 8431 if (isExtendedBUILD_VECTOR(N, DAG, true)) 8432 return true; 8433 return false; 8434 } 8435 8436 /// isZeroExtended - Check if a node is a vector value that is zero-extended 8437 /// or a constant BUILD_VECTOR with zero-extended elements. 8438 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 8439 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 8440 return true; 8441 if (isExtendedBUILD_VECTOR(N, DAG, false)) 8442 return true; 8443 return false; 8444 } 8445 8446 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 8447 if (OrigVT.getSizeInBits() >= 64) 8448 return OrigVT; 8449 8450 assert(OrigVT.isSimple() && "Expecting a simple value type"); 8451 8452 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 8453 switch (OrigSimpleTy) { 8454 default: llvm_unreachable("Unexpected Vector Type"); 8455 case MVT::v2i8: 8456 case MVT::v2i16: 8457 return MVT::v2i32; 8458 case MVT::v4i8: 8459 return MVT::v4i16; 8460 } 8461 } 8462 8463 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 8464 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 8465 /// We insert the required extension here to get the vector to fill a D register. 8466 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 8467 const EVT &OrigTy, 8468 const EVT &ExtTy, 8469 unsigned ExtOpcode) { 8470 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 8471 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 8472 // 64-bits we need to insert a new extension so that it will be 64-bits. 8473 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 8474 if (OrigTy.getSizeInBits() >= 64) 8475 return N; 8476 8477 // Must extend size to at least 64 bits to be used as an operand for VMULL. 8478 EVT NewVT = getExtensionTo64Bits(OrigTy); 8479 8480 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 8481 } 8482 8483 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 8484 /// does not do any sign/zero extension. If the original vector is less 8485 /// than 64 bits, an appropriate extension will be added after the load to 8486 /// reach a total size of 64 bits. We have to add the extension separately 8487 /// because ARM does not have a sign/zero extending load for vectors. 8488 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 8489 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 8490 8491 // The load already has the right type. 8492 if (ExtendedTy == LD->getMemoryVT()) 8493 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 8494 LD->getBasePtr(), LD->getPointerInfo(), 8495 LD->getAlignment(), LD->getMemOperand()->getFlags()); 8496 8497 // We need to create a zextload/sextload. We cannot just create a load 8498 // followed by a zext/zext node because LowerMUL is also run during normal 8499 // operation legalization where we can't create illegal types. 8500 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 8501 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 8502 LD->getMemoryVT(), LD->getAlignment(), 8503 LD->getMemOperand()->getFlags()); 8504 } 8505 8506 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 8507 /// extending load, or BUILD_VECTOR with extended elements, return the 8508 /// unextended value. The unextended vector should be 64 bits so that it can 8509 /// be used as an operand to a VMULL instruction. If the original vector size 8510 /// before extension is less than 64 bits we add a an extension to resize 8511 /// the vector to 64 bits. 8512 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 8513 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 8514 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 8515 N->getOperand(0)->getValueType(0), 8516 N->getValueType(0), 8517 N->getOpcode()); 8518 8519 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8520 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 8521 "Expected extending load"); 8522 8523 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 8524 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 8525 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8526 SDValue extLoad = 8527 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 8528 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 8529 8530 return newLoad; 8531 } 8532 8533 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 8534 // have been legalized as a BITCAST from v4i32. 8535 if (N->getOpcode() == ISD::BITCAST) { 8536 SDNode *BVN = N->getOperand(0).getNode(); 8537 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 8538 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 8539 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8540 return DAG.getBuildVector( 8541 MVT::v2i32, SDLoc(N), 8542 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 8543 } 8544 // Construct a new BUILD_VECTOR with elements truncated to half the size. 8545 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 8546 EVT VT = N->getValueType(0); 8547 unsigned EltSize = VT.getScalarSizeInBits() / 2; 8548 unsigned NumElts = VT.getVectorNumElements(); 8549 MVT TruncVT = MVT::getIntegerVT(EltSize); 8550 SmallVector<SDValue, 8> Ops; 8551 SDLoc dl(N); 8552 for (unsigned i = 0; i != NumElts; ++i) { 8553 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 8554 const APInt &CInt = C->getAPIntValue(); 8555 // Element types smaller than 32 bits are not legal, so use i32 elements. 8556 // The values are implicitly truncated so sext vs. zext doesn't matter. 8557 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 8558 } 8559 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 8560 } 8561 8562 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 8563 unsigned Opcode = N->getOpcode(); 8564 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8565 SDNode *N0 = N->getOperand(0).getNode(); 8566 SDNode *N1 = N->getOperand(1).getNode(); 8567 return N0->hasOneUse() && N1->hasOneUse() && 8568 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 8569 } 8570 return false; 8571 } 8572 8573 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 8574 unsigned Opcode = N->getOpcode(); 8575 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8576 SDNode *N0 = N->getOperand(0).getNode(); 8577 SDNode *N1 = N->getOperand(1).getNode(); 8578 return N0->hasOneUse() && N1->hasOneUse() && 8579 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 8580 } 8581 return false; 8582 } 8583 8584 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 8585 // Multiplications are only custom-lowered for 128-bit vectors so that 8586 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 8587 EVT VT = Op.getValueType(); 8588 assert(VT.is128BitVector() && VT.isInteger() && 8589 "unexpected type for custom-lowering ISD::MUL"); 8590 SDNode *N0 = Op.getOperand(0).getNode(); 8591 SDNode *N1 = Op.getOperand(1).getNode(); 8592 unsigned NewOpc = 0; 8593 bool isMLA = false; 8594 bool isN0SExt = isSignExtended(N0, DAG); 8595 bool isN1SExt = isSignExtended(N1, DAG); 8596 if (isN0SExt && isN1SExt) 8597 NewOpc = ARMISD::VMULLs; 8598 else { 8599 bool isN0ZExt = isZeroExtended(N0, DAG); 8600 bool isN1ZExt = isZeroExtended(N1, DAG); 8601 if (isN0ZExt && isN1ZExt) 8602 NewOpc = ARMISD::VMULLu; 8603 else if (isN1SExt || isN1ZExt) { 8604 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 8605 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 8606 if (isN1SExt && isAddSubSExt(N0, DAG)) { 8607 NewOpc = ARMISD::VMULLs; 8608 isMLA = true; 8609 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 8610 NewOpc = ARMISD::VMULLu; 8611 isMLA = true; 8612 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 8613 std::swap(N0, N1); 8614 NewOpc = ARMISD::VMULLu; 8615 isMLA = true; 8616 } 8617 } 8618 8619 if (!NewOpc) { 8620 if (VT == MVT::v2i64) 8621 // Fall through to expand this. It is not legal. 8622 return SDValue(); 8623 else 8624 // Other vector multiplications are legal. 8625 return Op; 8626 } 8627 } 8628 8629 // Legalize to a VMULL instruction. 8630 SDLoc DL(Op); 8631 SDValue Op0; 8632 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 8633 if (!isMLA) { 8634 Op0 = SkipExtensionForVMULL(N0, DAG); 8635 assert(Op0.getValueType().is64BitVector() && 8636 Op1.getValueType().is64BitVector() && 8637 "unexpected types for extended operands to VMULL"); 8638 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 8639 } 8640 8641 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 8642 // isel lowering to take advantage of no-stall back to back vmul + vmla. 8643 // vmull q0, d4, d6 8644 // vmlal q0, d5, d6 8645 // is faster than 8646 // vaddl q0, d4, d5 8647 // vmovl q1, d6 8648 // vmul q0, q0, q1 8649 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 8650 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 8651 EVT Op1VT = Op1.getValueType(); 8652 return DAG.getNode(N0->getOpcode(), DL, VT, 8653 DAG.getNode(NewOpc, DL, VT, 8654 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 8655 DAG.getNode(NewOpc, DL, VT, 8656 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 8657 } 8658 8659 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 8660 SelectionDAG &DAG) { 8661 // TODO: Should this propagate fast-math-flags? 8662 8663 // Convert to float 8664 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 8665 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 8666 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 8667 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 8668 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 8669 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 8670 // Get reciprocal estimate. 8671 // float4 recip = vrecpeq_f32(yf); 8672 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8673 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8674 Y); 8675 // Because char has a smaller range than uchar, we can actually get away 8676 // without any newton steps. This requires that we use a weird bias 8677 // of 0xb000, however (again, this has been exhaustively tested). 8678 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 8679 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 8680 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 8681 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 8682 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 8683 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 8684 // Convert back to short. 8685 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 8686 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 8687 return X; 8688 } 8689 8690 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 8691 SelectionDAG &DAG) { 8692 // TODO: Should this propagate fast-math-flags? 8693 8694 SDValue N2; 8695 // Convert to float. 8696 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 8697 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 8698 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 8699 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 8700 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8701 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8702 8703 // Use reciprocal estimate and one refinement step. 8704 // float4 recip = vrecpeq_f32(yf); 8705 // recip *= vrecpsq_f32(yf, recip); 8706 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8707 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8708 N1); 8709 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8710 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8711 N1, N2); 8712 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8713 // Because short has a smaller range than ushort, we can actually get away 8714 // with only a single newton step. This requires that we use a weird bias 8715 // of 89, however (again, this has been exhaustively tested). 8716 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 8717 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8718 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8719 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 8720 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8721 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8722 // Convert back to integer and return. 8723 // return vmovn_s32(vcvt_s32_f32(result)); 8724 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8725 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8726 return N0; 8727 } 8728 8729 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 8730 const ARMSubtarget *ST) { 8731 EVT VT = Op.getValueType(); 8732 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8733 "unexpected type for custom-lowering ISD::SDIV"); 8734 8735 SDLoc dl(Op); 8736 SDValue N0 = Op.getOperand(0); 8737 SDValue N1 = Op.getOperand(1); 8738 SDValue N2, N3; 8739 8740 if (VT == MVT::v8i8) { 8741 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 8742 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 8743 8744 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8745 DAG.getIntPtrConstant(4, dl)); 8746 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8747 DAG.getIntPtrConstant(4, dl)); 8748 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8749 DAG.getIntPtrConstant(0, dl)); 8750 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8751 DAG.getIntPtrConstant(0, dl)); 8752 8753 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 8754 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 8755 8756 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8757 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8758 8759 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 8760 return N0; 8761 } 8762 return LowerSDIV_v4i16(N0, N1, dl, DAG); 8763 } 8764 8765 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 8766 const ARMSubtarget *ST) { 8767 // TODO: Should this propagate fast-math-flags? 8768 EVT VT = Op.getValueType(); 8769 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 8770 "unexpected type for custom-lowering ISD::UDIV"); 8771 8772 SDLoc dl(Op); 8773 SDValue N0 = Op.getOperand(0); 8774 SDValue N1 = Op.getOperand(1); 8775 SDValue N2, N3; 8776 8777 if (VT == MVT::v8i8) { 8778 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 8779 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 8780 8781 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8782 DAG.getIntPtrConstant(4, dl)); 8783 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8784 DAG.getIntPtrConstant(4, dl)); 8785 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 8786 DAG.getIntPtrConstant(0, dl)); 8787 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 8788 DAG.getIntPtrConstant(0, dl)); 8789 8790 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 8791 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 8792 8793 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 8794 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 8795 8796 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 8797 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 8798 MVT::i32), 8799 N0); 8800 return N0; 8801 } 8802 8803 // v4i16 sdiv ... Convert to float. 8804 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 8805 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 8806 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 8807 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 8808 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 8809 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 8810 8811 // Use reciprocal estimate and two refinement steps. 8812 // float4 recip = vrecpeq_f32(yf); 8813 // recip *= vrecpsq_f32(yf, recip); 8814 // recip *= vrecpsq_f32(yf, recip); 8815 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8816 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 8817 BN1); 8818 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8819 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8820 BN1, N2); 8821 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8822 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 8823 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 8824 BN1, N2); 8825 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 8826 // Simply multiplying by the reciprocal estimate can leave us a few ulps 8827 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 8828 // and that it will never cause us to return an answer too large). 8829 // float4 result = as_float4(as_int4(xf*recip) + 2); 8830 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 8831 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 8832 N1 = DAG.getConstant(2, dl, MVT::v4i32); 8833 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 8834 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 8835 // Convert back to integer and return. 8836 // return vmovn_u32(vcvt_s32_f32(result)); 8837 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 8838 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 8839 return N0; 8840 } 8841 8842 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 8843 SDNode *N = Op.getNode(); 8844 EVT VT = N->getValueType(0); 8845 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8846 8847 SDValue Carry = Op.getOperand(2); 8848 8849 SDLoc DL(Op); 8850 8851 SDValue Result; 8852 if (Op.getOpcode() == ISD::ADDCARRY) { 8853 // This converts the boolean value carry into the carry flag. 8854 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8855 8856 // Do the addition proper using the carry flag we wanted. 8857 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 8858 Op.getOperand(1), Carry); 8859 8860 // Now convert the carry flag into a boolean value. 8861 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8862 } else { 8863 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 8864 // have to invert the carry first. 8865 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8866 DAG.getConstant(1, DL, MVT::i32), Carry); 8867 // This converts the boolean value carry into the carry flag. 8868 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 8869 8870 // Do the subtraction proper using the carry flag we wanted. 8871 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 8872 Op.getOperand(1), Carry); 8873 8874 // Now convert the carry flag into a boolean value. 8875 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 8876 // But the carry returned by ARMISD::SUBE is not a borrow as expected 8877 // by ISD::SUBCARRY, so compute 1 - C. 8878 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 8879 DAG.getConstant(1, DL, MVT::i32), Carry); 8880 } 8881 8882 // Return both values. 8883 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 8884 } 8885 8886 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 8887 assert(Subtarget->isTargetDarwin()); 8888 8889 // For iOS, we want to call an alternative entry point: __sincos_stret, 8890 // return values are passed via sret. 8891 SDLoc dl(Op); 8892 SDValue Arg = Op.getOperand(0); 8893 EVT ArgVT = Arg.getValueType(); 8894 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8895 auto PtrVT = getPointerTy(DAG.getDataLayout()); 8896 8897 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8898 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8899 8900 // Pair of floats / doubles used to pass the result. 8901 Type *RetTy = StructType::get(ArgTy, ArgTy); 8902 auto &DL = DAG.getDataLayout(); 8903 8904 ArgListTy Args; 8905 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 8906 SDValue SRet; 8907 if (ShouldUseSRet) { 8908 // Create stack object for sret. 8909 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 8910 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 8911 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 8912 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 8913 8914 ArgListEntry Entry; 8915 Entry.Node = SRet; 8916 Entry.Ty = RetTy->getPointerTo(); 8917 Entry.IsSExt = false; 8918 Entry.IsZExt = false; 8919 Entry.IsSRet = true; 8920 Args.push_back(Entry); 8921 RetTy = Type::getVoidTy(*DAG.getContext()); 8922 } 8923 8924 ArgListEntry Entry; 8925 Entry.Node = Arg; 8926 Entry.Ty = ArgTy; 8927 Entry.IsSExt = false; 8928 Entry.IsZExt = false; 8929 Args.push_back(Entry); 8930 8931 RTLIB::Libcall LC = 8932 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 8933 const char *LibcallName = getLibcallName(LC); 8934 CallingConv::ID CC = getLibcallCallingConv(LC); 8935 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 8936 8937 TargetLowering::CallLoweringInfo CLI(DAG); 8938 CLI.setDebugLoc(dl) 8939 .setChain(DAG.getEntryNode()) 8940 .setCallee(CC, RetTy, Callee, std::move(Args)) 8941 .setDiscardResult(ShouldUseSRet); 8942 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 8943 8944 if (!ShouldUseSRet) 8945 return CallResult.first; 8946 8947 SDValue LoadSin = 8948 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 8949 8950 // Address of cos field. 8951 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 8952 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 8953 SDValue LoadCos = 8954 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 8955 8956 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 8957 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 8958 LoadSin.getValue(0), LoadCos.getValue(0)); 8959 } 8960 8961 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 8962 bool Signed, 8963 SDValue &Chain) const { 8964 EVT VT = Op.getValueType(); 8965 assert((VT == MVT::i32 || VT == MVT::i64) && 8966 "unexpected type for custom lowering DIV"); 8967 SDLoc dl(Op); 8968 8969 const auto &DL = DAG.getDataLayout(); 8970 const auto &TLI = DAG.getTargetLoweringInfo(); 8971 8972 const char *Name = nullptr; 8973 if (Signed) 8974 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 8975 else 8976 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 8977 8978 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 8979 8980 ARMTargetLowering::ArgListTy Args; 8981 8982 for (auto AI : {1, 0}) { 8983 ArgListEntry Arg; 8984 Arg.Node = Op.getOperand(AI); 8985 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 8986 Args.push_back(Arg); 8987 } 8988 8989 CallLoweringInfo CLI(DAG); 8990 CLI.setDebugLoc(dl) 8991 .setChain(Chain) 8992 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 8993 ES, std::move(Args)); 8994 8995 return LowerCallTo(CLI).first; 8996 } 8997 8998 // This is a code size optimisation: return the original SDIV node to 8999 // DAGCombiner when we don't want to expand SDIV into a sequence of 9000 // instructions, and an empty node otherwise which will cause the 9001 // SDIV to be expanded in DAGCombine. 9002 SDValue 9003 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 9004 SelectionDAG &DAG, 9005 SmallVectorImpl<SDNode *> &Created) const { 9006 // TODO: Support SREM 9007 if (N->getOpcode() != ISD::SDIV) 9008 return SDValue(); 9009 9010 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 9011 const bool MinSize = ST.hasMinSize(); 9012 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 9013 : ST.hasDivideInARMMode(); 9014 9015 // Don't touch vector types; rewriting this may lead to scalarizing 9016 // the int divs. 9017 if (N->getOperand(0).getValueType().isVector()) 9018 return SDValue(); 9019 9020 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9021 // hwdiv support for this to be really profitable. 9022 if (!(MinSize && HasDivide)) 9023 return SDValue(); 9024 9025 // ARM mode is a bit simpler than Thumb: we can handle large power 9026 // of 2 immediates with 1 mov instruction; no further checks required, 9027 // just return the sdiv node. 9028 if (!ST.isThumb()) 9029 return SDValue(N, 0); 9030 9031 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9032 // and thus lose the code size benefits of a MOVS that requires only 2. 9033 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9034 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9035 if (Divisor.sgt(128)) 9036 return SDValue(); 9037 9038 return SDValue(N, 0); 9039 } 9040 9041 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9042 bool Signed) const { 9043 assert(Op.getValueType() == MVT::i32 && 9044 "unexpected type for custom lowering DIV"); 9045 SDLoc dl(Op); 9046 9047 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9048 DAG.getEntryNode(), Op.getOperand(1)); 9049 9050 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9051 } 9052 9053 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9054 SDLoc DL(N); 9055 SDValue Op = N->getOperand(1); 9056 if (N->getValueType(0) == MVT::i32) 9057 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9058 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9059 DAG.getConstant(0, DL, MVT::i32)); 9060 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9061 DAG.getConstant(1, DL, MVT::i32)); 9062 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9063 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9064 } 9065 9066 void ARMTargetLowering::ExpandDIV_Windows( 9067 SDValue Op, SelectionDAG &DAG, bool Signed, 9068 SmallVectorImpl<SDValue> &Results) const { 9069 const auto &DL = DAG.getDataLayout(); 9070 const auto &TLI = DAG.getTargetLoweringInfo(); 9071 9072 assert(Op.getValueType() == MVT::i64 && 9073 "unexpected type for custom lowering DIV"); 9074 SDLoc dl(Op); 9075 9076 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9077 9078 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9079 9080 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9081 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9082 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9083 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9084 9085 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); 9086 } 9087 9088 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9089 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9090 EVT MemVT = LD->getMemoryVT(); 9091 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9092 "Expected a predicate type!"); 9093 assert(MemVT == Op.getValueType()); 9094 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9095 "Expected a non-extending load"); 9096 assert(LD->isUnindexed() && "Expected a unindexed load"); 9097 9098 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 9099 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9100 // need to make sure that 8/4 bits are actually loaded into the correct 9101 // place, which means loading the value and then shuffling the values into 9102 // the bottom bits of the predicate. 9103 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9104 // for BE). 9105 9106 SDLoc dl(Op); 9107 SDValue Load = DAG.getExtLoad( 9108 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9109 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9110 LD->getMemOperand()); 9111 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); 9112 if (MemVT != MVT::v16i1) 9113 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9114 DAG.getConstant(0, dl, MVT::i32)); 9115 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9116 } 9117 9118 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, 9119 SelectionDAG &DAG) const { 9120 LoadSDNode *LD = cast<LoadSDNode>(N); 9121 EVT MemVT = LD->getMemoryVT(); 9122 assert(LD->isUnindexed() && "Loads should be unindexed at this point."); 9123 9124 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9125 !Subtarget->isThumb1Only() && LD->isVolatile()) { 9126 SDLoc dl(N); 9127 SDValue Result = DAG.getMemIntrinsicNode( 9128 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), 9129 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); 9130 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); 9131 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); 9132 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 9133 Results.append({Pair, Result.getValue(2)}); 9134 } 9135 } 9136 9137 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 9138 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9139 EVT MemVT = ST->getMemoryVT(); 9140 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9141 "Expected a predicate type!"); 9142 assert(MemVT == ST->getValue().getValueType()); 9143 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 9144 assert(ST->isUnindexed() && "Expected a unindexed store"); 9145 9146 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 9147 // unset and a scalar store. 9148 SDLoc dl(Op); 9149 SDValue Build = ST->getValue(); 9150 if (MemVT != MVT::v16i1) { 9151 SmallVector<SDValue, 16> Ops; 9152 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) 9153 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 9154 DAG.getConstant(I, dl, MVT::i32))); 9155 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 9156 Ops.push_back(DAG.getUNDEF(MVT::i32)); 9157 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 9158 } 9159 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 9160 return DAG.getTruncStore( 9161 ST->getChain(), dl, GRP, ST->getBasePtr(), 9162 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9163 ST->getMemOperand()); 9164 } 9165 9166 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, 9167 const ARMSubtarget *Subtarget) { 9168 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9169 EVT MemVT = ST->getMemoryVT(); 9170 assert(ST->isUnindexed() && "Stores should be unindexed at this point."); 9171 9172 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9173 !Subtarget->isThumb1Only() && ST->isVolatile()) { 9174 SDNode *N = Op.getNode(); 9175 SDLoc dl(N); 9176 9177 SDValue Lo = DAG.getNode( 9178 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9179 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, 9180 MVT::i32)); 9181 SDValue Hi = DAG.getNode( 9182 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9183 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, 9184 MVT::i32)); 9185 9186 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), 9187 {ST->getChain(), Lo, Hi, ST->getBasePtr()}, 9188 MemVT, ST->getMemOperand()); 9189 } else if (Subtarget->hasMVEIntegerOps() && 9190 ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 9191 MemVT == MVT::v16i1))) { 9192 return LowerPredicateStore(Op, DAG); 9193 } 9194 9195 return SDValue(); 9196 } 9197 9198 static bool isZeroVector(SDValue N) { 9199 return (ISD::isBuildVectorAllZeros(N.getNode()) || 9200 (N->getOpcode() == ARMISD::VMOVIMM && 9201 isNullConstant(N->getOperand(0)))); 9202 } 9203 9204 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 9205 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 9206 MVT VT = Op.getSimpleValueType(); 9207 SDValue Mask = N->getMask(); 9208 SDValue PassThru = N->getPassThru(); 9209 SDLoc dl(Op); 9210 9211 if (isZeroVector(PassThru)) 9212 return Op; 9213 9214 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9215 // zero too, and other values are lowered to a select. 9216 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9217 DAG.getTargetConstant(0, dl, MVT::i32)); 9218 SDValue NewLoad = DAG.getMaskedLoad( 9219 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 9220 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 9221 N->getExtensionType(), N->isExpandingLoad()); 9222 SDValue Combo = NewLoad; 9223 if (!PassThru.isUndef() && 9224 (PassThru.getOpcode() != ISD::BITCAST || 9225 !isZeroVector(PassThru->getOperand(0)))) 9226 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9227 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9228 } 9229 9230 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9231 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 9232 // Acquire/Release load/store is not legal for targets without a dmb or 9233 // equivalent available. 9234 return SDValue(); 9235 9236 // Monotonic load/store is legal for all targets. 9237 return Op; 9238 } 9239 9240 static void ReplaceREADCYCLECOUNTER(SDNode *N, 9241 SmallVectorImpl<SDValue> &Results, 9242 SelectionDAG &DAG, 9243 const ARMSubtarget *Subtarget) { 9244 SDLoc DL(N); 9245 // Under Power Management extensions, the cycle-count is: 9246 // mrc p15, #0, <Rt>, c9, c13, #0 9247 SDValue Ops[] = { N->getOperand(0), // Chain 9248 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 9249 DAG.getTargetConstant(15, DL, MVT::i32), 9250 DAG.getTargetConstant(0, DL, MVT::i32), 9251 DAG.getTargetConstant(9, DL, MVT::i32), 9252 DAG.getTargetConstant(13, DL, MVT::i32), 9253 DAG.getTargetConstant(0, DL, MVT::i32) 9254 }; 9255 9256 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 9257 DAG.getVTList(MVT::i32, MVT::Other), Ops); 9258 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 9259 DAG.getConstant(0, DL, MVT::i32))); 9260 Results.push_back(Cycles32.getValue(1)); 9261 } 9262 9263 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 9264 SDLoc dl(V.getNode()); 9265 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 9266 SDValue VHi = DAG.getAnyExtOrTrunc( 9267 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 9268 dl, MVT::i32); 9269 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9270 if (isBigEndian) 9271 std::swap (VLo, VHi); 9272 SDValue RegClass = 9273 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 9274 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 9275 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 9276 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 9277 return SDValue( 9278 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 9279 } 9280 9281 static void ReplaceCMP_SWAP_64Results(SDNode *N, 9282 SmallVectorImpl<SDValue> & Results, 9283 SelectionDAG &DAG) { 9284 assert(N->getValueType(0) == MVT::i64 && 9285 "AtomicCmpSwap on types less than 64 should be legal"); 9286 SDValue Ops[] = {N->getOperand(1), 9287 createGPRPairNode(DAG, N->getOperand(2)), 9288 createGPRPairNode(DAG, N->getOperand(3)), 9289 N->getOperand(0)}; 9290 SDNode *CmpSwap = DAG.getMachineNode( 9291 ARM::CMP_SWAP_64, SDLoc(N), 9292 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 9293 9294 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 9295 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 9296 9297 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9298 9299 SDValue Lo = 9300 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 9301 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 9302 SDValue Hi = 9303 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 9304 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 9305 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); 9306 Results.push_back(SDValue(CmpSwap, 2)); 9307 } 9308 9309 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 9310 SDLoc dl(Op); 9311 EVT VT = Op.getValueType(); 9312 SDValue Chain = Op.getOperand(0); 9313 SDValue LHS = Op.getOperand(1); 9314 SDValue RHS = Op.getOperand(2); 9315 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 9316 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 9317 9318 // If we don't have instructions of this float type then soften to a libcall 9319 // and use SETCC instead. 9320 if (isUnsupportedFloatingType(LHS.getValueType())) { 9321 DAG.getTargetLoweringInfo().softenSetCCOperands( 9322 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 9323 if (!RHS.getNode()) { 9324 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 9325 CC = ISD::SETNE; 9326 } 9327 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 9328 DAG.getCondCode(CC)); 9329 return DAG.getMergeValues({Result, Chain}, dl); 9330 } 9331 9332 ARMCC::CondCodes CondCode, CondCode2; 9333 FPCCToARMCC(CC, CondCode, CondCode2); 9334 9335 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 9336 // in CMPFP and CMPFPE, but instead it should be made explicit by these 9337 // instructions using a chain instead of glue. This would also fix the problem 9338 // here (and also in LowerSELECT_CC) where we generate two comparisons when 9339 // CondCode2 != AL. 9340 SDValue True = DAG.getConstant(1, dl, VT); 9341 SDValue False = DAG.getConstant(0, dl, VT); 9342 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 9343 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 9344 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9345 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 9346 if (CondCode2 != ARMCC::AL) { 9347 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 9348 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9349 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 9350 } 9351 return DAG.getMergeValues({Result, Chain}, dl); 9352 } 9353 9354 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9355 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 9356 switch (Op.getOpcode()) { 9357 default: llvm_unreachable("Don't know how to custom lower this!"); 9358 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 9359 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9360 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9361 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9362 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9363 case ISD::SELECT: return LowerSELECT(Op, DAG); 9364 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9365 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9366 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 9367 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 9368 case ISD::VASTART: return LowerVASTART(Op, DAG); 9369 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 9370 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 9371 case ISD::SINT_TO_FP: 9372 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9373 case ISD::STRICT_FP_TO_SINT: 9374 case ISD::STRICT_FP_TO_UINT: 9375 case ISD::FP_TO_SINT: 9376 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 9377 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9378 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9379 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9380 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 9381 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 9382 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 9383 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 9384 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 9385 Subtarget); 9386 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 9387 case ISD::SHL: 9388 case ISD::SRL: 9389 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 9390 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 9391 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 9392 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 9393 case ISD::SRL_PARTS: 9394 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 9395 case ISD::CTTZ: 9396 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 9397 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 9398 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 9399 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 9400 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 9401 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 9402 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 9403 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 9404 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9405 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 9406 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 9407 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9408 case ISD::MUL: return LowerMUL(Op, DAG); 9409 case ISD::SDIV: 9410 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9411 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 9412 return LowerSDIV(Op, DAG, Subtarget); 9413 case ISD::UDIV: 9414 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9415 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 9416 return LowerUDIV(Op, DAG, Subtarget); 9417 case ISD::ADDCARRY: 9418 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 9419 case ISD::SADDO: 9420 case ISD::SSUBO: 9421 return LowerSignedALUO(Op, DAG); 9422 case ISD::UADDO: 9423 case ISD::USUBO: 9424 return LowerUnsignedALUO(Op, DAG); 9425 case ISD::SADDSAT: 9426 case ISD::SSUBSAT: 9427 return LowerSADDSUBSAT(Op, DAG, Subtarget); 9428 case ISD::LOAD: 9429 return LowerPredicateLoad(Op, DAG); 9430 case ISD::STORE: 9431 return LowerSTORE(Op, DAG, Subtarget); 9432 case ISD::MLOAD: 9433 return LowerMLOAD(Op, DAG); 9434 case ISD::ATOMIC_LOAD: 9435 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 9436 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 9437 case ISD::SDIVREM: 9438 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 9439 case ISD::DYNAMIC_STACKALLOC: 9440 if (Subtarget->isTargetWindows()) 9441 return LowerDYNAMIC_STACKALLOC(Op, DAG); 9442 llvm_unreachable("Don't know how to custom lower this!"); 9443 case ISD::STRICT_FP_ROUND: 9444 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 9445 case ISD::STRICT_FP_EXTEND: 9446 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 9447 case ISD::STRICT_FSETCC: 9448 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 9449 case ARMISD::WIN__DBZCHK: return SDValue(); 9450 } 9451 } 9452 9453 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 9454 SelectionDAG &DAG) { 9455 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9456 unsigned Opc = 0; 9457 if (IntNo == Intrinsic::arm_smlald) 9458 Opc = ARMISD::SMLALD; 9459 else if (IntNo == Intrinsic::arm_smlaldx) 9460 Opc = ARMISD::SMLALDX; 9461 else if (IntNo == Intrinsic::arm_smlsld) 9462 Opc = ARMISD::SMLSLD; 9463 else if (IntNo == Intrinsic::arm_smlsldx) 9464 Opc = ARMISD::SMLSLDX; 9465 else 9466 return; 9467 9468 SDLoc dl(N); 9469 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9470 N->getOperand(3), 9471 DAG.getConstant(0, dl, MVT::i32)); 9472 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9473 N->getOperand(3), 9474 DAG.getConstant(1, dl, MVT::i32)); 9475 9476 SDValue LongMul = DAG.getNode(Opc, dl, 9477 DAG.getVTList(MVT::i32, MVT::i32), 9478 N->getOperand(1), N->getOperand(2), 9479 Lo, Hi); 9480 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 9481 LongMul.getValue(0), LongMul.getValue(1))); 9482 } 9483 9484 /// ReplaceNodeResults - Replace the results of node with an illegal result 9485 /// type with new values built out of custom code. 9486 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 9487 SmallVectorImpl<SDValue> &Results, 9488 SelectionDAG &DAG) const { 9489 SDValue Res; 9490 switch (N->getOpcode()) { 9491 default: 9492 llvm_unreachable("Don't know how to custom expand this!"); 9493 case ISD::READ_REGISTER: 9494 ExpandREAD_REGISTER(N, Results, DAG); 9495 break; 9496 case ISD::BITCAST: 9497 Res = ExpandBITCAST(N, DAG, Subtarget); 9498 break; 9499 case ISD::SRL: 9500 case ISD::SRA: 9501 case ISD::SHL: 9502 Res = Expand64BitShift(N, DAG, Subtarget); 9503 break; 9504 case ISD::SREM: 9505 case ISD::UREM: 9506 Res = LowerREM(N, DAG); 9507 break; 9508 case ISD::SDIVREM: 9509 case ISD::UDIVREM: 9510 Res = LowerDivRem(SDValue(N, 0), DAG); 9511 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 9512 Results.push_back(Res.getValue(0)); 9513 Results.push_back(Res.getValue(1)); 9514 return; 9515 case ISD::SADDSAT: 9516 case ISD::SSUBSAT: 9517 Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 9518 break; 9519 case ISD::READCYCLECOUNTER: 9520 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 9521 return; 9522 case ISD::UDIV: 9523 case ISD::SDIV: 9524 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 9525 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 9526 Results); 9527 case ISD::ATOMIC_CMP_SWAP: 9528 ReplaceCMP_SWAP_64Results(N, Results, DAG); 9529 return; 9530 case ISD::INTRINSIC_WO_CHAIN: 9531 return ReplaceLongIntrinsic(N, Results, DAG); 9532 case ISD::ABS: 9533 lowerABS(N, Results, DAG); 9534 return ; 9535 case ISD::LOAD: 9536 LowerLOAD(N, Results, DAG); 9537 break; 9538 } 9539 if (Res.getNode()) 9540 Results.push_back(Res); 9541 } 9542 9543 //===----------------------------------------------------------------------===// 9544 // ARM Scheduler Hooks 9545 //===----------------------------------------------------------------------===// 9546 9547 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 9548 /// registers the function context. 9549 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 9550 MachineBasicBlock *MBB, 9551 MachineBasicBlock *DispatchBB, 9552 int FI) const { 9553 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 9554 "ROPI/RWPI not currently supported with SjLj"); 9555 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9556 DebugLoc dl = MI.getDebugLoc(); 9557 MachineFunction *MF = MBB->getParent(); 9558 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9559 MachineConstantPool *MCP = MF->getConstantPool(); 9560 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 9561 const Function &F = MF->getFunction(); 9562 9563 bool isThumb = Subtarget->isThumb(); 9564 bool isThumb2 = Subtarget->isThumb2(); 9565 9566 unsigned PCLabelId = AFI->createPICLabelUId(); 9567 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 9568 ARMConstantPoolValue *CPV = 9569 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 9570 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 9571 9572 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 9573 : &ARM::GPRRegClass; 9574 9575 // Grab constant pool and fixed stack memory operands. 9576 MachineMemOperand *CPMMO = 9577 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 9578 MachineMemOperand::MOLoad, 4, 4); 9579 9580 MachineMemOperand *FIMMOSt = 9581 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 9582 MachineMemOperand::MOStore, 4, 4); 9583 9584 // Load the address of the dispatch MBB into the jump buffer. 9585 if (isThumb2) { 9586 // Incoming value: jbuf 9587 // ldr.n r5, LCPI1_1 9588 // orr r5, r5, #1 9589 // add r5, pc 9590 // str r5, [$jbuf, #+4] ; &jbuf[1] 9591 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9592 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 9593 .addConstantPoolIndex(CPI) 9594 .addMemOperand(CPMMO) 9595 .add(predOps(ARMCC::AL)); 9596 // Set the low bit because of thumb mode. 9597 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9598 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 9599 .addReg(NewVReg1, RegState::Kill) 9600 .addImm(0x01) 9601 .add(predOps(ARMCC::AL)) 9602 .add(condCodeOp()); 9603 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9604 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 9605 .addReg(NewVReg2, RegState::Kill) 9606 .addImm(PCLabelId); 9607 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 9608 .addReg(NewVReg3, RegState::Kill) 9609 .addFrameIndex(FI) 9610 .addImm(36) // &jbuf[1] :: pc 9611 .addMemOperand(FIMMOSt) 9612 .add(predOps(ARMCC::AL)); 9613 } else if (isThumb) { 9614 // Incoming value: jbuf 9615 // ldr.n r1, LCPI1_4 9616 // add r1, pc 9617 // mov r2, #1 9618 // orrs r1, r2 9619 // add r2, $jbuf, #+4 ; &jbuf[1] 9620 // str r1, [r2] 9621 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9622 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 9623 .addConstantPoolIndex(CPI) 9624 .addMemOperand(CPMMO) 9625 .add(predOps(ARMCC::AL)); 9626 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9627 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 9628 .addReg(NewVReg1, RegState::Kill) 9629 .addImm(PCLabelId); 9630 // Set the low bit because of thumb mode. 9631 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9632 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 9633 .addReg(ARM::CPSR, RegState::Define) 9634 .addImm(1) 9635 .add(predOps(ARMCC::AL)); 9636 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9637 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 9638 .addReg(ARM::CPSR, RegState::Define) 9639 .addReg(NewVReg2, RegState::Kill) 9640 .addReg(NewVReg3, RegState::Kill) 9641 .add(predOps(ARMCC::AL)); 9642 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9643 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 9644 .addFrameIndex(FI) 9645 .addImm(36); // &jbuf[1] :: pc 9646 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 9647 .addReg(NewVReg4, RegState::Kill) 9648 .addReg(NewVReg5, RegState::Kill) 9649 .addImm(0) 9650 .addMemOperand(FIMMOSt) 9651 .add(predOps(ARMCC::AL)); 9652 } else { 9653 // Incoming value: jbuf 9654 // ldr r1, LCPI1_1 9655 // add r1, pc, r1 9656 // str r1, [$jbuf, #+4] ; &jbuf[1] 9657 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9658 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 9659 .addConstantPoolIndex(CPI) 9660 .addImm(0) 9661 .addMemOperand(CPMMO) 9662 .add(predOps(ARMCC::AL)); 9663 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9664 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 9665 .addReg(NewVReg1, RegState::Kill) 9666 .addImm(PCLabelId) 9667 .add(predOps(ARMCC::AL)); 9668 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 9669 .addReg(NewVReg2, RegState::Kill) 9670 .addFrameIndex(FI) 9671 .addImm(36) // &jbuf[1] :: pc 9672 .addMemOperand(FIMMOSt) 9673 .add(predOps(ARMCC::AL)); 9674 } 9675 } 9676 9677 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 9678 MachineBasicBlock *MBB) const { 9679 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9680 DebugLoc dl = MI.getDebugLoc(); 9681 MachineFunction *MF = MBB->getParent(); 9682 MachineRegisterInfo *MRI = &MF->getRegInfo(); 9683 MachineFrameInfo &MFI = MF->getFrameInfo(); 9684 int FI = MFI.getFunctionContextIndex(); 9685 9686 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 9687 : &ARM::GPRnopcRegClass; 9688 9689 // Get a mapping of the call site numbers to all of the landing pads they're 9690 // associated with. 9691 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 9692 unsigned MaxCSNum = 0; 9693 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 9694 ++BB) { 9695 if (!BB->isEHPad()) continue; 9696 9697 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 9698 // pad. 9699 for (MachineBasicBlock::iterator 9700 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 9701 if (!II->isEHLabel()) continue; 9702 9703 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 9704 if (!MF->hasCallSiteLandingPad(Sym)) continue; 9705 9706 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 9707 for (SmallVectorImpl<unsigned>::iterator 9708 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 9709 CSI != CSE; ++CSI) { 9710 CallSiteNumToLPad[*CSI].push_back(&*BB); 9711 MaxCSNum = std::max(MaxCSNum, *CSI); 9712 } 9713 break; 9714 } 9715 } 9716 9717 // Get an ordered list of the machine basic blocks for the jump table. 9718 std::vector<MachineBasicBlock*> LPadList; 9719 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 9720 LPadList.reserve(CallSiteNumToLPad.size()); 9721 for (unsigned I = 1; I <= MaxCSNum; ++I) { 9722 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 9723 for (SmallVectorImpl<MachineBasicBlock*>::iterator 9724 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 9725 LPadList.push_back(*II); 9726 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 9727 } 9728 } 9729 9730 assert(!LPadList.empty() && 9731 "No landing pad destinations for the dispatch jump table!"); 9732 9733 // Create the jump table and associated information. 9734 MachineJumpTableInfo *JTI = 9735 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 9736 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 9737 9738 // Create the MBBs for the dispatch code. 9739 9740 // Shove the dispatch's address into the return slot in the function context. 9741 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 9742 DispatchBB->setIsEHPad(); 9743 9744 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9745 unsigned trap_opcode; 9746 if (Subtarget->isThumb()) 9747 trap_opcode = ARM::tTRAP; 9748 else 9749 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 9750 9751 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 9752 DispatchBB->addSuccessor(TrapBB); 9753 9754 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 9755 DispatchBB->addSuccessor(DispContBB); 9756 9757 // Insert and MBBs. 9758 MF->insert(MF->end(), DispatchBB); 9759 MF->insert(MF->end(), DispContBB); 9760 MF->insert(MF->end(), TrapBB); 9761 9762 // Insert code into the entry block that creates and registers the function 9763 // context. 9764 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 9765 9766 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 9767 MachinePointerInfo::getFixedStack(*MF, FI), 9768 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 9769 9770 MachineInstrBuilder MIB; 9771 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 9772 9773 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 9774 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 9775 9776 // Add a register mask with no preserved registers. This results in all 9777 // registers being marked as clobbered. This can't work if the dispatch block 9778 // is in a Thumb1 function and is linked with ARM code which uses the FP 9779 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 9780 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 9781 9782 bool IsPositionIndependent = isPositionIndependent(); 9783 unsigned NumLPads = LPadList.size(); 9784 if (Subtarget->isThumb2()) { 9785 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9786 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 9787 .addFrameIndex(FI) 9788 .addImm(4) 9789 .addMemOperand(FIMMOLd) 9790 .add(predOps(ARMCC::AL)); 9791 9792 if (NumLPads < 256) { 9793 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 9794 .addReg(NewVReg1) 9795 .addImm(LPadList.size()) 9796 .add(predOps(ARMCC::AL)); 9797 } else { 9798 Register VReg1 = MRI->createVirtualRegister(TRC); 9799 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 9800 .addImm(NumLPads & 0xFFFF) 9801 .add(predOps(ARMCC::AL)); 9802 9803 unsigned VReg2 = VReg1; 9804 if ((NumLPads & 0xFFFF0000) != 0) { 9805 VReg2 = MRI->createVirtualRegister(TRC); 9806 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 9807 .addReg(VReg1) 9808 .addImm(NumLPads >> 16) 9809 .add(predOps(ARMCC::AL)); 9810 } 9811 9812 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 9813 .addReg(NewVReg1) 9814 .addReg(VReg2) 9815 .add(predOps(ARMCC::AL)); 9816 } 9817 9818 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 9819 .addMBB(TrapBB) 9820 .addImm(ARMCC::HI) 9821 .addReg(ARM::CPSR); 9822 9823 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9824 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 9825 .addJumpTableIndex(MJTI) 9826 .add(predOps(ARMCC::AL)); 9827 9828 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9829 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 9830 .addReg(NewVReg3, RegState::Kill) 9831 .addReg(NewVReg1) 9832 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9833 .add(predOps(ARMCC::AL)) 9834 .add(condCodeOp()); 9835 9836 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 9837 .addReg(NewVReg4, RegState::Kill) 9838 .addReg(NewVReg1) 9839 .addJumpTableIndex(MJTI); 9840 } else if (Subtarget->isThumb()) { 9841 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9842 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 9843 .addFrameIndex(FI) 9844 .addImm(1) 9845 .addMemOperand(FIMMOLd) 9846 .add(predOps(ARMCC::AL)); 9847 9848 if (NumLPads < 256) { 9849 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 9850 .addReg(NewVReg1) 9851 .addImm(NumLPads) 9852 .add(predOps(ARMCC::AL)); 9853 } else { 9854 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9855 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9856 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9857 9858 // MachineConstantPool wants an explicit alignment. 9859 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9860 if (Align == 0) 9861 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9862 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9863 9864 Register VReg1 = MRI->createVirtualRegister(TRC); 9865 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 9866 .addReg(VReg1, RegState::Define) 9867 .addConstantPoolIndex(Idx) 9868 .add(predOps(ARMCC::AL)); 9869 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 9870 .addReg(NewVReg1) 9871 .addReg(VReg1) 9872 .add(predOps(ARMCC::AL)); 9873 } 9874 9875 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 9876 .addMBB(TrapBB) 9877 .addImm(ARMCC::HI) 9878 .addReg(ARM::CPSR); 9879 9880 Register NewVReg2 = MRI->createVirtualRegister(TRC); 9881 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 9882 .addReg(ARM::CPSR, RegState::Define) 9883 .addReg(NewVReg1) 9884 .addImm(2) 9885 .add(predOps(ARMCC::AL)); 9886 9887 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9888 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 9889 .addJumpTableIndex(MJTI) 9890 .add(predOps(ARMCC::AL)); 9891 9892 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9893 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 9894 .addReg(ARM::CPSR, RegState::Define) 9895 .addReg(NewVReg2, RegState::Kill) 9896 .addReg(NewVReg3) 9897 .add(predOps(ARMCC::AL)); 9898 9899 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9900 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9901 9902 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9903 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 9904 .addReg(NewVReg4, RegState::Kill) 9905 .addImm(0) 9906 .addMemOperand(JTMMOLd) 9907 .add(predOps(ARMCC::AL)); 9908 9909 unsigned NewVReg6 = NewVReg5; 9910 if (IsPositionIndependent) { 9911 NewVReg6 = MRI->createVirtualRegister(TRC); 9912 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 9913 .addReg(ARM::CPSR, RegState::Define) 9914 .addReg(NewVReg5, RegState::Kill) 9915 .addReg(NewVReg3) 9916 .add(predOps(ARMCC::AL)); 9917 } 9918 9919 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 9920 .addReg(NewVReg6, RegState::Kill) 9921 .addJumpTableIndex(MJTI); 9922 } else { 9923 Register NewVReg1 = MRI->createVirtualRegister(TRC); 9924 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 9925 .addFrameIndex(FI) 9926 .addImm(4) 9927 .addMemOperand(FIMMOLd) 9928 .add(predOps(ARMCC::AL)); 9929 9930 if (NumLPads < 256) { 9931 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 9932 .addReg(NewVReg1) 9933 .addImm(NumLPads) 9934 .add(predOps(ARMCC::AL)); 9935 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 9936 Register VReg1 = MRI->createVirtualRegister(TRC); 9937 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 9938 .addImm(NumLPads & 0xFFFF) 9939 .add(predOps(ARMCC::AL)); 9940 9941 unsigned VReg2 = VReg1; 9942 if ((NumLPads & 0xFFFF0000) != 0) { 9943 VReg2 = MRI->createVirtualRegister(TRC); 9944 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 9945 .addReg(VReg1) 9946 .addImm(NumLPads >> 16) 9947 .add(predOps(ARMCC::AL)); 9948 } 9949 9950 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9951 .addReg(NewVReg1) 9952 .addReg(VReg2) 9953 .add(predOps(ARMCC::AL)); 9954 } else { 9955 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9956 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9957 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 9958 9959 // MachineConstantPool wants an explicit alignment. 9960 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9961 if (Align == 0) 9962 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9963 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9964 9965 Register VReg1 = MRI->createVirtualRegister(TRC); 9966 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 9967 .addReg(VReg1, RegState::Define) 9968 .addConstantPoolIndex(Idx) 9969 .addImm(0) 9970 .add(predOps(ARMCC::AL)); 9971 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 9972 .addReg(NewVReg1) 9973 .addReg(VReg1, RegState::Kill) 9974 .add(predOps(ARMCC::AL)); 9975 } 9976 9977 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 9978 .addMBB(TrapBB) 9979 .addImm(ARMCC::HI) 9980 .addReg(ARM::CPSR); 9981 9982 Register NewVReg3 = MRI->createVirtualRegister(TRC); 9983 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 9984 .addReg(NewVReg1) 9985 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 9986 .add(predOps(ARMCC::AL)) 9987 .add(condCodeOp()); 9988 Register NewVReg4 = MRI->createVirtualRegister(TRC); 9989 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 9990 .addJumpTableIndex(MJTI) 9991 .add(predOps(ARMCC::AL)); 9992 9993 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 9994 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 9995 Register NewVReg5 = MRI->createVirtualRegister(TRC); 9996 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 9997 .addReg(NewVReg3, RegState::Kill) 9998 .addReg(NewVReg4) 9999 .addImm(0) 10000 .addMemOperand(JTMMOLd) 10001 .add(predOps(ARMCC::AL)); 10002 10003 if (IsPositionIndependent) { 10004 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 10005 .addReg(NewVReg5, RegState::Kill) 10006 .addReg(NewVReg4) 10007 .addJumpTableIndex(MJTI); 10008 } else { 10009 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 10010 .addReg(NewVReg5, RegState::Kill) 10011 .addJumpTableIndex(MJTI); 10012 } 10013 } 10014 10015 // Add the jump table entries as successors to the MBB. 10016 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 10017 for (std::vector<MachineBasicBlock*>::iterator 10018 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 10019 MachineBasicBlock *CurMBB = *I; 10020 if (SeenMBBs.insert(CurMBB).second) 10021 DispContBB->addSuccessor(CurMBB); 10022 } 10023 10024 // N.B. the order the invoke BBs are processed in doesn't matter here. 10025 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 10026 SmallVector<MachineBasicBlock*, 64> MBBLPads; 10027 for (MachineBasicBlock *BB : InvokeBBs) { 10028 10029 // Remove the landing pad successor from the invoke block and replace it 10030 // with the new dispatch block. 10031 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 10032 BB->succ_end()); 10033 while (!Successors.empty()) { 10034 MachineBasicBlock *SMBB = Successors.pop_back_val(); 10035 if (SMBB->isEHPad()) { 10036 BB->removeSuccessor(SMBB); 10037 MBBLPads.push_back(SMBB); 10038 } 10039 } 10040 10041 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 10042 BB->normalizeSuccProbs(); 10043 10044 // Find the invoke call and mark all of the callee-saved registers as 10045 // 'implicit defined' so that they're spilled. This prevents code from 10046 // moving instructions to before the EH block, where they will never be 10047 // executed. 10048 for (MachineBasicBlock::reverse_iterator 10049 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 10050 if (!II->isCall()) continue; 10051 10052 DenseMap<unsigned, bool> DefRegs; 10053 for (MachineInstr::mop_iterator 10054 OI = II->operands_begin(), OE = II->operands_end(); 10055 OI != OE; ++OI) { 10056 if (!OI->isReg()) continue; 10057 DefRegs[OI->getReg()] = true; 10058 } 10059 10060 MachineInstrBuilder MIB(*MF, &*II); 10061 10062 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 10063 unsigned Reg = SavedRegs[i]; 10064 if (Subtarget->isThumb2() && 10065 !ARM::tGPRRegClass.contains(Reg) && 10066 !ARM::hGPRRegClass.contains(Reg)) 10067 continue; 10068 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 10069 continue; 10070 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 10071 continue; 10072 if (!DefRegs[Reg]) 10073 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 10074 } 10075 10076 break; 10077 } 10078 } 10079 10080 // Mark all former landing pads as non-landing pads. The dispatch is the only 10081 // landing pad now. 10082 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10083 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 10084 (*I)->setIsEHPad(false); 10085 10086 // The instruction is gone now. 10087 MI.eraseFromParent(); 10088 } 10089 10090 static 10091 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 10092 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 10093 E = MBB->succ_end(); I != E; ++I) 10094 if (*I != Succ) 10095 return *I; 10096 llvm_unreachable("Expecting a BB with two successors!"); 10097 } 10098 10099 /// Return the load opcode for a given load size. If load size >= 8, 10100 /// neon opcode will be returned. 10101 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 10102 if (LdSize >= 8) 10103 return LdSize == 16 ? ARM::VLD1q32wb_fixed 10104 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 10105 if (IsThumb1) 10106 return LdSize == 4 ? ARM::tLDRi 10107 : LdSize == 2 ? ARM::tLDRHi 10108 : LdSize == 1 ? ARM::tLDRBi : 0; 10109 if (IsThumb2) 10110 return LdSize == 4 ? ARM::t2LDR_POST 10111 : LdSize == 2 ? ARM::t2LDRH_POST 10112 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 10113 return LdSize == 4 ? ARM::LDR_POST_IMM 10114 : LdSize == 2 ? ARM::LDRH_POST 10115 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 10116 } 10117 10118 /// Return the store opcode for a given store size. If store size >= 8, 10119 /// neon opcode will be returned. 10120 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 10121 if (StSize >= 8) 10122 return StSize == 16 ? ARM::VST1q32wb_fixed 10123 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 10124 if (IsThumb1) 10125 return StSize == 4 ? ARM::tSTRi 10126 : StSize == 2 ? ARM::tSTRHi 10127 : StSize == 1 ? ARM::tSTRBi : 0; 10128 if (IsThumb2) 10129 return StSize == 4 ? ARM::t2STR_POST 10130 : StSize == 2 ? ARM::t2STRH_POST 10131 : StSize == 1 ? ARM::t2STRB_POST : 0; 10132 return StSize == 4 ? ARM::STR_POST_IMM 10133 : StSize == 2 ? ARM::STRH_POST 10134 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 10135 } 10136 10137 /// Emit a post-increment load operation with given size. The instructions 10138 /// will be added to BB at Pos. 10139 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10140 const TargetInstrInfo *TII, const DebugLoc &dl, 10141 unsigned LdSize, unsigned Data, unsigned AddrIn, 10142 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10143 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 10144 assert(LdOpc != 0 && "Should have a load opcode"); 10145 if (LdSize >= 8) { 10146 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10147 .addReg(AddrOut, RegState::Define) 10148 .addReg(AddrIn) 10149 .addImm(0) 10150 .add(predOps(ARMCC::AL)); 10151 } else if (IsThumb1) { 10152 // load + update AddrIn 10153 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10154 .addReg(AddrIn) 10155 .addImm(0) 10156 .add(predOps(ARMCC::AL)); 10157 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10158 .add(t1CondCodeOp()) 10159 .addReg(AddrIn) 10160 .addImm(LdSize) 10161 .add(predOps(ARMCC::AL)); 10162 } else if (IsThumb2) { 10163 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10164 .addReg(AddrOut, RegState::Define) 10165 .addReg(AddrIn) 10166 .addImm(LdSize) 10167 .add(predOps(ARMCC::AL)); 10168 } else { // arm 10169 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10170 .addReg(AddrOut, RegState::Define) 10171 .addReg(AddrIn) 10172 .addReg(0) 10173 .addImm(LdSize) 10174 .add(predOps(ARMCC::AL)); 10175 } 10176 } 10177 10178 /// Emit a post-increment store operation with given size. The instructions 10179 /// will be added to BB at Pos. 10180 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10181 const TargetInstrInfo *TII, const DebugLoc &dl, 10182 unsigned StSize, unsigned Data, unsigned AddrIn, 10183 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10184 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 10185 assert(StOpc != 0 && "Should have a store opcode"); 10186 if (StSize >= 8) { 10187 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10188 .addReg(AddrIn) 10189 .addImm(0) 10190 .addReg(Data) 10191 .add(predOps(ARMCC::AL)); 10192 } else if (IsThumb1) { 10193 // store + update AddrIn 10194 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 10195 .addReg(Data) 10196 .addReg(AddrIn) 10197 .addImm(0) 10198 .add(predOps(ARMCC::AL)); 10199 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10200 .add(t1CondCodeOp()) 10201 .addReg(AddrIn) 10202 .addImm(StSize) 10203 .add(predOps(ARMCC::AL)); 10204 } else if (IsThumb2) { 10205 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10206 .addReg(Data) 10207 .addReg(AddrIn) 10208 .addImm(StSize) 10209 .add(predOps(ARMCC::AL)); 10210 } else { // arm 10211 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10212 .addReg(Data) 10213 .addReg(AddrIn) 10214 .addReg(0) 10215 .addImm(StSize) 10216 .add(predOps(ARMCC::AL)); 10217 } 10218 } 10219 10220 MachineBasicBlock * 10221 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 10222 MachineBasicBlock *BB) const { 10223 // This pseudo instruction has 3 operands: dst, src, size 10224 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 10225 // Otherwise, we will generate unrolled scalar copies. 10226 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10227 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10228 MachineFunction::iterator It = ++BB->getIterator(); 10229 10230 Register dest = MI.getOperand(0).getReg(); 10231 Register src = MI.getOperand(1).getReg(); 10232 unsigned SizeVal = MI.getOperand(2).getImm(); 10233 unsigned Align = MI.getOperand(3).getImm(); 10234 DebugLoc dl = MI.getDebugLoc(); 10235 10236 MachineFunction *MF = BB->getParent(); 10237 MachineRegisterInfo &MRI = MF->getRegInfo(); 10238 unsigned UnitSize = 0; 10239 const TargetRegisterClass *TRC = nullptr; 10240 const TargetRegisterClass *VecTRC = nullptr; 10241 10242 bool IsThumb1 = Subtarget->isThumb1Only(); 10243 bool IsThumb2 = Subtarget->isThumb2(); 10244 bool IsThumb = Subtarget->isThumb(); 10245 10246 if (Align & 1) { 10247 UnitSize = 1; 10248 } else if (Align & 2) { 10249 UnitSize = 2; 10250 } else { 10251 // Check whether we can use NEON instructions. 10252 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 10253 Subtarget->hasNEON()) { 10254 if ((Align % 16 == 0) && SizeVal >= 16) 10255 UnitSize = 16; 10256 else if ((Align % 8 == 0) && SizeVal >= 8) 10257 UnitSize = 8; 10258 } 10259 // Can't use NEON instructions. 10260 if (UnitSize == 0) 10261 UnitSize = 4; 10262 } 10263 10264 // Select the correct opcode and register class for unit size load/store 10265 bool IsNeon = UnitSize >= 8; 10266 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 10267 if (IsNeon) 10268 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 10269 : UnitSize == 8 ? &ARM::DPRRegClass 10270 : nullptr; 10271 10272 unsigned BytesLeft = SizeVal % UnitSize; 10273 unsigned LoopSize = SizeVal - BytesLeft; 10274 10275 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 10276 // Use LDR and STR to copy. 10277 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 10278 // [destOut] = STR_POST(scratch, destIn, UnitSize) 10279 unsigned srcIn = src; 10280 unsigned destIn = dest; 10281 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 10282 Register srcOut = MRI.createVirtualRegister(TRC); 10283 Register destOut = MRI.createVirtualRegister(TRC); 10284 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10285 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 10286 IsThumb1, IsThumb2); 10287 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 10288 IsThumb1, IsThumb2); 10289 srcIn = srcOut; 10290 destIn = destOut; 10291 } 10292 10293 // Handle the leftover bytes with LDRB and STRB. 10294 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 10295 // [destOut] = STRB_POST(scratch, destIn, 1) 10296 for (unsigned i = 0; i < BytesLeft; i++) { 10297 Register srcOut = MRI.createVirtualRegister(TRC); 10298 Register destOut = MRI.createVirtualRegister(TRC); 10299 Register scratch = MRI.createVirtualRegister(TRC); 10300 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 10301 IsThumb1, IsThumb2); 10302 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 10303 IsThumb1, IsThumb2); 10304 srcIn = srcOut; 10305 destIn = destOut; 10306 } 10307 MI.eraseFromParent(); // The instruction is gone now. 10308 return BB; 10309 } 10310 10311 // Expand the pseudo op to a loop. 10312 // thisMBB: 10313 // ... 10314 // movw varEnd, # --> with thumb2 10315 // movt varEnd, # 10316 // ldrcp varEnd, idx --> without thumb2 10317 // fallthrough --> loopMBB 10318 // loopMBB: 10319 // PHI varPhi, varEnd, varLoop 10320 // PHI srcPhi, src, srcLoop 10321 // PHI destPhi, dst, destLoop 10322 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10323 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 10324 // subs varLoop, varPhi, #UnitSize 10325 // bne loopMBB 10326 // fallthrough --> exitMBB 10327 // exitMBB: 10328 // epilogue to handle left-over bytes 10329 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10330 // [destOut] = STRB_POST(scratch, destLoop, 1) 10331 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10332 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10333 MF->insert(It, loopMBB); 10334 MF->insert(It, exitMBB); 10335 10336 // Transfer the remainder of BB and its successor edges to exitMBB. 10337 exitMBB->splice(exitMBB->begin(), BB, 10338 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10339 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10340 10341 // Load an immediate to varEnd. 10342 Register varEnd = MRI.createVirtualRegister(TRC); 10343 if (Subtarget->useMovt()) { 10344 unsigned Vtmp = varEnd; 10345 if ((LoopSize & 0xFFFF0000) != 0) 10346 Vtmp = MRI.createVirtualRegister(TRC); 10347 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 10348 .addImm(LoopSize & 0xFFFF) 10349 .add(predOps(ARMCC::AL)); 10350 10351 if ((LoopSize & 0xFFFF0000) != 0) 10352 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 10353 .addReg(Vtmp) 10354 .addImm(LoopSize >> 16) 10355 .add(predOps(ARMCC::AL)); 10356 } else { 10357 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10358 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10359 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 10360 10361 // MachineConstantPool wants an explicit alignment. 10362 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 10363 if (Align == 0) 10364 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 10365 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 10366 MachineMemOperand *CPMMO = 10367 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10368 MachineMemOperand::MOLoad, 4, 4); 10369 10370 if (IsThumb) 10371 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 10372 .addReg(varEnd, RegState::Define) 10373 .addConstantPoolIndex(Idx) 10374 .add(predOps(ARMCC::AL)) 10375 .addMemOperand(CPMMO); 10376 else 10377 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 10378 .addReg(varEnd, RegState::Define) 10379 .addConstantPoolIndex(Idx) 10380 .addImm(0) 10381 .add(predOps(ARMCC::AL)) 10382 .addMemOperand(CPMMO); 10383 } 10384 BB->addSuccessor(loopMBB); 10385 10386 // Generate the loop body: 10387 // varPhi = PHI(varLoop, varEnd) 10388 // srcPhi = PHI(srcLoop, src) 10389 // destPhi = PHI(destLoop, dst) 10390 MachineBasicBlock *entryBB = BB; 10391 BB = loopMBB; 10392 Register varLoop = MRI.createVirtualRegister(TRC); 10393 Register varPhi = MRI.createVirtualRegister(TRC); 10394 Register srcLoop = MRI.createVirtualRegister(TRC); 10395 Register srcPhi = MRI.createVirtualRegister(TRC); 10396 Register destLoop = MRI.createVirtualRegister(TRC); 10397 Register destPhi = MRI.createVirtualRegister(TRC); 10398 10399 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 10400 .addReg(varLoop).addMBB(loopMBB) 10401 .addReg(varEnd).addMBB(entryBB); 10402 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 10403 .addReg(srcLoop).addMBB(loopMBB) 10404 .addReg(src).addMBB(entryBB); 10405 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 10406 .addReg(destLoop).addMBB(loopMBB) 10407 .addReg(dest).addMBB(entryBB); 10408 10409 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10410 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 10411 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10412 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 10413 IsThumb1, IsThumb2); 10414 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 10415 IsThumb1, IsThumb2); 10416 10417 // Decrement loop variable by UnitSize. 10418 if (IsThumb1) { 10419 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 10420 .add(t1CondCodeOp()) 10421 .addReg(varPhi) 10422 .addImm(UnitSize) 10423 .add(predOps(ARMCC::AL)); 10424 } else { 10425 MachineInstrBuilder MIB = 10426 BuildMI(*BB, BB->end(), dl, 10427 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 10428 MIB.addReg(varPhi) 10429 .addImm(UnitSize) 10430 .add(predOps(ARMCC::AL)) 10431 .add(condCodeOp()); 10432 MIB->getOperand(5).setReg(ARM::CPSR); 10433 MIB->getOperand(5).setIsDef(true); 10434 } 10435 BuildMI(*BB, BB->end(), dl, 10436 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10437 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 10438 10439 // loopMBB can loop back to loopMBB or fall through to exitMBB. 10440 BB->addSuccessor(loopMBB); 10441 BB->addSuccessor(exitMBB); 10442 10443 // Add epilogue to handle BytesLeft. 10444 BB = exitMBB; 10445 auto StartOfExit = exitMBB->begin(); 10446 10447 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10448 // [destOut] = STRB_POST(scratch, destLoop, 1) 10449 unsigned srcIn = srcLoop; 10450 unsigned destIn = destLoop; 10451 for (unsigned i = 0; i < BytesLeft; i++) { 10452 Register srcOut = MRI.createVirtualRegister(TRC); 10453 Register destOut = MRI.createVirtualRegister(TRC); 10454 Register scratch = MRI.createVirtualRegister(TRC); 10455 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 10456 IsThumb1, IsThumb2); 10457 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 10458 IsThumb1, IsThumb2); 10459 srcIn = srcOut; 10460 destIn = destOut; 10461 } 10462 10463 MI.eraseFromParent(); // The instruction is gone now. 10464 return BB; 10465 } 10466 10467 MachineBasicBlock * 10468 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 10469 MachineBasicBlock *MBB) const { 10470 const TargetMachine &TM = getTargetMachine(); 10471 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 10472 DebugLoc DL = MI.getDebugLoc(); 10473 10474 assert(Subtarget->isTargetWindows() && 10475 "__chkstk is only supported on Windows"); 10476 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 10477 10478 // __chkstk takes the number of words to allocate on the stack in R4, and 10479 // returns the stack adjustment in number of bytes in R4. This will not 10480 // clober any other registers (other than the obvious lr). 10481 // 10482 // Although, technically, IP should be considered a register which may be 10483 // clobbered, the call itself will not touch it. Windows on ARM is a pure 10484 // thumb-2 environment, so there is no interworking required. As a result, we 10485 // do not expect a veneer to be emitted by the linker, clobbering IP. 10486 // 10487 // Each module receives its own copy of __chkstk, so no import thunk is 10488 // required, again, ensuring that IP is not clobbered. 10489 // 10490 // Finally, although some linkers may theoretically provide a trampoline for 10491 // out of range calls (which is quite common due to a 32M range limitation of 10492 // branches for Thumb), we can generate the long-call version via 10493 // -mcmodel=large, alleviating the need for the trampoline which may clobber 10494 // IP. 10495 10496 switch (TM.getCodeModel()) { 10497 case CodeModel::Tiny: 10498 llvm_unreachable("Tiny code model not available on ARM."); 10499 case CodeModel::Small: 10500 case CodeModel::Medium: 10501 case CodeModel::Kernel: 10502 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 10503 .add(predOps(ARMCC::AL)) 10504 .addExternalSymbol("__chkstk") 10505 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10506 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10507 .addReg(ARM::R12, 10508 RegState::Implicit | RegState::Define | RegState::Dead) 10509 .addReg(ARM::CPSR, 10510 RegState::Implicit | RegState::Define | RegState::Dead); 10511 break; 10512 case CodeModel::Large: { 10513 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10514 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 10515 10516 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 10517 .addExternalSymbol("__chkstk"); 10518 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 10519 .add(predOps(ARMCC::AL)) 10520 .addReg(Reg, RegState::Kill) 10521 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 10522 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 10523 .addReg(ARM::R12, 10524 RegState::Implicit | RegState::Define | RegState::Dead) 10525 .addReg(ARM::CPSR, 10526 RegState::Implicit | RegState::Define | RegState::Dead); 10527 break; 10528 } 10529 } 10530 10531 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 10532 .addReg(ARM::SP, RegState::Kill) 10533 .addReg(ARM::R4, RegState::Kill) 10534 .setMIFlags(MachineInstr::FrameSetup) 10535 .add(predOps(ARMCC::AL)) 10536 .add(condCodeOp()); 10537 10538 MI.eraseFromParent(); 10539 return MBB; 10540 } 10541 10542 MachineBasicBlock * 10543 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 10544 MachineBasicBlock *MBB) const { 10545 DebugLoc DL = MI.getDebugLoc(); 10546 MachineFunction *MF = MBB->getParent(); 10547 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10548 10549 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 10550 MF->insert(++MBB->getIterator(), ContBB); 10551 ContBB->splice(ContBB->begin(), MBB, 10552 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10553 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 10554 MBB->addSuccessor(ContBB); 10555 10556 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10557 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 10558 MF->push_back(TrapBB); 10559 MBB->addSuccessor(TrapBB); 10560 10561 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 10562 .addReg(MI.getOperand(0).getReg()) 10563 .addImm(0) 10564 .add(predOps(ARMCC::AL)); 10565 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 10566 .addMBB(TrapBB) 10567 .addImm(ARMCC::EQ) 10568 .addReg(ARM::CPSR); 10569 10570 MI.eraseFromParent(); 10571 return ContBB; 10572 } 10573 10574 // The CPSR operand of SelectItr might be missing a kill marker 10575 // because there were multiple uses of CPSR, and ISel didn't know 10576 // which to mark. Figure out whether SelectItr should have had a 10577 // kill marker, and set it if it should. Returns the correct kill 10578 // marker value. 10579 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 10580 MachineBasicBlock* BB, 10581 const TargetRegisterInfo* TRI) { 10582 // Scan forward through BB for a use/def of CPSR. 10583 MachineBasicBlock::iterator miI(std::next(SelectItr)); 10584 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 10585 const MachineInstr& mi = *miI; 10586 if (mi.readsRegister(ARM::CPSR)) 10587 return false; 10588 if (mi.definesRegister(ARM::CPSR)) 10589 break; // Should have kill-flag - update below. 10590 } 10591 10592 // If we hit the end of the block, check whether CPSR is live into a 10593 // successor. 10594 if (miI == BB->end()) { 10595 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 10596 sEnd = BB->succ_end(); 10597 sItr != sEnd; ++sItr) { 10598 MachineBasicBlock* succ = *sItr; 10599 if (succ->isLiveIn(ARM::CPSR)) 10600 return false; 10601 } 10602 } 10603 10604 // We found a def, or hit the end of the basic block and CPSR wasn't live 10605 // out. SelectMI should have a kill flag on CPSR. 10606 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 10607 return true; 10608 } 10609 10610 MachineBasicBlock * 10611 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10612 MachineBasicBlock *BB) const { 10613 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10614 DebugLoc dl = MI.getDebugLoc(); 10615 bool isThumb2 = Subtarget->isThumb2(); 10616 switch (MI.getOpcode()) { 10617 default: { 10618 MI.print(errs()); 10619 llvm_unreachable("Unexpected instr type to insert"); 10620 } 10621 10622 // Thumb1 post-indexed loads are really just single-register LDMs. 10623 case ARM::tLDR_postidx: { 10624 MachineOperand Def(MI.getOperand(1)); 10625 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 10626 .add(Def) // Rn_wb 10627 .add(MI.getOperand(2)) // Rn 10628 .add(MI.getOperand(3)) // PredImm 10629 .add(MI.getOperand(4)) // PredReg 10630 .add(MI.getOperand(0)) // Rt 10631 .cloneMemRefs(MI); 10632 MI.eraseFromParent(); 10633 return BB; 10634 } 10635 10636 // The Thumb2 pre-indexed stores have the same MI operands, they just 10637 // define them differently in the .td files from the isel patterns, so 10638 // they need pseudos. 10639 case ARM::t2STR_preidx: 10640 MI.setDesc(TII->get(ARM::t2STR_PRE)); 10641 return BB; 10642 case ARM::t2STRB_preidx: 10643 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 10644 return BB; 10645 case ARM::t2STRH_preidx: 10646 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 10647 return BB; 10648 10649 case ARM::STRi_preidx: 10650 case ARM::STRBi_preidx: { 10651 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 10652 : ARM::STRB_PRE_IMM; 10653 // Decode the offset. 10654 unsigned Offset = MI.getOperand(4).getImm(); 10655 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 10656 Offset = ARM_AM::getAM2Offset(Offset); 10657 if (isSub) 10658 Offset = -Offset; 10659 10660 MachineMemOperand *MMO = *MI.memoperands_begin(); 10661 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 10662 .add(MI.getOperand(0)) // Rn_wb 10663 .add(MI.getOperand(1)) // Rt 10664 .add(MI.getOperand(2)) // Rn 10665 .addImm(Offset) // offset (skip GPR==zero_reg) 10666 .add(MI.getOperand(5)) // pred 10667 .add(MI.getOperand(6)) 10668 .addMemOperand(MMO); 10669 MI.eraseFromParent(); 10670 return BB; 10671 } 10672 case ARM::STRr_preidx: 10673 case ARM::STRBr_preidx: 10674 case ARM::STRH_preidx: { 10675 unsigned NewOpc; 10676 switch (MI.getOpcode()) { 10677 default: llvm_unreachable("unexpected opcode!"); 10678 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 10679 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 10680 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 10681 } 10682 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 10683 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 10684 MIB.add(MI.getOperand(i)); 10685 MI.eraseFromParent(); 10686 return BB; 10687 } 10688 10689 case ARM::tMOVCCr_pseudo: { 10690 // To "insert" a SELECT_CC instruction, we actually have to insert the 10691 // diamond control-flow pattern. The incoming instruction knows the 10692 // destination vreg to set, the condition code register to branch on, the 10693 // true/false values to select between, and a branch opcode to use. 10694 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10695 MachineFunction::iterator It = ++BB->getIterator(); 10696 10697 // thisMBB: 10698 // ... 10699 // TrueVal = ... 10700 // cmpTY ccX, r1, r2 10701 // bCC copy1MBB 10702 // fallthrough --> copy0MBB 10703 MachineBasicBlock *thisMBB = BB; 10704 MachineFunction *F = BB->getParent(); 10705 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10706 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10707 F->insert(It, copy0MBB); 10708 F->insert(It, sinkMBB); 10709 10710 // Check whether CPSR is live past the tMOVCCr_pseudo. 10711 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 10712 if (!MI.killsRegister(ARM::CPSR) && 10713 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 10714 copy0MBB->addLiveIn(ARM::CPSR); 10715 sinkMBB->addLiveIn(ARM::CPSR); 10716 } 10717 10718 // Transfer the remainder of BB and its successor edges to sinkMBB. 10719 sinkMBB->splice(sinkMBB->begin(), BB, 10720 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10721 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10722 10723 BB->addSuccessor(copy0MBB); 10724 BB->addSuccessor(sinkMBB); 10725 10726 BuildMI(BB, dl, TII->get(ARM::tBcc)) 10727 .addMBB(sinkMBB) 10728 .addImm(MI.getOperand(3).getImm()) 10729 .addReg(MI.getOperand(4).getReg()); 10730 10731 // copy0MBB: 10732 // %FalseValue = ... 10733 // # fallthrough to sinkMBB 10734 BB = copy0MBB; 10735 10736 // Update machine-CFG edges 10737 BB->addSuccessor(sinkMBB); 10738 10739 // sinkMBB: 10740 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10741 // ... 10742 BB = sinkMBB; 10743 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 10744 .addReg(MI.getOperand(1).getReg()) 10745 .addMBB(copy0MBB) 10746 .addReg(MI.getOperand(2).getReg()) 10747 .addMBB(thisMBB); 10748 10749 MI.eraseFromParent(); // The pseudo instruction is gone now. 10750 return BB; 10751 } 10752 10753 case ARM::BCCi64: 10754 case ARM::BCCZi64: { 10755 // If there is an unconditional branch to the other successor, remove it. 10756 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10757 10758 // Compare both parts that make up the double comparison separately for 10759 // equality. 10760 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 10761 10762 Register LHS1 = MI.getOperand(1).getReg(); 10763 Register LHS2 = MI.getOperand(2).getReg(); 10764 if (RHSisZero) { 10765 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10766 .addReg(LHS1) 10767 .addImm(0) 10768 .add(predOps(ARMCC::AL)); 10769 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10770 .addReg(LHS2).addImm(0) 10771 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10772 } else { 10773 Register RHS1 = MI.getOperand(3).getReg(); 10774 Register RHS2 = MI.getOperand(4).getReg(); 10775 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10776 .addReg(LHS1) 10777 .addReg(RHS1) 10778 .add(predOps(ARMCC::AL)); 10779 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 10780 .addReg(LHS2).addReg(RHS2) 10781 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 10782 } 10783 10784 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 10785 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 10786 if (MI.getOperand(0).getImm() == ARMCC::NE) 10787 std::swap(destMBB, exitMBB); 10788 10789 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10790 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 10791 if (isThumb2) 10792 BuildMI(BB, dl, TII->get(ARM::t2B)) 10793 .addMBB(exitMBB) 10794 .add(predOps(ARMCC::AL)); 10795 else 10796 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 10797 10798 MI.eraseFromParent(); // The pseudo instruction is gone now. 10799 return BB; 10800 } 10801 10802 case ARM::Int_eh_sjlj_setjmp: 10803 case ARM::Int_eh_sjlj_setjmp_nofp: 10804 case ARM::tInt_eh_sjlj_setjmp: 10805 case ARM::t2Int_eh_sjlj_setjmp: 10806 case ARM::t2Int_eh_sjlj_setjmp_nofp: 10807 return BB; 10808 10809 case ARM::Int_eh_sjlj_setup_dispatch: 10810 EmitSjLjDispatchBlock(MI, BB); 10811 return BB; 10812 10813 case ARM::ABS: 10814 case ARM::t2ABS: { 10815 // To insert an ABS instruction, we have to insert the 10816 // diamond control-flow pattern. The incoming instruction knows the 10817 // source vreg to test against 0, the destination vreg to set, 10818 // the condition code register to branch on, the 10819 // true/false values to select between, and a branch opcode to use. 10820 // It transforms 10821 // V1 = ABS V0 10822 // into 10823 // V2 = MOVS V0 10824 // BCC (branch to SinkBB if V0 >= 0) 10825 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 10826 // SinkBB: V1 = PHI(V2, V3) 10827 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10828 MachineFunction::iterator BBI = ++BB->getIterator(); 10829 MachineFunction *Fn = BB->getParent(); 10830 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10831 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 10832 Fn->insert(BBI, RSBBB); 10833 Fn->insert(BBI, SinkBB); 10834 10835 Register ABSSrcReg = MI.getOperand(1).getReg(); 10836 Register ABSDstReg = MI.getOperand(0).getReg(); 10837 bool ABSSrcKIll = MI.getOperand(1).isKill(); 10838 bool isThumb2 = Subtarget->isThumb2(); 10839 MachineRegisterInfo &MRI = Fn->getRegInfo(); 10840 // In Thumb mode S must not be specified if source register is the SP or 10841 // PC and if destination register is the SP, so restrict register class 10842 Register NewRsbDstReg = MRI.createVirtualRegister( 10843 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 10844 10845 // Transfer the remainder of BB and its successor edges to sinkMBB. 10846 SinkBB->splice(SinkBB->begin(), BB, 10847 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10848 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 10849 10850 BB->addSuccessor(RSBBB); 10851 BB->addSuccessor(SinkBB); 10852 10853 // fall through to SinkMBB 10854 RSBBB->addSuccessor(SinkBB); 10855 10856 // insert a cmp at the end of BB 10857 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 10858 .addReg(ABSSrcReg) 10859 .addImm(0) 10860 .add(predOps(ARMCC::AL)); 10861 10862 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 10863 BuildMI(BB, dl, 10864 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 10865 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 10866 10867 // insert rsbri in RSBBB 10868 // Note: BCC and rsbri will be converted into predicated rsbmi 10869 // by if-conversion pass 10870 BuildMI(*RSBBB, RSBBB->begin(), dl, 10871 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 10872 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 10873 .addImm(0) 10874 .add(predOps(ARMCC::AL)) 10875 .add(condCodeOp()); 10876 10877 // insert PHI in SinkBB, 10878 // reuse ABSDstReg to not change uses of ABS instruction 10879 BuildMI(*SinkBB, SinkBB->begin(), dl, 10880 TII->get(ARM::PHI), ABSDstReg) 10881 .addReg(NewRsbDstReg).addMBB(RSBBB) 10882 .addReg(ABSSrcReg).addMBB(BB); 10883 10884 // remove ABS instruction 10885 MI.eraseFromParent(); 10886 10887 // return last added BB 10888 return SinkBB; 10889 } 10890 case ARM::COPY_STRUCT_BYVAL_I32: 10891 ++NumLoopByVals; 10892 return EmitStructByval(MI, BB); 10893 case ARM::WIN__CHKSTK: 10894 return EmitLowered__chkstk(MI, BB); 10895 case ARM::WIN__DBZCHK: 10896 return EmitLowered__dbzchk(MI, BB); 10897 } 10898 } 10899 10900 /// Attaches vregs to MEMCPY that it will use as scratch registers 10901 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 10902 /// instead of as a custom inserter because we need the use list from the SDNode. 10903 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 10904 MachineInstr &MI, const SDNode *Node) { 10905 bool isThumb1 = Subtarget->isThumb1Only(); 10906 10907 DebugLoc DL = MI.getDebugLoc(); 10908 MachineFunction *MF = MI.getParent()->getParent(); 10909 MachineRegisterInfo &MRI = MF->getRegInfo(); 10910 MachineInstrBuilder MIB(*MF, MI); 10911 10912 // If the new dst/src is unused mark it as dead. 10913 if (!Node->hasAnyUseOfValue(0)) { 10914 MI.getOperand(0).setIsDead(true); 10915 } 10916 if (!Node->hasAnyUseOfValue(1)) { 10917 MI.getOperand(1).setIsDead(true); 10918 } 10919 10920 // The MEMCPY both defines and kills the scratch registers. 10921 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 10922 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 10923 : &ARM::GPRRegClass); 10924 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 10925 } 10926 } 10927 10928 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 10929 SDNode *Node) const { 10930 if (MI.getOpcode() == ARM::MEMCPY) { 10931 attachMEMCPYScratchRegs(Subtarget, MI, Node); 10932 return; 10933 } 10934 10935 const MCInstrDesc *MCID = &MI.getDesc(); 10936 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 10937 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 10938 // operand is still set to noreg. If needed, set the optional operand's 10939 // register to CPSR, and remove the redundant implicit def. 10940 // 10941 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 10942 10943 // Rename pseudo opcodes. 10944 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 10945 unsigned ccOutIdx; 10946 if (NewOpc) { 10947 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 10948 MCID = &TII->get(NewOpc); 10949 10950 assert(MCID->getNumOperands() == 10951 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 10952 && "converted opcode should be the same except for cc_out" 10953 " (and, on Thumb1, pred)"); 10954 10955 MI.setDesc(*MCID); 10956 10957 // Add the optional cc_out operand 10958 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 10959 10960 // On Thumb1, move all input operands to the end, then add the predicate 10961 if (Subtarget->isThumb1Only()) { 10962 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 10963 MI.addOperand(MI.getOperand(1)); 10964 MI.RemoveOperand(1); 10965 } 10966 10967 // Restore the ties 10968 for (unsigned i = MI.getNumOperands(); i--;) { 10969 const MachineOperand& op = MI.getOperand(i); 10970 if (op.isReg() && op.isUse()) { 10971 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 10972 if (DefIdx != -1) 10973 MI.tieOperands(DefIdx, i); 10974 } 10975 } 10976 10977 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 10978 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 10979 ccOutIdx = 1; 10980 } else 10981 ccOutIdx = MCID->getNumOperands() - 1; 10982 } else 10983 ccOutIdx = MCID->getNumOperands() - 1; 10984 10985 // Any ARM instruction that sets the 's' bit should specify an optional 10986 // "cc_out" operand in the last operand position. 10987 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 10988 assert(!NewOpc && "Optional cc_out operand required"); 10989 return; 10990 } 10991 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 10992 // since we already have an optional CPSR def. 10993 bool definesCPSR = false; 10994 bool deadCPSR = false; 10995 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 10996 ++i) { 10997 const MachineOperand &MO = MI.getOperand(i); 10998 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 10999 definesCPSR = true; 11000 if (MO.isDead()) 11001 deadCPSR = true; 11002 MI.RemoveOperand(i); 11003 break; 11004 } 11005 } 11006 if (!definesCPSR) { 11007 assert(!NewOpc && "Optional cc_out operand required"); 11008 return; 11009 } 11010 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 11011 if (deadCPSR) { 11012 assert(!MI.getOperand(ccOutIdx).getReg() && 11013 "expect uninitialized optional cc_out operand"); 11014 // Thumb1 instructions must have the S bit even if the CPSR is dead. 11015 if (!Subtarget->isThumb1Only()) 11016 return; 11017 } 11018 11019 // If this instruction was defined with an optional CPSR def and its dag node 11020 // had a live implicit CPSR def, then activate the optional CPSR def. 11021 MachineOperand &MO = MI.getOperand(ccOutIdx); 11022 MO.setReg(ARM::CPSR); 11023 MO.setIsDef(true); 11024 } 11025 11026 //===----------------------------------------------------------------------===// 11027 // ARM Optimization Hooks 11028 //===----------------------------------------------------------------------===// 11029 11030 // Helper function that checks if N is a null or all ones constant. 11031 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 11032 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 11033 } 11034 11035 // Return true if N is conditionally 0 or all ones. 11036 // Detects these expressions where cc is an i1 value: 11037 // 11038 // (select cc 0, y) [AllOnes=0] 11039 // (select cc y, 0) [AllOnes=0] 11040 // (zext cc) [AllOnes=0] 11041 // (sext cc) [AllOnes=0/1] 11042 // (select cc -1, y) [AllOnes=1] 11043 // (select cc y, -1) [AllOnes=1] 11044 // 11045 // Invert is set when N is the null/all ones constant when CC is false. 11046 // OtherOp is set to the alternative value of N. 11047 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 11048 SDValue &CC, bool &Invert, 11049 SDValue &OtherOp, 11050 SelectionDAG &DAG) { 11051 switch (N->getOpcode()) { 11052 default: return false; 11053 case ISD::SELECT: { 11054 CC = N->getOperand(0); 11055 SDValue N1 = N->getOperand(1); 11056 SDValue N2 = N->getOperand(2); 11057 if (isZeroOrAllOnes(N1, AllOnes)) { 11058 Invert = false; 11059 OtherOp = N2; 11060 return true; 11061 } 11062 if (isZeroOrAllOnes(N2, AllOnes)) { 11063 Invert = true; 11064 OtherOp = N1; 11065 return true; 11066 } 11067 return false; 11068 } 11069 case ISD::ZERO_EXTEND: 11070 // (zext cc) can never be the all ones value. 11071 if (AllOnes) 11072 return false; 11073 LLVM_FALLTHROUGH; 11074 case ISD::SIGN_EXTEND: { 11075 SDLoc dl(N); 11076 EVT VT = N->getValueType(0); 11077 CC = N->getOperand(0); 11078 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 11079 return false; 11080 Invert = !AllOnes; 11081 if (AllOnes) 11082 // When looking for an AllOnes constant, N is an sext, and the 'other' 11083 // value is 0. 11084 OtherOp = DAG.getConstant(0, dl, VT); 11085 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11086 // When looking for a 0 constant, N can be zext or sext. 11087 OtherOp = DAG.getConstant(1, dl, VT); 11088 else 11089 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 11090 VT); 11091 return true; 11092 } 11093 } 11094 } 11095 11096 // Combine a constant select operand into its use: 11097 // 11098 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11099 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11100 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 11101 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 11102 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11103 // 11104 // The transform is rejected if the select doesn't have a constant operand that 11105 // is null, or all ones when AllOnes is set. 11106 // 11107 // Also recognize sext/zext from i1: 11108 // 11109 // (add (zext cc), x) -> (select cc (add x, 1), x) 11110 // (add (sext cc), x) -> (select cc (add x, -1), x) 11111 // 11112 // These transformations eventually create predicated instructions. 11113 // 11114 // @param N The node to transform. 11115 // @param Slct The N operand that is a select. 11116 // @param OtherOp The other N operand (x above). 11117 // @param DCI Context. 11118 // @param AllOnes Require the select constant to be all ones instead of null. 11119 // @returns The new node, or SDValue() on failure. 11120 static 11121 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 11122 TargetLowering::DAGCombinerInfo &DCI, 11123 bool AllOnes = false) { 11124 SelectionDAG &DAG = DCI.DAG; 11125 EVT VT = N->getValueType(0); 11126 SDValue NonConstantVal; 11127 SDValue CCOp; 11128 bool SwapSelectOps; 11129 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 11130 NonConstantVal, DAG)) 11131 return SDValue(); 11132 11133 // Slct is now know to be the desired identity constant when CC is true. 11134 SDValue TrueVal = OtherOp; 11135 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 11136 OtherOp, NonConstantVal); 11137 // Unless SwapSelectOps says CC should be false. 11138 if (SwapSelectOps) 11139 std::swap(TrueVal, FalseVal); 11140 11141 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 11142 CCOp, TrueVal, FalseVal); 11143 } 11144 11145 // Attempt combineSelectAndUse on each operand of a commutative operator N. 11146 static 11147 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 11148 TargetLowering::DAGCombinerInfo &DCI) { 11149 SDValue N0 = N->getOperand(0); 11150 SDValue N1 = N->getOperand(1); 11151 if (N0.getNode()->hasOneUse()) 11152 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 11153 return Result; 11154 if (N1.getNode()->hasOneUse()) 11155 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 11156 return Result; 11157 return SDValue(); 11158 } 11159 11160 static bool IsVUZPShuffleNode(SDNode *N) { 11161 // VUZP shuffle node. 11162 if (N->getOpcode() == ARMISD::VUZP) 11163 return true; 11164 11165 // "VUZP" on i32 is an alias for VTRN. 11166 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 11167 return true; 11168 11169 return false; 11170 } 11171 11172 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 11173 TargetLowering::DAGCombinerInfo &DCI, 11174 const ARMSubtarget *Subtarget) { 11175 // Look for ADD(VUZP.0, VUZP.1). 11176 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 11177 N0 == N1) 11178 return SDValue(); 11179 11180 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 11181 if (!N->getValueType(0).is64BitVector()) 11182 return SDValue(); 11183 11184 // Generate vpadd. 11185 SelectionDAG &DAG = DCI.DAG; 11186 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11187 SDLoc dl(N); 11188 SDNode *Unzip = N0.getNode(); 11189 EVT VT = N->getValueType(0); 11190 11191 SmallVector<SDValue, 8> Ops; 11192 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 11193 TLI.getPointerTy(DAG.getDataLayout()))); 11194 Ops.push_back(Unzip->getOperand(0)); 11195 Ops.push_back(Unzip->getOperand(1)); 11196 11197 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11198 } 11199 11200 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11201 TargetLowering::DAGCombinerInfo &DCI, 11202 const ARMSubtarget *Subtarget) { 11203 // Check for two extended operands. 11204 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 11205 N1.getOpcode() == ISD::SIGN_EXTEND) && 11206 !(N0.getOpcode() == ISD::ZERO_EXTEND && 11207 N1.getOpcode() == ISD::ZERO_EXTEND)) 11208 return SDValue(); 11209 11210 SDValue N00 = N0.getOperand(0); 11211 SDValue N10 = N1.getOperand(0); 11212 11213 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 11214 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 11215 N00 == N10) 11216 return SDValue(); 11217 11218 // We only recognize Q register paddl here; this can't be reached until 11219 // after type legalization. 11220 if (!N00.getValueType().is64BitVector() || 11221 !N0.getValueType().is128BitVector()) 11222 return SDValue(); 11223 11224 // Generate vpaddl. 11225 SelectionDAG &DAG = DCI.DAG; 11226 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11227 SDLoc dl(N); 11228 EVT VT = N->getValueType(0); 11229 11230 SmallVector<SDValue, 8> Ops; 11231 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 11232 unsigned Opcode; 11233 if (N0.getOpcode() == ISD::SIGN_EXTEND) 11234 Opcode = Intrinsic::arm_neon_vpaddls; 11235 else 11236 Opcode = Intrinsic::arm_neon_vpaddlu; 11237 Ops.push_back(DAG.getConstant(Opcode, dl, 11238 TLI.getPointerTy(DAG.getDataLayout()))); 11239 EVT ElemTy = N00.getValueType().getVectorElementType(); 11240 unsigned NumElts = VT.getVectorNumElements(); 11241 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 11242 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 11243 N00.getOperand(0), N00.getOperand(1)); 11244 Ops.push_back(Concat); 11245 11246 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11247 } 11248 11249 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 11250 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 11251 // much easier to match. 11252 static SDValue 11253 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11254 TargetLowering::DAGCombinerInfo &DCI, 11255 const ARMSubtarget *Subtarget) { 11256 // Only perform optimization if after legalize, and if NEON is available. We 11257 // also expected both operands to be BUILD_VECTORs. 11258 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 11259 || N0.getOpcode() != ISD::BUILD_VECTOR 11260 || N1.getOpcode() != ISD::BUILD_VECTOR) 11261 return SDValue(); 11262 11263 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 11264 EVT VT = N->getValueType(0); 11265 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 11266 return SDValue(); 11267 11268 // Check that the vector operands are of the right form. 11269 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 11270 // operands, where N is the size of the formed vector. 11271 // Each EXTRACT_VECTOR should have the same input vector and odd or even 11272 // index such that we have a pair wise add pattern. 11273 11274 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 11275 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11276 return SDValue(); 11277 SDValue Vec = N0->getOperand(0)->getOperand(0); 11278 SDNode *V = Vec.getNode(); 11279 unsigned nextIndex = 0; 11280 11281 // For each operands to the ADD which are BUILD_VECTORs, 11282 // check to see if each of their operands are an EXTRACT_VECTOR with 11283 // the same vector and appropriate index. 11284 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 11285 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 11286 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11287 11288 SDValue ExtVec0 = N0->getOperand(i); 11289 SDValue ExtVec1 = N1->getOperand(i); 11290 11291 // First operand is the vector, verify its the same. 11292 if (V != ExtVec0->getOperand(0).getNode() || 11293 V != ExtVec1->getOperand(0).getNode()) 11294 return SDValue(); 11295 11296 // Second is the constant, verify its correct. 11297 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 11298 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 11299 11300 // For the constant, we want to see all the even or all the odd. 11301 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 11302 || C1->getZExtValue() != nextIndex+1) 11303 return SDValue(); 11304 11305 // Increment index. 11306 nextIndex+=2; 11307 } else 11308 return SDValue(); 11309 } 11310 11311 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 11312 // we're using the entire input vector, otherwise there's a size/legality 11313 // mismatch somewhere. 11314 if (nextIndex != Vec.getValueType().getVectorNumElements() || 11315 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 11316 return SDValue(); 11317 11318 // Create VPADDL node. 11319 SelectionDAG &DAG = DCI.DAG; 11320 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11321 11322 SDLoc dl(N); 11323 11324 // Build operand list. 11325 SmallVector<SDValue, 8> Ops; 11326 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 11327 TLI.getPointerTy(DAG.getDataLayout()))); 11328 11329 // Input is the vector. 11330 Ops.push_back(Vec); 11331 11332 // Get widened type and narrowed type. 11333 MVT widenType; 11334 unsigned numElem = VT.getVectorNumElements(); 11335 11336 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 11337 switch (inputLaneType.getSimpleVT().SimpleTy) { 11338 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 11339 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 11340 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 11341 default: 11342 llvm_unreachable("Invalid vector element type for padd optimization."); 11343 } 11344 11345 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 11346 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 11347 return DAG.getNode(ExtOp, dl, VT, tmp); 11348 } 11349 11350 static SDValue findMUL_LOHI(SDValue V) { 11351 if (V->getOpcode() == ISD::UMUL_LOHI || 11352 V->getOpcode() == ISD::SMUL_LOHI) 11353 return V; 11354 return SDValue(); 11355 } 11356 11357 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 11358 TargetLowering::DAGCombinerInfo &DCI, 11359 const ARMSubtarget *Subtarget) { 11360 if (!Subtarget->hasBaseDSP()) 11361 return SDValue(); 11362 11363 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 11364 // accumulates the product into a 64-bit value. The 16-bit values will 11365 // be sign extended somehow or SRA'd into 32-bit values 11366 // (addc (adde (mul 16bit, 16bit), lo), hi) 11367 SDValue Mul = AddcNode->getOperand(0); 11368 SDValue Lo = AddcNode->getOperand(1); 11369 if (Mul.getOpcode() != ISD::MUL) { 11370 Lo = AddcNode->getOperand(0); 11371 Mul = AddcNode->getOperand(1); 11372 if (Mul.getOpcode() != ISD::MUL) 11373 return SDValue(); 11374 } 11375 11376 SDValue SRA = AddeNode->getOperand(0); 11377 SDValue Hi = AddeNode->getOperand(1); 11378 if (SRA.getOpcode() != ISD::SRA) { 11379 SRA = AddeNode->getOperand(1); 11380 Hi = AddeNode->getOperand(0); 11381 if (SRA.getOpcode() != ISD::SRA) 11382 return SDValue(); 11383 } 11384 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 11385 if (Const->getZExtValue() != 31) 11386 return SDValue(); 11387 } else 11388 return SDValue(); 11389 11390 if (SRA.getOperand(0) != Mul) 11391 return SDValue(); 11392 11393 SelectionDAG &DAG = DCI.DAG; 11394 SDLoc dl(AddcNode); 11395 unsigned Opcode = 0; 11396 SDValue Op0; 11397 SDValue Op1; 11398 11399 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 11400 Opcode = ARMISD::SMLALBB; 11401 Op0 = Mul.getOperand(0); 11402 Op1 = Mul.getOperand(1); 11403 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 11404 Opcode = ARMISD::SMLALBT; 11405 Op0 = Mul.getOperand(0); 11406 Op1 = Mul.getOperand(1).getOperand(0); 11407 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 11408 Opcode = ARMISD::SMLALTB; 11409 Op0 = Mul.getOperand(0).getOperand(0); 11410 Op1 = Mul.getOperand(1); 11411 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 11412 Opcode = ARMISD::SMLALTT; 11413 Op0 = Mul->getOperand(0).getOperand(0); 11414 Op1 = Mul->getOperand(1).getOperand(0); 11415 } 11416 11417 if (!Op0 || !Op1) 11418 return SDValue(); 11419 11420 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 11421 Op0, Op1, Lo, Hi); 11422 // Replace the ADDs' nodes uses by the MLA node's values. 11423 SDValue HiMLALResult(SMLAL.getNode(), 1); 11424 SDValue LoMLALResult(SMLAL.getNode(), 0); 11425 11426 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 11427 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 11428 11429 // Return original node to notify the driver to stop replacing. 11430 SDValue resNode(AddcNode, 0); 11431 return resNode; 11432 } 11433 11434 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 11435 TargetLowering::DAGCombinerInfo &DCI, 11436 const ARMSubtarget *Subtarget) { 11437 // Look for multiply add opportunities. 11438 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 11439 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 11440 // a glue link from the first add to the second add. 11441 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 11442 // a S/UMLAL instruction. 11443 // UMUL_LOHI 11444 // / :lo \ :hi 11445 // V \ [no multiline comment] 11446 // loAdd -> ADDC | 11447 // \ :carry / 11448 // V V 11449 // ADDE <- hiAdd 11450 // 11451 // In the special case where only the higher part of a signed result is used 11452 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 11453 // a constant with the exact value of 0x80000000, we recognize we are dealing 11454 // with a "rounded multiply and add" (or subtract) and transform it into 11455 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 11456 11457 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 11458 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 11459 "Expect an ADDE or SUBE"); 11460 11461 assert(AddeSubeNode->getNumOperands() == 3 && 11462 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 11463 "ADDE node has the wrong inputs"); 11464 11465 // Check that we are chained to the right ADDC or SUBC node. 11466 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 11467 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 11468 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 11469 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 11470 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 11471 return SDValue(); 11472 11473 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 11474 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 11475 11476 // Check if the two operands are from the same mul_lohi node. 11477 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 11478 return SDValue(); 11479 11480 assert(AddcSubcNode->getNumValues() == 2 && 11481 AddcSubcNode->getValueType(0) == MVT::i32 && 11482 "Expect ADDC with two result values. First: i32"); 11483 11484 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 11485 // maybe a SMLAL which multiplies two 16-bit values. 11486 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 11487 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 11488 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 11489 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 11490 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 11491 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 11492 11493 // Check for the triangle shape. 11494 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 11495 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 11496 11497 // Make sure that the ADDE/SUBE operands are not coming from the same node. 11498 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 11499 return SDValue(); 11500 11501 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 11502 bool IsLeftOperandMUL = false; 11503 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 11504 if (MULOp == SDValue()) 11505 MULOp = findMUL_LOHI(AddeSubeOp1); 11506 else 11507 IsLeftOperandMUL = true; 11508 if (MULOp == SDValue()) 11509 return SDValue(); 11510 11511 // Figure out the right opcode. 11512 unsigned Opc = MULOp->getOpcode(); 11513 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 11514 11515 // Figure out the high and low input values to the MLAL node. 11516 SDValue *HiAddSub = nullptr; 11517 SDValue *LoMul = nullptr; 11518 SDValue *LowAddSub = nullptr; 11519 11520 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 11521 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 11522 return SDValue(); 11523 11524 if (IsLeftOperandMUL) 11525 HiAddSub = &AddeSubeOp1; 11526 else 11527 HiAddSub = &AddeSubeOp0; 11528 11529 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 11530 // whose low result is fed to the ADDC/SUBC we are checking. 11531 11532 if (AddcSubcOp0 == MULOp.getValue(0)) { 11533 LoMul = &AddcSubcOp0; 11534 LowAddSub = &AddcSubcOp1; 11535 } 11536 if (AddcSubcOp1 == MULOp.getValue(0)) { 11537 LoMul = &AddcSubcOp1; 11538 LowAddSub = &AddcSubcOp0; 11539 } 11540 11541 if (!LoMul) 11542 return SDValue(); 11543 11544 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 11545 // the replacement below will create a cycle. 11546 if (AddcSubcNode == HiAddSub->getNode() || 11547 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 11548 return SDValue(); 11549 11550 // Create the merged node. 11551 SelectionDAG &DAG = DCI.DAG; 11552 11553 // Start building operand list. 11554 SmallVector<SDValue, 8> Ops; 11555 Ops.push_back(LoMul->getOperand(0)); 11556 Ops.push_back(LoMul->getOperand(1)); 11557 11558 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 11559 // the case, we must be doing signed multiplication and only use the higher 11560 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 11561 // addition or subtraction with the value of 0x800000. 11562 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 11563 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 11564 LowAddSub->getNode()->getOpcode() == ISD::Constant && 11565 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 11566 0x80000000) { 11567 Ops.push_back(*HiAddSub); 11568 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 11569 FinalOpc = ARMISD::SMMLSR; 11570 } else { 11571 FinalOpc = ARMISD::SMMLAR; 11572 } 11573 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 11574 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 11575 11576 return SDValue(AddeSubeNode, 0); 11577 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 11578 // SMMLS is generated during instruction selection and the rest of this 11579 // function can not handle the case where AddcSubcNode is a SUBC. 11580 return SDValue(); 11581 11582 // Finish building the operand list for {U/S}MLAL 11583 Ops.push_back(*LowAddSub); 11584 Ops.push_back(*HiAddSub); 11585 11586 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 11587 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11588 11589 // Replace the ADDs' nodes uses by the MLA node's values. 11590 SDValue HiMLALResult(MLALNode.getNode(), 1); 11591 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 11592 11593 SDValue LoMLALResult(MLALNode.getNode(), 0); 11594 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 11595 11596 // Return original node to notify the driver to stop replacing. 11597 return SDValue(AddeSubeNode, 0); 11598 } 11599 11600 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 11601 TargetLowering::DAGCombinerInfo &DCI, 11602 const ARMSubtarget *Subtarget) { 11603 // UMAAL is similar to UMLAL except that it adds two unsigned values. 11604 // While trying to combine for the other MLAL nodes, first search for the 11605 // chance to use UMAAL. Check if Addc uses a node which has already 11606 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 11607 // as the addend, and it's handled in PerformUMLALCombine. 11608 11609 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11610 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11611 11612 // Check that we have a glued ADDC node. 11613 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 11614 if (AddcNode->getOpcode() != ARMISD::ADDC) 11615 return SDValue(); 11616 11617 // Find the converted UMAAL or quit if it doesn't exist. 11618 SDNode *UmlalNode = nullptr; 11619 SDValue AddHi; 11620 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 11621 UmlalNode = AddcNode->getOperand(0).getNode(); 11622 AddHi = AddcNode->getOperand(1); 11623 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 11624 UmlalNode = AddcNode->getOperand(1).getNode(); 11625 AddHi = AddcNode->getOperand(0); 11626 } else { 11627 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 11628 } 11629 11630 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 11631 // the ADDC as well as Zero. 11632 if (!isNullConstant(UmlalNode->getOperand(3))) 11633 return SDValue(); 11634 11635 if ((isNullConstant(AddeNode->getOperand(0)) && 11636 AddeNode->getOperand(1).getNode() == UmlalNode) || 11637 (AddeNode->getOperand(0).getNode() == UmlalNode && 11638 isNullConstant(AddeNode->getOperand(1)))) { 11639 SelectionDAG &DAG = DCI.DAG; 11640 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 11641 UmlalNode->getOperand(2), AddHi }; 11642 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 11643 DAG.getVTList(MVT::i32, MVT::i32), Ops); 11644 11645 // Replace the ADDs' nodes uses by the UMAAL node's values. 11646 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 11647 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 11648 11649 // Return original node to notify the driver to stop replacing. 11650 return SDValue(AddeNode, 0); 11651 } 11652 return SDValue(); 11653 } 11654 11655 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 11656 const ARMSubtarget *Subtarget) { 11657 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 11658 return SDValue(); 11659 11660 // Check that we have a pair of ADDC and ADDE as operands. 11661 // Both addends of the ADDE must be zero. 11662 SDNode* AddcNode = N->getOperand(2).getNode(); 11663 SDNode* AddeNode = N->getOperand(3).getNode(); 11664 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 11665 (AddeNode->getOpcode() == ARMISD::ADDE) && 11666 isNullConstant(AddeNode->getOperand(0)) && 11667 isNullConstant(AddeNode->getOperand(1)) && 11668 (AddeNode->getOperand(2).getNode() == AddcNode)) 11669 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 11670 DAG.getVTList(MVT::i32, MVT::i32), 11671 {N->getOperand(0), N->getOperand(1), 11672 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 11673 else 11674 return SDValue(); 11675 } 11676 11677 static SDValue PerformAddcSubcCombine(SDNode *N, 11678 TargetLowering::DAGCombinerInfo &DCI, 11679 const ARMSubtarget *Subtarget) { 11680 SelectionDAG &DAG(DCI.DAG); 11681 11682 if (N->getOpcode() == ARMISD::SUBC) { 11683 // (SUBC (ADDE 0, 0, C), 1) -> C 11684 SDValue LHS = N->getOperand(0); 11685 SDValue RHS = N->getOperand(1); 11686 if (LHS->getOpcode() == ARMISD::ADDE && 11687 isNullConstant(LHS->getOperand(0)) && 11688 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 11689 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 11690 } 11691 } 11692 11693 if (Subtarget->isThumb1Only()) { 11694 SDValue RHS = N->getOperand(1); 11695 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11696 int32_t imm = C->getSExtValue(); 11697 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 11698 SDLoc DL(N); 11699 RHS = DAG.getConstant(-imm, DL, MVT::i32); 11700 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 11701 : ARMISD::ADDC; 11702 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 11703 } 11704 } 11705 } 11706 11707 return SDValue(); 11708 } 11709 11710 static SDValue PerformAddeSubeCombine(SDNode *N, 11711 TargetLowering::DAGCombinerInfo &DCI, 11712 const ARMSubtarget *Subtarget) { 11713 if (Subtarget->isThumb1Only()) { 11714 SelectionDAG &DAG = DCI.DAG; 11715 SDValue RHS = N->getOperand(1); 11716 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 11717 int64_t imm = C->getSExtValue(); 11718 if (imm < 0) { 11719 SDLoc DL(N); 11720 11721 // The with-carry-in form matches bitwise not instead of the negation. 11722 // Effectively, the inverse interpretation of the carry flag already 11723 // accounts for part of the negation. 11724 RHS = DAG.getConstant(~imm, DL, MVT::i32); 11725 11726 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 11727 : ARMISD::ADDE; 11728 return DAG.getNode(Opcode, DL, N->getVTList(), 11729 N->getOperand(0), RHS, N->getOperand(2)); 11730 } 11731 } 11732 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 11733 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 11734 } 11735 return SDValue(); 11736 } 11737 11738 static SDValue PerformABSCombine(SDNode *N, 11739 TargetLowering::DAGCombinerInfo &DCI, 11740 const ARMSubtarget *Subtarget) { 11741 SDValue res; 11742 SelectionDAG &DAG = DCI.DAG; 11743 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11744 11745 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 11746 return SDValue(); 11747 11748 if (!TLI.expandABS(N, res, DAG)) 11749 return SDValue(); 11750 11751 return res; 11752 } 11753 11754 /// PerformADDECombine - Target-specific dag combine transform from 11755 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 11756 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 11757 static SDValue PerformADDECombine(SDNode *N, 11758 TargetLowering::DAGCombinerInfo &DCI, 11759 const ARMSubtarget *Subtarget) { 11760 // Only ARM and Thumb2 support UMLAL/SMLAL. 11761 if (Subtarget->isThumb1Only()) 11762 return PerformAddeSubeCombine(N, DCI, Subtarget); 11763 11764 // Only perform the checks after legalize when the pattern is available. 11765 if (DCI.isBeforeLegalize()) return SDValue(); 11766 11767 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 11768 } 11769 11770 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 11771 /// operands N0 and N1. This is a helper for PerformADDCombine that is 11772 /// called with the default operands, and if that fails, with commuted 11773 /// operands. 11774 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 11775 TargetLowering::DAGCombinerInfo &DCI, 11776 const ARMSubtarget *Subtarget){ 11777 // Attempt to create vpadd for this add. 11778 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 11779 return Result; 11780 11781 // Attempt to create vpaddl for this add. 11782 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 11783 return Result; 11784 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 11785 Subtarget)) 11786 return Result; 11787 11788 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11789 if (N0.getNode()->hasOneUse()) 11790 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 11791 return Result; 11792 return SDValue(); 11793 } 11794 11795 static SDValue PerformADDVecReduce(SDNode *N, 11796 TargetLowering::DAGCombinerInfo &DCI, 11797 const ARMSubtarget *Subtarget) { 11798 if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64) 11799 return SDValue(); 11800 11801 SDValue N0 = N->getOperand(0); 11802 SDValue N1 = N->getOperand(1); 11803 11804 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this 11805 // will look like: 11806 // t1: i32,i32 = ARMISD::VADDLVs x 11807 // t2: i64 = build_pair t1, t1:1 11808 // t3: i64 = add t2, y 11809 // We also need to check for sext / zext and commutitive adds. 11810 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, 11811 SDValue NB) { 11812 if (NB->getOpcode() != ISD::BUILD_PAIR) 11813 return SDValue(); 11814 SDValue VecRed = NB->getOperand(0); 11815 if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 || 11816 NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) 11817 return SDValue(); 11818 11819 SDLoc dl(N); 11820 SDValue Lo = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 11821 DCI.DAG.getConstant(0, dl, MVT::i32)); 11822 SDValue Hi = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 11823 DCI.DAG.getConstant(1, dl, MVT::i32)); 11824 SDValue Red = 11825 VecRed->getNumOperands() == 1 11826 ? DCI.DAG.getNode(OpcodeA, dl, 11827 DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi, 11828 VecRed->getOperand(0)) 11829 : DCI.DAG.getNode(OpcodeA, dl, 11830 DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi, 11831 VecRed->getOperand(0), VecRed->getOperand(1)); 11832 return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, 11833 SDValue(Red.getNode(), 1)); 11834 }; 11835 11836 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) 11837 return M; 11838 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) 11839 return M; 11840 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) 11841 return M; 11842 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) 11843 return M; 11844 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) 11845 return M; 11846 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) 11847 return M; 11848 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) 11849 return M; 11850 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) 11851 return M; 11852 return SDValue(); 11853 } 11854 11855 bool 11856 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 11857 CombineLevel Level) const { 11858 if (Level == BeforeLegalizeTypes) 11859 return true; 11860 11861 if (N->getOpcode() != ISD::SHL) 11862 return true; 11863 11864 if (Subtarget->isThumb1Only()) { 11865 // Avoid making expensive immediates by commuting shifts. (This logic 11866 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 11867 // for free.) 11868 if (N->getOpcode() != ISD::SHL) 11869 return true; 11870 SDValue N1 = N->getOperand(0); 11871 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 11872 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 11873 return true; 11874 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 11875 if (Const->getAPIntValue().ult(256)) 11876 return false; 11877 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 11878 Const->getAPIntValue().sgt(-256)) 11879 return false; 11880 } 11881 return true; 11882 } 11883 11884 // Turn off commute-with-shift transform after legalization, so it doesn't 11885 // conflict with PerformSHLSimplify. (We could try to detect when 11886 // PerformSHLSimplify would trigger more precisely, but it isn't 11887 // really necessary.) 11888 return false; 11889 } 11890 11891 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 11892 const SDNode *N, CombineLevel Level) const { 11893 if (!Subtarget->isThumb1Only()) 11894 return true; 11895 11896 if (Level == BeforeLegalizeTypes) 11897 return true; 11898 11899 return false; 11900 } 11901 11902 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 11903 if (!Subtarget->hasNEON()) { 11904 if (Subtarget->isThumb1Only()) 11905 return VT.getScalarSizeInBits() <= 32; 11906 return true; 11907 } 11908 return VT.isScalarInteger(); 11909 } 11910 11911 static SDValue PerformSHLSimplify(SDNode *N, 11912 TargetLowering::DAGCombinerInfo &DCI, 11913 const ARMSubtarget *ST) { 11914 // Allow the generic combiner to identify potential bswaps. 11915 if (DCI.isBeforeLegalize()) 11916 return SDValue(); 11917 11918 // DAG combiner will fold: 11919 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 11920 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 11921 // Other code patterns that can be also be modified have the following form: 11922 // b + ((a << 1) | 510) 11923 // b + ((a << 1) & 510) 11924 // b + ((a << 1) ^ 510) 11925 // b + ((a << 1) + 510) 11926 11927 // Many instructions can perform the shift for free, but it requires both 11928 // the operands to be registers. If c1 << c2 is too large, a mov immediate 11929 // instruction will needed. So, unfold back to the original pattern if: 11930 // - if c1 and c2 are small enough that they don't require mov imms. 11931 // - the user(s) of the node can perform an shl 11932 11933 // No shifted operands for 16-bit instructions. 11934 if (ST->isThumb() && ST->isThumb1Only()) 11935 return SDValue(); 11936 11937 // Check that all the users could perform the shl themselves. 11938 for (auto U : N->uses()) { 11939 switch(U->getOpcode()) { 11940 default: 11941 return SDValue(); 11942 case ISD::SUB: 11943 case ISD::ADD: 11944 case ISD::AND: 11945 case ISD::OR: 11946 case ISD::XOR: 11947 case ISD::SETCC: 11948 case ARMISD::CMP: 11949 // Check that the user isn't already using a constant because there 11950 // aren't any instructions that support an immediate operand and a 11951 // shifted operand. 11952 if (isa<ConstantSDNode>(U->getOperand(0)) || 11953 isa<ConstantSDNode>(U->getOperand(1))) 11954 return SDValue(); 11955 11956 // Check that it's not already using a shift. 11957 if (U->getOperand(0).getOpcode() == ISD::SHL || 11958 U->getOperand(1).getOpcode() == ISD::SHL) 11959 return SDValue(); 11960 break; 11961 } 11962 } 11963 11964 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 11965 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 11966 return SDValue(); 11967 11968 if (N->getOperand(0).getOpcode() != ISD::SHL) 11969 return SDValue(); 11970 11971 SDValue SHL = N->getOperand(0); 11972 11973 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11974 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 11975 if (!C1ShlC2 || !C2) 11976 return SDValue(); 11977 11978 APInt C2Int = C2->getAPIntValue(); 11979 APInt C1Int = C1ShlC2->getAPIntValue(); 11980 11981 // Check that performing a lshr will not lose any information. 11982 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 11983 C2Int.getBitWidth() - C2->getZExtValue()); 11984 if ((C1Int & Mask) != C1Int) 11985 return SDValue(); 11986 11987 // Shift the first constant. 11988 C1Int.lshrInPlace(C2Int); 11989 11990 // The immediates are encoded as an 8-bit value that can be rotated. 11991 auto LargeImm = [](const APInt &Imm) { 11992 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 11993 return Imm.getBitWidth() - Zeros > 8; 11994 }; 11995 11996 if (LargeImm(C1Int) || LargeImm(C2Int)) 11997 return SDValue(); 11998 11999 SelectionDAG &DAG = DCI.DAG; 12000 SDLoc dl(N); 12001 SDValue X = SHL.getOperand(0); 12002 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 12003 DAG.getConstant(C1Int, dl, MVT::i32)); 12004 // Shift left to compensate for the lshr of C1Int. 12005 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 12006 12007 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 12008 SHL.dump(); N->dump()); 12009 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 12010 return Res; 12011 } 12012 12013 12014 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 12015 /// 12016 static SDValue PerformADDCombine(SDNode *N, 12017 TargetLowering::DAGCombinerInfo &DCI, 12018 const ARMSubtarget *Subtarget) { 12019 SDValue N0 = N->getOperand(0); 12020 SDValue N1 = N->getOperand(1); 12021 12022 // Only works one way, because it needs an immediate operand. 12023 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12024 return Result; 12025 12026 if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget)) 12027 return Result; 12028 12029 // First try with the default operand order. 12030 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 12031 return Result; 12032 12033 // If that didn't work, try again with the operands commuted. 12034 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 12035 } 12036 12037 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 12038 /// 12039 static SDValue PerformSUBCombine(SDNode *N, 12040 TargetLowering::DAGCombinerInfo &DCI, 12041 const ARMSubtarget *Subtarget) { 12042 SDValue N0 = N->getOperand(0); 12043 SDValue N1 = N->getOperand(1); 12044 12045 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 12046 if (N1.getNode()->hasOneUse()) 12047 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 12048 return Result; 12049 12050 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 12051 return SDValue(); 12052 12053 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 12054 // so that we can readily pattern match more mve instructions which can use 12055 // a scalar operand. 12056 SDValue VDup = N->getOperand(1); 12057 if (VDup->getOpcode() != ARMISD::VDUP) 12058 return SDValue(); 12059 12060 SDValue VMov = N->getOperand(0); 12061 if (VMov->getOpcode() == ISD::BITCAST) 12062 VMov = VMov->getOperand(0); 12063 12064 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 12065 return SDValue(); 12066 12067 SDLoc dl(N); 12068 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 12069 DCI.DAG.getConstant(0, dl, MVT::i32), 12070 VDup->getOperand(0)); 12071 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 12072 } 12073 12074 /// PerformVMULCombine 12075 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 12076 /// special multiplier accumulator forwarding. 12077 /// vmul d3, d0, d2 12078 /// vmla d3, d1, d2 12079 /// is faster than 12080 /// vadd d3, d0, d1 12081 /// vmul d3, d3, d2 12082 // However, for (A + B) * (A + B), 12083 // vadd d2, d0, d1 12084 // vmul d3, d0, d2 12085 // vmla d3, d1, d2 12086 // is slower than 12087 // vadd d2, d0, d1 12088 // vmul d3, d2, d2 12089 static SDValue PerformVMULCombine(SDNode *N, 12090 TargetLowering::DAGCombinerInfo &DCI, 12091 const ARMSubtarget *Subtarget) { 12092 if (!Subtarget->hasVMLxForwarding()) 12093 return SDValue(); 12094 12095 SelectionDAG &DAG = DCI.DAG; 12096 SDValue N0 = N->getOperand(0); 12097 SDValue N1 = N->getOperand(1); 12098 unsigned Opcode = N0.getOpcode(); 12099 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 12100 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 12101 Opcode = N1.getOpcode(); 12102 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 12103 Opcode != ISD::FADD && Opcode != ISD::FSUB) 12104 return SDValue(); 12105 std::swap(N0, N1); 12106 } 12107 12108 if (N0 == N1) 12109 return SDValue(); 12110 12111 EVT VT = N->getValueType(0); 12112 SDLoc DL(N); 12113 SDValue N00 = N0->getOperand(0); 12114 SDValue N01 = N0->getOperand(1); 12115 return DAG.getNode(Opcode, DL, VT, 12116 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 12117 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 12118 } 12119 12120 static SDValue PerformMULCombine(SDNode *N, 12121 TargetLowering::DAGCombinerInfo &DCI, 12122 const ARMSubtarget *Subtarget) { 12123 SelectionDAG &DAG = DCI.DAG; 12124 12125 if (Subtarget->isThumb1Only()) 12126 return SDValue(); 12127 12128 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12129 return SDValue(); 12130 12131 EVT VT = N->getValueType(0); 12132 if (VT.is64BitVector() || VT.is128BitVector()) 12133 return PerformVMULCombine(N, DCI, Subtarget); 12134 if (VT != MVT::i32) 12135 return SDValue(); 12136 12137 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12138 if (!C) 12139 return SDValue(); 12140 12141 int64_t MulAmt = C->getSExtValue(); 12142 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 12143 12144 ShiftAmt = ShiftAmt & (32 - 1); 12145 SDValue V = N->getOperand(0); 12146 SDLoc DL(N); 12147 12148 SDValue Res; 12149 MulAmt >>= ShiftAmt; 12150 12151 if (MulAmt >= 0) { 12152 if (isPowerOf2_32(MulAmt - 1)) { 12153 // (mul x, 2^N + 1) => (add (shl x, N), x) 12154 Res = DAG.getNode(ISD::ADD, DL, VT, 12155 V, 12156 DAG.getNode(ISD::SHL, DL, VT, 12157 V, 12158 DAG.getConstant(Log2_32(MulAmt - 1), DL, 12159 MVT::i32))); 12160 } else if (isPowerOf2_32(MulAmt + 1)) { 12161 // (mul x, 2^N - 1) => (sub (shl x, N), x) 12162 Res = DAG.getNode(ISD::SUB, DL, VT, 12163 DAG.getNode(ISD::SHL, DL, VT, 12164 V, 12165 DAG.getConstant(Log2_32(MulAmt + 1), DL, 12166 MVT::i32)), 12167 V); 12168 } else 12169 return SDValue(); 12170 } else { 12171 uint64_t MulAmtAbs = -MulAmt; 12172 if (isPowerOf2_32(MulAmtAbs + 1)) { 12173 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 12174 Res = DAG.getNode(ISD::SUB, DL, VT, 12175 V, 12176 DAG.getNode(ISD::SHL, DL, VT, 12177 V, 12178 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 12179 MVT::i32))); 12180 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 12181 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 12182 Res = DAG.getNode(ISD::ADD, DL, VT, 12183 V, 12184 DAG.getNode(ISD::SHL, DL, VT, 12185 V, 12186 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 12187 MVT::i32))); 12188 Res = DAG.getNode(ISD::SUB, DL, VT, 12189 DAG.getConstant(0, DL, MVT::i32), Res); 12190 } else 12191 return SDValue(); 12192 } 12193 12194 if (ShiftAmt != 0) 12195 Res = DAG.getNode(ISD::SHL, DL, VT, 12196 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 12197 12198 // Do not add new nodes to DAG combiner worklist. 12199 DCI.CombineTo(N, Res, false); 12200 return SDValue(); 12201 } 12202 12203 static SDValue CombineANDShift(SDNode *N, 12204 TargetLowering::DAGCombinerInfo &DCI, 12205 const ARMSubtarget *Subtarget) { 12206 // Allow DAGCombine to pattern-match before we touch the canonical form. 12207 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12208 return SDValue(); 12209 12210 if (N->getValueType(0) != MVT::i32) 12211 return SDValue(); 12212 12213 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12214 if (!N1C) 12215 return SDValue(); 12216 12217 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 12218 // Don't transform uxtb/uxth. 12219 if (C1 == 255 || C1 == 65535) 12220 return SDValue(); 12221 12222 SDNode *N0 = N->getOperand(0).getNode(); 12223 if (!N0->hasOneUse()) 12224 return SDValue(); 12225 12226 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 12227 return SDValue(); 12228 12229 bool LeftShift = N0->getOpcode() == ISD::SHL; 12230 12231 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 12232 if (!N01C) 12233 return SDValue(); 12234 12235 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 12236 if (!C2 || C2 >= 32) 12237 return SDValue(); 12238 12239 // Clear irrelevant bits in the mask. 12240 if (LeftShift) 12241 C1 &= (-1U << C2); 12242 else 12243 C1 &= (-1U >> C2); 12244 12245 SelectionDAG &DAG = DCI.DAG; 12246 SDLoc DL(N); 12247 12248 // We have a pattern of the form "(and (shl x, c2) c1)" or 12249 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 12250 // transform to a pair of shifts, to save materializing c1. 12251 12252 // First pattern: right shift, then mask off leading bits. 12253 // FIXME: Use demanded bits? 12254 if (!LeftShift && isMask_32(C1)) { 12255 uint32_t C3 = countLeadingZeros(C1); 12256 if (C2 < C3) { 12257 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12258 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12259 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12260 DAG.getConstant(C3, DL, MVT::i32)); 12261 } 12262 } 12263 12264 // First pattern, reversed: left shift, then mask off trailing bits. 12265 if (LeftShift && isMask_32(~C1)) { 12266 uint32_t C3 = countTrailingZeros(C1); 12267 if (C2 < C3) { 12268 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12269 DAG.getConstant(C3 - C2, DL, MVT::i32)); 12270 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12271 DAG.getConstant(C3, DL, MVT::i32)); 12272 } 12273 } 12274 12275 // Second pattern: left shift, then mask off leading bits. 12276 // FIXME: Use demanded bits? 12277 if (LeftShift && isShiftedMask_32(C1)) { 12278 uint32_t Trailing = countTrailingZeros(C1); 12279 uint32_t C3 = countLeadingZeros(C1); 12280 if (Trailing == C2 && C2 + C3 < 32) { 12281 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12282 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12283 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 12284 DAG.getConstant(C3, DL, MVT::i32)); 12285 } 12286 } 12287 12288 // Second pattern, reversed: right shift, then mask off trailing bits. 12289 // FIXME: Handle other patterns of known/demanded bits. 12290 if (!LeftShift && isShiftedMask_32(C1)) { 12291 uint32_t Leading = countLeadingZeros(C1); 12292 uint32_t C3 = countTrailingZeros(C1); 12293 if (Leading == C2 && C2 + C3 < 32) { 12294 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 12295 DAG.getConstant(C2 + C3, DL, MVT::i32)); 12296 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 12297 DAG.getConstant(C3, DL, MVT::i32)); 12298 } 12299 } 12300 12301 // FIXME: Transform "(and (shl x, c2) c1)" -> 12302 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 12303 // c1. 12304 return SDValue(); 12305 } 12306 12307 static SDValue PerformANDCombine(SDNode *N, 12308 TargetLowering::DAGCombinerInfo &DCI, 12309 const ARMSubtarget *Subtarget) { 12310 // Attempt to use immediate-form VBIC 12311 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12312 SDLoc dl(N); 12313 EVT VT = N->getValueType(0); 12314 SelectionDAG &DAG = DCI.DAG; 12315 12316 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12317 return SDValue(); 12318 12319 APInt SplatBits, SplatUndef; 12320 unsigned SplatBitSize; 12321 bool HasAnyUndefs; 12322 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 12323 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12324 if (SplatBitSize <= 64) { 12325 EVT VbicVT; 12326 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 12327 SplatUndef.getZExtValue(), SplatBitSize, 12328 DAG, dl, VbicVT, VT.is128BitVector(), 12329 OtherModImm); 12330 if (Val.getNode()) { 12331 SDValue Input = 12332 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 12333 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 12334 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 12335 } 12336 } 12337 } 12338 12339 if (!Subtarget->isThumb1Only()) { 12340 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 12341 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 12342 return Result; 12343 12344 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12345 return Result; 12346 } 12347 12348 if (Subtarget->isThumb1Only()) 12349 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 12350 return Result; 12351 12352 return SDValue(); 12353 } 12354 12355 // Try combining OR nodes to SMULWB, SMULWT. 12356 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 12357 TargetLowering::DAGCombinerInfo &DCI, 12358 const ARMSubtarget *Subtarget) { 12359 if (!Subtarget->hasV6Ops() || 12360 (Subtarget->isThumb() && 12361 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 12362 return SDValue(); 12363 12364 SDValue SRL = OR->getOperand(0); 12365 SDValue SHL = OR->getOperand(1); 12366 12367 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 12368 SRL = OR->getOperand(1); 12369 SHL = OR->getOperand(0); 12370 } 12371 if (!isSRL16(SRL) || !isSHL16(SHL)) 12372 return SDValue(); 12373 12374 // The first operands to the shifts need to be the two results from the 12375 // same smul_lohi node. 12376 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 12377 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 12378 return SDValue(); 12379 12380 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 12381 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 12382 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 12383 return SDValue(); 12384 12385 // Now we have: 12386 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 12387 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 12388 // For SMUWB the 16-bit value will signed extended somehow. 12389 // For SMULWT only the SRA is required. 12390 // Check both sides of SMUL_LOHI 12391 SDValue OpS16 = SMULLOHI->getOperand(0); 12392 SDValue OpS32 = SMULLOHI->getOperand(1); 12393 12394 SelectionDAG &DAG = DCI.DAG; 12395 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 12396 OpS16 = OpS32; 12397 OpS32 = SMULLOHI->getOperand(0); 12398 } 12399 12400 SDLoc dl(OR); 12401 unsigned Opcode = 0; 12402 if (isS16(OpS16, DAG)) 12403 Opcode = ARMISD::SMULWB; 12404 else if (isSRA16(OpS16)) { 12405 Opcode = ARMISD::SMULWT; 12406 OpS16 = OpS16->getOperand(0); 12407 } 12408 else 12409 return SDValue(); 12410 12411 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 12412 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 12413 return SDValue(OR, 0); 12414 } 12415 12416 static SDValue PerformORCombineToBFI(SDNode *N, 12417 TargetLowering::DAGCombinerInfo &DCI, 12418 const ARMSubtarget *Subtarget) { 12419 // BFI is only available on V6T2+ 12420 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 12421 return SDValue(); 12422 12423 EVT VT = N->getValueType(0); 12424 SDValue N0 = N->getOperand(0); 12425 SDValue N1 = N->getOperand(1); 12426 SelectionDAG &DAG = DCI.DAG; 12427 SDLoc DL(N); 12428 // 1) or (and A, mask), val => ARMbfi A, val, mask 12429 // iff (val & mask) == val 12430 // 12431 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12432 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 12433 // && mask == ~mask2 12434 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 12435 // && ~mask == mask2 12436 // (i.e., copy a bitfield value into another bitfield of the same width) 12437 12438 if (VT != MVT::i32) 12439 return SDValue(); 12440 12441 SDValue N00 = N0.getOperand(0); 12442 12443 // The value and the mask need to be constants so we can verify this is 12444 // actually a bitfield set. If the mask is 0xffff, we can do better 12445 // via a movt instruction, so don't use BFI in that case. 12446 SDValue MaskOp = N0.getOperand(1); 12447 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 12448 if (!MaskC) 12449 return SDValue(); 12450 unsigned Mask = MaskC->getZExtValue(); 12451 if (Mask == 0xffff) 12452 return SDValue(); 12453 SDValue Res; 12454 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 12455 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 12456 if (N1C) { 12457 unsigned Val = N1C->getZExtValue(); 12458 if ((Val & ~Mask) != Val) 12459 return SDValue(); 12460 12461 if (ARM::isBitFieldInvertedMask(Mask)) { 12462 Val >>= countTrailingZeros(~Mask); 12463 12464 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 12465 DAG.getConstant(Val, DL, MVT::i32), 12466 DAG.getConstant(Mask, DL, MVT::i32)); 12467 12468 DCI.CombineTo(N, Res, false); 12469 // Return value from the original node to inform the combiner than N is 12470 // now dead. 12471 return SDValue(N, 0); 12472 } 12473 } else if (N1.getOpcode() == ISD::AND) { 12474 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 12475 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12476 if (!N11C) 12477 return SDValue(); 12478 unsigned Mask2 = N11C->getZExtValue(); 12479 12480 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 12481 // as is to match. 12482 if (ARM::isBitFieldInvertedMask(Mask) && 12483 (Mask == ~Mask2)) { 12484 // The pack halfword instruction works better for masks that fit it, 12485 // so use that when it's available. 12486 if (Subtarget->hasDSP() && 12487 (Mask == 0xffff || Mask == 0xffff0000)) 12488 return SDValue(); 12489 // 2a 12490 unsigned amt = countTrailingZeros(Mask2); 12491 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 12492 DAG.getConstant(amt, DL, MVT::i32)); 12493 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 12494 DAG.getConstant(Mask, DL, MVT::i32)); 12495 DCI.CombineTo(N, Res, false); 12496 // Return value from the original node to inform the combiner than N is 12497 // now dead. 12498 return SDValue(N, 0); 12499 } else if (ARM::isBitFieldInvertedMask(~Mask) && 12500 (~Mask == Mask2)) { 12501 // The pack halfword instruction works better for masks that fit it, 12502 // so use that when it's available. 12503 if (Subtarget->hasDSP() && 12504 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 12505 return SDValue(); 12506 // 2b 12507 unsigned lsb = countTrailingZeros(Mask); 12508 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 12509 DAG.getConstant(lsb, DL, MVT::i32)); 12510 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 12511 DAG.getConstant(Mask2, DL, MVT::i32)); 12512 DCI.CombineTo(N, Res, false); 12513 // Return value from the original node to inform the combiner than N is 12514 // now dead. 12515 return SDValue(N, 0); 12516 } 12517 } 12518 12519 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 12520 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 12521 ARM::isBitFieldInvertedMask(~Mask)) { 12522 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 12523 // where lsb(mask) == #shamt and masked bits of B are known zero. 12524 SDValue ShAmt = N00.getOperand(1); 12525 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 12526 unsigned LSB = countTrailingZeros(Mask); 12527 if (ShAmtC != LSB) 12528 return SDValue(); 12529 12530 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 12531 DAG.getConstant(~Mask, DL, MVT::i32)); 12532 12533 DCI.CombineTo(N, Res, false); 12534 // Return value from the original node to inform the combiner than N is 12535 // now dead. 12536 return SDValue(N, 0); 12537 } 12538 12539 return SDValue(); 12540 } 12541 12542 static bool isValidMVECond(unsigned CC, bool IsFloat) { 12543 switch (CC) { 12544 case ARMCC::EQ: 12545 case ARMCC::NE: 12546 case ARMCC::LE: 12547 case ARMCC::GT: 12548 case ARMCC::GE: 12549 case ARMCC::LT: 12550 return true; 12551 case ARMCC::HS: 12552 case ARMCC::HI: 12553 return !IsFloat; 12554 default: 12555 return false; 12556 }; 12557 } 12558 12559 static SDValue PerformORCombine_i1(SDNode *N, 12560 TargetLowering::DAGCombinerInfo &DCI, 12561 const ARMSubtarget *Subtarget) { 12562 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 12563 // together with predicates 12564 EVT VT = N->getValueType(0); 12565 SDValue N0 = N->getOperand(0); 12566 SDValue N1 = N->getOperand(1); 12567 12568 ARMCC::CondCodes CondCode0 = ARMCC::AL; 12569 ARMCC::CondCodes CondCode1 = ARMCC::AL; 12570 if (N0->getOpcode() == ARMISD::VCMP) 12571 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) 12572 ->getZExtValue(); 12573 else if (N0->getOpcode() == ARMISD::VCMPZ) 12574 CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) 12575 ->getZExtValue(); 12576 if (N1->getOpcode() == ARMISD::VCMP) 12577 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) 12578 ->getZExtValue(); 12579 else if (N1->getOpcode() == ARMISD::VCMPZ) 12580 CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) 12581 ->getZExtValue(); 12582 12583 if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) 12584 return SDValue(); 12585 12586 unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); 12587 unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); 12588 12589 if (!isValidMVECond(Opposite0, 12590 N0->getOperand(0)->getValueType(0).isFloatingPoint()) || 12591 !isValidMVECond(Opposite1, 12592 N1->getOperand(0)->getValueType(0).isFloatingPoint())) 12593 return SDValue(); 12594 12595 SmallVector<SDValue, 4> Ops0; 12596 Ops0.push_back(N0->getOperand(0)); 12597 if (N0->getOpcode() == ARMISD::VCMP) 12598 Ops0.push_back(N0->getOperand(1)); 12599 Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); 12600 SmallVector<SDValue, 4> Ops1; 12601 Ops1.push_back(N1->getOperand(0)); 12602 if (N1->getOpcode() == ARMISD::VCMP) 12603 Ops1.push_back(N1->getOperand(1)); 12604 Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); 12605 12606 SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); 12607 SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); 12608 SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); 12609 return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, 12610 DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); 12611 } 12612 12613 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 12614 static SDValue PerformORCombine(SDNode *N, 12615 TargetLowering::DAGCombinerInfo &DCI, 12616 const ARMSubtarget *Subtarget) { 12617 // Attempt to use immediate-form VORR 12618 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 12619 SDLoc dl(N); 12620 EVT VT = N->getValueType(0); 12621 SelectionDAG &DAG = DCI.DAG; 12622 12623 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12624 return SDValue(); 12625 12626 APInt SplatBits, SplatUndef; 12627 unsigned SplatBitSize; 12628 bool HasAnyUndefs; 12629 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 12630 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12631 if (SplatBitSize <= 64) { 12632 EVT VorrVT; 12633 SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), 12634 SplatUndef.getZExtValue(), SplatBitSize, 12635 DAG, dl, VorrVT, VT.is128BitVector(), 12636 OtherModImm); 12637 if (Val.getNode()) { 12638 SDValue Input = 12639 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 12640 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 12641 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 12642 } 12643 } 12644 } 12645 12646 if (!Subtarget->isThumb1Only()) { 12647 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 12648 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12649 return Result; 12650 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 12651 return Result; 12652 } 12653 12654 SDValue N0 = N->getOperand(0); 12655 SDValue N1 = N->getOperand(1); 12656 12657 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 12658 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 12659 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12660 12661 // The code below optimizes (or (and X, Y), Z). 12662 // The AND operand needs to have a single user to make these optimizations 12663 // profitable. 12664 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 12665 return SDValue(); 12666 12667 APInt SplatUndef; 12668 unsigned SplatBitSize; 12669 bool HasAnyUndefs; 12670 12671 APInt SplatBits0, SplatBits1; 12672 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 12673 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 12674 // Ensure that the second operand of both ands are constants 12675 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 12676 HasAnyUndefs) && !HasAnyUndefs) { 12677 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 12678 HasAnyUndefs) && !HasAnyUndefs) { 12679 // Ensure that the bit width of the constants are the same and that 12680 // the splat arguments are logical inverses as per the pattern we 12681 // are trying to simplify. 12682 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 12683 SplatBits0 == ~SplatBits1) { 12684 // Canonicalize the vector type to make instruction selection 12685 // simpler. 12686 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 12687 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 12688 N0->getOperand(1), 12689 N0->getOperand(0), 12690 N1->getOperand(0)); 12691 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 12692 } 12693 } 12694 } 12695 } 12696 12697 if (Subtarget->hasMVEIntegerOps() && 12698 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 12699 return PerformORCombine_i1(N, DCI, Subtarget); 12700 12701 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 12702 // reasonable. 12703 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 12704 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 12705 return Res; 12706 } 12707 12708 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12709 return Result; 12710 12711 return SDValue(); 12712 } 12713 12714 static SDValue PerformXORCombine(SDNode *N, 12715 TargetLowering::DAGCombinerInfo &DCI, 12716 const ARMSubtarget *Subtarget) { 12717 EVT VT = N->getValueType(0); 12718 SelectionDAG &DAG = DCI.DAG; 12719 12720 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12721 return SDValue(); 12722 12723 if (!Subtarget->isThumb1Only()) { 12724 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 12725 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 12726 return Result; 12727 12728 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12729 return Result; 12730 } 12731 12732 return SDValue(); 12733 } 12734 12735 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 12736 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 12737 // their position in "to" (Rd). 12738 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 12739 assert(N->getOpcode() == ARMISD::BFI); 12740 12741 SDValue From = N->getOperand(1); 12742 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 12743 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 12744 12745 // If the Base came from a SHR #C, we can deduce that it is really testing bit 12746 // #C in the base of the SHR. 12747 if (From->getOpcode() == ISD::SRL && 12748 isa<ConstantSDNode>(From->getOperand(1))) { 12749 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 12750 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 12751 FromMask <<= Shift.getLimitedValue(31); 12752 From = From->getOperand(0); 12753 } 12754 12755 return From; 12756 } 12757 12758 // If A and B contain one contiguous set of bits, does A | B == A . B? 12759 // 12760 // Neither A nor B must be zero. 12761 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 12762 unsigned LastActiveBitInA = A.countTrailingZeros(); 12763 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 12764 return LastActiveBitInA - 1 == FirstActiveBitInB; 12765 } 12766 12767 static SDValue FindBFIToCombineWith(SDNode *N) { 12768 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 12769 // if one exists. 12770 APInt ToMask, FromMask; 12771 SDValue From = ParseBFI(N, ToMask, FromMask); 12772 SDValue To = N->getOperand(0); 12773 12774 // Now check for a compatible BFI to merge with. We can pass through BFIs that 12775 // aren't compatible, but not if they set the same bit in their destination as 12776 // we do (or that of any BFI we're going to combine with). 12777 SDValue V = To; 12778 APInt CombinedToMask = ToMask; 12779 while (V.getOpcode() == ARMISD::BFI) { 12780 APInt NewToMask, NewFromMask; 12781 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 12782 if (NewFrom != From) { 12783 // This BFI has a different base. Keep going. 12784 CombinedToMask |= NewToMask; 12785 V = V.getOperand(0); 12786 continue; 12787 } 12788 12789 // Do the written bits conflict with any we've seen so far? 12790 if ((NewToMask & CombinedToMask).getBoolValue()) 12791 // Conflicting bits - bail out because going further is unsafe. 12792 return SDValue(); 12793 12794 // Are the new bits contiguous when combined with the old bits? 12795 if (BitsProperlyConcatenate(ToMask, NewToMask) && 12796 BitsProperlyConcatenate(FromMask, NewFromMask)) 12797 return V; 12798 if (BitsProperlyConcatenate(NewToMask, ToMask) && 12799 BitsProperlyConcatenate(NewFromMask, FromMask)) 12800 return V; 12801 12802 // We've seen a write to some bits, so track it. 12803 CombinedToMask |= NewToMask; 12804 // Keep going... 12805 V = V.getOperand(0); 12806 } 12807 12808 return SDValue(); 12809 } 12810 12811 static SDValue PerformBFICombine(SDNode *N, 12812 TargetLowering::DAGCombinerInfo &DCI) { 12813 SDValue N1 = N->getOperand(1); 12814 if (N1.getOpcode() == ISD::AND) { 12815 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 12816 // the bits being cleared by the AND are not demanded by the BFI. 12817 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 12818 if (!N11C) 12819 return SDValue(); 12820 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 12821 unsigned LSB = countTrailingZeros(~InvMask); 12822 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 12823 assert(Width < 12824 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 12825 "undefined behavior"); 12826 unsigned Mask = (1u << Width) - 1; 12827 unsigned Mask2 = N11C->getZExtValue(); 12828 if ((Mask & (~Mask2)) == 0) 12829 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 12830 N->getOperand(0), N1.getOperand(0), 12831 N->getOperand(2)); 12832 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 12833 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 12834 // Keep track of any consecutive bits set that all come from the same base 12835 // value. We can combine these together into a single BFI. 12836 SDValue CombineBFI = FindBFIToCombineWith(N); 12837 if (CombineBFI == SDValue()) 12838 return SDValue(); 12839 12840 // We've found a BFI. 12841 APInt ToMask1, FromMask1; 12842 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 12843 12844 APInt ToMask2, FromMask2; 12845 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 12846 assert(From1 == From2); 12847 (void)From2; 12848 12849 // First, unlink CombineBFI. 12850 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 12851 // Then create a new BFI, combining the two together. 12852 APInt NewFromMask = FromMask1 | FromMask2; 12853 APInt NewToMask = ToMask1 | ToMask2; 12854 12855 EVT VT = N->getValueType(0); 12856 SDLoc dl(N); 12857 12858 if (NewFromMask[0] == 0) 12859 From1 = DCI.DAG.getNode( 12860 ISD::SRL, dl, VT, From1, 12861 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 12862 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 12863 DCI.DAG.getConstant(~NewToMask, dl, VT)); 12864 } 12865 return SDValue(); 12866 } 12867 12868 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 12869 /// ARMISD::VMOVRRD. 12870 static SDValue PerformVMOVRRDCombine(SDNode *N, 12871 TargetLowering::DAGCombinerInfo &DCI, 12872 const ARMSubtarget *Subtarget) { 12873 // vmovrrd(vmovdrr x, y) -> x,y 12874 SDValue InDouble = N->getOperand(0); 12875 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 12876 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 12877 12878 // vmovrrd(load f64) -> (load i32), (load i32) 12879 SDNode *InNode = InDouble.getNode(); 12880 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 12881 InNode->getValueType(0) == MVT::f64 && 12882 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 12883 !cast<LoadSDNode>(InNode)->isVolatile()) { 12884 // TODO: Should this be done for non-FrameIndex operands? 12885 LoadSDNode *LD = cast<LoadSDNode>(InNode); 12886 12887 SelectionDAG &DAG = DCI.DAG; 12888 SDLoc DL(LD); 12889 SDValue BasePtr = LD->getBasePtr(); 12890 SDValue NewLD1 = 12891 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 12892 LD->getAlignment(), LD->getMemOperand()->getFlags()); 12893 12894 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 12895 DAG.getConstant(4, DL, MVT::i32)); 12896 12897 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 12898 LD->getPointerInfo().getWithOffset(4), 12899 std::min(4U, LD->getAlignment()), 12900 LD->getMemOperand()->getFlags()); 12901 12902 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 12903 if (DCI.DAG.getDataLayout().isBigEndian()) 12904 std::swap (NewLD1, NewLD2); 12905 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 12906 return Result; 12907 } 12908 12909 return SDValue(); 12910 } 12911 12912 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 12913 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 12914 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 12915 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 12916 SDValue Op0 = N->getOperand(0); 12917 SDValue Op1 = N->getOperand(1); 12918 if (Op0.getOpcode() == ISD::BITCAST) 12919 Op0 = Op0.getOperand(0); 12920 if (Op1.getOpcode() == ISD::BITCAST) 12921 Op1 = Op1.getOperand(0); 12922 if (Op0.getOpcode() == ARMISD::VMOVRRD && 12923 Op0.getNode() == Op1.getNode() && 12924 Op0.getResNo() == 0 && Op1.getResNo() == 1) 12925 return DAG.getNode(ISD::BITCAST, SDLoc(N), 12926 N->getValueType(0), Op0.getOperand(0)); 12927 return SDValue(); 12928 } 12929 12930 static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12931 // FullFP16: half values are passed in S-registers, and we don't 12932 // need any of the bitcast and moves: 12933 // 12934 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 12935 // t5: i32 = bitcast t2 12936 // t18: f16 = ARMISD::VMOVhr t5 12937 SDValue BC = N->getOperand(0); 12938 if (BC->getOpcode() != ISD::BITCAST) 12939 return SDValue(); 12940 SDValue Copy = BC->getOperand(0); 12941 if (Copy.getValueType() != MVT::f32 || Copy->getOpcode() != ISD::CopyFromReg) 12942 return SDValue(); 12943 12944 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; 12945 SDValue NewCopy = DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), MVT::f16, Ops); 12946 return NewCopy; 12947 } 12948 12949 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 12950 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 12951 /// i64 vector to have f64 elements, since the value can then be loaded 12952 /// directly into a VFP register. 12953 static bool hasNormalLoadOperand(SDNode *N) { 12954 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 12955 for (unsigned i = 0; i < NumElts; ++i) { 12956 SDNode *Elt = N->getOperand(i).getNode(); 12957 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 12958 return true; 12959 } 12960 return false; 12961 } 12962 12963 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 12964 /// ISD::BUILD_VECTOR. 12965 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 12966 TargetLowering::DAGCombinerInfo &DCI, 12967 const ARMSubtarget *Subtarget) { 12968 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 12969 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 12970 // into a pair of GPRs, which is fine when the value is used as a scalar, 12971 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 12972 SelectionDAG &DAG = DCI.DAG; 12973 if (N->getNumOperands() == 2) 12974 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 12975 return RV; 12976 12977 // Load i64 elements as f64 values so that type legalization does not split 12978 // them up into i32 values. 12979 EVT VT = N->getValueType(0); 12980 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 12981 return SDValue(); 12982 SDLoc dl(N); 12983 SmallVector<SDValue, 8> Ops; 12984 unsigned NumElts = VT.getVectorNumElements(); 12985 for (unsigned i = 0; i < NumElts; ++i) { 12986 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 12987 Ops.push_back(V); 12988 // Make the DAGCombiner fold the bitcast. 12989 DCI.AddToWorklist(V.getNode()); 12990 } 12991 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 12992 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 12993 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 12994 } 12995 12996 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 12997 static SDValue 12998 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12999 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 13000 // At that time, we may have inserted bitcasts from integer to float. 13001 // If these bitcasts have survived DAGCombine, change the lowering of this 13002 // BUILD_VECTOR in something more vector friendly, i.e., that does not 13003 // force to use floating point types. 13004 13005 // Make sure we can change the type of the vector. 13006 // This is possible iff: 13007 // 1. The vector is only used in a bitcast to a integer type. I.e., 13008 // 1.1. Vector is used only once. 13009 // 1.2. Use is a bit convert to an integer type. 13010 // 2. The size of its operands are 32-bits (64-bits are not legal). 13011 EVT VT = N->getValueType(0); 13012 EVT EltVT = VT.getVectorElementType(); 13013 13014 // Check 1.1. and 2. 13015 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 13016 return SDValue(); 13017 13018 // By construction, the input type must be float. 13019 assert(EltVT == MVT::f32 && "Unexpected type!"); 13020 13021 // Check 1.2. 13022 SDNode *Use = *N->use_begin(); 13023 if (Use->getOpcode() != ISD::BITCAST || 13024 Use->getValueType(0).isFloatingPoint()) 13025 return SDValue(); 13026 13027 // Check profitability. 13028 // Model is, if more than half of the relevant operands are bitcast from 13029 // i32, turn the build_vector into a sequence of insert_vector_elt. 13030 // Relevant operands are everything that is not statically 13031 // (i.e., at compile time) bitcasted. 13032 unsigned NumOfBitCastedElts = 0; 13033 unsigned NumElts = VT.getVectorNumElements(); 13034 unsigned NumOfRelevantElts = NumElts; 13035 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 13036 SDValue Elt = N->getOperand(Idx); 13037 if (Elt->getOpcode() == ISD::BITCAST) { 13038 // Assume only bit cast to i32 will go away. 13039 if (Elt->getOperand(0).getValueType() == MVT::i32) 13040 ++NumOfBitCastedElts; 13041 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 13042 // Constants are statically casted, thus do not count them as 13043 // relevant operands. 13044 --NumOfRelevantElts; 13045 } 13046 13047 // Check if more than half of the elements require a non-free bitcast. 13048 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 13049 return SDValue(); 13050 13051 SelectionDAG &DAG = DCI.DAG; 13052 // Create the new vector type. 13053 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 13054 // Check if the type is legal. 13055 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13056 if (!TLI.isTypeLegal(VecVT)) 13057 return SDValue(); 13058 13059 // Combine: 13060 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 13061 // => BITCAST INSERT_VECTOR_ELT 13062 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 13063 // (BITCAST EN), N. 13064 SDValue Vec = DAG.getUNDEF(VecVT); 13065 SDLoc dl(N); 13066 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 13067 SDValue V = N->getOperand(Idx); 13068 if (V.isUndef()) 13069 continue; 13070 if (V.getOpcode() == ISD::BITCAST && 13071 V->getOperand(0).getValueType() == MVT::i32) 13072 // Fold obvious case. 13073 V = V.getOperand(0); 13074 else { 13075 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 13076 // Make the DAGCombiner fold the bitcasts. 13077 DCI.AddToWorklist(V.getNode()); 13078 } 13079 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 13080 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 13081 } 13082 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 13083 // Make the DAGCombiner fold the bitcasts. 13084 DCI.AddToWorklist(Vec.getNode()); 13085 return Vec; 13086 } 13087 13088 static SDValue 13089 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13090 EVT VT = N->getValueType(0); 13091 SDValue Op = N->getOperand(0); 13092 SDLoc dl(N); 13093 13094 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 13095 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 13096 // If the valuetypes are the same, we can remove the cast entirely. 13097 if (Op->getOperand(0).getValueType() == VT) 13098 return Op->getOperand(0); 13099 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, 13100 Op->getOperand(0).getValueType(), Op->getOperand(0)); 13101 } 13102 13103 return SDValue(); 13104 } 13105 13106 static SDValue 13107 PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 13108 const ARMSubtarget *ST) { 13109 EVT VT = N->getValueType(0); 13110 SDValue Op = N->getOperand(0); 13111 SDLoc dl(N); 13112 13113 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST 13114 if (ST->isLittle()) 13115 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op); 13116 13117 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) 13118 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { 13119 // If the valuetypes are the same, we can remove the cast entirely. 13120 if (Op->getOperand(0).getValueType() == VT) 13121 return Op->getOperand(0); 13122 return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); 13123 } 13124 13125 return SDValue(); 13126 } 13127 13128 static SDValue PerformVCMPCombine(SDNode *N, 13129 TargetLowering::DAGCombinerInfo &DCI, 13130 const ARMSubtarget *Subtarget) { 13131 if (!Subtarget->hasMVEIntegerOps()) 13132 return SDValue(); 13133 13134 EVT VT = N->getValueType(0); 13135 SDValue Op0 = N->getOperand(0); 13136 SDValue Op1 = N->getOperand(1); 13137 ARMCC::CondCodes Cond = 13138 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 13139 SDLoc dl(N); 13140 13141 // vcmp X, 0, cc -> vcmpz X, cc 13142 if (isZeroVector(Op1)) 13143 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, 13144 N->getOperand(2)); 13145 13146 unsigned SwappedCond = getSwappedCondition(Cond); 13147 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 13148 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 13149 if (isZeroVector(Op0)) 13150 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 13151 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 13152 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 13153 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 13154 return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 13155 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 13156 } 13157 13158 return SDValue(); 13159 } 13160 13161 /// PerformInsertEltCombine - Target-specific dag combine xforms for 13162 /// ISD::INSERT_VECTOR_ELT. 13163 static SDValue PerformInsertEltCombine(SDNode *N, 13164 TargetLowering::DAGCombinerInfo &DCI) { 13165 // Bitcast an i64 load inserted into a vector to f64. 13166 // Otherwise, the i64 value will be legalized to a pair of i32 values. 13167 EVT VT = N->getValueType(0); 13168 SDNode *Elt = N->getOperand(1).getNode(); 13169 if (VT.getVectorElementType() != MVT::i64 || 13170 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 13171 return SDValue(); 13172 13173 SelectionDAG &DAG = DCI.DAG; 13174 SDLoc dl(N); 13175 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13176 VT.getVectorNumElements()); 13177 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 13178 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 13179 // Make the DAGCombiner fold the bitcasts. 13180 DCI.AddToWorklist(Vec.getNode()); 13181 DCI.AddToWorklist(V.getNode()); 13182 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 13183 Vec, V, N->getOperand(2)); 13184 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 13185 } 13186 13187 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 13188 /// ISD::VECTOR_SHUFFLE. 13189 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 13190 // The LLVM shufflevector instruction does not require the shuffle mask 13191 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 13192 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 13193 // operands do not match the mask length, they are extended by concatenating 13194 // them with undef vectors. That is probably the right thing for other 13195 // targets, but for NEON it is better to concatenate two double-register 13196 // size vector operands into a single quad-register size vector. Do that 13197 // transformation here: 13198 // shuffle(concat(v1, undef), concat(v2, undef)) -> 13199 // shuffle(concat(v1, v2), undef) 13200 SDValue Op0 = N->getOperand(0); 13201 SDValue Op1 = N->getOperand(1); 13202 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 13203 Op1.getOpcode() != ISD::CONCAT_VECTORS || 13204 Op0.getNumOperands() != 2 || 13205 Op1.getNumOperands() != 2) 13206 return SDValue(); 13207 SDValue Concat0Op1 = Op0.getOperand(1); 13208 SDValue Concat1Op1 = Op1.getOperand(1); 13209 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 13210 return SDValue(); 13211 // Skip the transformation if any of the types are illegal. 13212 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13213 EVT VT = N->getValueType(0); 13214 if (!TLI.isTypeLegal(VT) || 13215 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 13216 !TLI.isTypeLegal(Concat1Op1.getValueType())) 13217 return SDValue(); 13218 13219 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 13220 Op0.getOperand(0), Op1.getOperand(0)); 13221 // Translate the shuffle mask. 13222 SmallVector<int, 16> NewMask; 13223 unsigned NumElts = VT.getVectorNumElements(); 13224 unsigned HalfElts = NumElts/2; 13225 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 13226 for (unsigned n = 0; n < NumElts; ++n) { 13227 int MaskElt = SVN->getMaskElt(n); 13228 int NewElt = -1; 13229 if (MaskElt < (int)HalfElts) 13230 NewElt = MaskElt; 13231 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 13232 NewElt = HalfElts + MaskElt - NumElts; 13233 NewMask.push_back(NewElt); 13234 } 13235 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 13236 DAG.getUNDEF(VT), NewMask); 13237 } 13238 13239 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 13240 /// NEON load/store intrinsics, and generic vector load/stores, to merge 13241 /// base address updates. 13242 /// For generic load/stores, the memory type is assumed to be a vector. 13243 /// The caller is assumed to have checked legality. 13244 static SDValue CombineBaseUpdate(SDNode *N, 13245 TargetLowering::DAGCombinerInfo &DCI) { 13246 SelectionDAG &DAG = DCI.DAG; 13247 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 13248 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 13249 const bool isStore = N->getOpcode() == ISD::STORE; 13250 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 13251 SDValue Addr = N->getOperand(AddrOpIdx); 13252 MemSDNode *MemN = cast<MemSDNode>(N); 13253 SDLoc dl(N); 13254 13255 // Search for a use of the address operand that is an increment. 13256 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 13257 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 13258 SDNode *User = *UI; 13259 if (User->getOpcode() != ISD::ADD || 13260 UI.getUse().getResNo() != Addr.getResNo()) 13261 continue; 13262 13263 // Check that the add is independent of the load/store. Otherwise, folding 13264 // it would create a cycle. We can avoid searching through Addr as it's a 13265 // predecessor to both. 13266 SmallPtrSet<const SDNode *, 32> Visited; 13267 SmallVector<const SDNode *, 16> Worklist; 13268 Visited.insert(Addr.getNode()); 13269 Worklist.push_back(N); 13270 Worklist.push_back(User); 13271 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 13272 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 13273 continue; 13274 13275 // Find the new opcode for the updating load/store. 13276 bool isLoadOp = true; 13277 bool isLaneOp = false; 13278 unsigned NewOpc = 0; 13279 unsigned NumVecs = 0; 13280 if (isIntrinsic) { 13281 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 13282 switch (IntNo) { 13283 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 13284 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 13285 NumVecs = 1; break; 13286 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 13287 NumVecs = 2; break; 13288 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 13289 NumVecs = 3; break; 13290 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 13291 NumVecs = 4; break; 13292 case Intrinsic::arm_neon_vld2dup: 13293 case Intrinsic::arm_neon_vld3dup: 13294 case Intrinsic::arm_neon_vld4dup: 13295 // TODO: Support updating VLDxDUP nodes. For now, we just skip 13296 // combining base updates for such intrinsics. 13297 continue; 13298 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 13299 NumVecs = 2; isLaneOp = true; break; 13300 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 13301 NumVecs = 3; isLaneOp = true; break; 13302 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 13303 NumVecs = 4; isLaneOp = true; break; 13304 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 13305 NumVecs = 1; isLoadOp = false; break; 13306 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 13307 NumVecs = 2; isLoadOp = false; break; 13308 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 13309 NumVecs = 3; isLoadOp = false; break; 13310 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 13311 NumVecs = 4; isLoadOp = false; break; 13312 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 13313 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 13314 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 13315 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 13316 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 13317 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 13318 } 13319 } else { 13320 isLaneOp = true; 13321 switch (N->getOpcode()) { 13322 default: llvm_unreachable("unexpected opcode for Neon base update"); 13323 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 13324 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 13325 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 13326 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 13327 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 13328 NumVecs = 1; isLaneOp = false; break; 13329 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 13330 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 13331 } 13332 } 13333 13334 // Find the size of memory referenced by the load/store. 13335 EVT VecTy; 13336 if (isLoadOp) { 13337 VecTy = N->getValueType(0); 13338 } else if (isIntrinsic) { 13339 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 13340 } else { 13341 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 13342 VecTy = N->getOperand(1).getValueType(); 13343 } 13344 13345 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 13346 if (isLaneOp) 13347 NumBytes /= VecTy.getVectorNumElements(); 13348 13349 // If the increment is a constant, it must match the memory ref size. 13350 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 13351 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 13352 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 13353 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 13354 // separate instructions that make it harder to use a non-constant update. 13355 continue; 13356 } 13357 13358 // OK, we found an ADD we can fold into the base update. 13359 // Now, create a _UPD node, taking care of not breaking alignment. 13360 13361 EVT AlignedVecTy = VecTy; 13362 unsigned Alignment = MemN->getAlignment(); 13363 13364 // If this is a less-than-standard-aligned load/store, change the type to 13365 // match the standard alignment. 13366 // The alignment is overlooked when selecting _UPD variants; and it's 13367 // easier to introduce bitcasts here than fix that. 13368 // There are 3 ways to get to this base-update combine: 13369 // - intrinsics: they are assumed to be properly aligned (to the standard 13370 // alignment of the memory type), so we don't need to do anything. 13371 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 13372 // intrinsics, so, likewise, there's nothing to do. 13373 // - generic load/store instructions: the alignment is specified as an 13374 // explicit operand, rather than implicitly as the standard alignment 13375 // of the memory type (like the intrisics). We need to change the 13376 // memory type to match the explicit alignment. That way, we don't 13377 // generate non-standard-aligned ARMISD::VLDx nodes. 13378 if (isa<LSBaseSDNode>(N)) { 13379 if (Alignment == 0) 13380 Alignment = 1; 13381 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 13382 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 13383 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 13384 assert(!isLaneOp && "Unexpected generic load/store lane."); 13385 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 13386 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 13387 } 13388 // Don't set an explicit alignment on regular load/stores that we want 13389 // to transform to VLD/VST 1_UPD nodes. 13390 // This matches the behavior of regular load/stores, which only get an 13391 // explicit alignment if the MMO alignment is larger than the standard 13392 // alignment of the memory type. 13393 // Intrinsics, however, always get an explicit alignment, set to the 13394 // alignment of the MMO. 13395 Alignment = 1; 13396 } 13397 13398 // Create the new updating load/store node. 13399 // First, create an SDVTList for the new updating node's results. 13400 EVT Tys[6]; 13401 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13402 unsigned n; 13403 for (n = 0; n < NumResultVecs; ++n) 13404 Tys[n] = AlignedVecTy; 13405 Tys[n++] = MVT::i32; 13406 Tys[n] = MVT::Other; 13407 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 13408 13409 // Then, gather the new node's operands. 13410 SmallVector<SDValue, 8> Ops; 13411 Ops.push_back(N->getOperand(0)); // incoming chain 13412 Ops.push_back(N->getOperand(AddrOpIdx)); 13413 Ops.push_back(Inc); 13414 13415 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 13416 // Try to match the intrinsic's signature 13417 Ops.push_back(StN->getValue()); 13418 } else { 13419 // Loads (and of course intrinsics) match the intrinsics' signature, 13420 // so just add all but the alignment operand. 13421 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 13422 Ops.push_back(N->getOperand(i)); 13423 } 13424 13425 // For all node types, the alignment operand is always the last one. 13426 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 13427 13428 // If this is a non-standard-aligned STORE, the penultimate operand is the 13429 // stored value. Bitcast it to the aligned type. 13430 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 13431 SDValue &StVal = Ops[Ops.size()-2]; 13432 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 13433 } 13434 13435 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 13436 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 13437 MemN->getMemOperand()); 13438 13439 // Update the uses. 13440 SmallVector<SDValue, 5> NewResults; 13441 for (unsigned i = 0; i < NumResultVecs; ++i) 13442 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13443 13444 // If this is an non-standard-aligned LOAD, the first result is the loaded 13445 // value. Bitcast it to the expected result type. 13446 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 13447 SDValue &LdVal = NewResults[0]; 13448 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 13449 } 13450 13451 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 13452 DCI.CombineTo(N, NewResults); 13453 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13454 13455 break; 13456 } 13457 return SDValue(); 13458 } 13459 13460 static SDValue PerformVLDCombine(SDNode *N, 13461 TargetLowering::DAGCombinerInfo &DCI) { 13462 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13463 return SDValue(); 13464 13465 return CombineBaseUpdate(N, DCI); 13466 } 13467 13468 static SDValue PerformMVEVLDCombine(SDNode *N, 13469 TargetLowering::DAGCombinerInfo &DCI) { 13470 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13471 return SDValue(); 13472 13473 SelectionDAG &DAG = DCI.DAG; 13474 SDValue Addr = N->getOperand(2); 13475 MemSDNode *MemN = cast<MemSDNode>(N); 13476 SDLoc dl(N); 13477 13478 // For the stores, where there are multiple intrinsics we only actually want 13479 // to post-inc the last of the them. 13480 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 13481 if (IntNo == Intrinsic::arm_mve_vst2q && 13482 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) 13483 return SDValue(); 13484 if (IntNo == Intrinsic::arm_mve_vst4q && 13485 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) 13486 return SDValue(); 13487 13488 // Search for a use of the address operand that is an increment. 13489 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 13490 UE = Addr.getNode()->use_end(); 13491 UI != UE; ++UI) { 13492 SDNode *User = *UI; 13493 if (User->getOpcode() != ISD::ADD || 13494 UI.getUse().getResNo() != Addr.getResNo()) 13495 continue; 13496 13497 // Check that the add is independent of the load/store. Otherwise, folding 13498 // it would create a cycle. We can avoid searching through Addr as it's a 13499 // predecessor to both. 13500 SmallPtrSet<const SDNode *, 32> Visited; 13501 SmallVector<const SDNode *, 16> Worklist; 13502 Visited.insert(Addr.getNode()); 13503 Worklist.push_back(N); 13504 Worklist.push_back(User); 13505 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 13506 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 13507 continue; 13508 13509 // Find the new opcode for the updating load/store. 13510 bool isLoadOp = true; 13511 unsigned NewOpc = 0; 13512 unsigned NumVecs = 0; 13513 switch (IntNo) { 13514 default: 13515 llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); 13516 case Intrinsic::arm_mve_vld2q: 13517 NewOpc = ARMISD::VLD2_UPD; 13518 NumVecs = 2; 13519 break; 13520 case Intrinsic::arm_mve_vld4q: 13521 NewOpc = ARMISD::VLD4_UPD; 13522 NumVecs = 4; 13523 break; 13524 case Intrinsic::arm_mve_vst2q: 13525 NewOpc = ARMISD::VST2_UPD; 13526 NumVecs = 2; 13527 isLoadOp = false; 13528 break; 13529 case Intrinsic::arm_mve_vst4q: 13530 NewOpc = ARMISD::VST4_UPD; 13531 NumVecs = 4; 13532 isLoadOp = false; 13533 break; 13534 } 13535 13536 // Find the size of memory referenced by the load/store. 13537 EVT VecTy; 13538 if (isLoadOp) { 13539 VecTy = N->getValueType(0); 13540 } else { 13541 VecTy = N->getOperand(3).getValueType(); 13542 } 13543 13544 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 13545 13546 // If the increment is a constant, it must match the memory ref size. 13547 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 13548 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 13549 if (!CInc || CInc->getZExtValue() != NumBytes) 13550 continue; 13551 13552 // Create the new updating load/store node. 13553 // First, create an SDVTList for the new updating node's results. 13554 EVT Tys[6]; 13555 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 13556 unsigned n; 13557 for (n = 0; n < NumResultVecs; ++n) 13558 Tys[n] = VecTy; 13559 Tys[n++] = MVT::i32; 13560 Tys[n] = MVT::Other; 13561 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 13562 13563 // Then, gather the new node's operands. 13564 SmallVector<SDValue, 8> Ops; 13565 Ops.push_back(N->getOperand(0)); // incoming chain 13566 Ops.push_back(N->getOperand(2)); // ptr 13567 Ops.push_back(Inc); 13568 13569 for (unsigned i = 3; i < N->getNumOperands(); ++i) 13570 Ops.push_back(N->getOperand(i)); 13571 13572 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, 13573 MemN->getMemOperand()); 13574 13575 // Update the uses. 13576 SmallVector<SDValue, 5> NewResults; 13577 for (unsigned i = 0; i < NumResultVecs; ++i) 13578 NewResults.push_back(SDValue(UpdN.getNode(), i)); 13579 13580 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 13581 DCI.CombineTo(N, NewResults); 13582 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 13583 13584 break; 13585 } 13586 13587 return SDValue(); 13588 } 13589 13590 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 13591 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 13592 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 13593 /// return true. 13594 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 13595 SelectionDAG &DAG = DCI.DAG; 13596 EVT VT = N->getValueType(0); 13597 // vldN-dup instructions only support 64-bit vectors for N > 1. 13598 if (!VT.is64BitVector()) 13599 return false; 13600 13601 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 13602 SDNode *VLD = N->getOperand(0).getNode(); 13603 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 13604 return false; 13605 unsigned NumVecs = 0; 13606 unsigned NewOpc = 0; 13607 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 13608 if (IntNo == Intrinsic::arm_neon_vld2lane) { 13609 NumVecs = 2; 13610 NewOpc = ARMISD::VLD2DUP; 13611 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 13612 NumVecs = 3; 13613 NewOpc = ARMISD::VLD3DUP; 13614 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 13615 NumVecs = 4; 13616 NewOpc = ARMISD::VLD4DUP; 13617 } else { 13618 return false; 13619 } 13620 13621 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 13622 // numbers match the load. 13623 unsigned VLDLaneNo = 13624 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 13625 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13626 UI != UE; ++UI) { 13627 // Ignore uses of the chain result. 13628 if (UI.getUse().getResNo() == NumVecs) 13629 continue; 13630 SDNode *User = *UI; 13631 if (User->getOpcode() != ARMISD::VDUPLANE || 13632 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 13633 return false; 13634 } 13635 13636 // Create the vldN-dup node. 13637 EVT Tys[5]; 13638 unsigned n; 13639 for (n = 0; n < NumVecs; ++n) 13640 Tys[n] = VT; 13641 Tys[n] = MVT::Other; 13642 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 13643 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 13644 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 13645 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 13646 Ops, VLDMemInt->getMemoryVT(), 13647 VLDMemInt->getMemOperand()); 13648 13649 // Update the uses. 13650 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 13651 UI != UE; ++UI) { 13652 unsigned ResNo = UI.getUse().getResNo(); 13653 // Ignore uses of the chain result. 13654 if (ResNo == NumVecs) 13655 continue; 13656 SDNode *User = *UI; 13657 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 13658 } 13659 13660 // Now the vldN-lane intrinsic is dead except for its chain result. 13661 // Update uses of the chain. 13662 std::vector<SDValue> VLDDupResults; 13663 for (unsigned n = 0; n < NumVecs; ++n) 13664 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 13665 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 13666 DCI.CombineTo(VLD, VLDDupResults); 13667 13668 return true; 13669 } 13670 13671 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 13672 /// ARMISD::VDUPLANE. 13673 static SDValue PerformVDUPLANECombine(SDNode *N, 13674 TargetLowering::DAGCombinerInfo &DCI) { 13675 SDValue Op = N->getOperand(0); 13676 13677 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 13678 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 13679 if (CombineVLDDUP(N, DCI)) 13680 return SDValue(N, 0); 13681 13682 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 13683 // redundant. Ignore bit_converts for now; element sizes are checked below. 13684 while (Op.getOpcode() == ISD::BITCAST) 13685 Op = Op.getOperand(0); 13686 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 13687 return SDValue(); 13688 13689 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 13690 unsigned EltSize = Op.getScalarValueSizeInBits(); 13691 // The canonical VMOV for a zero vector uses a 32-bit element size. 13692 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13693 unsigned EltBits; 13694 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 13695 EltSize = 8; 13696 EVT VT = N->getValueType(0); 13697 if (EltSize > VT.getScalarSizeInBits()) 13698 return SDValue(); 13699 13700 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 13701 } 13702 13703 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 13704 static SDValue PerformVDUPCombine(SDNode *N, 13705 TargetLowering::DAGCombinerInfo &DCI, 13706 const ARMSubtarget *Subtarget) { 13707 SelectionDAG &DAG = DCI.DAG; 13708 SDValue Op = N->getOperand(0); 13709 13710 if (!Subtarget->hasNEON()) 13711 return SDValue(); 13712 13713 // Match VDUP(LOAD) -> VLD1DUP. 13714 // We match this pattern here rather than waiting for isel because the 13715 // transform is only legal for unindexed loads. 13716 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 13717 if (LD && Op.hasOneUse() && LD->isUnindexed() && 13718 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 13719 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 13720 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 13721 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 13722 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 13723 Ops, LD->getMemoryVT(), 13724 LD->getMemOperand()); 13725 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 13726 return VLDDup; 13727 } 13728 13729 return SDValue(); 13730 } 13731 13732 static SDValue PerformLOADCombine(SDNode *N, 13733 TargetLowering::DAGCombinerInfo &DCI) { 13734 EVT VT = N->getValueType(0); 13735 13736 // If this is a legal vector load, try to combine it into a VLD1_UPD. 13737 if (ISD::isNormalLoad(N) && VT.isVector() && 13738 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13739 return CombineBaseUpdate(N, DCI); 13740 13741 return SDValue(); 13742 } 13743 13744 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 13745 // pack all of the elements in one place. Next, store to memory in fewer 13746 // chunks. 13747 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 13748 SelectionDAG &DAG) { 13749 SDValue StVal = St->getValue(); 13750 EVT VT = StVal.getValueType(); 13751 if (!St->isTruncatingStore() || !VT.isVector()) 13752 return SDValue(); 13753 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13754 EVT StVT = St->getMemoryVT(); 13755 unsigned NumElems = VT.getVectorNumElements(); 13756 assert(StVT != VT && "Cannot truncate to the same type"); 13757 unsigned FromEltSz = VT.getScalarSizeInBits(); 13758 unsigned ToEltSz = StVT.getScalarSizeInBits(); 13759 13760 // From, To sizes and ElemCount must be pow of two 13761 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 13762 return SDValue(); 13763 13764 // We are going to use the original vector elt for storing. 13765 // Accumulated smaller vector elements must be a multiple of the store size. 13766 if (0 != (NumElems * FromEltSz) % ToEltSz) 13767 return SDValue(); 13768 13769 unsigned SizeRatio = FromEltSz / ToEltSz; 13770 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 13771 13772 // Create a type on which we perform the shuffle. 13773 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 13774 NumElems * SizeRatio); 13775 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 13776 13777 SDLoc DL(St); 13778 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 13779 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 13780 for (unsigned i = 0; i < NumElems; ++i) 13781 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 13782 : i * SizeRatio; 13783 13784 // Can't shuffle using an illegal type. 13785 if (!TLI.isTypeLegal(WideVecVT)) 13786 return SDValue(); 13787 13788 SDValue Shuff = DAG.getVectorShuffle( 13789 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 13790 // At this point all of the data is stored at the bottom of the 13791 // register. We now need to save it to mem. 13792 13793 // Find the largest store unit 13794 MVT StoreType = MVT::i8; 13795 for (MVT Tp : MVT::integer_valuetypes()) { 13796 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 13797 StoreType = Tp; 13798 } 13799 // Didn't find a legal store type. 13800 if (!TLI.isTypeLegal(StoreType)) 13801 return SDValue(); 13802 13803 // Bitcast the original vector into a vector of store-size units 13804 EVT StoreVecVT = 13805 EVT::getVectorVT(*DAG.getContext(), StoreType, 13806 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 13807 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 13808 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 13809 SmallVector<SDValue, 8> Chains; 13810 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 13811 TLI.getPointerTy(DAG.getDataLayout())); 13812 SDValue BasePtr = St->getBasePtr(); 13813 13814 // Perform one or more big stores into memory. 13815 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 13816 for (unsigned I = 0; I < E; I++) { 13817 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 13818 ShuffWide, DAG.getIntPtrConstant(I, DL)); 13819 SDValue Ch = 13820 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 13821 St->getAlignment(), St->getMemOperand()->getFlags()); 13822 BasePtr = 13823 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 13824 Chains.push_back(Ch); 13825 } 13826 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 13827 } 13828 13829 // Try taking a single vector store from an truncate (which would otherwise turn 13830 // into an expensive buildvector) and splitting it into a series of narrowing 13831 // stores. 13832 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 13833 SelectionDAG &DAG) { 13834 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 13835 return SDValue(); 13836 SDValue Trunc = St->getValue(); 13837 if (Trunc->getOpcode() != ISD::TRUNCATE) 13838 return SDValue(); 13839 EVT FromVT = Trunc->getOperand(0).getValueType(); 13840 EVT ToVT = Trunc.getValueType(); 13841 if (!ToVT.isVector()) 13842 return SDValue(); 13843 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 13844 EVT ToEltVT = ToVT.getVectorElementType(); 13845 EVT FromEltVT = FromVT.getVectorElementType(); 13846 13847 unsigned NumElements = 0; 13848 if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) 13849 NumElements = 4; 13850 if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) 13851 NumElements = 8; 13852 if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || 13853 FromVT.getVectorNumElements() % NumElements != 0) 13854 return SDValue(); 13855 13856 SDLoc DL(St); 13857 // Details about the old store 13858 SDValue Ch = St->getChain(); 13859 SDValue BasePtr = St->getBasePtr(); 13860 unsigned Alignment = St->getOriginalAlignment(); 13861 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 13862 AAMDNodes AAInfo = St->getAAInfo(); 13863 13864 EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); 13865 EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); 13866 13867 SmallVector<SDValue, 4> Stores; 13868 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 13869 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 13870 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 13871 13872 SDValue Extract = 13873 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 13874 DAG.getConstant(i * NumElements, DL, MVT::i32)); 13875 SDValue Store = DAG.getTruncStore( 13876 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 13877 NewToVT, Alignment, MMOFlags, AAInfo); 13878 Stores.push_back(Store); 13879 } 13880 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 13881 } 13882 13883 /// PerformSTORECombine - Target-specific dag combine xforms for 13884 /// ISD::STORE. 13885 static SDValue PerformSTORECombine(SDNode *N, 13886 TargetLowering::DAGCombinerInfo &DCI, 13887 const ARMSubtarget *Subtarget) { 13888 StoreSDNode *St = cast<StoreSDNode>(N); 13889 if (St->isVolatile()) 13890 return SDValue(); 13891 SDValue StVal = St->getValue(); 13892 EVT VT = StVal.getValueType(); 13893 13894 if (Subtarget->hasNEON()) 13895 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 13896 return Store; 13897 13898 if (Subtarget->hasMVEIntegerOps()) 13899 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 13900 return NewToken; 13901 13902 if (!ISD::isNormalStore(St)) 13903 return SDValue(); 13904 13905 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 13906 // ARM stores of arguments in the same cache line. 13907 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 13908 StVal.getNode()->hasOneUse()) { 13909 SelectionDAG &DAG = DCI.DAG; 13910 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 13911 SDLoc DL(St); 13912 SDValue BasePtr = St->getBasePtr(); 13913 SDValue NewST1 = DAG.getStore( 13914 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 13915 BasePtr, St->getPointerInfo(), St->getAlignment(), 13916 St->getMemOperand()->getFlags()); 13917 13918 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 13919 DAG.getConstant(4, DL, MVT::i32)); 13920 return DAG.getStore(NewST1.getValue(0), DL, 13921 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 13922 OffsetPtr, St->getPointerInfo(), 13923 std::min(4U, St->getAlignment() / 2), 13924 St->getMemOperand()->getFlags()); 13925 } 13926 13927 if (StVal.getValueType() == MVT::i64 && 13928 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 13929 13930 // Bitcast an i64 store extracted from a vector to f64. 13931 // Otherwise, the i64 value will be legalized to a pair of i32 values. 13932 SelectionDAG &DAG = DCI.DAG; 13933 SDLoc dl(StVal); 13934 SDValue IntVec = StVal.getOperand(0); 13935 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 13936 IntVec.getValueType().getVectorNumElements()); 13937 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 13938 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 13939 Vec, StVal.getOperand(1)); 13940 dl = SDLoc(N); 13941 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 13942 // Make the DAGCombiner fold the bitcasts. 13943 DCI.AddToWorklist(Vec.getNode()); 13944 DCI.AddToWorklist(ExtElt.getNode()); 13945 DCI.AddToWorklist(V.getNode()); 13946 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 13947 St->getPointerInfo(), St->getAlignment(), 13948 St->getMemOperand()->getFlags(), St->getAAInfo()); 13949 } 13950 13951 // If this is a legal vector store, try to combine it into a VST1_UPD. 13952 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 13953 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13954 return CombineBaseUpdate(N, DCI); 13955 13956 return SDValue(); 13957 } 13958 13959 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 13960 /// can replace combinations of VMUL and VCVT (floating-point to integer) 13961 /// when the VMUL has a constant operand that is a power of 2. 13962 /// 13963 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 13964 /// vmul.f32 d16, d17, d16 13965 /// vcvt.s32.f32 d16, d16 13966 /// becomes: 13967 /// vcvt.s32.f32 d16, d16, #3 13968 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 13969 const ARMSubtarget *Subtarget) { 13970 if (!Subtarget->hasNEON()) 13971 return SDValue(); 13972 13973 SDValue Op = N->getOperand(0); 13974 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 13975 Op.getOpcode() != ISD::FMUL) 13976 return SDValue(); 13977 13978 SDValue ConstVec = Op->getOperand(1); 13979 if (!isa<BuildVectorSDNode>(ConstVec)) 13980 return SDValue(); 13981 13982 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 13983 uint32_t FloatBits = FloatTy.getSizeInBits(); 13984 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 13985 uint32_t IntBits = IntTy.getSizeInBits(); 13986 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 13987 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 13988 // These instructions only exist converting from f32 to i32. We can handle 13989 // smaller integers by generating an extra truncate, but larger ones would 13990 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 13991 // these intructions only support v2i32/v4i32 types. 13992 return SDValue(); 13993 } 13994 13995 BitVector UndefElements; 13996 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 13997 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 13998 if (C == -1 || C == 0 || C > 32) 13999 return SDValue(); 14000 14001 SDLoc dl(N); 14002 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 14003 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 14004 Intrinsic::arm_neon_vcvtfp2fxu; 14005 SDValue FixConv = DAG.getNode( 14006 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 14007 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 14008 DAG.getConstant(C, dl, MVT::i32)); 14009 14010 if (IntBits < FloatBits) 14011 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 14012 14013 return FixConv; 14014 } 14015 14016 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 14017 /// can replace combinations of VCVT (integer to floating-point) and VDIV 14018 /// when the VDIV has a constant operand that is a power of 2. 14019 /// 14020 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 14021 /// vcvt.f32.s32 d16, d16 14022 /// vdiv.f32 d16, d17, d16 14023 /// becomes: 14024 /// vcvt.f32.s32 d16, d16, #3 14025 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 14026 const ARMSubtarget *Subtarget) { 14027 if (!Subtarget->hasNEON()) 14028 return SDValue(); 14029 14030 SDValue Op = N->getOperand(0); 14031 unsigned OpOpcode = Op.getNode()->getOpcode(); 14032 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 14033 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 14034 return SDValue(); 14035 14036 SDValue ConstVec = N->getOperand(1); 14037 if (!isa<BuildVectorSDNode>(ConstVec)) 14038 return SDValue(); 14039 14040 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 14041 uint32_t FloatBits = FloatTy.getSizeInBits(); 14042 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 14043 uint32_t IntBits = IntTy.getSizeInBits(); 14044 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 14045 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 14046 // These instructions only exist converting from i32 to f32. We can handle 14047 // smaller integers by generating an extra extend, but larger ones would 14048 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 14049 // these intructions only support v2i32/v4i32 types. 14050 return SDValue(); 14051 } 14052 14053 BitVector UndefElements; 14054 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 14055 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 14056 if (C == -1 || C == 0 || C > 32) 14057 return SDValue(); 14058 14059 SDLoc dl(N); 14060 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 14061 SDValue ConvInput = Op.getOperand(0); 14062 if (IntBits < FloatBits) 14063 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 14064 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 14065 ConvInput); 14066 14067 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 14068 Intrinsic::arm_neon_vcvtfxu2fp; 14069 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 14070 Op.getValueType(), 14071 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 14072 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 14073 } 14074 14075 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, 14076 const ARMSubtarget *ST) { 14077 if (!ST->hasMVEIntegerOps()) 14078 return SDValue(); 14079 14080 assert(N->getOpcode() == ISD::VECREDUCE_ADD); 14081 EVT ResVT = N->getValueType(0); 14082 SDValue N0 = N->getOperand(0); 14083 SDLoc dl(N); 14084 14085 // We are looking for something that will have illegal types if left alone, 14086 // but that we can convert to a single instruction undef MVE. For example 14087 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A 14088 // or 14089 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B 14090 14091 // Cases: 14092 // VADDV u/s 8/16/32 14093 // VMLAV u/s 8/16/32 14094 // VADDLV u/s 32 14095 // VMLALV u/s 16/32 14096 14097 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { 14098 if (ResVT != RetTy || N0->getOpcode() != ExtendCode) 14099 return SDValue(); 14100 SDValue A = N0->getOperand(0); 14101 if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) 14102 return A; 14103 return SDValue(); 14104 }; 14105 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 14106 SDValue &A, SDValue &B) { 14107 if (ResVT != RetTy || N0->getOpcode() != ISD::MUL) 14108 return false; 14109 SDValue ExtA = N0->getOperand(0); 14110 SDValue ExtB = N0->getOperand(1); 14111 if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) 14112 return false; 14113 A = ExtA->getOperand(0); 14114 B = ExtB->getOperand(0); 14115 if (A.getValueType() == B.getValueType() && 14116 llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) 14117 return true; 14118 return false; 14119 }; 14120 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { 14121 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); 14122 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, 14123 SDValue(Node.getNode(), 1)); 14124 }; 14125 14126 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) 14127 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); 14128 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) 14129 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); 14130 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) 14131 return Create64bitNode(ARMISD::VADDLVs, {A}); 14132 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) 14133 return Create64bitNode(ARMISD::VADDLVu, {A}); 14134 14135 SDValue A, B; 14136 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 14137 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); 14138 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 14139 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); 14140 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) 14141 return Create64bitNode(ARMISD::VMLALVs, {A, B}); 14142 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) 14143 return Create64bitNode(ARMISD::VMLALVu, {A, B}); 14144 return SDValue(); 14145 } 14146 14147 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { 14148 SDLoc DL(N); 14149 SDValue Op0 = N->getOperand(0); 14150 SDValue Op1 = N->getOperand(1); 14151 14152 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from 14153 // uses of the intrinsics. 14154 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 14155 int ShiftAmt = C->getSExtValue(); 14156 if (ShiftAmt == 0) { 14157 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); 14158 DAG.ReplaceAllUsesWith(N, Merge.getNode()); 14159 return SDValue(); 14160 } 14161 14162 if (ShiftAmt >= -32 && ShiftAmt < 0) { 14163 unsigned NewOpcode = 14164 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; 14165 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, 14166 DAG.getConstant(-ShiftAmt, DL, MVT::i32)); 14167 DAG.ReplaceAllUsesWith(N, NewShift.getNode()); 14168 return NewShift; 14169 } 14170 } 14171 14172 return SDValue(); 14173 } 14174 14175 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 14176 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 14177 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 14178 switch (IntNo) { 14179 default: 14180 // Don't do anything for most intrinsics. 14181 break; 14182 14183 // Vector shifts: check for immediate versions and lower them. 14184 // Note: This is done during DAG combining instead of DAG legalizing because 14185 // the build_vectors for 64-bit vector element shift counts are generally 14186 // not legal, and it is hard to see their values after they get legalized to 14187 // loads from a constant pool. 14188 case Intrinsic::arm_neon_vshifts: 14189 case Intrinsic::arm_neon_vshiftu: 14190 case Intrinsic::arm_neon_vrshifts: 14191 case Intrinsic::arm_neon_vrshiftu: 14192 case Intrinsic::arm_neon_vrshiftn: 14193 case Intrinsic::arm_neon_vqshifts: 14194 case Intrinsic::arm_neon_vqshiftu: 14195 case Intrinsic::arm_neon_vqshiftsu: 14196 case Intrinsic::arm_neon_vqshiftns: 14197 case Intrinsic::arm_neon_vqshiftnu: 14198 case Intrinsic::arm_neon_vqshiftnsu: 14199 case Intrinsic::arm_neon_vqrshiftns: 14200 case Intrinsic::arm_neon_vqrshiftnu: 14201 case Intrinsic::arm_neon_vqrshiftnsu: { 14202 EVT VT = N->getOperand(1).getValueType(); 14203 int64_t Cnt; 14204 unsigned VShiftOpc = 0; 14205 14206 switch (IntNo) { 14207 case Intrinsic::arm_neon_vshifts: 14208 case Intrinsic::arm_neon_vshiftu: 14209 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 14210 VShiftOpc = ARMISD::VSHLIMM; 14211 break; 14212 } 14213 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 14214 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 14215 : ARMISD::VSHRuIMM); 14216 break; 14217 } 14218 return SDValue(); 14219 14220 case Intrinsic::arm_neon_vrshifts: 14221 case Intrinsic::arm_neon_vrshiftu: 14222 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 14223 break; 14224 return SDValue(); 14225 14226 case Intrinsic::arm_neon_vqshifts: 14227 case Intrinsic::arm_neon_vqshiftu: 14228 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 14229 break; 14230 return SDValue(); 14231 14232 case Intrinsic::arm_neon_vqshiftsu: 14233 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 14234 break; 14235 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 14236 14237 case Intrinsic::arm_neon_vrshiftn: 14238 case Intrinsic::arm_neon_vqshiftns: 14239 case Intrinsic::arm_neon_vqshiftnu: 14240 case Intrinsic::arm_neon_vqshiftnsu: 14241 case Intrinsic::arm_neon_vqrshiftns: 14242 case Intrinsic::arm_neon_vqrshiftnu: 14243 case Intrinsic::arm_neon_vqrshiftnsu: 14244 // Narrowing shifts require an immediate right shift. 14245 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 14246 break; 14247 llvm_unreachable("invalid shift count for narrowing vector shift " 14248 "intrinsic"); 14249 14250 default: 14251 llvm_unreachable("unhandled vector shift"); 14252 } 14253 14254 switch (IntNo) { 14255 case Intrinsic::arm_neon_vshifts: 14256 case Intrinsic::arm_neon_vshiftu: 14257 // Opcode already set above. 14258 break; 14259 case Intrinsic::arm_neon_vrshifts: 14260 VShiftOpc = ARMISD::VRSHRsIMM; 14261 break; 14262 case Intrinsic::arm_neon_vrshiftu: 14263 VShiftOpc = ARMISD::VRSHRuIMM; 14264 break; 14265 case Intrinsic::arm_neon_vrshiftn: 14266 VShiftOpc = ARMISD::VRSHRNIMM; 14267 break; 14268 case Intrinsic::arm_neon_vqshifts: 14269 VShiftOpc = ARMISD::VQSHLsIMM; 14270 break; 14271 case Intrinsic::arm_neon_vqshiftu: 14272 VShiftOpc = ARMISD::VQSHLuIMM; 14273 break; 14274 case Intrinsic::arm_neon_vqshiftsu: 14275 VShiftOpc = ARMISD::VQSHLsuIMM; 14276 break; 14277 case Intrinsic::arm_neon_vqshiftns: 14278 VShiftOpc = ARMISD::VQSHRNsIMM; 14279 break; 14280 case Intrinsic::arm_neon_vqshiftnu: 14281 VShiftOpc = ARMISD::VQSHRNuIMM; 14282 break; 14283 case Intrinsic::arm_neon_vqshiftnsu: 14284 VShiftOpc = ARMISD::VQSHRNsuIMM; 14285 break; 14286 case Intrinsic::arm_neon_vqrshiftns: 14287 VShiftOpc = ARMISD::VQRSHRNsIMM; 14288 break; 14289 case Intrinsic::arm_neon_vqrshiftnu: 14290 VShiftOpc = ARMISD::VQRSHRNuIMM; 14291 break; 14292 case Intrinsic::arm_neon_vqrshiftnsu: 14293 VShiftOpc = ARMISD::VQRSHRNsuIMM; 14294 break; 14295 } 14296 14297 SDLoc dl(N); 14298 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 14299 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 14300 } 14301 14302 case Intrinsic::arm_neon_vshiftins: { 14303 EVT VT = N->getOperand(1).getValueType(); 14304 int64_t Cnt; 14305 unsigned VShiftOpc = 0; 14306 14307 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 14308 VShiftOpc = ARMISD::VSLIIMM; 14309 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 14310 VShiftOpc = ARMISD::VSRIIMM; 14311 else { 14312 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 14313 } 14314 14315 SDLoc dl(N); 14316 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 14317 N->getOperand(1), N->getOperand(2), 14318 DAG.getConstant(Cnt, dl, MVT::i32)); 14319 } 14320 14321 case Intrinsic::arm_neon_vqrshifts: 14322 case Intrinsic::arm_neon_vqrshiftu: 14323 // No immediate versions of these to check for. 14324 break; 14325 } 14326 14327 return SDValue(); 14328 } 14329 14330 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 14331 /// lowers them. As with the vector shift intrinsics, this is done during DAG 14332 /// combining instead of DAG legalizing because the build_vectors for 64-bit 14333 /// vector element shift counts are generally not legal, and it is hard to see 14334 /// their values after they get legalized to loads from a constant pool. 14335 static SDValue PerformShiftCombine(SDNode *N, 14336 TargetLowering::DAGCombinerInfo &DCI, 14337 const ARMSubtarget *ST) { 14338 SelectionDAG &DAG = DCI.DAG; 14339 EVT VT = N->getValueType(0); 14340 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 14341 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 14342 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 14343 SDValue N1 = N->getOperand(1); 14344 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 14345 SDValue N0 = N->getOperand(0); 14346 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 14347 DAG.MaskedValueIsZero(N0.getOperand(0), 14348 APInt::getHighBitsSet(32, 16))) 14349 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 14350 } 14351 } 14352 14353 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 14354 N->getOperand(0)->getOpcode() == ISD::AND && 14355 N->getOperand(0)->hasOneUse()) { 14356 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14357 return SDValue(); 14358 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 14359 // usually show up because instcombine prefers to canonicalize it to 14360 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 14361 // out of GEP lowering in some cases. 14362 SDValue N0 = N->getOperand(0); 14363 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14364 if (!ShiftAmtNode) 14365 return SDValue(); 14366 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 14367 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 14368 if (!AndMaskNode) 14369 return SDValue(); 14370 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 14371 // Don't transform uxtb/uxth. 14372 if (AndMask == 255 || AndMask == 65535) 14373 return SDValue(); 14374 if (isMask_32(AndMask)) { 14375 uint32_t MaskedBits = countLeadingZeros(AndMask); 14376 if (MaskedBits > ShiftAmt) { 14377 SDLoc DL(N); 14378 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 14379 DAG.getConstant(MaskedBits, DL, MVT::i32)); 14380 return DAG.getNode( 14381 ISD::SRL, DL, MVT::i32, SHL, 14382 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 14383 } 14384 } 14385 } 14386 14387 // Nothing to be done for scalar shifts. 14388 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14389 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 14390 return SDValue(); 14391 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 14392 return SDValue(); 14393 14394 int64_t Cnt; 14395 14396 switch (N->getOpcode()) { 14397 default: llvm_unreachable("unexpected shift opcode"); 14398 14399 case ISD::SHL: 14400 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 14401 SDLoc dl(N); 14402 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 14403 DAG.getConstant(Cnt, dl, MVT::i32)); 14404 } 14405 break; 14406 14407 case ISD::SRA: 14408 case ISD::SRL: 14409 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 14410 unsigned VShiftOpc = 14411 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 14412 SDLoc dl(N); 14413 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 14414 DAG.getConstant(Cnt, dl, MVT::i32)); 14415 } 14416 } 14417 return SDValue(); 14418 } 14419 14420 // Look for a sign/zero extend of a larger than legal load. This can be split 14421 // into two extending loads, which are simpler to deal with than an arbitrary 14422 // sign extend. 14423 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 14424 SDValue N0 = N->getOperand(0); 14425 if (N0.getOpcode() != ISD::LOAD) 14426 return SDValue(); 14427 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 14428 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 14429 LD->getExtensionType() != ISD::NON_EXTLOAD) 14430 return SDValue(); 14431 EVT FromVT = LD->getValueType(0); 14432 EVT ToVT = N->getValueType(0); 14433 if (!ToVT.isVector()) 14434 return SDValue(); 14435 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 14436 EVT ToEltVT = ToVT.getVectorElementType(); 14437 EVT FromEltVT = FromVT.getVectorElementType(); 14438 14439 unsigned NumElements = 0; 14440 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 14441 NumElements = 4; 14442 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 14443 NumElements = 8; 14444 if (NumElements == 0 || 14445 FromVT.getVectorNumElements() == NumElements || 14446 FromVT.getVectorNumElements() % NumElements != 0 || 14447 !isPowerOf2_32(NumElements)) 14448 return SDValue(); 14449 14450 SDLoc DL(LD); 14451 // Details about the old load 14452 SDValue Ch = LD->getChain(); 14453 SDValue BasePtr = LD->getBasePtr(); 14454 unsigned Alignment = LD->getOriginalAlignment(); 14455 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 14456 AAMDNodes AAInfo = LD->getAAInfo(); 14457 14458 ISD::LoadExtType NewExtType = 14459 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 14460 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 14461 EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14462 EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); 14463 unsigned NewOffset = NewFromVT.getSizeInBits() / 8; 14464 SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); 14465 14466 // Split the load in half, each side of which is extended separately. This 14467 // is good enough, as legalisation will take it from there. They are either 14468 // already legal or they will be split further into something that is 14469 // legal. 14470 SDValue NewLoad1 = 14471 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, 14472 LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); 14473 SDValue NewLoad2 = 14474 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 14475 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 14476 Alignment, MMOFlags, AAInfo); 14477 14478 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 14479 SDValue(NewLoad1.getNode(), 1), 14480 SDValue(NewLoad2.getNode(), 1)); 14481 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 14482 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); 14483 } 14484 14485 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 14486 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 14487 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 14488 const ARMSubtarget *ST) { 14489 SDValue N0 = N->getOperand(0); 14490 14491 // Check for sign- and zero-extensions of vector extract operations of 8- and 14492 // 16-bit vector elements. NEON and MVE support these directly. They are 14493 // handled during DAG combining because type legalization will promote them 14494 // to 32-bit types and it is messy to recognize the operations after that. 14495 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 14496 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 14497 SDValue Vec = N0.getOperand(0); 14498 SDValue Lane = N0.getOperand(1); 14499 EVT VT = N->getValueType(0); 14500 EVT EltVT = N0.getValueType(); 14501 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14502 14503 if (VT == MVT::i32 && 14504 (EltVT == MVT::i8 || EltVT == MVT::i16) && 14505 TLI.isTypeLegal(Vec.getValueType()) && 14506 isa<ConstantSDNode>(Lane)) { 14507 14508 unsigned Opc = 0; 14509 switch (N->getOpcode()) { 14510 default: llvm_unreachable("unexpected opcode"); 14511 case ISD::SIGN_EXTEND: 14512 Opc = ARMISD::VGETLANEs; 14513 break; 14514 case ISD::ZERO_EXTEND: 14515 case ISD::ANY_EXTEND: 14516 Opc = ARMISD::VGETLANEu; 14517 break; 14518 } 14519 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 14520 } 14521 } 14522 14523 if (ST->hasMVEIntegerOps()) 14524 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 14525 return NewLoad; 14526 14527 return SDValue(); 14528 } 14529 14530 static const APInt *isPowerOf2Constant(SDValue V) { 14531 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 14532 if (!C) 14533 return nullptr; 14534 const APInt *CV = &C->getAPIntValue(); 14535 return CV->isPowerOf2() ? CV : nullptr; 14536 } 14537 14538 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 14539 // If we have a CMOV, OR and AND combination such as: 14540 // if (x & CN) 14541 // y |= CM; 14542 // 14543 // And: 14544 // * CN is a single bit; 14545 // * All bits covered by CM are known zero in y 14546 // 14547 // Then we can convert this into a sequence of BFI instructions. This will 14548 // always be a win if CM is a single bit, will always be no worse than the 14549 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 14550 // three bits (due to the extra IT instruction). 14551 14552 SDValue Op0 = CMOV->getOperand(0); 14553 SDValue Op1 = CMOV->getOperand(1); 14554 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 14555 auto CC = CCNode->getAPIntValue().getLimitedValue(); 14556 SDValue CmpZ = CMOV->getOperand(4); 14557 14558 // The compare must be against zero. 14559 if (!isNullConstant(CmpZ->getOperand(1))) 14560 return SDValue(); 14561 14562 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 14563 SDValue And = CmpZ->getOperand(0); 14564 if (And->getOpcode() != ISD::AND) 14565 return SDValue(); 14566 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 14567 if (!AndC) 14568 return SDValue(); 14569 SDValue X = And->getOperand(0); 14570 14571 if (CC == ARMCC::EQ) { 14572 // We're performing an "equal to zero" compare. Swap the operands so we 14573 // canonicalize on a "not equal to zero" compare. 14574 std::swap(Op0, Op1); 14575 } else { 14576 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 14577 } 14578 14579 if (Op1->getOpcode() != ISD::OR) 14580 return SDValue(); 14581 14582 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 14583 if (!OrC) 14584 return SDValue(); 14585 SDValue Y = Op1->getOperand(0); 14586 14587 if (Op0 != Y) 14588 return SDValue(); 14589 14590 // Now, is it profitable to continue? 14591 APInt OrCI = OrC->getAPIntValue(); 14592 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 14593 if (OrCI.countPopulation() > Heuristic) 14594 return SDValue(); 14595 14596 // Lastly, can we determine that the bits defined by OrCI 14597 // are zero in Y? 14598 KnownBits Known = DAG.computeKnownBits(Y); 14599 if ((OrCI & Known.Zero) != OrCI) 14600 return SDValue(); 14601 14602 // OK, we can do the combine. 14603 SDValue V = Y; 14604 SDLoc dl(X); 14605 EVT VT = X.getValueType(); 14606 unsigned BitInX = AndC->logBase2(); 14607 14608 if (BitInX != 0) { 14609 // We must shift X first. 14610 X = DAG.getNode(ISD::SRL, dl, VT, X, 14611 DAG.getConstant(BitInX, dl, VT)); 14612 } 14613 14614 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 14615 BitInY < NumActiveBits; ++BitInY) { 14616 if (OrCI[BitInY] == 0) 14617 continue; 14618 APInt Mask(VT.getSizeInBits(), 0); 14619 Mask.setBit(BitInY); 14620 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 14621 // Confusingly, the operand is an *inverted* mask. 14622 DAG.getConstant(~Mask, dl, VT)); 14623 } 14624 14625 return V; 14626 } 14627 14628 // Given N, the value controlling the conditional branch, search for the loop 14629 // intrinsic, returning it, along with how the value is used. We need to handle 14630 // patterns such as the following: 14631 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 14632 // (brcond (setcc (loop.decrement), 0, eq), exit) 14633 // (brcond (setcc (loop.decrement), 0, ne), header) 14634 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 14635 bool &Negate) { 14636 switch (N->getOpcode()) { 14637 default: 14638 break; 14639 case ISD::XOR: { 14640 if (!isa<ConstantSDNode>(N.getOperand(1))) 14641 return SDValue(); 14642 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 14643 return SDValue(); 14644 Negate = !Negate; 14645 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 14646 } 14647 case ISD::SETCC: { 14648 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 14649 if (!Const) 14650 return SDValue(); 14651 if (Const->isNullValue()) 14652 Imm = 0; 14653 else if (Const->isOne()) 14654 Imm = 1; 14655 else 14656 return SDValue(); 14657 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 14658 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 14659 } 14660 case ISD::INTRINSIC_W_CHAIN: { 14661 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 14662 if (IntOp != Intrinsic::test_set_loop_iterations && 14663 IntOp != Intrinsic::loop_decrement_reg) 14664 return SDValue(); 14665 return N; 14666 } 14667 } 14668 return SDValue(); 14669 } 14670 14671 static SDValue PerformHWLoopCombine(SDNode *N, 14672 TargetLowering::DAGCombinerInfo &DCI, 14673 const ARMSubtarget *ST) { 14674 14675 // The hwloop intrinsics that we're interested are used for control-flow, 14676 // either for entering or exiting the loop: 14677 // - test.set.loop.iterations will test whether its operand is zero. If it 14678 // is zero, the proceeding branch should not enter the loop. 14679 // - loop.decrement.reg also tests whether its operand is zero. If it is 14680 // zero, the proceeding branch should not branch back to the beginning of 14681 // the loop. 14682 // So here, we need to check that how the brcond is using the result of each 14683 // of the intrinsics to ensure that we're branching to the right place at the 14684 // right time. 14685 14686 ISD::CondCode CC; 14687 SDValue Cond; 14688 int Imm = 1; 14689 bool Negate = false; 14690 SDValue Chain = N->getOperand(0); 14691 SDValue Dest; 14692 14693 if (N->getOpcode() == ISD::BRCOND) { 14694 CC = ISD::SETEQ; 14695 Cond = N->getOperand(1); 14696 Dest = N->getOperand(2); 14697 } else { 14698 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 14699 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 14700 Cond = N->getOperand(2); 14701 Dest = N->getOperand(4); 14702 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 14703 if (!Const->isOne() && !Const->isNullValue()) 14704 return SDValue(); 14705 Imm = Const->getZExtValue(); 14706 } else 14707 return SDValue(); 14708 } 14709 14710 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 14711 if (!Int) 14712 return SDValue(); 14713 14714 if (Negate) 14715 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 14716 14717 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 14718 return (CC == ISD::SETEQ && Imm == 0) || 14719 (CC == ISD::SETNE && Imm == 1) || 14720 (CC == ISD::SETLT && Imm == 1) || 14721 (CC == ISD::SETULT && Imm == 1); 14722 }; 14723 14724 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 14725 return (CC == ISD::SETEQ && Imm == 1) || 14726 (CC == ISD::SETNE && Imm == 0) || 14727 (CC == ISD::SETGT && Imm == 0) || 14728 (CC == ISD::SETUGT && Imm == 0) || 14729 (CC == ISD::SETGE && Imm == 1) || 14730 (CC == ISD::SETUGE && Imm == 1); 14731 }; 14732 14733 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 14734 "unsupported condition"); 14735 14736 SDLoc dl(Int); 14737 SelectionDAG &DAG = DCI.DAG; 14738 SDValue Elements = Int.getOperand(2); 14739 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 14740 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 14741 && "expected single br user"); 14742 SDNode *Br = *N->use_begin(); 14743 SDValue OtherTarget = Br->getOperand(1); 14744 14745 // Update the unconditional branch to branch to the given Dest. 14746 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 14747 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 14748 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 14749 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 14750 }; 14751 14752 if (IntOp == Intrinsic::test_set_loop_iterations) { 14753 SDValue Res; 14754 // We expect this 'instruction' to branch when the counter is zero. 14755 if (IsTrueIfZero(CC, Imm)) { 14756 SDValue Ops[] = { Chain, Elements, Dest }; 14757 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14758 } else { 14759 // The logic is the reverse of what we need for WLS, so find the other 14760 // basic block target: the target of the proceeding br. 14761 UpdateUncondBr(Br, Dest, DAG); 14762 14763 SDValue Ops[] = { Chain, Elements, OtherTarget }; 14764 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 14765 } 14766 DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); 14767 return Res; 14768 } else { 14769 SDValue Size = DAG.getTargetConstant( 14770 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 14771 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 14772 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 14773 DAG.getVTList(MVT::i32, MVT::Other), Args); 14774 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 14775 14776 // We expect this instruction to branch when the count is not zero. 14777 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 14778 14779 // Update the unconditional branch to target the loop preheader if we've 14780 // found the condition has been reversed. 14781 if (Target == OtherTarget) 14782 UpdateUncondBr(Br, Dest, DAG); 14783 14784 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 14785 SDValue(LoopDec.getNode(), 1), Chain); 14786 14787 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 14788 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 14789 } 14790 return SDValue(); 14791 } 14792 14793 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 14794 SDValue 14795 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 14796 SDValue Cmp = N->getOperand(4); 14797 if (Cmp.getOpcode() != ARMISD::CMPZ) 14798 // Only looking at NE cases. 14799 return SDValue(); 14800 14801 EVT VT = N->getValueType(0); 14802 SDLoc dl(N); 14803 SDValue LHS = Cmp.getOperand(0); 14804 SDValue RHS = Cmp.getOperand(1); 14805 SDValue Chain = N->getOperand(0); 14806 SDValue BB = N->getOperand(1); 14807 SDValue ARMcc = N->getOperand(2); 14808 ARMCC::CondCodes CC = 14809 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14810 14811 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 14812 // -> (brcond Chain BB CC CPSR Cmp) 14813 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 14814 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 14815 LHS->getOperand(0)->hasOneUse()) { 14816 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 14817 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 14818 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14819 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14820 if ((LHS00C && LHS00C->getZExtValue() == 0) && 14821 (LHS01C && LHS01C->getZExtValue() == 1) && 14822 (LHS1C && LHS1C->getZExtValue() == 1) && 14823 (RHSC && RHSC->getZExtValue() == 0)) { 14824 return DAG.getNode( 14825 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 14826 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 14827 } 14828 } 14829 14830 return SDValue(); 14831 } 14832 14833 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 14834 SDValue 14835 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 14836 SDValue Cmp = N->getOperand(4); 14837 if (Cmp.getOpcode() != ARMISD::CMPZ) 14838 // Only looking at EQ and NE cases. 14839 return SDValue(); 14840 14841 EVT VT = N->getValueType(0); 14842 SDLoc dl(N); 14843 SDValue LHS = Cmp.getOperand(0); 14844 SDValue RHS = Cmp.getOperand(1); 14845 SDValue FalseVal = N->getOperand(0); 14846 SDValue TrueVal = N->getOperand(1); 14847 SDValue ARMcc = N->getOperand(2); 14848 ARMCC::CondCodes CC = 14849 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 14850 14851 // BFI is only available on V6T2+. 14852 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 14853 SDValue R = PerformCMOVToBFICombine(N, DAG); 14854 if (R) 14855 return R; 14856 } 14857 14858 // Simplify 14859 // mov r1, r0 14860 // cmp r1, x 14861 // mov r0, y 14862 // moveq r0, x 14863 // to 14864 // cmp r0, x 14865 // movne r0, y 14866 // 14867 // mov r1, r0 14868 // cmp r1, x 14869 // mov r0, x 14870 // movne r0, y 14871 // to 14872 // cmp r0, x 14873 // movne r0, y 14874 /// FIXME: Turn this into a target neutral optimization? 14875 SDValue Res; 14876 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 14877 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 14878 N->getOperand(3), Cmp); 14879 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 14880 SDValue ARMcc; 14881 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 14882 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 14883 N->getOperand(3), NewCmp); 14884 } 14885 14886 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 14887 // -> (cmov F T CC CPSR Cmp) 14888 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 14889 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 14890 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 14891 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 14892 if ((LHS0C && LHS0C->getZExtValue() == 0) && 14893 (LHS1C && LHS1C->getZExtValue() == 1) && 14894 (RHSC && RHSC->getZExtValue() == 0)) { 14895 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 14896 LHS->getOperand(2), LHS->getOperand(3), 14897 LHS->getOperand(4)); 14898 } 14899 } 14900 14901 if (!VT.isInteger()) 14902 return SDValue(); 14903 14904 // Materialize a boolean comparison for integers so we can avoid branching. 14905 if (isNullConstant(FalseVal)) { 14906 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 14907 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 14908 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 14909 // right 5 bits will make that 32 be 1, otherwise it will be 0. 14910 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 14911 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14912 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 14913 DAG.getConstant(5, dl, MVT::i32)); 14914 } else { 14915 // CMOV 0, 1, ==, (CMPZ x, y) -> 14916 // (ADDCARRY (SUB x, y), t:0, t:1) 14917 // where t = (SUBCARRY 0, (SUB x, y), 0) 14918 // 14919 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 14920 // x != y. In other words, a carry C == 1 when x == y, C == 0 14921 // otherwise. 14922 // The final ADDCARRY computes 14923 // x - y + (0 - (x - y)) + C == C 14924 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 14925 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14926 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 14927 // ISD::SUBCARRY returns a borrow but we want the carry here 14928 // actually. 14929 SDValue Carry = 14930 DAG.getNode(ISD::SUB, dl, MVT::i32, 14931 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 14932 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 14933 } 14934 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 14935 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 14936 // This seems pointless but will allow us to combine it further below. 14937 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14938 SDValue Sub = 14939 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14940 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14941 Sub.getValue(1), SDValue()); 14942 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 14943 N->getOperand(3), CPSRGlue.getValue(1)); 14944 FalseVal = Sub; 14945 } 14946 } else if (isNullConstant(TrueVal)) { 14947 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 14948 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 14949 // This seems pointless but will allow us to combine it further below 14950 // Note that we change == for != as this is the dual for the case above. 14951 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 14952 SDValue Sub = 14953 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 14954 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 14955 Sub.getValue(1), SDValue()); 14956 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 14957 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 14958 N->getOperand(3), CPSRGlue.getValue(1)); 14959 FalseVal = Sub; 14960 } 14961 } 14962 14963 // On Thumb1, the DAG above may be further combined if z is a power of 2 14964 // (z == 2 ^ K). 14965 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 14966 // t1 = (USUBO (SUB x, y), 1) 14967 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 14968 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14969 // 14970 // This also handles the special case of comparing against zero; it's 14971 // essentially, the same pattern, except there's no SUBS: 14972 // CMOV x, z, !=, (CMPZ x, 0) -> 14973 // t1 = (USUBO x, 1) 14974 // t2 = (SUBCARRY x, t1:0, t1:1) 14975 // Result = if K != 0 then (SHL t2:0, K) else t2:0 14976 const APInt *TrueConst; 14977 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 14978 ((FalseVal.getOpcode() == ARMISD::SUBS && 14979 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 14980 (FalseVal == LHS && isNullConstant(RHS))) && 14981 (TrueConst = isPowerOf2Constant(TrueVal))) { 14982 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 14983 unsigned ShiftAmount = TrueConst->logBase2(); 14984 if (ShiftAmount) 14985 TrueVal = DAG.getConstant(1, dl, VT); 14986 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 14987 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 14988 14989 if (ShiftAmount) 14990 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 14991 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 14992 } 14993 14994 if (Res.getNode()) { 14995 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 14996 // Capture demanded bits information that would be otherwise lost. 14997 if (Known.Zero == 0xfffffffe) 14998 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 14999 DAG.getValueType(MVT::i1)); 15000 else if (Known.Zero == 0xffffff00) 15001 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 15002 DAG.getValueType(MVT::i8)); 15003 else if (Known.Zero == 0xffff0000) 15004 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 15005 DAG.getValueType(MVT::i16)); 15006 } 15007 15008 return Res; 15009 } 15010 15011 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 15012 DAGCombinerInfo &DCI) const { 15013 switch (N->getOpcode()) { 15014 default: break; 15015 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 15016 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 15017 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 15018 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 15019 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 15020 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 15021 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 15022 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 15023 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 15024 case ISD::BRCOND: 15025 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 15026 case ARMISD::ADDC: 15027 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 15028 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 15029 case ARMISD::BFI: return PerformBFICombine(N, DCI); 15030 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 15031 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 15032 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); 15033 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 15034 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 15035 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 15036 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 15037 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 15038 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); 15039 case ISD::FP_TO_SINT: 15040 case ISD::FP_TO_UINT: 15041 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 15042 case ISD::FDIV: 15043 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 15044 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 15045 case ISD::SHL: 15046 case ISD::SRA: 15047 case ISD::SRL: 15048 return PerformShiftCombine(N, DCI, Subtarget); 15049 case ISD::SIGN_EXTEND: 15050 case ISD::ZERO_EXTEND: 15051 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 15052 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 15053 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 15054 case ISD::LOAD: return PerformLOADCombine(N, DCI); 15055 case ARMISD::VLD1DUP: 15056 case ARMISD::VLD2DUP: 15057 case ARMISD::VLD3DUP: 15058 case ARMISD::VLD4DUP: 15059 return PerformVLDCombine(N, DCI); 15060 case ARMISD::BUILD_VECTOR: 15061 return PerformARMBUILD_VECTORCombine(N, DCI); 15062 case ARMISD::PREDICATE_CAST: 15063 return PerformPREDICATE_CASTCombine(N, DCI); 15064 case ARMISD::VECTOR_REG_CAST: 15065 return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); 15066 case ARMISD::VCMP: 15067 return PerformVCMPCombine(N, DCI, Subtarget); 15068 case ISD::VECREDUCE_ADD: 15069 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); 15070 case ARMISD::ASRL: 15071 case ARMISD::LSRL: 15072 case ARMISD::LSLL: 15073 return PerformLongShiftCombine(N, DCI.DAG); 15074 case ARMISD::SMULWB: { 15075 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15076 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 15077 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 15078 return SDValue(); 15079 break; 15080 } 15081 case ARMISD::SMULWT: { 15082 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15083 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 15084 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 15085 return SDValue(); 15086 break; 15087 } 15088 case ARMISD::SMLALBB: 15089 case ARMISD::QADD16b: 15090 case ARMISD::QSUB16b: { 15091 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15092 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 15093 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 15094 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 15095 return SDValue(); 15096 break; 15097 } 15098 case ARMISD::SMLALBT: { 15099 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 15100 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 15101 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 15102 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 15103 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 15104 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 15105 return SDValue(); 15106 break; 15107 } 15108 case ARMISD::SMLALTB: { 15109 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 15110 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 15111 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 15112 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 15113 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 15114 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 15115 return SDValue(); 15116 break; 15117 } 15118 case ARMISD::SMLALTT: { 15119 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15120 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 15121 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 15122 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 15123 return SDValue(); 15124 break; 15125 } 15126 case ARMISD::QADD8b: 15127 case ARMISD::QSUB8b: { 15128 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 15129 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 15130 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 15131 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 15132 return SDValue(); 15133 break; 15134 } 15135 case ISD::INTRINSIC_VOID: 15136 case ISD::INTRINSIC_W_CHAIN: 15137 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 15138 case Intrinsic::arm_neon_vld1: 15139 case Intrinsic::arm_neon_vld1x2: 15140 case Intrinsic::arm_neon_vld1x3: 15141 case Intrinsic::arm_neon_vld1x4: 15142 case Intrinsic::arm_neon_vld2: 15143 case Intrinsic::arm_neon_vld3: 15144 case Intrinsic::arm_neon_vld4: 15145 case Intrinsic::arm_neon_vld2lane: 15146 case Intrinsic::arm_neon_vld3lane: 15147 case Intrinsic::arm_neon_vld4lane: 15148 case Intrinsic::arm_neon_vld2dup: 15149 case Intrinsic::arm_neon_vld3dup: 15150 case Intrinsic::arm_neon_vld4dup: 15151 case Intrinsic::arm_neon_vst1: 15152 case Intrinsic::arm_neon_vst1x2: 15153 case Intrinsic::arm_neon_vst1x3: 15154 case Intrinsic::arm_neon_vst1x4: 15155 case Intrinsic::arm_neon_vst2: 15156 case Intrinsic::arm_neon_vst3: 15157 case Intrinsic::arm_neon_vst4: 15158 case Intrinsic::arm_neon_vst2lane: 15159 case Intrinsic::arm_neon_vst3lane: 15160 case Intrinsic::arm_neon_vst4lane: 15161 return PerformVLDCombine(N, DCI); 15162 case Intrinsic::arm_mve_vld2q: 15163 case Intrinsic::arm_mve_vld4q: 15164 case Intrinsic::arm_mve_vst2q: 15165 case Intrinsic::arm_mve_vst4q: 15166 return PerformMVEVLDCombine(N, DCI); 15167 default: break; 15168 } 15169 break; 15170 } 15171 return SDValue(); 15172 } 15173 15174 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 15175 EVT VT) const { 15176 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 15177 } 15178 15179 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 15180 unsigned Alignment, 15181 MachineMemOperand::Flags, 15182 bool *Fast) const { 15183 // Depends what it gets converted into if the type is weird. 15184 if (!VT.isSimple()) 15185 return false; 15186 15187 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 15188 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 15189 auto Ty = VT.getSimpleVT().SimpleTy; 15190 15191 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 15192 // Unaligned access can use (for example) LRDB, LRDH, LDR 15193 if (AllowsUnaligned) { 15194 if (Fast) 15195 *Fast = Subtarget->hasV7Ops(); 15196 return true; 15197 } 15198 } 15199 15200 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 15201 // For any little-endian targets with neon, we can support unaligned ld/st 15202 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 15203 // A big-endian target may also explicitly support unaligned accesses 15204 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 15205 if (Fast) 15206 *Fast = true; 15207 return true; 15208 } 15209 } 15210 15211 if (!Subtarget->hasMVEIntegerOps()) 15212 return false; 15213 15214 // These are for predicates 15215 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 15216 if (Fast) 15217 *Fast = true; 15218 return true; 15219 } 15220 15221 // These are for truncated stores/narrowing loads. They are fine so long as 15222 // the alignment is at least the size of the item being loaded 15223 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 15224 Alignment >= VT.getScalarSizeInBits() / 8) { 15225 if (Fast) 15226 *Fast = true; 15227 return true; 15228 } 15229 15230 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 15231 // VSTRW.U32 all store the vector register in exactly the same format, and 15232 // differ only in the range of their immediate offset field and the required 15233 // alignment. So there is always a store that can be used, regardless of 15234 // actual type. 15235 // 15236 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 15237 // VREV64.8) pair and get the same effect. This will likely be better than 15238 // aligning the vector through the stack. 15239 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 15240 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 15241 Ty == MVT::v2f64) { 15242 if (Fast) 15243 *Fast = true; 15244 return true; 15245 } 15246 15247 return false; 15248 } 15249 15250 15251 EVT ARMTargetLowering::getOptimalMemOpType( 15252 const MemOp &Op, const AttributeList &FuncAttributes) const { 15253 // See if we can use NEON instructions for this... 15254 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && 15255 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 15256 bool Fast; 15257 if (Op.size() >= 16 && 15258 (Op.isAligned(Align(16)) || 15259 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, 15260 MachineMemOperand::MONone, &Fast) && 15261 Fast))) { 15262 return MVT::v2f64; 15263 } else if (Op.size() >= 8 && 15264 (Op.isAligned(Align(8)) || 15265 (allowsMisalignedMemoryAccesses( 15266 MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && 15267 Fast))) { 15268 return MVT::f64; 15269 } 15270 } 15271 15272 // Let the target-independent logic figure it out. 15273 return MVT::Other; 15274 } 15275 15276 // 64-bit integers are split into their high and low parts and held in two 15277 // different registers, so the trunc is free since the low register can just 15278 // be used. 15279 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 15280 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 15281 return false; 15282 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 15283 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 15284 return (SrcBits == 64 && DestBits == 32); 15285 } 15286 15287 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 15288 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 15289 !DstVT.isInteger()) 15290 return false; 15291 unsigned SrcBits = SrcVT.getSizeInBits(); 15292 unsigned DestBits = DstVT.getSizeInBits(); 15293 return (SrcBits == 64 && DestBits == 32); 15294 } 15295 15296 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 15297 if (Val.getOpcode() != ISD::LOAD) 15298 return false; 15299 15300 EVT VT1 = Val.getValueType(); 15301 if (!VT1.isSimple() || !VT1.isInteger() || 15302 !VT2.isSimple() || !VT2.isInteger()) 15303 return false; 15304 15305 switch (VT1.getSimpleVT().SimpleTy) { 15306 default: break; 15307 case MVT::i1: 15308 case MVT::i8: 15309 case MVT::i16: 15310 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 15311 return true; 15312 } 15313 15314 return false; 15315 } 15316 15317 bool ARMTargetLowering::isFNegFree(EVT VT) const { 15318 if (!VT.isSimple()) 15319 return false; 15320 15321 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 15322 // negate values directly (fneg is free). So, we don't want to let the DAG 15323 // combiner rewrite fneg into xors and some other instructions. For f16 and 15324 // FullFP16 argument passing, some bitcast nodes may be introduced, 15325 // triggering this DAG combine rewrite, so we are avoiding that with this. 15326 switch (VT.getSimpleVT().SimpleTy) { 15327 default: break; 15328 case MVT::f16: 15329 return Subtarget->hasFullFP16(); 15330 } 15331 15332 return false; 15333 } 15334 15335 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 15336 /// of the vector elements. 15337 static bool areExtractExts(Value *Ext1, Value *Ext2) { 15338 auto areExtDoubled = [](Instruction *Ext) { 15339 return Ext->getType()->getScalarSizeInBits() == 15340 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 15341 }; 15342 15343 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 15344 !match(Ext2, m_ZExtOrSExt(m_Value())) || 15345 !areExtDoubled(cast<Instruction>(Ext1)) || 15346 !areExtDoubled(cast<Instruction>(Ext2))) 15347 return false; 15348 15349 return true; 15350 } 15351 15352 /// Check if sinking \p I's operands to I's basic block is profitable, because 15353 /// the operands can be folded into a target instruction, e.g. 15354 /// sext/zext can be folded into vsubl. 15355 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 15356 SmallVectorImpl<Use *> &Ops) const { 15357 if (!I->getType()->isVectorTy()) 15358 return false; 15359 15360 if (Subtarget->hasNEON()) { 15361 switch (I->getOpcode()) { 15362 case Instruction::Sub: 15363 case Instruction::Add: { 15364 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 15365 return false; 15366 Ops.push_back(&I->getOperandUse(0)); 15367 Ops.push_back(&I->getOperandUse(1)); 15368 return true; 15369 } 15370 default: 15371 return false; 15372 } 15373 } 15374 15375 if (!Subtarget->hasMVEIntegerOps()) 15376 return false; 15377 15378 auto IsSinker = [](Instruction *I, int Operand) { 15379 switch (I->getOpcode()) { 15380 case Instruction::Add: 15381 case Instruction::Mul: 15382 case Instruction::ICmp: 15383 return true; 15384 case Instruction::Sub: 15385 case Instruction::Shl: 15386 case Instruction::LShr: 15387 case Instruction::AShr: 15388 return Operand == 1; 15389 default: 15390 return false; 15391 } 15392 }; 15393 15394 int Op = 0; 15395 if (!isa<ShuffleVectorInst>(I->getOperand(Op))) 15396 Op = 1; 15397 if (!IsSinker(I, Op)) 15398 return false; 15399 if (!match(I->getOperand(Op), 15400 m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), 15401 m_Undef(), m_Zero()))) { 15402 return false; 15403 } 15404 Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); 15405 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 15406 // and vector registers 15407 for (Use &U : Shuffle->uses()) { 15408 Instruction *Insn = cast<Instruction>(U.getUser()); 15409 if (!IsSinker(Insn, U.getOperandNo())) 15410 return false; 15411 } 15412 Ops.push_back(&Shuffle->getOperandUse(0)); 15413 Ops.push_back(&I->getOperandUse(Op)); 15414 return true; 15415 } 15416 15417 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 15418 EVT VT = ExtVal.getValueType(); 15419 15420 if (!isTypeLegal(VT)) 15421 return false; 15422 15423 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 15424 if (Ld->isExpandingLoad()) 15425 return false; 15426 } 15427 15428 if (Subtarget->hasMVEIntegerOps()) 15429 return true; 15430 15431 // Don't create a loadext if we can fold the extension into a wide/long 15432 // instruction. 15433 // If there's more than one user instruction, the loadext is desirable no 15434 // matter what. There can be two uses by the same instruction. 15435 if (ExtVal->use_empty() || 15436 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 15437 return true; 15438 15439 SDNode *U = *ExtVal->use_begin(); 15440 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 15441 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 15442 return false; 15443 15444 return true; 15445 } 15446 15447 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 15448 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 15449 return false; 15450 15451 if (!isTypeLegal(EVT::getEVT(Ty1))) 15452 return false; 15453 15454 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 15455 15456 // Assuming the caller doesn't have a zeroext or signext return parameter, 15457 // truncation all the way down to i1 is valid. 15458 return true; 15459 } 15460 15461 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 15462 const AddrMode &AM, Type *Ty, 15463 unsigned AS) const { 15464 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 15465 if (Subtarget->hasFPAO()) 15466 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 15467 return 0; 15468 } 15469 return -1; 15470 } 15471 15472 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 15473 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 15474 /// expanded to FMAs when this method returns true, otherwise fmuladd is 15475 /// expanded to fmul + fadd. 15476 /// 15477 /// ARM supports both fused and unfused multiply-add operations; we already 15478 /// lower a pair of fmul and fadd to the latter so it's not clear that there 15479 /// would be a gain or that the gain would be worthwhile enough to risk 15480 /// correctness bugs. 15481 /// 15482 /// For MVE, we set this to true as it helps simplify the need for some 15483 /// patterns (and we don't have the non-fused floating point instruction). 15484 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 15485 EVT VT) const { 15486 if (!VT.isSimple()) 15487 return false; 15488 15489 switch (VT.getSimpleVT().SimpleTy) { 15490 case MVT::v4f32: 15491 case MVT::v8f16: 15492 return Subtarget->hasMVEFloatOps(); 15493 case MVT::f16: 15494 return Subtarget->useFPVFMx16(); 15495 case MVT::f32: 15496 return Subtarget->useFPVFMx(); 15497 case MVT::f64: 15498 return Subtarget->useFPVFMx64(); 15499 default: 15500 break; 15501 } 15502 15503 return false; 15504 } 15505 15506 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 15507 if (V < 0) 15508 return false; 15509 15510 unsigned Scale = 1; 15511 switch (VT.getSimpleVT().SimpleTy) { 15512 case MVT::i1: 15513 case MVT::i8: 15514 // Scale == 1; 15515 break; 15516 case MVT::i16: 15517 // Scale == 2; 15518 Scale = 2; 15519 break; 15520 default: 15521 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 15522 // Scale == 4; 15523 Scale = 4; 15524 break; 15525 } 15526 15527 if ((V & (Scale - 1)) != 0) 15528 return false; 15529 return isUInt<5>(V / Scale); 15530 } 15531 15532 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 15533 const ARMSubtarget *Subtarget) { 15534 if (!VT.isInteger() && !VT.isFloatingPoint()) 15535 return false; 15536 if (VT.isVector() && Subtarget->hasNEON()) 15537 return false; 15538 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 15539 !Subtarget->hasMVEFloatOps()) 15540 return false; 15541 15542 bool IsNeg = false; 15543 if (V < 0) { 15544 IsNeg = true; 15545 V = -V; 15546 } 15547 15548 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 15549 15550 // MVE: size * imm7 15551 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 15552 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 15553 case MVT::i32: 15554 case MVT::f32: 15555 return isShiftedUInt<7,2>(V); 15556 case MVT::i16: 15557 case MVT::f16: 15558 return isShiftedUInt<7,1>(V); 15559 case MVT::i8: 15560 return isUInt<7>(V); 15561 default: 15562 return false; 15563 } 15564 } 15565 15566 // half VLDR: 2 * imm8 15567 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 15568 return isShiftedUInt<8, 1>(V); 15569 // VLDR and LDRD: 4 * imm8 15570 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 15571 return isShiftedUInt<8, 2>(V); 15572 15573 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 15574 // + imm12 or - imm8 15575 if (IsNeg) 15576 return isUInt<8>(V); 15577 return isUInt<12>(V); 15578 } 15579 15580 return false; 15581 } 15582 15583 /// isLegalAddressImmediate - Return true if the integer value can be used 15584 /// as the offset of the target addressing mode for load / store of the 15585 /// given type. 15586 static bool isLegalAddressImmediate(int64_t V, EVT VT, 15587 const ARMSubtarget *Subtarget) { 15588 if (V == 0) 15589 return true; 15590 15591 if (!VT.isSimple()) 15592 return false; 15593 15594 if (Subtarget->isThumb1Only()) 15595 return isLegalT1AddressImmediate(V, VT); 15596 else if (Subtarget->isThumb2()) 15597 return isLegalT2AddressImmediate(V, VT, Subtarget); 15598 15599 // ARM mode. 15600 if (V < 0) 15601 V = - V; 15602 switch (VT.getSimpleVT().SimpleTy) { 15603 default: return false; 15604 case MVT::i1: 15605 case MVT::i8: 15606 case MVT::i32: 15607 // +- imm12 15608 return isUInt<12>(V); 15609 case MVT::i16: 15610 // +- imm8 15611 return isUInt<8>(V); 15612 case MVT::f32: 15613 case MVT::f64: 15614 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 15615 return false; 15616 return isShiftedUInt<8, 2>(V); 15617 } 15618 } 15619 15620 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 15621 EVT VT) const { 15622 int Scale = AM.Scale; 15623 if (Scale < 0) 15624 return false; 15625 15626 switch (VT.getSimpleVT().SimpleTy) { 15627 default: return false; 15628 case MVT::i1: 15629 case MVT::i8: 15630 case MVT::i16: 15631 case MVT::i32: 15632 if (Scale == 1) 15633 return true; 15634 // r + r << imm 15635 Scale = Scale & ~1; 15636 return Scale == 2 || Scale == 4 || Scale == 8; 15637 case MVT::i64: 15638 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 15639 // version in Thumb mode. 15640 // r + r 15641 if (Scale == 1) 15642 return true; 15643 // r * 2 (this can be lowered to r + r). 15644 if (!AM.HasBaseReg && Scale == 2) 15645 return true; 15646 return false; 15647 case MVT::isVoid: 15648 // Note, we allow "void" uses (basically, uses that aren't loads or 15649 // stores), because arm allows folding a scale into many arithmetic 15650 // operations. This should be made more precise and revisited later. 15651 15652 // Allow r << imm, but the imm has to be a multiple of two. 15653 if (Scale & 1) return false; 15654 return isPowerOf2_32(Scale); 15655 } 15656 } 15657 15658 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 15659 EVT VT) const { 15660 const int Scale = AM.Scale; 15661 15662 // Negative scales are not supported in Thumb1. 15663 if (Scale < 0) 15664 return false; 15665 15666 // Thumb1 addressing modes do not support register scaling excepting the 15667 // following cases: 15668 // 1. Scale == 1 means no scaling. 15669 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 15670 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 15671 } 15672 15673 /// isLegalAddressingMode - Return true if the addressing mode represented 15674 /// by AM is legal for this target, for a load/store of the specified type. 15675 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 15676 const AddrMode &AM, Type *Ty, 15677 unsigned AS, Instruction *I) const { 15678 EVT VT = getValueType(DL, Ty, true); 15679 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 15680 return false; 15681 15682 // Can never fold addr of global into load/store. 15683 if (AM.BaseGV) 15684 return false; 15685 15686 switch (AM.Scale) { 15687 case 0: // no scale reg, must be "r+i" or "r", or "i". 15688 break; 15689 default: 15690 // ARM doesn't support any R+R*scale+imm addr modes. 15691 if (AM.BaseOffs) 15692 return false; 15693 15694 if (!VT.isSimple()) 15695 return false; 15696 15697 if (Subtarget->isThumb1Only()) 15698 return isLegalT1ScaledAddressingMode(AM, VT); 15699 15700 if (Subtarget->isThumb2()) 15701 return isLegalT2ScaledAddressingMode(AM, VT); 15702 15703 int Scale = AM.Scale; 15704 switch (VT.getSimpleVT().SimpleTy) { 15705 default: return false; 15706 case MVT::i1: 15707 case MVT::i8: 15708 case MVT::i32: 15709 if (Scale < 0) Scale = -Scale; 15710 if (Scale == 1) 15711 return true; 15712 // r + r << imm 15713 return isPowerOf2_32(Scale & ~1); 15714 case MVT::i16: 15715 case MVT::i64: 15716 // r +/- r 15717 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 15718 return true; 15719 // r * 2 (this can be lowered to r + r). 15720 if (!AM.HasBaseReg && Scale == 2) 15721 return true; 15722 return false; 15723 15724 case MVT::isVoid: 15725 // Note, we allow "void" uses (basically, uses that aren't loads or 15726 // stores), because arm allows folding a scale into many arithmetic 15727 // operations. This should be made more precise and revisited later. 15728 15729 // Allow r << imm, but the imm has to be a multiple of two. 15730 if (Scale & 1) return false; 15731 return isPowerOf2_32(Scale); 15732 } 15733 } 15734 return true; 15735 } 15736 15737 /// isLegalICmpImmediate - Return true if the specified immediate is legal 15738 /// icmp immediate, that is the target has icmp instructions which can compare 15739 /// a register against the immediate without having to materialize the 15740 /// immediate into a register. 15741 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 15742 // Thumb2 and ARM modes can use cmn for negative immediates. 15743 if (!Subtarget->isThumb()) 15744 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 15745 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 15746 if (Subtarget->isThumb2()) 15747 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 15748 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 15749 // Thumb1 doesn't have cmn, and only 8-bit immediates. 15750 return Imm >= 0 && Imm <= 255; 15751 } 15752 15753 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 15754 /// *or sub* immediate, that is the target has add or sub instructions which can 15755 /// add a register with the immediate without having to materialize the 15756 /// immediate into a register. 15757 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 15758 // Same encoding for add/sub, just flip the sign. 15759 int64_t AbsImm = std::abs(Imm); 15760 if (!Subtarget->isThumb()) 15761 return ARM_AM::getSOImmVal(AbsImm) != -1; 15762 if (Subtarget->isThumb2()) 15763 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 15764 // Thumb1 only has 8-bit unsigned immediate. 15765 return AbsImm >= 0 && AbsImm <= 255; 15766 } 15767 15768 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 15769 bool isSEXTLoad, SDValue &Base, 15770 SDValue &Offset, bool &isInc, 15771 SelectionDAG &DAG) { 15772 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15773 return false; 15774 15775 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 15776 // AddressingMode 3 15777 Base = Ptr->getOperand(0); 15778 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15779 int RHSC = (int)RHS->getZExtValue(); 15780 if (RHSC < 0 && RHSC > -256) { 15781 assert(Ptr->getOpcode() == ISD::ADD); 15782 isInc = false; 15783 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15784 return true; 15785 } 15786 } 15787 isInc = (Ptr->getOpcode() == ISD::ADD); 15788 Offset = Ptr->getOperand(1); 15789 return true; 15790 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 15791 // AddressingMode 2 15792 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15793 int RHSC = (int)RHS->getZExtValue(); 15794 if (RHSC < 0 && RHSC > -0x1000) { 15795 assert(Ptr->getOpcode() == ISD::ADD); 15796 isInc = false; 15797 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15798 Base = Ptr->getOperand(0); 15799 return true; 15800 } 15801 } 15802 15803 if (Ptr->getOpcode() == ISD::ADD) { 15804 isInc = true; 15805 ARM_AM::ShiftOpc ShOpcVal= 15806 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 15807 if (ShOpcVal != ARM_AM::no_shift) { 15808 Base = Ptr->getOperand(1); 15809 Offset = Ptr->getOperand(0); 15810 } else { 15811 Base = Ptr->getOperand(0); 15812 Offset = Ptr->getOperand(1); 15813 } 15814 return true; 15815 } 15816 15817 isInc = (Ptr->getOpcode() == ISD::ADD); 15818 Base = Ptr->getOperand(0); 15819 Offset = Ptr->getOperand(1); 15820 return true; 15821 } 15822 15823 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 15824 return false; 15825 } 15826 15827 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 15828 bool isSEXTLoad, SDValue &Base, 15829 SDValue &Offset, bool &isInc, 15830 SelectionDAG &DAG) { 15831 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15832 return false; 15833 15834 Base = Ptr->getOperand(0); 15835 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 15836 int RHSC = (int)RHS->getZExtValue(); 15837 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 15838 assert(Ptr->getOpcode() == ISD::ADD); 15839 isInc = false; 15840 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15841 return true; 15842 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 15843 isInc = Ptr->getOpcode() == ISD::ADD; 15844 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15845 return true; 15846 } 15847 } 15848 15849 return false; 15850 } 15851 15852 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, 15853 bool isSEXTLoad, bool IsMasked, bool isLE, 15854 SDValue &Base, SDValue &Offset, 15855 bool &isInc, SelectionDAG &DAG) { 15856 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 15857 return false; 15858 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 15859 return false; 15860 15861 // We allow LE non-masked loads to change the type (for example use a vldrb.8 15862 // as opposed to a vldrw.32). This can allow extra addressing modes or 15863 // alignments for what is otherwise an equivalent instruction. 15864 bool CanChangeType = isLE && !IsMasked; 15865 15866 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 15867 int RHSC = (int)RHS->getZExtValue(); 15868 15869 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 15870 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 15871 assert(Ptr->getOpcode() == ISD::ADD); 15872 isInc = false; 15873 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15874 return true; 15875 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 15876 isInc = Ptr->getOpcode() == ISD::ADD; 15877 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 15878 return true; 15879 } 15880 return false; 15881 }; 15882 15883 // Try to find a matching instruction based on s/zext, Alignment, Offset and 15884 // (in BE/masked) type. 15885 Base = Ptr->getOperand(0); 15886 if (VT == MVT::v4i16) { 15887 if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) 15888 return true; 15889 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 15890 if (IsInRange(RHSC, 0x80, 1)) 15891 return true; 15892 } else if (Align >= 4 && 15893 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 15894 IsInRange(RHSC, 0x80, 4)) 15895 return true; 15896 else if (Align >= 2 && 15897 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 15898 IsInRange(RHSC, 0x80, 2)) 15899 return true; 15900 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 15901 return true; 15902 return false; 15903 } 15904 15905 /// getPreIndexedAddressParts - returns true by value, base pointer and 15906 /// offset pointer and addressing mode by reference if the node's address 15907 /// can be legally represented as pre-indexed load / store address. 15908 bool 15909 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 15910 SDValue &Offset, 15911 ISD::MemIndexedMode &AM, 15912 SelectionDAG &DAG) const { 15913 if (Subtarget->isThumb1Only()) 15914 return false; 15915 15916 EVT VT; 15917 SDValue Ptr; 15918 unsigned Align; 15919 bool isSEXTLoad = false; 15920 bool IsMasked = false; 15921 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15922 Ptr = LD->getBasePtr(); 15923 VT = LD->getMemoryVT(); 15924 Align = LD->getAlignment(); 15925 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15926 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15927 Ptr = ST->getBasePtr(); 15928 VT = ST->getMemoryVT(); 15929 Align = ST->getAlignment(); 15930 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15931 Ptr = LD->getBasePtr(); 15932 VT = LD->getMemoryVT(); 15933 Align = LD->getAlignment(); 15934 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15935 IsMasked = true; 15936 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15937 Ptr = ST->getBasePtr(); 15938 VT = ST->getMemoryVT(); 15939 Align = ST->getAlignment(); 15940 IsMasked = true; 15941 } else 15942 return false; 15943 15944 bool isInc; 15945 bool isLegal = false; 15946 if (VT.isVector()) 15947 isLegal = Subtarget->hasMVEIntegerOps() && 15948 getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, 15949 IsMasked, Subtarget->isLittle(), Base, 15950 Offset, isInc, DAG); 15951 else { 15952 if (Subtarget->isThumb2()) 15953 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15954 Offset, isInc, DAG); 15955 else 15956 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 15957 Offset, isInc, DAG); 15958 } 15959 if (!isLegal) 15960 return false; 15961 15962 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 15963 return true; 15964 } 15965 15966 /// getPostIndexedAddressParts - returns true by value, base pointer and 15967 /// offset pointer and addressing mode by reference if this node can be 15968 /// combined with a load / store to form a post-indexed load / store. 15969 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 15970 SDValue &Base, 15971 SDValue &Offset, 15972 ISD::MemIndexedMode &AM, 15973 SelectionDAG &DAG) const { 15974 EVT VT; 15975 SDValue Ptr; 15976 unsigned Align; 15977 bool isSEXTLoad = false, isNonExt; 15978 bool IsMasked = false; 15979 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 15980 VT = LD->getMemoryVT(); 15981 Ptr = LD->getBasePtr(); 15982 Align = LD->getAlignment(); 15983 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15984 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15985 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 15986 VT = ST->getMemoryVT(); 15987 Ptr = ST->getBasePtr(); 15988 Align = ST->getAlignment(); 15989 isNonExt = !ST->isTruncatingStore(); 15990 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 15991 VT = LD->getMemoryVT(); 15992 Ptr = LD->getBasePtr(); 15993 Align = LD->getAlignment(); 15994 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 15995 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 15996 IsMasked = true; 15997 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 15998 VT = ST->getMemoryVT(); 15999 Ptr = ST->getBasePtr(); 16000 Align = ST->getAlignment(); 16001 isNonExt = !ST->isTruncatingStore(); 16002 IsMasked = true; 16003 } else 16004 return false; 16005 16006 if (Subtarget->isThumb1Only()) { 16007 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 16008 // must be non-extending/truncating, i32, with an offset of 4. 16009 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 16010 if (Op->getOpcode() != ISD::ADD || !isNonExt) 16011 return false; 16012 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 16013 if (!RHS || RHS->getZExtValue() != 4) 16014 return false; 16015 16016 Offset = Op->getOperand(1); 16017 Base = Op->getOperand(0); 16018 AM = ISD::POST_INC; 16019 return true; 16020 } 16021 16022 bool isInc; 16023 bool isLegal = false; 16024 if (VT.isVector()) 16025 isLegal = Subtarget->hasMVEIntegerOps() && 16026 getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, 16027 Subtarget->isLittle(), Base, Offset, 16028 isInc, DAG); 16029 else { 16030 if (Subtarget->isThumb2()) 16031 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 16032 isInc, DAG); 16033 else 16034 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 16035 isInc, DAG); 16036 } 16037 if (!isLegal) 16038 return false; 16039 16040 if (Ptr != Base) { 16041 // Swap base ptr and offset to catch more post-index load / store when 16042 // it's legal. In Thumb2 mode, offset must be an immediate. 16043 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 16044 !Subtarget->isThumb2()) 16045 std::swap(Base, Offset); 16046 16047 // Post-indexed load / store update the base pointer. 16048 if (Ptr != Base) 16049 return false; 16050 } 16051 16052 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 16053 return true; 16054 } 16055 16056 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 16057 KnownBits &Known, 16058 const APInt &DemandedElts, 16059 const SelectionDAG &DAG, 16060 unsigned Depth) const { 16061 unsigned BitWidth = Known.getBitWidth(); 16062 Known.resetAll(); 16063 switch (Op.getOpcode()) { 16064 default: break; 16065 case ARMISD::ADDC: 16066 case ARMISD::ADDE: 16067 case ARMISD::SUBC: 16068 case ARMISD::SUBE: 16069 // Special cases when we convert a carry to a boolean. 16070 if (Op.getResNo() == 0) { 16071 SDValue LHS = Op.getOperand(0); 16072 SDValue RHS = Op.getOperand(1); 16073 // (ADDE 0, 0, C) will give us a single bit. 16074 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 16075 isNullConstant(RHS)) { 16076 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 16077 return; 16078 } 16079 } 16080 break; 16081 case ARMISD::CMOV: { 16082 // Bits are known zero/one if known on the LHS and RHS. 16083 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 16084 if (Known.isUnknown()) 16085 return; 16086 16087 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 16088 Known.Zero &= KnownRHS.Zero; 16089 Known.One &= KnownRHS.One; 16090 return; 16091 } 16092 case ISD::INTRINSIC_W_CHAIN: { 16093 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 16094 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 16095 switch (IntID) { 16096 default: return; 16097 case Intrinsic::arm_ldaex: 16098 case Intrinsic::arm_ldrex: { 16099 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 16100 unsigned MemBits = VT.getScalarSizeInBits(); 16101 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 16102 return; 16103 } 16104 } 16105 } 16106 case ARMISD::BFI: { 16107 // Conservatively, we can recurse down the first operand 16108 // and just mask out all affected bits. 16109 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 16110 16111 // The operand to BFI is already a mask suitable for removing the bits it 16112 // sets. 16113 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 16114 const APInt &Mask = CI->getAPIntValue(); 16115 Known.Zero &= Mask; 16116 Known.One &= Mask; 16117 return; 16118 } 16119 case ARMISD::VGETLANEs: 16120 case ARMISD::VGETLANEu: { 16121 const SDValue &SrcSV = Op.getOperand(0); 16122 EVT VecVT = SrcSV.getValueType(); 16123 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 16124 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 16125 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 16126 assert(Pos->getAPIntValue().ult(NumSrcElts) && 16127 "VGETLANE index out of bounds"); 16128 unsigned Idx = Pos->getZExtValue(); 16129 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 16130 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 16131 16132 EVT VT = Op.getValueType(); 16133 const unsigned DstSz = VT.getScalarSizeInBits(); 16134 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 16135 (void)SrcSz; 16136 assert(SrcSz == Known.getBitWidth()); 16137 assert(DstSz > SrcSz); 16138 if (Op.getOpcode() == ARMISD::VGETLANEs) 16139 Known = Known.sext(DstSz); 16140 else { 16141 Known = Known.zext(DstSz); 16142 } 16143 assert(DstSz == Known.getBitWidth()); 16144 break; 16145 } 16146 } 16147 } 16148 16149 bool 16150 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 16151 const APInt &DemandedAPInt, 16152 TargetLoweringOpt &TLO) const { 16153 // Delay optimization, so we don't have to deal with illegal types, or block 16154 // optimizations. 16155 if (!TLO.LegalOps) 16156 return false; 16157 16158 // Only optimize AND for now. 16159 if (Op.getOpcode() != ISD::AND) 16160 return false; 16161 16162 EVT VT = Op.getValueType(); 16163 16164 // Ignore vectors. 16165 if (VT.isVector()) 16166 return false; 16167 16168 assert(VT == MVT::i32 && "Unexpected integer type"); 16169 16170 // Make sure the RHS really is a constant. 16171 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 16172 if (!C) 16173 return false; 16174 16175 unsigned Mask = C->getZExtValue(); 16176 16177 unsigned Demanded = DemandedAPInt.getZExtValue(); 16178 unsigned ShrunkMask = Mask & Demanded; 16179 unsigned ExpandedMask = Mask | ~Demanded; 16180 16181 // If the mask is all zeros, let the target-independent code replace the 16182 // result with zero. 16183 if (ShrunkMask == 0) 16184 return false; 16185 16186 // If the mask is all ones, erase the AND. (Currently, the target-independent 16187 // code won't do this, so we have to do it explicitly to avoid an infinite 16188 // loop in obscure cases.) 16189 if (ExpandedMask == ~0U) 16190 return TLO.CombineTo(Op, Op.getOperand(0)); 16191 16192 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 16193 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 16194 }; 16195 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 16196 if (NewMask == Mask) 16197 return true; 16198 SDLoc DL(Op); 16199 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 16200 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 16201 return TLO.CombineTo(Op, NewOp); 16202 }; 16203 16204 // Prefer uxtb mask. 16205 if (IsLegalMask(0xFF)) 16206 return UseMask(0xFF); 16207 16208 // Prefer uxth mask. 16209 if (IsLegalMask(0xFFFF)) 16210 return UseMask(0xFFFF); 16211 16212 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 16213 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 16214 if (ShrunkMask < 256) 16215 return UseMask(ShrunkMask); 16216 16217 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 16218 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 16219 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 16220 return UseMask(ExpandedMask); 16221 16222 // Potential improvements: 16223 // 16224 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 16225 // We could try to prefer Thumb1 immediates which can be lowered to a 16226 // two-instruction sequence. 16227 // We could try to recognize more legal ARM/Thumb2 immediates here. 16228 16229 return false; 16230 } 16231 16232 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( 16233 SDValue Op, const APInt &OriginalDemandedBits, 16234 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 16235 unsigned Depth) const { 16236 unsigned Opc = Op.getOpcode(); 16237 16238 switch (Opc) { 16239 case ARMISD::ASRL: 16240 case ARMISD::LSRL: { 16241 // If this is result 0 and the other result is unused, see if the demand 16242 // bits allow us to shrink this long shift into a standard small shift in 16243 // the opposite direction. 16244 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && 16245 isa<ConstantSDNode>(Op->getOperand(2))) { 16246 unsigned ShAmt = Op->getConstantOperandVal(2); 16247 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf( 16248 APInt::getAllOnesValue(32) << (32 - ShAmt))) 16249 return TLO.CombineTo( 16250 Op, TLO.DAG.getNode( 16251 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), 16252 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); 16253 } 16254 break; 16255 } 16256 } 16257 16258 return TargetLowering::SimplifyDemandedBitsForTargetNode( 16259 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 16260 } 16261 16262 //===----------------------------------------------------------------------===// 16263 // ARM Inline Assembly Support 16264 //===----------------------------------------------------------------------===// 16265 16266 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 16267 // Looking for "rev" which is V6+. 16268 if (!Subtarget->hasV6Ops()) 16269 return false; 16270 16271 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 16272 std::string AsmStr = IA->getAsmString(); 16273 SmallVector<StringRef, 4> AsmPieces; 16274 SplitString(AsmStr, AsmPieces, ";\n"); 16275 16276 switch (AsmPieces.size()) { 16277 default: return false; 16278 case 1: 16279 AsmStr = std::string(AsmPieces[0]); 16280 AsmPieces.clear(); 16281 SplitString(AsmStr, AsmPieces, " \t,"); 16282 16283 // rev $0, $1 16284 if (AsmPieces.size() == 3 && 16285 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 16286 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 16287 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 16288 if (Ty && Ty->getBitWidth() == 32) 16289 return IntrinsicLowering::LowerToByteSwap(CI); 16290 } 16291 break; 16292 } 16293 16294 return false; 16295 } 16296 16297 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 16298 // At this point, we have to lower this constraint to something else, so we 16299 // lower it to an "r" or "w". However, by doing this we will force the result 16300 // to be in register, while the X constraint is much more permissive. 16301 // 16302 // Although we are correct (we are free to emit anything, without 16303 // constraints), we might break use cases that would expect us to be more 16304 // efficient and emit something else. 16305 if (!Subtarget->hasVFP2Base()) 16306 return "r"; 16307 if (ConstraintVT.isFloatingPoint()) 16308 return "w"; 16309 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 16310 (ConstraintVT.getSizeInBits() == 64 || 16311 ConstraintVT.getSizeInBits() == 128)) 16312 return "w"; 16313 16314 return "r"; 16315 } 16316 16317 /// getConstraintType - Given a constraint letter, return the type of 16318 /// constraint it is for this target. 16319 ARMTargetLowering::ConstraintType 16320 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 16321 unsigned S = Constraint.size(); 16322 if (S == 1) { 16323 switch (Constraint[0]) { 16324 default: break; 16325 case 'l': return C_RegisterClass; 16326 case 'w': return C_RegisterClass; 16327 case 'h': return C_RegisterClass; 16328 case 'x': return C_RegisterClass; 16329 case 't': return C_RegisterClass; 16330 case 'j': return C_Immediate; // Constant for movw. 16331 // An address with a single base register. Due to the way we 16332 // currently handle addresses it is the same as an 'r' memory constraint. 16333 case 'Q': return C_Memory; 16334 } 16335 } else if (S == 2) { 16336 switch (Constraint[0]) { 16337 default: break; 16338 case 'T': return C_RegisterClass; 16339 // All 'U+' constraints are addresses. 16340 case 'U': return C_Memory; 16341 } 16342 } 16343 return TargetLowering::getConstraintType(Constraint); 16344 } 16345 16346 /// Examine constraint type and operand type and determine a weight value. 16347 /// This object must already have been set up with the operand type 16348 /// and the current alternative constraint selected. 16349 TargetLowering::ConstraintWeight 16350 ARMTargetLowering::getSingleConstraintMatchWeight( 16351 AsmOperandInfo &info, const char *constraint) const { 16352 ConstraintWeight weight = CW_Invalid; 16353 Value *CallOperandVal = info.CallOperandVal; 16354 // If we don't have a value, we can't do a match, 16355 // but allow it at the lowest weight. 16356 if (!CallOperandVal) 16357 return CW_Default; 16358 Type *type = CallOperandVal->getType(); 16359 // Look at the constraint type. 16360 switch (*constraint) { 16361 default: 16362 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 16363 break; 16364 case 'l': 16365 if (type->isIntegerTy()) { 16366 if (Subtarget->isThumb()) 16367 weight = CW_SpecificReg; 16368 else 16369 weight = CW_Register; 16370 } 16371 break; 16372 case 'w': 16373 if (type->isFloatingPointTy()) 16374 weight = CW_Register; 16375 break; 16376 } 16377 return weight; 16378 } 16379 16380 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 16381 16382 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 16383 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 16384 switch (Constraint.size()) { 16385 case 1: 16386 // GCC ARM Constraint Letters 16387 switch (Constraint[0]) { 16388 case 'l': // Low regs or general regs. 16389 if (Subtarget->isThumb()) 16390 return RCPair(0U, &ARM::tGPRRegClass); 16391 return RCPair(0U, &ARM::GPRRegClass); 16392 case 'h': // High regs or no regs. 16393 if (Subtarget->isThumb()) 16394 return RCPair(0U, &ARM::hGPRRegClass); 16395 break; 16396 case 'r': 16397 if (Subtarget->isThumb1Only()) 16398 return RCPair(0U, &ARM::tGPRRegClass); 16399 return RCPair(0U, &ARM::GPRRegClass); 16400 case 'w': 16401 if (VT == MVT::Other) 16402 break; 16403 if (VT == MVT::f32) 16404 return RCPair(0U, &ARM::SPRRegClass); 16405 if (VT.getSizeInBits() == 64) 16406 return RCPair(0U, &ARM::DPRRegClass); 16407 if (VT.getSizeInBits() == 128) 16408 return RCPair(0U, &ARM::QPRRegClass); 16409 break; 16410 case 'x': 16411 if (VT == MVT::Other) 16412 break; 16413 if (VT == MVT::f32) 16414 return RCPair(0U, &ARM::SPR_8RegClass); 16415 if (VT.getSizeInBits() == 64) 16416 return RCPair(0U, &ARM::DPR_8RegClass); 16417 if (VT.getSizeInBits() == 128) 16418 return RCPair(0U, &ARM::QPR_8RegClass); 16419 break; 16420 case 't': 16421 if (VT == MVT::Other) 16422 break; 16423 if (VT == MVT::f32 || VT == MVT::i32) 16424 return RCPair(0U, &ARM::SPRRegClass); 16425 if (VT.getSizeInBits() == 64) 16426 return RCPair(0U, &ARM::DPR_VFP2RegClass); 16427 if (VT.getSizeInBits() == 128) 16428 return RCPair(0U, &ARM::QPR_VFP2RegClass); 16429 break; 16430 } 16431 break; 16432 16433 case 2: 16434 if (Constraint[0] == 'T') { 16435 switch (Constraint[1]) { 16436 default: 16437 break; 16438 case 'e': 16439 return RCPair(0U, &ARM::tGPREvenRegClass); 16440 case 'o': 16441 return RCPair(0U, &ARM::tGPROddRegClass); 16442 } 16443 } 16444 break; 16445 16446 default: 16447 break; 16448 } 16449 16450 if (StringRef("{cc}").equals_lower(Constraint)) 16451 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 16452 16453 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 16454 } 16455 16456 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 16457 /// vector. If it is invalid, don't add anything to Ops. 16458 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16459 std::string &Constraint, 16460 std::vector<SDValue>&Ops, 16461 SelectionDAG &DAG) const { 16462 SDValue Result; 16463 16464 // Currently only support length 1 constraints. 16465 if (Constraint.length() != 1) return; 16466 16467 char ConstraintLetter = Constraint[0]; 16468 switch (ConstraintLetter) { 16469 default: break; 16470 case 'j': 16471 case 'I': case 'J': case 'K': case 'L': 16472 case 'M': case 'N': case 'O': 16473 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 16474 if (!C) 16475 return; 16476 16477 int64_t CVal64 = C->getSExtValue(); 16478 int CVal = (int) CVal64; 16479 // None of these constraints allow values larger than 32 bits. Check 16480 // that the value fits in an int. 16481 if (CVal != CVal64) 16482 return; 16483 16484 switch (ConstraintLetter) { 16485 case 'j': 16486 // Constant suitable for movw, must be between 0 and 16487 // 65535. 16488 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 16489 if (CVal >= 0 && CVal <= 65535) 16490 break; 16491 return; 16492 case 'I': 16493 if (Subtarget->isThumb1Only()) { 16494 // This must be a constant between 0 and 255, for ADD 16495 // immediates. 16496 if (CVal >= 0 && CVal <= 255) 16497 break; 16498 } else if (Subtarget->isThumb2()) { 16499 // A constant that can be used as an immediate value in a 16500 // data-processing instruction. 16501 if (ARM_AM::getT2SOImmVal(CVal) != -1) 16502 break; 16503 } else { 16504 // A constant that can be used as an immediate value in a 16505 // data-processing instruction. 16506 if (ARM_AM::getSOImmVal(CVal) != -1) 16507 break; 16508 } 16509 return; 16510 16511 case 'J': 16512 if (Subtarget->isThumb1Only()) { 16513 // This must be a constant between -255 and -1, for negated ADD 16514 // immediates. This can be used in GCC with an "n" modifier that 16515 // prints the negated value, for use with SUB instructions. It is 16516 // not useful otherwise but is implemented for compatibility. 16517 if (CVal >= -255 && CVal <= -1) 16518 break; 16519 } else { 16520 // This must be a constant between -4095 and 4095. It is not clear 16521 // what this constraint is intended for. Implemented for 16522 // compatibility with GCC. 16523 if (CVal >= -4095 && CVal <= 4095) 16524 break; 16525 } 16526 return; 16527 16528 case 'K': 16529 if (Subtarget->isThumb1Only()) { 16530 // A 32-bit value where only one byte has a nonzero value. Exclude 16531 // zero to match GCC. This constraint is used by GCC internally for 16532 // constants that can be loaded with a move/shift combination. 16533 // It is not useful otherwise but is implemented for compatibility. 16534 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 16535 break; 16536 } else if (Subtarget->isThumb2()) { 16537 // A constant whose bitwise inverse can be used as an immediate 16538 // value in a data-processing instruction. This can be used in GCC 16539 // with a "B" modifier that prints the inverted value, for use with 16540 // BIC and MVN instructions. It is not useful otherwise but is 16541 // implemented for compatibility. 16542 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 16543 break; 16544 } else { 16545 // A constant whose bitwise inverse can be used as an immediate 16546 // value in a data-processing instruction. This can be used in GCC 16547 // with a "B" modifier that prints the inverted value, for use with 16548 // BIC and MVN instructions. It is not useful otherwise but is 16549 // implemented for compatibility. 16550 if (ARM_AM::getSOImmVal(~CVal) != -1) 16551 break; 16552 } 16553 return; 16554 16555 case 'L': 16556 if (Subtarget->isThumb1Only()) { 16557 // This must be a constant between -7 and 7, 16558 // for 3-operand ADD/SUB immediate instructions. 16559 if (CVal >= -7 && CVal < 7) 16560 break; 16561 } else if (Subtarget->isThumb2()) { 16562 // A constant whose negation can be used as an immediate value in a 16563 // data-processing instruction. This can be used in GCC with an "n" 16564 // modifier that prints the negated value, for use with SUB 16565 // instructions. It is not useful otherwise but is implemented for 16566 // compatibility. 16567 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 16568 break; 16569 } else { 16570 // A constant whose negation can be used as an immediate value in a 16571 // data-processing instruction. This can be used in GCC with an "n" 16572 // modifier that prints the negated value, for use with SUB 16573 // instructions. It is not useful otherwise but is implemented for 16574 // compatibility. 16575 if (ARM_AM::getSOImmVal(-CVal) != -1) 16576 break; 16577 } 16578 return; 16579 16580 case 'M': 16581 if (Subtarget->isThumb1Only()) { 16582 // This must be a multiple of 4 between 0 and 1020, for 16583 // ADD sp + immediate. 16584 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 16585 break; 16586 } else { 16587 // A power of two or a constant between 0 and 32. This is used in 16588 // GCC for the shift amount on shifted register operands, but it is 16589 // useful in general for any shift amounts. 16590 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 16591 break; 16592 } 16593 return; 16594 16595 case 'N': 16596 if (Subtarget->isThumb1Only()) { 16597 // This must be a constant between 0 and 31, for shift amounts. 16598 if (CVal >= 0 && CVal <= 31) 16599 break; 16600 } 16601 return; 16602 16603 case 'O': 16604 if (Subtarget->isThumb1Only()) { 16605 // This must be a multiple of 4 between -508 and 508, for 16606 // ADD/SUB sp = sp + immediate. 16607 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 16608 break; 16609 } 16610 return; 16611 } 16612 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 16613 break; 16614 } 16615 16616 if (Result.getNode()) { 16617 Ops.push_back(Result); 16618 return; 16619 } 16620 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16621 } 16622 16623 static RTLIB::Libcall getDivRemLibcall( 16624 const SDNode *N, MVT::SimpleValueType SVT) { 16625 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16626 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16627 "Unhandled Opcode in getDivRemLibcall"); 16628 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16629 N->getOpcode() == ISD::SREM; 16630 RTLIB::Libcall LC; 16631 switch (SVT) { 16632 default: llvm_unreachable("Unexpected request for libcall!"); 16633 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 16634 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 16635 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 16636 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 16637 } 16638 return LC; 16639 } 16640 16641 static TargetLowering::ArgListTy getDivRemArgList( 16642 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 16643 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 16644 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 16645 "Unhandled Opcode in getDivRemArgList"); 16646 bool isSigned = N->getOpcode() == ISD::SDIVREM || 16647 N->getOpcode() == ISD::SREM; 16648 TargetLowering::ArgListTy Args; 16649 TargetLowering::ArgListEntry Entry; 16650 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 16651 EVT ArgVT = N->getOperand(i).getValueType(); 16652 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 16653 Entry.Node = N->getOperand(i); 16654 Entry.Ty = ArgTy; 16655 Entry.IsSExt = isSigned; 16656 Entry.IsZExt = !isSigned; 16657 Args.push_back(Entry); 16658 } 16659 if (Subtarget->isTargetWindows() && Args.size() >= 2) 16660 std::swap(Args[0], Args[1]); 16661 return Args; 16662 } 16663 16664 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 16665 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 16666 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 16667 Subtarget->isTargetWindows()) && 16668 "Register-based DivRem lowering only"); 16669 unsigned Opcode = Op->getOpcode(); 16670 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 16671 "Invalid opcode for Div/Rem lowering"); 16672 bool isSigned = (Opcode == ISD::SDIVREM); 16673 EVT VT = Op->getValueType(0); 16674 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 16675 SDLoc dl(Op); 16676 16677 // If the target has hardware divide, use divide + multiply + subtract: 16678 // div = a / b 16679 // rem = a - b * div 16680 // return {div, rem} 16681 // This should be lowered into UDIV/SDIV + MLS later on. 16682 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 16683 : Subtarget->hasDivideInARMMode(); 16684 if (hasDivide && Op->getValueType(0).isSimple() && 16685 Op->getSimpleValueType(0) == MVT::i32) { 16686 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 16687 const SDValue Dividend = Op->getOperand(0); 16688 const SDValue Divisor = Op->getOperand(1); 16689 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 16690 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 16691 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 16692 16693 SDValue Values[2] = {Div, Rem}; 16694 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 16695 } 16696 16697 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 16698 VT.getSimpleVT().SimpleTy); 16699 SDValue InChain = DAG.getEntryNode(); 16700 16701 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 16702 DAG.getContext(), 16703 Subtarget); 16704 16705 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16706 getPointerTy(DAG.getDataLayout())); 16707 16708 Type *RetTy = StructType::get(Ty, Ty); 16709 16710 if (Subtarget->isTargetWindows()) 16711 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 16712 16713 TargetLowering::CallLoweringInfo CLI(DAG); 16714 CLI.setDebugLoc(dl).setChain(InChain) 16715 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 16716 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 16717 16718 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 16719 return CallInfo.first; 16720 } 16721 16722 // Lowers REM using divmod helpers 16723 // see RTABI section 4.2/4.3 16724 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 16725 // Build return types (div and rem) 16726 std::vector<Type*> RetTyParams; 16727 Type *RetTyElement; 16728 16729 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 16730 default: llvm_unreachable("Unexpected request for libcall!"); 16731 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 16732 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 16733 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 16734 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 16735 } 16736 16737 RetTyParams.push_back(RetTyElement); 16738 RetTyParams.push_back(RetTyElement); 16739 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 16740 Type *RetTy = StructType::get(*DAG.getContext(), ret); 16741 16742 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 16743 SimpleTy); 16744 SDValue InChain = DAG.getEntryNode(); 16745 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 16746 Subtarget); 16747 bool isSigned = N->getOpcode() == ISD::SREM; 16748 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16749 getPointerTy(DAG.getDataLayout())); 16750 16751 if (Subtarget->isTargetWindows()) 16752 InChain = WinDBZCheckDenominator(DAG, N, InChain); 16753 16754 // Lower call 16755 CallLoweringInfo CLI(DAG); 16756 CLI.setChain(InChain) 16757 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 16758 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 16759 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 16760 16761 // Return second (rem) result operand (first contains div) 16762 SDNode *ResNode = CallResult.first.getNode(); 16763 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 16764 return ResNode->getOperand(1); 16765 } 16766 16767 SDValue 16768 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 16769 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 16770 SDLoc DL(Op); 16771 16772 // Get the inputs. 16773 SDValue Chain = Op.getOperand(0); 16774 SDValue Size = Op.getOperand(1); 16775 16776 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 16777 "no-stack-arg-probe")) { 16778 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 16779 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16780 Chain = SP.getValue(1); 16781 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 16782 if (Align) 16783 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 16784 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 16785 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 16786 SDValue Ops[2] = { SP, Chain }; 16787 return DAG.getMergeValues(Ops, DL); 16788 } 16789 16790 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 16791 DAG.getConstant(2, DL, MVT::i32)); 16792 16793 SDValue Flag; 16794 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 16795 Flag = Chain.getValue(1); 16796 16797 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 16798 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 16799 16800 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 16801 Chain = NewSP.getValue(1); 16802 16803 SDValue Ops[2] = { NewSP, Chain }; 16804 return DAG.getMergeValues(Ops, DL); 16805 } 16806 16807 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 16808 bool IsStrict = Op->isStrictFPOpcode(); 16809 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16810 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16811 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 16812 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 16813 "Unexpected type for custom-lowering FP_EXTEND"); 16814 16815 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16816 "With both FP DP and 16, any FP conversion is legal!"); 16817 16818 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 16819 "With FP16, 16 to 32 conversion is legal!"); 16820 16821 // Converting from 32 -> 64 is valid if we have FP64. 16822 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 16823 // FIXME: Remove this when we have strict fp instruction selection patterns 16824 if (IsStrict) { 16825 SDLoc Loc(Op); 16826 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 16827 Loc, Op.getValueType(), SrcVal); 16828 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 16829 } 16830 return Op; 16831 } 16832 16833 // Either we are converting from 16 -> 64, without FP16 and/or 16834 // FP.double-precision or without Armv8-fp. So we must do it in two 16835 // steps. 16836 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 16837 // without FP16. So we must do a function call. 16838 SDLoc Loc(Op); 16839 RTLIB::Libcall LC; 16840 MakeLibCallOptions CallOptions; 16841 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16842 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 16843 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 16844 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 16845 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 16846 if (Supported) { 16847 if (IsStrict) { 16848 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 16849 {DstVT, MVT::Other}, {Chain, SrcVal}); 16850 Chain = SrcVal.getValue(1); 16851 } else { 16852 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 16853 } 16854 } else { 16855 LC = RTLIB::getFPEXT(SrcVT, DstVT); 16856 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16857 "Unexpected type for custom-lowering FP_EXTEND"); 16858 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16859 Loc, Chain); 16860 } 16861 } 16862 16863 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 16864 } 16865 16866 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 16867 bool IsStrict = Op->isStrictFPOpcode(); 16868 16869 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 16870 EVT SrcVT = SrcVal.getValueType(); 16871 EVT DstVT = Op.getValueType(); 16872 const unsigned DstSz = Op.getValueType().getSizeInBits(); 16873 const unsigned SrcSz = SrcVT.getSizeInBits(); 16874 (void)DstSz; 16875 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 16876 "Unexpected type for custom-lowering FP_ROUND"); 16877 16878 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 16879 "With both FP DP and 16, any FP conversion is legal!"); 16880 16881 SDLoc Loc(Op); 16882 16883 // Instruction from 32 -> 16 if hasFP16 is valid 16884 if (SrcSz == 32 && Subtarget->hasFP16()) 16885 return Op; 16886 16887 // Lib call from 32 -> 16 / 64 -> [32, 16] 16888 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 16889 assert(LC != RTLIB::UNKNOWN_LIBCALL && 16890 "Unexpected type for custom-lowering FP_ROUND"); 16891 MakeLibCallOptions CallOptions; 16892 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 16893 SDValue Result; 16894 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 16895 Loc, Chain); 16896 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 16897 } 16898 16899 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 16900 SelectionDAG &DAG) const { 16901 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 16902 MVT HalfT = MVT::i32; 16903 SDLoc dl(N); 16904 SDValue Hi, Lo, Tmp; 16905 16906 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 16907 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 16908 return ; 16909 16910 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 16911 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 16912 16913 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16914 DAG.getConstant(0, dl, HalfT)); 16915 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 16916 DAG.getConstant(1, dl, HalfT)); 16917 16918 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 16919 DAG.getConstant(OpTypeBits - 1, dl, 16920 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 16921 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 16922 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 16923 SDValue(Lo.getNode(), 1)); 16924 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 16925 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 16926 16927 Results.push_back(Lo); 16928 Results.push_back(Hi); 16929 } 16930 16931 bool 16932 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 16933 // The ARM target isn't yet aware of offsets. 16934 return false; 16935 } 16936 16937 bool ARM::isBitFieldInvertedMask(unsigned v) { 16938 if (v == 0xffffffff) 16939 return false; 16940 16941 // there can be 1's on either or both "outsides", all the "inside" 16942 // bits must be 0's 16943 return isShiftedMask_32(~v); 16944 } 16945 16946 /// isFPImmLegal - Returns true if the target can instruction select the 16947 /// specified FP immediate natively. If false, the legalizer will 16948 /// materialize the FP immediate as a load from a constant pool. 16949 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 16950 bool ForCodeSize) const { 16951 if (!Subtarget->hasVFP3Base()) 16952 return false; 16953 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 16954 return ARM_AM::getFP16Imm(Imm) != -1; 16955 if (VT == MVT::f32) 16956 return ARM_AM::getFP32Imm(Imm) != -1; 16957 if (VT == MVT::f64 && Subtarget->hasFP64()) 16958 return ARM_AM::getFP64Imm(Imm) != -1; 16959 return false; 16960 } 16961 16962 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 16963 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 16964 /// specified in the intrinsic calls. 16965 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 16966 const CallInst &I, 16967 MachineFunction &MF, 16968 unsigned Intrinsic) const { 16969 switch (Intrinsic) { 16970 case Intrinsic::arm_neon_vld1: 16971 case Intrinsic::arm_neon_vld2: 16972 case Intrinsic::arm_neon_vld3: 16973 case Intrinsic::arm_neon_vld4: 16974 case Intrinsic::arm_neon_vld2lane: 16975 case Intrinsic::arm_neon_vld3lane: 16976 case Intrinsic::arm_neon_vld4lane: 16977 case Intrinsic::arm_neon_vld2dup: 16978 case Intrinsic::arm_neon_vld3dup: 16979 case Intrinsic::arm_neon_vld4dup: { 16980 Info.opc = ISD::INTRINSIC_W_CHAIN; 16981 // Conservatively set memVT to the entire set of vectors loaded. 16982 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16983 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 16984 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 16985 Info.ptrVal = I.getArgOperand(0); 16986 Info.offset = 0; 16987 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 16988 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 16989 // volatile loads with NEON intrinsics not supported 16990 Info.flags = MachineMemOperand::MOLoad; 16991 return true; 16992 } 16993 case Intrinsic::arm_neon_vld1x2: 16994 case Intrinsic::arm_neon_vld1x3: 16995 case Intrinsic::arm_neon_vld1x4: { 16996 Info.opc = ISD::INTRINSIC_W_CHAIN; 16997 // Conservatively set memVT to the entire set of vectors loaded. 16998 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 16999 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 17000 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 17001 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 17002 Info.offset = 0; 17003 Info.align.reset(); 17004 // volatile loads with NEON intrinsics not supported 17005 Info.flags = MachineMemOperand::MOLoad; 17006 return true; 17007 } 17008 case Intrinsic::arm_neon_vst1: 17009 case Intrinsic::arm_neon_vst2: 17010 case Intrinsic::arm_neon_vst3: 17011 case Intrinsic::arm_neon_vst4: 17012 case Intrinsic::arm_neon_vst2lane: 17013 case Intrinsic::arm_neon_vst3lane: 17014 case Intrinsic::arm_neon_vst4lane: { 17015 Info.opc = ISD::INTRINSIC_VOID; 17016 // Conservatively set memVT to the entire set of vectors stored. 17017 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17018 unsigned NumElts = 0; 17019 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 17020 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 17021 if (!ArgTy->isVectorTy()) 17022 break; 17023 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 17024 } 17025 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 17026 Info.ptrVal = I.getArgOperand(0); 17027 Info.offset = 0; 17028 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 17029 Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); 17030 // volatile stores with NEON intrinsics not supported 17031 Info.flags = MachineMemOperand::MOStore; 17032 return true; 17033 } 17034 case Intrinsic::arm_neon_vst1x2: 17035 case Intrinsic::arm_neon_vst1x3: 17036 case Intrinsic::arm_neon_vst1x4: { 17037 Info.opc = ISD::INTRINSIC_VOID; 17038 // Conservatively set memVT to the entire set of vectors stored. 17039 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17040 unsigned NumElts = 0; 17041 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 17042 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 17043 if (!ArgTy->isVectorTy()) 17044 break; 17045 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 17046 } 17047 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 17048 Info.ptrVal = I.getArgOperand(0); 17049 Info.offset = 0; 17050 Info.align.reset(); 17051 // volatile stores with NEON intrinsics not supported 17052 Info.flags = MachineMemOperand::MOStore; 17053 return true; 17054 } 17055 case Intrinsic::arm_mve_vld2q: 17056 case Intrinsic::arm_mve_vld4q: { 17057 Info.opc = ISD::INTRINSIC_W_CHAIN; 17058 // Conservatively set memVT to the entire set of vectors loaded. 17059 Type *VecTy = cast<CompositeType>(I.getType())->getTypeAtIndex(1); 17060 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; 17061 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 17062 Info.ptrVal = I.getArgOperand(0); 17063 Info.offset = 0; 17064 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 17065 // volatile loads with MVE intrinsics not supported 17066 Info.flags = MachineMemOperand::MOLoad; 17067 return true; 17068 } 17069 case Intrinsic::arm_mve_vst2q: 17070 case Intrinsic::arm_mve_vst4q: { 17071 Info.opc = ISD::INTRINSIC_VOID; 17072 // Conservatively set memVT to the entire set of vectors stored. 17073 Type *VecTy = I.getArgOperand(1)->getType(); 17074 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; 17075 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 17076 Info.ptrVal = I.getArgOperand(0); 17077 Info.offset = 0; 17078 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 17079 // volatile stores with MVE intrinsics not supported 17080 Info.flags = MachineMemOperand::MOStore; 17081 return true; 17082 } 17083 case Intrinsic::arm_ldaex: 17084 case Intrinsic::arm_ldrex: { 17085 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17086 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 17087 Info.opc = ISD::INTRINSIC_W_CHAIN; 17088 Info.memVT = MVT::getVT(PtrTy->getElementType()); 17089 Info.ptrVal = I.getArgOperand(0); 17090 Info.offset = 0; 17091 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 17092 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 17093 return true; 17094 } 17095 case Intrinsic::arm_stlex: 17096 case Intrinsic::arm_strex: { 17097 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 17098 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 17099 Info.opc = ISD::INTRINSIC_W_CHAIN; 17100 Info.memVT = MVT::getVT(PtrTy->getElementType()); 17101 Info.ptrVal = I.getArgOperand(1); 17102 Info.offset = 0; 17103 Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); 17104 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 17105 return true; 17106 } 17107 case Intrinsic::arm_stlexd: 17108 case Intrinsic::arm_strexd: 17109 Info.opc = ISD::INTRINSIC_W_CHAIN; 17110 Info.memVT = MVT::i64; 17111 Info.ptrVal = I.getArgOperand(2); 17112 Info.offset = 0; 17113 Info.align = Align(8); 17114 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 17115 return true; 17116 17117 case Intrinsic::arm_ldaexd: 17118 case Intrinsic::arm_ldrexd: 17119 Info.opc = ISD::INTRINSIC_W_CHAIN; 17120 Info.memVT = MVT::i64; 17121 Info.ptrVal = I.getArgOperand(0); 17122 Info.offset = 0; 17123 Info.align = Align(8); 17124 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 17125 return true; 17126 17127 default: 17128 break; 17129 } 17130 17131 return false; 17132 } 17133 17134 /// Returns true if it is beneficial to convert a load of a constant 17135 /// to just the constant itself. 17136 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 17137 Type *Ty) const { 17138 assert(Ty->isIntegerTy()); 17139 17140 unsigned Bits = Ty->getPrimitiveSizeInBits(); 17141 if (Bits == 0 || Bits > 32) 17142 return false; 17143 return true; 17144 } 17145 17146 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 17147 unsigned Index) const { 17148 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 17149 return false; 17150 17151 return (Index == 0 || Index == ResVT.getVectorNumElements()); 17152 } 17153 17154 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 17155 ARM_MB::MemBOpt Domain) const { 17156 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17157 17158 // First, if the target has no DMB, see what fallback we can use. 17159 if (!Subtarget->hasDataBarrier()) { 17160 // Some ARMv6 cpus can support data barriers with an mcr instruction. 17161 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 17162 // here. 17163 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 17164 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 17165 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 17166 Builder.getInt32(0), Builder.getInt32(7), 17167 Builder.getInt32(10), Builder.getInt32(5)}; 17168 return Builder.CreateCall(MCR, args); 17169 } else { 17170 // Instead of using barriers, atomic accesses on these subtargets use 17171 // libcalls. 17172 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 17173 } 17174 } else { 17175 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 17176 // Only a full system barrier exists in the M-class architectures. 17177 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 17178 Constant *CDomain = Builder.getInt32(Domain); 17179 return Builder.CreateCall(DMB, CDomain); 17180 } 17181 } 17182 17183 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 17184 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 17185 Instruction *Inst, 17186 AtomicOrdering Ord) const { 17187 switch (Ord) { 17188 case AtomicOrdering::NotAtomic: 17189 case AtomicOrdering::Unordered: 17190 llvm_unreachable("Invalid fence: unordered/non-atomic"); 17191 case AtomicOrdering::Monotonic: 17192 case AtomicOrdering::Acquire: 17193 return nullptr; // Nothing to do 17194 case AtomicOrdering::SequentiallyConsistent: 17195 if (!Inst->hasAtomicStore()) 17196 return nullptr; // Nothing to do 17197 LLVM_FALLTHROUGH; 17198 case AtomicOrdering::Release: 17199 case AtomicOrdering::AcquireRelease: 17200 if (Subtarget->preferISHSTBarriers()) 17201 return makeDMB(Builder, ARM_MB::ISHST); 17202 // FIXME: add a comment with a link to documentation justifying this. 17203 else 17204 return makeDMB(Builder, ARM_MB::ISH); 17205 } 17206 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 17207 } 17208 17209 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 17210 Instruction *Inst, 17211 AtomicOrdering Ord) const { 17212 switch (Ord) { 17213 case AtomicOrdering::NotAtomic: 17214 case AtomicOrdering::Unordered: 17215 llvm_unreachable("Invalid fence: unordered/not-atomic"); 17216 case AtomicOrdering::Monotonic: 17217 case AtomicOrdering::Release: 17218 return nullptr; // Nothing to do 17219 case AtomicOrdering::Acquire: 17220 case AtomicOrdering::AcquireRelease: 17221 case AtomicOrdering::SequentiallyConsistent: 17222 return makeDMB(Builder, ARM_MB::ISH); 17223 } 17224 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 17225 } 17226 17227 // Loads and stores less than 64-bits are already atomic; ones above that 17228 // are doomed anyway, so defer to the default libcall and blame the OS when 17229 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 17230 // anything for those. 17231 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 17232 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 17233 return (Size == 64) && !Subtarget->isMClass(); 17234 } 17235 17236 // Loads and stores less than 64-bits are already atomic; ones above that 17237 // are doomed anyway, so defer to the default libcall and blame the OS when 17238 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 17239 // anything for those. 17240 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 17241 // guarantee, see DDI0406C ARM architecture reference manual, 17242 // sections A8.8.72-74 LDRD) 17243 TargetLowering::AtomicExpansionKind 17244 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 17245 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 17246 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 17247 : AtomicExpansionKind::None; 17248 } 17249 17250 // For the real atomic operations, we have ldrex/strex up to 32 bits, 17251 // and up to 64 bits on the non-M profiles 17252 TargetLowering::AtomicExpansionKind 17253 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 17254 if (AI->isFloatingPointOperation()) 17255 return AtomicExpansionKind::CmpXChg; 17256 17257 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 17258 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 17259 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 17260 ? AtomicExpansionKind::LLSC 17261 : AtomicExpansionKind::None; 17262 } 17263 17264 TargetLowering::AtomicExpansionKind 17265 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 17266 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 17267 // implement cmpxchg without spilling. If the address being exchanged is also 17268 // on the stack and close enough to the spill slot, this can lead to a 17269 // situation where the monitor always gets cleared and the atomic operation 17270 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 17271 bool HasAtomicCmpXchg = 17272 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 17273 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 17274 return AtomicExpansionKind::LLSC; 17275 return AtomicExpansionKind::None; 17276 } 17277 17278 bool ARMTargetLowering::shouldInsertFencesForAtomic( 17279 const Instruction *I) const { 17280 return InsertFencesForAtomic; 17281 } 17282 17283 // This has so far only been implemented for MachO. 17284 bool ARMTargetLowering::useLoadStackGuardNode() const { 17285 return Subtarget->isTargetMachO(); 17286 } 17287 17288 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 17289 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17290 return TargetLowering::insertSSPDeclarations(M); 17291 17292 // MSVC CRT has a global variable holding security cookie. 17293 M.getOrInsertGlobal("__security_cookie", 17294 Type::getInt8PtrTy(M.getContext())); 17295 17296 // MSVC CRT has a function to validate security cookie. 17297 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 17298 "__security_check_cookie", Type::getVoidTy(M.getContext()), 17299 Type::getInt8PtrTy(M.getContext())); 17300 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 17301 F->addAttribute(1, Attribute::AttrKind::InReg); 17302 } 17303 17304 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 17305 // MSVC CRT has a global variable holding security cookie. 17306 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17307 return M.getGlobalVariable("__security_cookie"); 17308 return TargetLowering::getSDagStackGuard(M); 17309 } 17310 17311 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 17312 // MSVC CRT has a function to validate security cookie. 17313 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 17314 return M.getFunction("__security_check_cookie"); 17315 return TargetLowering::getSSPStackGuardCheck(M); 17316 } 17317 17318 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 17319 unsigned &Cost) const { 17320 // If we do not have NEON, vector types are not natively supported. 17321 if (!Subtarget->hasNEON()) 17322 return false; 17323 17324 // Floating point values and vector values map to the same register file. 17325 // Therefore, although we could do a store extract of a vector type, this is 17326 // better to leave at float as we have more freedom in the addressing mode for 17327 // those. 17328 if (VectorTy->isFPOrFPVectorTy()) 17329 return false; 17330 17331 // If the index is unknown at compile time, this is very expensive to lower 17332 // and it is not possible to combine the store with the extract. 17333 if (!isa<ConstantInt>(Idx)) 17334 return false; 17335 17336 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 17337 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 17338 // We can do a store + vector extract on any vector that fits perfectly in a D 17339 // or Q register. 17340 if (BitWidth == 64 || BitWidth == 128) { 17341 Cost = 0; 17342 return true; 17343 } 17344 return false; 17345 } 17346 17347 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 17348 return Subtarget->hasV6T2Ops(); 17349 } 17350 17351 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 17352 return Subtarget->hasV6T2Ops(); 17353 } 17354 17355 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 17356 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 17357 } 17358 17359 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 17360 AtomicOrdering Ord) const { 17361 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17362 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 17363 bool IsAcquire = isAcquireOrStronger(Ord); 17364 17365 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 17366 // intrinsic must return {i32, i32} and we have to recombine them into a 17367 // single i64 here. 17368 if (ValTy->getPrimitiveSizeInBits() == 64) { 17369 Intrinsic::ID Int = 17370 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 17371 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 17372 17373 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 17374 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 17375 17376 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 17377 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 17378 if (!Subtarget->isLittle()) 17379 std::swap (Lo, Hi); 17380 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 17381 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 17382 return Builder.CreateOr( 17383 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 17384 } 17385 17386 Type *Tys[] = { Addr->getType() }; 17387 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 17388 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 17389 17390 return Builder.CreateTruncOrBitCast( 17391 Builder.CreateCall(Ldrex, Addr), 17392 cast<PointerType>(Addr->getType())->getElementType()); 17393 } 17394 17395 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 17396 IRBuilder<> &Builder) const { 17397 if (!Subtarget->hasV7Ops()) 17398 return; 17399 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17400 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 17401 } 17402 17403 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 17404 Value *Addr, 17405 AtomicOrdering Ord) const { 17406 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 17407 bool IsRelease = isReleaseOrStronger(Ord); 17408 17409 // Since the intrinsics must have legal type, the i64 intrinsics take two 17410 // parameters: "i32, i32". We must marshal Val into the appropriate form 17411 // before the call. 17412 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 17413 Intrinsic::ID Int = 17414 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 17415 Function *Strex = Intrinsic::getDeclaration(M, Int); 17416 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 17417 17418 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 17419 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 17420 if (!Subtarget->isLittle()) 17421 std::swap(Lo, Hi); 17422 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 17423 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 17424 } 17425 17426 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 17427 Type *Tys[] = { Addr->getType() }; 17428 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 17429 17430 return Builder.CreateCall( 17431 Strex, {Builder.CreateZExtOrBitCast( 17432 Val, Strex->getFunctionType()->getParamType(0)), 17433 Addr}); 17434 } 17435 17436 17437 bool ARMTargetLowering::alignLoopsWithOptSize() const { 17438 return Subtarget->isMClass(); 17439 } 17440 17441 /// A helper function for determining the number of interleaved accesses we 17442 /// will generate when lowering accesses of the given type. 17443 unsigned 17444 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 17445 const DataLayout &DL) const { 17446 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 17447 } 17448 17449 bool ARMTargetLowering::isLegalInterleavedAccessType( 17450 unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { 17451 17452 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 17453 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 17454 17455 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 17456 return false; 17457 17458 // Ensure the vector doesn't have f16 elements. Even though we could do an 17459 // i16 vldN, we can't hold the f16 vectors and will end up converting via 17460 // f32. 17461 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 17462 return false; 17463 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 17464 return false; 17465 17466 // Ensure the number of vector elements is greater than 1. 17467 if (VecTy->getNumElements() < 2) 17468 return false; 17469 17470 // Ensure the element type is legal. 17471 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 17472 return false; 17473 17474 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 17475 // 128 will be split into multiple interleaved accesses. 17476 if (Subtarget->hasNEON() && VecSize == 64) 17477 return true; 17478 return VecSize % 128 == 0; 17479 } 17480 17481 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 17482 if (Subtarget->hasNEON()) 17483 return 4; 17484 if (Subtarget->hasMVEIntegerOps()) 17485 return MVEMaxSupportedInterleaveFactor; 17486 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 17487 } 17488 17489 /// Lower an interleaved load into a vldN intrinsic. 17490 /// 17491 /// E.g. Lower an interleaved load (Factor = 2): 17492 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 17493 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 17494 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 17495 /// 17496 /// Into: 17497 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 17498 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 17499 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 17500 bool ARMTargetLowering::lowerInterleavedLoad( 17501 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 17502 ArrayRef<unsigned> Indices, unsigned Factor) const { 17503 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17504 "Invalid interleave factor"); 17505 assert(!Shuffles.empty() && "Empty shufflevector input"); 17506 assert(Shuffles.size() == Indices.size() && 17507 "Unmatched number of shufflevectors and indices"); 17508 17509 VectorType *VecTy = Shuffles[0]->getType(); 17510 Type *EltTy = VecTy->getVectorElementType(); 17511 17512 const DataLayout &DL = LI->getModule()->getDataLayout(); 17513 17514 // Skip if we do not have NEON and skip illegal vector types. We can 17515 // "legalize" wide vector types into multiple interleaved accesses as long as 17516 // the vector types are divisible by 128. 17517 if (!isLegalInterleavedAccessType(Factor, VecTy, DL)) 17518 return false; 17519 17520 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 17521 17522 // A pointer vector can not be the return type of the ldN intrinsics. Need to 17523 // load integer vectors first and then convert to pointer vectors. 17524 if (EltTy->isPointerTy()) 17525 VecTy = 17526 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 17527 17528 IRBuilder<> Builder(LI); 17529 17530 // The base address of the load. 17531 Value *BaseAddr = LI->getPointerOperand(); 17532 17533 if (NumLoads > 1) { 17534 // If we're going to generate more than one load, reset the sub-vector type 17535 // to something legal. 17536 VecTy = VectorType::get(VecTy->getVectorElementType(), 17537 VecTy->getVectorNumElements() / NumLoads); 17538 17539 // We will compute the pointer operand of each load from the original base 17540 // address using GEPs. Cast the base address to a pointer to the scalar 17541 // element type. 17542 BaseAddr = Builder.CreateBitCast( 17543 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 17544 LI->getPointerAddressSpace())); 17545 } 17546 17547 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 17548 17549 auto createLoadIntrinsic = [&](Value *BaseAddr) { 17550 if (Subtarget->hasNEON()) { 17551 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 17552 Type *Tys[] = {VecTy, Int8Ptr}; 17553 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 17554 Intrinsic::arm_neon_vld3, 17555 Intrinsic::arm_neon_vld4}; 17556 Function *VldnFunc = 17557 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 17558 17559 SmallVector<Value *, 2> Ops; 17560 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17561 Ops.push_back(Builder.getInt32(LI->getAlignment())); 17562 17563 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17564 } else { 17565 assert((Factor == 2 || Factor == 4) && 17566 "expected interleave factor of 2 or 4 for MVE"); 17567 Intrinsic::ID LoadInts = 17568 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 17569 Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( 17570 LI->getPointerAddressSpace()); 17571 Type *Tys[] = {VecTy, VecEltTy}; 17572 Function *VldnFunc = 17573 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 17574 17575 SmallVector<Value *, 2> Ops; 17576 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 17577 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 17578 } 17579 }; 17580 17581 // Holds sub-vectors extracted from the load intrinsic return values. The 17582 // sub-vectors are associated with the shufflevector instructions they will 17583 // replace. 17584 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 17585 17586 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 17587 // If we're generating more than one load, compute the base address of 17588 // subsequent loads as an offset from the previous. 17589 if (LoadCount > 0) 17590 BaseAddr = 17591 Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, 17592 VecTy->getVectorNumElements() * Factor); 17593 17594 CallInst *VldN = createLoadIntrinsic(BaseAddr); 17595 17596 // Replace uses of each shufflevector with the corresponding vector loaded 17597 // by ldN. 17598 for (unsigned i = 0; i < Shuffles.size(); i++) { 17599 ShuffleVectorInst *SV = Shuffles[i]; 17600 unsigned Index = Indices[i]; 17601 17602 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 17603 17604 // Convert the integer vector to pointer vector if the element is pointer. 17605 if (EltTy->isPointerTy()) 17606 SubVec = Builder.CreateIntToPtr( 17607 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 17608 VecTy->getVectorNumElements())); 17609 17610 SubVecs[SV].push_back(SubVec); 17611 } 17612 } 17613 17614 // Replace uses of the shufflevector instructions with the sub-vectors 17615 // returned by the load intrinsic. If a shufflevector instruction is 17616 // associated with more than one sub-vector, those sub-vectors will be 17617 // concatenated into a single wide vector. 17618 for (ShuffleVectorInst *SVI : Shuffles) { 17619 auto &SubVec = SubVecs[SVI]; 17620 auto *WideVec = 17621 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 17622 SVI->replaceAllUsesWith(WideVec); 17623 } 17624 17625 return true; 17626 } 17627 17628 /// Lower an interleaved store into a vstN intrinsic. 17629 /// 17630 /// E.g. Lower an interleaved store (Factor = 3): 17631 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 17632 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 17633 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 17634 /// 17635 /// Into: 17636 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 17637 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 17638 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 17639 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17640 /// 17641 /// Note that the new shufflevectors will be removed and we'll only generate one 17642 /// vst3 instruction in CodeGen. 17643 /// 17644 /// Example for a more general valid mask (Factor 3). Lower: 17645 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 17646 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 17647 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 17648 /// 17649 /// Into: 17650 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 17651 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 17652 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 17653 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 17654 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 17655 ShuffleVectorInst *SVI, 17656 unsigned Factor) const { 17657 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 17658 "Invalid interleave factor"); 17659 17660 VectorType *VecTy = SVI->getType(); 17661 assert(VecTy->getVectorNumElements() % Factor == 0 && 17662 "Invalid interleaved store"); 17663 17664 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 17665 Type *EltTy = VecTy->getVectorElementType(); 17666 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 17667 17668 const DataLayout &DL = SI->getModule()->getDataLayout(); 17669 17670 // Skip if we do not have NEON and skip illegal vector types. We can 17671 // "legalize" wide vector types into multiple interleaved accesses as long as 17672 // the vector types are divisible by 128. 17673 if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL)) 17674 return false; 17675 17676 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 17677 17678 Value *Op0 = SVI->getOperand(0); 17679 Value *Op1 = SVI->getOperand(1); 17680 IRBuilder<> Builder(SI); 17681 17682 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 17683 // vectors to integer vectors. 17684 if (EltTy->isPointerTy()) { 17685 Type *IntTy = DL.getIntPtrType(EltTy); 17686 17687 // Convert to the corresponding integer vector. 17688 Type *IntVecTy = 17689 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 17690 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 17691 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 17692 17693 SubVecTy = VectorType::get(IntTy, LaneLen); 17694 } 17695 17696 // The base address of the store. 17697 Value *BaseAddr = SI->getPointerOperand(); 17698 17699 if (NumStores > 1) { 17700 // If we're going to generate more than one store, reset the lane length 17701 // and sub-vector type to something legal. 17702 LaneLen /= NumStores; 17703 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 17704 17705 // We will compute the pointer operand of each store from the original base 17706 // address using GEPs. Cast the base address to a pointer to the scalar 17707 // element type. 17708 BaseAddr = Builder.CreateBitCast( 17709 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 17710 SI->getPointerAddressSpace())); 17711 } 17712 17713 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 17714 17715 auto Mask = SVI->getShuffleMask(); 17716 17717 auto createStoreIntrinsic = [&](Value *BaseAddr, 17718 SmallVectorImpl<Value *> &Shuffles) { 17719 if (Subtarget->hasNEON()) { 17720 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 17721 Intrinsic::arm_neon_vst3, 17722 Intrinsic::arm_neon_vst4}; 17723 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 17724 Type *Tys[] = {Int8Ptr, SubVecTy}; 17725 17726 Function *VstNFunc = Intrinsic::getDeclaration( 17727 SI->getModule(), StoreInts[Factor - 2], Tys); 17728 17729 SmallVector<Value *, 6> Ops; 17730 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 17731 for (auto S : Shuffles) 17732 Ops.push_back(S); 17733 Ops.push_back(Builder.getInt32(SI->getAlignment())); 17734 Builder.CreateCall(VstNFunc, Ops); 17735 } else { 17736 assert((Factor == 2 || Factor == 4) && 17737 "expected interleave factor of 2 or 4 for MVE"); 17738 Intrinsic::ID StoreInts = 17739 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 17740 Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( 17741 SI->getPointerAddressSpace()); 17742 Type *Tys[] = {EltPtrTy, SubVecTy}; 17743 Function *VstNFunc = 17744 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 17745 17746 SmallVector<Value *, 6> Ops; 17747 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 17748 for (auto S : Shuffles) 17749 Ops.push_back(S); 17750 for (unsigned F = 0; F < Factor; F++) { 17751 Ops.push_back(Builder.getInt32(F)); 17752 Builder.CreateCall(VstNFunc, Ops); 17753 Ops.pop_back(); 17754 } 17755 } 17756 }; 17757 17758 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 17759 // If we generating more than one store, we compute the base address of 17760 // subsequent stores as an offset from the previous. 17761 if (StoreCount > 0) 17762 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), 17763 BaseAddr, LaneLen * Factor); 17764 17765 SmallVector<Value *, 4> Shuffles; 17766 17767 // Split the shufflevector operands into sub vectors for the new vstN call. 17768 for (unsigned i = 0; i < Factor; i++) { 17769 unsigned IdxI = StoreCount * LaneLen * Factor + i; 17770 if (Mask[IdxI] >= 0) { 17771 Shuffles.push_back(Builder.CreateShuffleVector( 17772 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 17773 } else { 17774 unsigned StartMask = 0; 17775 for (unsigned j = 1; j < LaneLen; j++) { 17776 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 17777 if (Mask[IdxJ * Factor + IdxI] >= 0) { 17778 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 17779 break; 17780 } 17781 } 17782 // Note: If all elements in a chunk are undefs, StartMask=0! 17783 // Note: Filling undef gaps with random elements is ok, since 17784 // those elements were being written anyway (with undefs). 17785 // In the case of all undefs we're defaulting to using elems from 0 17786 // Note: StartMask cannot be negative, it's checked in 17787 // isReInterleaveMask 17788 Shuffles.push_back(Builder.CreateShuffleVector( 17789 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 17790 } 17791 } 17792 17793 createStoreIntrinsic(BaseAddr, Shuffles); 17794 } 17795 return true; 17796 } 17797 17798 enum HABaseType { 17799 HA_UNKNOWN = 0, 17800 HA_FLOAT, 17801 HA_DOUBLE, 17802 HA_VECT64, 17803 HA_VECT128 17804 }; 17805 17806 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 17807 uint64_t &Members) { 17808 if (auto *ST = dyn_cast<StructType>(Ty)) { 17809 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 17810 uint64_t SubMembers = 0; 17811 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 17812 return false; 17813 Members += SubMembers; 17814 } 17815 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 17816 uint64_t SubMembers = 0; 17817 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 17818 return false; 17819 Members += SubMembers * AT->getNumElements(); 17820 } else if (Ty->isFloatTy()) { 17821 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 17822 return false; 17823 Members = 1; 17824 Base = HA_FLOAT; 17825 } else if (Ty->isDoubleTy()) { 17826 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 17827 return false; 17828 Members = 1; 17829 Base = HA_DOUBLE; 17830 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 17831 Members = 1; 17832 switch (Base) { 17833 case HA_FLOAT: 17834 case HA_DOUBLE: 17835 return false; 17836 case HA_VECT64: 17837 return VT->getBitWidth() == 64; 17838 case HA_VECT128: 17839 return VT->getBitWidth() == 128; 17840 case HA_UNKNOWN: 17841 switch (VT->getBitWidth()) { 17842 case 64: 17843 Base = HA_VECT64; 17844 return true; 17845 case 128: 17846 Base = HA_VECT128; 17847 return true; 17848 default: 17849 return false; 17850 } 17851 } 17852 } 17853 17854 return (Members > 0 && Members <= 4); 17855 } 17856 17857 /// Return the correct alignment for the current calling convention. 17858 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 17859 DataLayout DL) const { 17860 const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); 17861 if (!ArgTy->isVectorTy()) 17862 return ABITypeAlign; 17863 17864 // Avoid over-aligning vector parameters. It would require realigning the 17865 // stack and waste space for no real benefit. 17866 return std::min(ABITypeAlign, DL.getStackAlignment()); 17867 } 17868 17869 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 17870 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 17871 /// passing according to AAPCS rules. 17872 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 17873 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 17874 if (getEffectiveCallingConv(CallConv, isVarArg) != 17875 CallingConv::ARM_AAPCS_VFP) 17876 return false; 17877 17878 HABaseType Base = HA_UNKNOWN; 17879 uint64_t Members = 0; 17880 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 17881 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 17882 17883 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 17884 return IsHA || IsIntArray; 17885 } 17886 17887 unsigned ARMTargetLowering::getExceptionPointerRegister( 17888 const Constant *PersonalityFn) const { 17889 // Platforms which do not use SjLj EH may return values in these registers 17890 // via the personality function. 17891 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 17892 } 17893 17894 unsigned ARMTargetLowering::getExceptionSelectorRegister( 17895 const Constant *PersonalityFn) const { 17896 // Platforms which do not use SjLj EH may return values in these registers 17897 // via the personality function. 17898 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 17899 } 17900 17901 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 17902 // Update IsSplitCSR in ARMFunctionInfo. 17903 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 17904 AFI->setIsSplitCSR(true); 17905 } 17906 17907 void ARMTargetLowering::insertCopiesSplitCSR( 17908 MachineBasicBlock *Entry, 17909 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 17910 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 17911 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 17912 if (!IStart) 17913 return; 17914 17915 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 17916 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 17917 MachineBasicBlock::iterator MBBI = Entry->begin(); 17918 for (const MCPhysReg *I = IStart; *I; ++I) { 17919 const TargetRegisterClass *RC = nullptr; 17920 if (ARM::GPRRegClass.contains(*I)) 17921 RC = &ARM::GPRRegClass; 17922 else if (ARM::DPRRegClass.contains(*I)) 17923 RC = &ARM::DPRRegClass; 17924 else 17925 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 17926 17927 Register NewVR = MRI->createVirtualRegister(RC); 17928 // Create copy from CSR to a virtual register. 17929 // FIXME: this currently does not emit CFI pseudo-instructions, it works 17930 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 17931 // nounwind. If we want to generalize this later, we may need to emit 17932 // CFI pseudo-instructions. 17933 assert(Entry->getParent()->getFunction().hasFnAttribute( 17934 Attribute::NoUnwind) && 17935 "Function should be nounwind in insertCopiesSplitCSR!"); 17936 Entry->addLiveIn(*I); 17937 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 17938 .addReg(*I); 17939 17940 // Insert the copy-back instructions right before the terminator. 17941 for (auto *Exit : Exits) 17942 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 17943 TII->get(TargetOpcode::COPY), *I) 17944 .addReg(NewVR); 17945 } 17946 } 17947 17948 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 17949 MF.getFrameInfo().computeMaxCallFrameSize(MF); 17950 TargetLoweringBase::finalizeLowering(MF); 17951 } 17952