1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that ARM uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "ARMISelLowering.h" 16 #include "ARMBaseInstrInfo.h" 17 #include "ARMBaseRegisterInfo.h" 18 #include "ARMCallingConv.h" 19 #include "ARMConstantPoolValue.h" 20 #include "ARMMachineFunctionInfo.h" 21 #include "ARMPerfectShuffle.h" 22 #include "ARMRegisterInfo.h" 23 #include "ARMSelectionDAGInfo.h" 24 #include "ARMSubtarget.h" 25 #include "MCTargetDesc/ARMAddressingModes.h" 26 #include "MCTargetDesc/ARMBaseInfo.h" 27 #include "Utils/ARMBaseInfo.h" 28 #include "llvm/ADT/APFloat.h" 29 #include "llvm/ADT/APInt.h" 30 #include "llvm/ADT/ArrayRef.h" 31 #include "llvm/ADT/BitVector.h" 32 #include "llvm/ADT/DenseMap.h" 33 #include "llvm/ADT/STLExtras.h" 34 #include "llvm/ADT/SmallPtrSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/ADT/StringExtras.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/ADT/StringSwitch.h" 40 #include "llvm/ADT/Triple.h" 41 #include "llvm/ADT/Twine.h" 42 #include "llvm/Analysis/VectorUtils.h" 43 #include "llvm/CodeGen/CallingConvLower.h" 44 #include "llvm/CodeGen/ISDOpcodes.h" 45 #include "llvm/CodeGen/IntrinsicLowering.h" 46 #include "llvm/CodeGen/MachineBasicBlock.h" 47 #include "llvm/CodeGen/MachineConstantPool.h" 48 #include "llvm/CodeGen/MachineFrameInfo.h" 49 #include "llvm/CodeGen/MachineFunction.h" 50 #include "llvm/CodeGen/MachineInstr.h" 51 #include "llvm/CodeGen/MachineInstrBuilder.h" 52 #include "llvm/CodeGen/MachineJumpTableInfo.h" 53 #include "llvm/CodeGen/MachineMemOperand.h" 54 #include "llvm/CodeGen/MachineOperand.h" 55 #include "llvm/CodeGen/MachineRegisterInfo.h" 56 #include "llvm/CodeGen/RuntimeLibcalls.h" 57 #include "llvm/CodeGen/SelectionDAG.h" 58 #include "llvm/CodeGen/SelectionDAGNodes.h" 59 #include "llvm/CodeGen/TargetInstrInfo.h" 60 #include "llvm/CodeGen/TargetLowering.h" 61 #include "llvm/CodeGen/TargetOpcodes.h" 62 #include "llvm/CodeGen/TargetRegisterInfo.h" 63 #include "llvm/CodeGen/TargetSubtargetInfo.h" 64 #include "llvm/CodeGen/ValueTypes.h" 65 #include "llvm/IR/Attributes.h" 66 #include "llvm/IR/CallingConv.h" 67 #include "llvm/IR/Constant.h" 68 #include "llvm/IR/Constants.h" 69 #include "llvm/IR/DataLayout.h" 70 #include "llvm/IR/DebugLoc.h" 71 #include "llvm/IR/DerivedTypes.h" 72 #include "llvm/IR/Function.h" 73 #include "llvm/IR/GlobalAlias.h" 74 #include "llvm/IR/GlobalValue.h" 75 #include "llvm/IR/GlobalVariable.h" 76 #include "llvm/IR/IRBuilder.h" 77 #include "llvm/IR/InlineAsm.h" 78 #include "llvm/IR/Instruction.h" 79 #include "llvm/IR/Instructions.h" 80 #include "llvm/IR/IntrinsicInst.h" 81 #include "llvm/IR/Intrinsics.h" 82 #include "llvm/IR/Module.h" 83 #include "llvm/IR/Type.h" 84 #include "llvm/IR/User.h" 85 #include "llvm/IR/Value.h" 86 #include "llvm/MC/MCInstrDesc.h" 87 #include "llvm/MC/MCInstrItineraries.h" 88 #include "llvm/MC/MCRegisterInfo.h" 89 #include "llvm/MC/MCSchedule.h" 90 #include "llvm/Support/AtomicOrdering.h" 91 #include "llvm/Support/BranchProbability.h" 92 #include "llvm/Support/Casting.h" 93 #include "llvm/Support/CodeGen.h" 94 #include "llvm/Support/CommandLine.h" 95 #include "llvm/Support/Compiler.h" 96 #include "llvm/Support/Debug.h" 97 #include "llvm/Support/ErrorHandling.h" 98 #include "llvm/Support/KnownBits.h" 99 #include "llvm/Support/MachineValueType.h" 100 #include "llvm/Support/MathExtras.h" 101 #include "llvm/Support/raw_ostream.h" 102 #include "llvm/Target/TargetMachine.h" 103 #include "llvm/Target/TargetOptions.h" 104 #include <algorithm> 105 #include <cassert> 106 #include <cstdint> 107 #include <cstdlib> 108 #include <iterator> 109 #include <limits> 110 #include <string> 111 #include <tuple> 112 #include <utility> 113 #include <vector> 114 115 using namespace llvm; 116 117 #define DEBUG_TYPE "arm-isel" 118 119 STATISTIC(NumTailCalls, "Number of tail calls"); 120 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 121 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 122 STATISTIC(NumConstpoolPromoted, 123 "Number of constants with their storage promoted into constant pools"); 124 125 static cl::opt<bool> 126 ARMInterworking("arm-interworking", cl::Hidden, 127 cl::desc("Enable / disable ARM interworking (for debugging only)"), 128 cl::init(true)); 129 130 static cl::opt<bool> EnableConstpoolPromotion( 131 "arm-promote-constant", cl::Hidden, 132 cl::desc("Enable / disable promotion of unnamed_addr constants into " 133 "constant pools"), 134 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 135 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 136 "arm-promote-constant-max-size", cl::Hidden, 137 cl::desc("Maximum size of constant to promote into a constant pool"), 138 cl::init(64)); 139 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 140 "arm-promote-constant-max-total", cl::Hidden, 141 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 142 cl::init(128)); 143 144 // The APCS parameter registers. 145 static const MCPhysReg GPRArgRegs[] = { 146 ARM::R0, ARM::R1, ARM::R2, ARM::R3 147 }; 148 149 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 150 MVT PromotedBitwiseVT) { 151 if (VT != PromotedLdStVT) { 152 setOperationAction(ISD::LOAD, VT, Promote); 153 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 154 155 setOperationAction(ISD::STORE, VT, Promote); 156 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 157 } 158 159 MVT ElemTy = VT.getVectorElementType(); 160 if (ElemTy != MVT::f64) 161 setOperationAction(ISD::SETCC, VT, Custom); 162 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 163 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 164 if (ElemTy == MVT::i32) { 165 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 166 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 167 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 168 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 169 } else { 170 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 171 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 172 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 173 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 174 } 175 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 176 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 177 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 178 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 179 setOperationAction(ISD::SELECT, VT, Expand); 180 setOperationAction(ISD::SELECT_CC, VT, Expand); 181 setOperationAction(ISD::VSELECT, VT, Expand); 182 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 183 if (VT.isInteger()) { 184 setOperationAction(ISD::SHL, VT, Custom); 185 setOperationAction(ISD::SRA, VT, Custom); 186 setOperationAction(ISD::SRL, VT, Custom); 187 } 188 189 // Promote all bit-wise operations. 190 if (VT.isInteger() && VT != PromotedBitwiseVT) { 191 setOperationAction(ISD::AND, VT, Promote); 192 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 193 setOperationAction(ISD::OR, VT, Promote); 194 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 195 setOperationAction(ISD::XOR, VT, Promote); 196 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 197 } 198 199 // Neon does not support vector divide/remainder operations. 200 setOperationAction(ISD::SDIV, VT, Expand); 201 setOperationAction(ISD::UDIV, VT, Expand); 202 setOperationAction(ISD::FDIV, VT, Expand); 203 setOperationAction(ISD::SREM, VT, Expand); 204 setOperationAction(ISD::UREM, VT, Expand); 205 setOperationAction(ISD::FREM, VT, Expand); 206 207 if (!VT.isFloatingPoint() && 208 VT != MVT::v2i64 && VT != MVT::v1i64) 209 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 210 setOperationAction(Opcode, VT, Legal); 211 } 212 213 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 214 addRegisterClass(VT, &ARM::DPRRegClass); 215 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 216 } 217 218 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 219 addRegisterClass(VT, &ARM::DPairRegClass); 220 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 221 } 222 223 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 224 const ARMSubtarget &STI) 225 : TargetLowering(TM), Subtarget(&STI) { 226 RegInfo = Subtarget->getRegisterInfo(); 227 Itins = Subtarget->getInstrItineraryData(); 228 229 setBooleanContents(ZeroOrOneBooleanContent); 230 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 231 232 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 233 !Subtarget->isTargetWatchOS()) { 234 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 235 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 236 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 237 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 238 : CallingConv::ARM_AAPCS); 239 } 240 241 if (Subtarget->isTargetMachO()) { 242 // Uses VFP for Thumb libfuncs if available. 243 if (Subtarget->isThumb() && Subtarget->hasVFP2() && 244 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 245 static const struct { 246 const RTLIB::Libcall Op; 247 const char * const Name; 248 const ISD::CondCode Cond; 249 } LibraryCalls[] = { 250 // Single-precision floating-point arithmetic. 251 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 252 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 253 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 254 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 255 256 // Double-precision floating-point arithmetic. 257 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 258 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 259 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 260 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 261 262 // Single-precision comparisons. 263 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 264 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 265 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 266 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 267 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 268 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 269 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 270 { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, 271 272 // Double-precision comparisons. 273 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 274 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 275 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 276 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 277 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 278 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 279 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 280 { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, 281 282 // Floating-point to integer conversions. 283 // i64 conversions are done via library routines even when generating VFP 284 // instructions, so use the same ones. 285 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 286 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 287 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 288 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 289 290 // Conversions between floating types. 291 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 292 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 293 294 // Integer to floating-point conversions. 295 // i64 conversions are done via library routines even when generating VFP 296 // instructions, so use the same ones. 297 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 298 // e.g., __floatunsidf vs. __floatunssidfvfp. 299 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 300 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 301 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 302 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 303 }; 304 305 for (const auto &LC : LibraryCalls) { 306 setLibcallName(LC.Op, LC.Name); 307 if (LC.Cond != ISD::SETCC_INVALID) 308 setCmpLibcallCC(LC.Op, LC.Cond); 309 } 310 } 311 } 312 313 // These libcalls are not available in 32-bit. 314 setLibcallName(RTLIB::SHL_I128, nullptr); 315 setLibcallName(RTLIB::SRL_I128, nullptr); 316 setLibcallName(RTLIB::SRA_I128, nullptr); 317 318 // RTLIB 319 if (Subtarget->isAAPCS_ABI() && 320 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 321 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 322 static const struct { 323 const RTLIB::Libcall Op; 324 const char * const Name; 325 const CallingConv::ID CC; 326 const ISD::CondCode Cond; 327 } LibraryCalls[] = { 328 // Double-precision floating-point arithmetic helper functions 329 // RTABI chapter 4.1.2, Table 2 330 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 331 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 332 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 333 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 334 335 // Double-precision floating-point comparison helper functions 336 // RTABI chapter 4.1.2, Table 3 337 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 338 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 339 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 340 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 341 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 342 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 343 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 344 { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 345 346 // Single-precision floating-point arithmetic helper functions 347 // RTABI chapter 4.1.2, Table 4 348 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 349 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 350 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 351 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 352 353 // Single-precision floating-point comparison helper functions 354 // RTABI chapter 4.1.2, Table 5 355 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 356 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 357 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 358 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 359 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 360 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 361 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 362 { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 363 364 // Floating-point to integer conversions. 365 // RTABI chapter 4.1.2, Table 6 366 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 367 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 368 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 369 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 370 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 371 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 372 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 373 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 374 375 // Conversions between floating types. 376 // RTABI chapter 4.1.2, Table 7 377 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 378 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 379 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 380 381 // Integer to floating-point conversions. 382 // RTABI chapter 4.1.2, Table 8 383 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 384 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 385 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 386 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 387 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 388 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 389 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 390 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 391 392 // Long long helper functions 393 // RTABI chapter 4.2, Table 9 394 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 395 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 396 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 397 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 398 399 // Integer division functions 400 // RTABI chapter 4.3.1 401 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 402 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 403 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 404 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 405 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 406 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 407 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 408 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 409 }; 410 411 for (const auto &LC : LibraryCalls) { 412 setLibcallName(LC.Op, LC.Name); 413 setLibcallCallingConv(LC.Op, LC.CC); 414 if (LC.Cond != ISD::SETCC_INVALID) 415 setCmpLibcallCC(LC.Op, LC.Cond); 416 } 417 418 // EABI dependent RTLIB 419 if (TM.Options.EABIVersion == EABI::EABI4 || 420 TM.Options.EABIVersion == EABI::EABI5) { 421 static const struct { 422 const RTLIB::Libcall Op; 423 const char *const Name; 424 const CallingConv::ID CC; 425 const ISD::CondCode Cond; 426 } MemOpsLibraryCalls[] = { 427 // Memory operations 428 // RTABI chapter 4.3.4 429 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 430 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 431 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 432 }; 433 434 for (const auto &LC : MemOpsLibraryCalls) { 435 setLibcallName(LC.Op, LC.Name); 436 setLibcallCallingConv(LC.Op, LC.CC); 437 if (LC.Cond != ISD::SETCC_INVALID) 438 setCmpLibcallCC(LC.Op, LC.Cond); 439 } 440 } 441 } 442 443 if (Subtarget->isTargetWindows()) { 444 static const struct { 445 const RTLIB::Libcall Op; 446 const char * const Name; 447 const CallingConv::ID CC; 448 } LibraryCalls[] = { 449 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 450 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 451 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 452 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 453 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 454 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 455 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 456 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 457 }; 458 459 for (const auto &LC : LibraryCalls) { 460 setLibcallName(LC.Op, LC.Name); 461 setLibcallCallingConv(LC.Op, LC.CC); 462 } 463 } 464 465 // Use divmod compiler-rt calls for iOS 5.0 and later. 466 if (Subtarget->isTargetMachO() && 467 !(Subtarget->isTargetIOS() && 468 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 469 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 470 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 471 } 472 473 // The half <-> float conversion functions are always soft-float on 474 // non-watchos platforms, but are needed for some targets which use a 475 // hard-float calling convention by default. 476 if (!Subtarget->isTargetWatchABI()) { 477 if (Subtarget->isAAPCS_ABI()) { 478 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 479 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 480 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 481 } else { 482 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 483 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 484 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 485 } 486 } 487 488 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 489 // a __gnu_ prefix (which is the default). 490 if (Subtarget->isTargetAEABI()) { 491 static const struct { 492 const RTLIB::Libcall Op; 493 const char * const Name; 494 const CallingConv::ID CC; 495 } LibraryCalls[] = { 496 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 497 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 498 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 499 }; 500 501 for (const auto &LC : LibraryCalls) { 502 setLibcallName(LC.Op, LC.Name); 503 setLibcallCallingConv(LC.Op, LC.CC); 504 } 505 } 506 507 if (Subtarget->isThumb1Only()) 508 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 509 else 510 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 511 512 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 513 !Subtarget->isThumb1Only()) { 514 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 515 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 516 } 517 518 if (Subtarget->hasFullFP16()) { 519 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 520 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 521 setOperationAction(ISD::BITCAST, MVT::i32, Custom); 522 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 523 524 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 525 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 526 } 527 528 for (MVT VT : MVT::vector_valuetypes()) { 529 for (MVT InnerVT : MVT::vector_valuetypes()) { 530 setTruncStoreAction(VT, InnerVT, Expand); 531 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 532 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 533 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 534 } 535 536 setOperationAction(ISD::MULHS, VT, Expand); 537 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 538 setOperationAction(ISD::MULHU, VT, Expand); 539 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 540 541 setOperationAction(ISD::BSWAP, VT, Expand); 542 } 543 544 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 545 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 546 547 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 548 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 549 550 if (Subtarget->hasNEON()) { 551 addDRTypeForNEON(MVT::v2f32); 552 addDRTypeForNEON(MVT::v8i8); 553 addDRTypeForNEON(MVT::v4i16); 554 addDRTypeForNEON(MVT::v2i32); 555 addDRTypeForNEON(MVT::v1i64); 556 557 addQRTypeForNEON(MVT::v4f32); 558 addQRTypeForNEON(MVT::v2f64); 559 addQRTypeForNEON(MVT::v16i8); 560 addQRTypeForNEON(MVT::v8i16); 561 addQRTypeForNEON(MVT::v4i32); 562 addQRTypeForNEON(MVT::v2i64); 563 564 if (Subtarget->hasFullFP16()) { 565 addQRTypeForNEON(MVT::v8f16); 566 addDRTypeForNEON(MVT::v4f16); 567 } 568 569 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 570 // neither Neon nor VFP support any arithmetic operations on it. 571 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 572 // supported for v4f32. 573 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 574 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 575 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 576 // FIXME: Code duplication: FDIV and FREM are expanded always, see 577 // ARMTargetLowering::addTypeForNEON method for details. 578 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 579 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 580 // FIXME: Create unittest. 581 // In another words, find a way when "copysign" appears in DAG with vector 582 // operands. 583 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 584 // FIXME: Code duplication: SETCC has custom operation action, see 585 // ARMTargetLowering::addTypeForNEON method for details. 586 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 587 // FIXME: Create unittest for FNEG and for FABS. 588 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 589 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 590 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 591 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 592 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 593 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 594 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 595 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 596 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 597 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 598 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 599 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 600 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 601 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 602 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 603 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 604 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 605 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 606 607 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 608 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 609 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 610 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 611 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 612 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 613 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 614 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 615 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 616 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 617 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 618 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 619 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 620 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 621 622 // Mark v2f32 intrinsics. 623 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 624 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 625 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 626 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 627 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 628 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 629 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 630 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 631 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 632 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 633 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 634 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 635 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 636 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 637 638 // Neon does not support some operations on v1i64 and v2i64 types. 639 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 640 // Custom handling for some quad-vector types to detect VMULL. 641 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 642 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 643 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 644 // Custom handling for some vector types to avoid expensive expansions 645 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 646 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 647 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 648 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 649 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 650 // a destination type that is wider than the source, and nor does 651 // it have a FP_TO_[SU]INT instruction with a narrower destination than 652 // source. 653 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 654 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 655 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 656 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 657 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 658 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 659 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 660 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 661 662 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 663 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 664 665 // NEON does not have single instruction CTPOP for vectors with element 666 // types wider than 8-bits. However, custom lowering can leverage the 667 // v8i8/v16i8 vcnt instruction. 668 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 669 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 670 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 671 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 672 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 673 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 674 675 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 676 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 677 678 // NEON does not have single instruction CTTZ for vectors. 679 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 680 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 681 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 682 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 683 684 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 685 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 686 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 687 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 688 689 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 690 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 691 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 692 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 693 694 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 695 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 696 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 697 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 698 699 // NEON only has FMA instructions as of VFP4. 700 if (!Subtarget->hasVFP4()) { 701 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 702 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 703 } 704 705 setTargetDAGCombine(ISD::INTRINSIC_VOID); 706 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 707 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 708 setTargetDAGCombine(ISD::SHL); 709 setTargetDAGCombine(ISD::SRL); 710 setTargetDAGCombine(ISD::SRA); 711 setTargetDAGCombine(ISD::SIGN_EXTEND); 712 setTargetDAGCombine(ISD::ZERO_EXTEND); 713 setTargetDAGCombine(ISD::ANY_EXTEND); 714 setTargetDAGCombine(ISD::BUILD_VECTOR); 715 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 716 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 717 setTargetDAGCombine(ISD::STORE); 718 setTargetDAGCombine(ISD::FP_TO_SINT); 719 setTargetDAGCombine(ISD::FP_TO_UINT); 720 setTargetDAGCombine(ISD::FDIV); 721 setTargetDAGCombine(ISD::LOAD); 722 723 // It is legal to extload from v4i8 to v4i16 or v4i32. 724 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 725 MVT::v2i32}) { 726 for (MVT VT : MVT::integer_vector_valuetypes()) { 727 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 728 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 729 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 730 } 731 } 732 } 733 734 if (Subtarget->isFPOnlySP()) { 735 // When targeting a floating-point unit with only single-precision 736 // operations, f64 is legal for the few double-precision instructions which 737 // are present However, no double-precision operations other than moves, 738 // loads and stores are provided by the hardware. 739 setOperationAction(ISD::FADD, MVT::f64, Expand); 740 setOperationAction(ISD::FSUB, MVT::f64, Expand); 741 setOperationAction(ISD::FMUL, MVT::f64, Expand); 742 setOperationAction(ISD::FMA, MVT::f64, Expand); 743 setOperationAction(ISD::FDIV, MVT::f64, Expand); 744 setOperationAction(ISD::FREM, MVT::f64, Expand); 745 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 746 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 747 setOperationAction(ISD::FNEG, MVT::f64, Expand); 748 setOperationAction(ISD::FABS, MVT::f64, Expand); 749 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 750 setOperationAction(ISD::FSIN, MVT::f64, Expand); 751 setOperationAction(ISD::FCOS, MVT::f64, Expand); 752 setOperationAction(ISD::FPOW, MVT::f64, Expand); 753 setOperationAction(ISD::FLOG, MVT::f64, Expand); 754 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 755 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 756 setOperationAction(ISD::FEXP, MVT::f64, Expand); 757 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 758 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 759 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 760 setOperationAction(ISD::FRINT, MVT::f64, Expand); 761 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 762 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 763 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 764 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 765 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 766 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 767 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 768 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 769 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 770 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 771 } 772 773 computeRegisterProperties(Subtarget->getRegisterInfo()); 774 775 // ARM does not have floating-point extending loads. 776 for (MVT VT : MVT::fp_valuetypes()) { 777 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 778 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 779 } 780 781 // ... or truncating stores 782 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 783 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 784 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 785 786 // ARM does not have i1 sign extending load. 787 for (MVT VT : MVT::integer_valuetypes()) 788 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 789 790 // ARM supports all 4 flavors of integer indexed load / store. 791 if (!Subtarget->isThumb1Only()) { 792 for (unsigned im = (unsigned)ISD::PRE_INC; 793 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 794 setIndexedLoadAction(im, MVT::i1, Legal); 795 setIndexedLoadAction(im, MVT::i8, Legal); 796 setIndexedLoadAction(im, MVT::i16, Legal); 797 setIndexedLoadAction(im, MVT::i32, Legal); 798 setIndexedStoreAction(im, MVT::i1, Legal); 799 setIndexedStoreAction(im, MVT::i8, Legal); 800 setIndexedStoreAction(im, MVT::i16, Legal); 801 setIndexedStoreAction(im, MVT::i32, Legal); 802 } 803 } else { 804 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 805 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 806 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 807 } 808 809 setOperationAction(ISD::SADDO, MVT::i32, Custom); 810 setOperationAction(ISD::UADDO, MVT::i32, Custom); 811 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 812 setOperationAction(ISD::USUBO, MVT::i32, Custom); 813 814 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 815 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 816 817 // i64 operation support. 818 setOperationAction(ISD::MUL, MVT::i64, Expand); 819 setOperationAction(ISD::MULHU, MVT::i32, Expand); 820 if (Subtarget->isThumb1Only()) { 821 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 822 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 823 } 824 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 825 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 826 setOperationAction(ISD::MULHS, MVT::i32, Expand); 827 828 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 829 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 830 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 831 setOperationAction(ISD::SRL, MVT::i64, Custom); 832 setOperationAction(ISD::SRA, MVT::i64, Custom); 833 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 834 835 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 836 if (Subtarget->isThumb1Only()) { 837 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 838 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 839 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 840 } 841 842 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 843 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 844 845 // ARM does not have ROTL. 846 setOperationAction(ISD::ROTL, MVT::i32, Expand); 847 for (MVT VT : MVT::vector_valuetypes()) { 848 setOperationAction(ISD::ROTL, VT, Expand); 849 setOperationAction(ISD::ROTR, VT, Expand); 850 } 851 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 852 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 853 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 854 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 855 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 856 } 857 858 // @llvm.readcyclecounter requires the Performance Monitors extension. 859 // Default to the 0 expansion on unsupported platforms. 860 // FIXME: Technically there are older ARM CPUs that have 861 // implementation-specific ways of obtaining this information. 862 if (Subtarget->hasPerfMon()) 863 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 864 865 // Only ARMv6 has BSWAP. 866 if (!Subtarget->hasV6Ops()) 867 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 868 869 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 870 : Subtarget->hasDivideInARMMode(); 871 if (!hasDivide) { 872 // These are expanded into libcalls if the cpu doesn't have HW divider. 873 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 874 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 875 } 876 877 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 878 setOperationAction(ISD::SDIV, MVT::i32, Custom); 879 setOperationAction(ISD::UDIV, MVT::i32, Custom); 880 881 setOperationAction(ISD::SDIV, MVT::i64, Custom); 882 setOperationAction(ISD::UDIV, MVT::i64, Custom); 883 } 884 885 setOperationAction(ISD::SREM, MVT::i32, Expand); 886 setOperationAction(ISD::UREM, MVT::i32, Expand); 887 888 // Register based DivRem for AEABI (RTABI 4.2) 889 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 890 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 891 Subtarget->isTargetWindows()) { 892 setOperationAction(ISD::SREM, MVT::i64, Custom); 893 setOperationAction(ISD::UREM, MVT::i64, Custom); 894 HasStandaloneRem = false; 895 896 if (Subtarget->isTargetWindows()) { 897 const struct { 898 const RTLIB::Libcall Op; 899 const char * const Name; 900 const CallingConv::ID CC; 901 } LibraryCalls[] = { 902 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 903 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 904 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 905 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 906 907 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 908 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 909 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 910 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 911 }; 912 913 for (const auto &LC : LibraryCalls) { 914 setLibcallName(LC.Op, LC.Name); 915 setLibcallCallingConv(LC.Op, LC.CC); 916 } 917 } else { 918 const struct { 919 const RTLIB::Libcall Op; 920 const char * const Name; 921 const CallingConv::ID CC; 922 } LibraryCalls[] = { 923 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 924 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 925 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 926 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 927 928 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 929 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 930 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 931 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 932 }; 933 934 for (const auto &LC : LibraryCalls) { 935 setLibcallName(LC.Op, LC.Name); 936 setLibcallCallingConv(LC.Op, LC.CC); 937 } 938 } 939 940 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 941 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 942 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 943 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 944 } else { 945 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 946 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 947 } 948 949 if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) 950 for (auto &VT : {MVT::f32, MVT::f64}) 951 setOperationAction(ISD::FPOWI, VT, Custom); 952 953 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 954 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 955 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 956 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 957 958 setOperationAction(ISD::TRAP, MVT::Other, Legal); 959 960 // Use the default implementation. 961 setOperationAction(ISD::VASTART, MVT::Other, Custom); 962 setOperationAction(ISD::VAARG, MVT::Other, Expand); 963 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 964 setOperationAction(ISD::VAEND, MVT::Other, Expand); 965 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 966 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 967 968 if (Subtarget->isTargetWindows()) 969 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 970 else 971 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 972 973 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 974 // the default expansion. 975 InsertFencesForAtomic = false; 976 if (Subtarget->hasAnyDataBarrier() && 977 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 978 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 979 // to ldrex/strex loops already. 980 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 981 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 982 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 983 984 // On v8, we have particularly efficient implementations of atomic fences 985 // if they can be combined with nearby atomic loads and stores. 986 if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { 987 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 988 InsertFencesForAtomic = true; 989 } 990 } else { 991 // If there's anything we can use as a barrier, go through custom lowering 992 // for ATOMIC_FENCE. 993 // If target has DMB in thumb, Fences can be inserted. 994 if (Subtarget->hasDataBarrier()) 995 InsertFencesForAtomic = true; 996 997 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 998 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 999 1000 // Set them all for expansion, which will force libcalls. 1001 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1002 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1003 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1004 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1005 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1006 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1007 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1008 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1009 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1010 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1011 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1012 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1013 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1014 // Unordered/Monotonic case. 1015 if (!InsertFencesForAtomic) { 1016 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1017 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1018 } 1019 } 1020 1021 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1022 1023 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1024 if (!Subtarget->hasV6Ops()) { 1025 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1026 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1027 } 1028 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1029 1030 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 1031 !Subtarget->isThumb1Only()) { 1032 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1033 // iff target supports vfp2. 1034 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1035 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1036 } 1037 1038 // We want to custom lower some of our intrinsics. 1039 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1040 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1041 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1042 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1043 if (Subtarget->useSjLjEH()) 1044 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1045 1046 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1047 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1048 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1049 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1050 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1051 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1052 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1053 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1054 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1055 if (Subtarget->hasFullFP16()) { 1056 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1057 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1058 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1059 } 1060 1061 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1062 1063 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1064 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1065 if (Subtarget->hasFullFP16()) 1066 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1067 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1068 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1069 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1070 1071 // We don't support sin/cos/fmod/copysign/pow 1072 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1073 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1074 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1075 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1076 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1077 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1078 setOperationAction(ISD::FREM, MVT::f64, Expand); 1079 setOperationAction(ISD::FREM, MVT::f32, Expand); 1080 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 1081 !Subtarget->isThumb1Only()) { 1082 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1083 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1084 } 1085 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1086 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1087 1088 if (!Subtarget->hasVFP4()) { 1089 setOperationAction(ISD::FMA, MVT::f64, Expand); 1090 setOperationAction(ISD::FMA, MVT::f32, Expand); 1091 } 1092 1093 // Various VFP goodness 1094 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1095 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1096 if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { 1097 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1098 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1099 } 1100 1101 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1102 if (!Subtarget->hasFP16()) { 1103 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1104 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1105 } 1106 } 1107 1108 // Use __sincos_stret if available. 1109 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1110 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1111 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1112 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1113 } 1114 1115 // FP-ARMv8 implements a lot of rounding-like FP operations. 1116 if (Subtarget->hasFPARMv8()) { 1117 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1118 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1119 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1120 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1121 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1122 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1123 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1124 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1125 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1126 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1127 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1128 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1129 1130 if (!Subtarget->isFPOnlySP()) { 1131 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1132 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1133 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1134 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1135 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1136 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1137 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1138 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1139 } 1140 } 1141 1142 if (Subtarget->hasNEON()) { 1143 // vmin and vmax aren't available in a scalar form, so we use 1144 // a NEON instruction with an undef lane instead. 1145 setOperationAction(ISD::FMINNAN, MVT::f16, Legal); 1146 setOperationAction(ISD::FMAXNAN, MVT::f16, Legal); 1147 setOperationAction(ISD::FMINNAN, MVT::f32, Legal); 1148 setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); 1149 setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); 1150 setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); 1151 setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); 1152 setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); 1153 1154 if (Subtarget->hasFullFP16()) { 1155 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1156 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1157 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1158 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1159 1160 setOperationAction(ISD::FMINNAN, MVT::v4f16, Legal); 1161 setOperationAction(ISD::FMAXNAN, MVT::v4f16, Legal); 1162 setOperationAction(ISD::FMINNAN, MVT::v8f16, Legal); 1163 setOperationAction(ISD::FMAXNAN, MVT::v8f16, Legal); 1164 } 1165 } 1166 1167 // We have target-specific dag combine patterns for the following nodes: 1168 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1169 setTargetDAGCombine(ISD::ADD); 1170 setTargetDAGCombine(ISD::SUB); 1171 setTargetDAGCombine(ISD::MUL); 1172 setTargetDAGCombine(ISD::AND); 1173 setTargetDAGCombine(ISD::OR); 1174 setTargetDAGCombine(ISD::XOR); 1175 1176 if (Subtarget->hasV6Ops()) 1177 setTargetDAGCombine(ISD::SRL); 1178 1179 setStackPointerRegisterToSaveRestore(ARM::SP); 1180 1181 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1182 !Subtarget->hasVFP2()) 1183 setSchedulingPreference(Sched::RegPressure); 1184 else 1185 setSchedulingPreference(Sched::Hybrid); 1186 1187 //// temporary - rewrite interface to use type 1188 MaxStoresPerMemset = 8; 1189 MaxStoresPerMemsetOptSize = 4; 1190 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1191 MaxStoresPerMemcpyOptSize = 2; 1192 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1193 MaxStoresPerMemmoveOptSize = 2; 1194 1195 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1196 // are at least 4 bytes aligned. 1197 setMinStackArgumentAlignment(4); 1198 1199 // Prefer likely predicted branches to selects on out-of-order cores. 1200 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1201 1202 setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); 1203 1204 setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); 1205 } 1206 1207 bool ARMTargetLowering::useSoftFloat() const { 1208 return Subtarget->useSoftFloat(); 1209 } 1210 1211 // FIXME: It might make sense to define the representative register class as the 1212 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1213 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1214 // SPR's representative would be DPR_VFP2. This should work well if register 1215 // pressure tracking were modified such that a register use would increment the 1216 // pressure of the register class's representative and all of it's super 1217 // classes' representatives transitively. We have not implemented this because 1218 // of the difficulty prior to coalescing of modeling operand register classes 1219 // due to the common occurrence of cross class copies and subregister insertions 1220 // and extractions. 1221 std::pair<const TargetRegisterClass *, uint8_t> 1222 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1223 MVT VT) const { 1224 const TargetRegisterClass *RRC = nullptr; 1225 uint8_t Cost = 1; 1226 switch (VT.SimpleTy) { 1227 default: 1228 return TargetLowering::findRepresentativeClass(TRI, VT); 1229 // Use DPR as representative register class for all floating point 1230 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1231 // the cost is 1 for both f32 and f64. 1232 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1233 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1234 RRC = &ARM::DPRRegClass; 1235 // When NEON is used for SP, only half of the register file is available 1236 // because operations that define both SP and DP results will be constrained 1237 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1238 // coalescing by double-counting the SP regs. See the FIXME above. 1239 if (Subtarget->useNEONForSinglePrecisionFP()) 1240 Cost = 2; 1241 break; 1242 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1243 case MVT::v4f32: case MVT::v2f64: 1244 RRC = &ARM::DPRRegClass; 1245 Cost = 2; 1246 break; 1247 case MVT::v4i64: 1248 RRC = &ARM::DPRRegClass; 1249 Cost = 4; 1250 break; 1251 case MVT::v8i64: 1252 RRC = &ARM::DPRRegClass; 1253 Cost = 8; 1254 break; 1255 } 1256 return std::make_pair(RRC, Cost); 1257 } 1258 1259 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1260 switch ((ARMISD::NodeType)Opcode) { 1261 case ARMISD::FIRST_NUMBER: break; 1262 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1263 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1264 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1265 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1266 case ARMISD::CALL: return "ARMISD::CALL"; 1267 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1268 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1269 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1270 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1271 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1272 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1273 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1274 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1275 case ARMISD::CMP: return "ARMISD::CMP"; 1276 case ARMISD::CMN: return "ARMISD::CMN"; 1277 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1278 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1279 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1280 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1281 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1282 1283 case ARMISD::CMOV: return "ARMISD::CMOV"; 1284 1285 case ARMISD::SSAT: return "ARMISD::SSAT"; 1286 case ARMISD::USAT: return "ARMISD::USAT"; 1287 1288 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1289 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1290 case ARMISD::RRX: return "ARMISD::RRX"; 1291 1292 case ARMISD::ADDC: return "ARMISD::ADDC"; 1293 case ARMISD::ADDE: return "ARMISD::ADDE"; 1294 case ARMISD::SUBC: return "ARMISD::SUBC"; 1295 case ARMISD::SUBE: return "ARMISD::SUBE"; 1296 1297 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1298 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1299 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1300 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1301 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1302 1303 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1304 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1305 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1306 1307 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1308 1309 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1310 1311 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1312 1313 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1314 1315 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1316 1317 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1318 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1319 1320 case ARMISD::VCEQ: return "ARMISD::VCEQ"; 1321 case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; 1322 case ARMISD::VCGE: return "ARMISD::VCGE"; 1323 case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; 1324 case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; 1325 case ARMISD::VCGEU: return "ARMISD::VCGEU"; 1326 case ARMISD::VCGT: return "ARMISD::VCGT"; 1327 case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; 1328 case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; 1329 case ARMISD::VCGTU: return "ARMISD::VCGTU"; 1330 case ARMISD::VTST: return "ARMISD::VTST"; 1331 1332 case ARMISD::VSHL: return "ARMISD::VSHL"; 1333 case ARMISD::VSHRs: return "ARMISD::VSHRs"; 1334 case ARMISD::VSHRu: return "ARMISD::VSHRu"; 1335 case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; 1336 case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; 1337 case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; 1338 case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; 1339 case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; 1340 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; 1341 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; 1342 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; 1343 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; 1344 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; 1345 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; 1346 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; 1347 case ARMISD::VSLI: return "ARMISD::VSLI"; 1348 case ARMISD::VSRI: return "ARMISD::VSRI"; 1349 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1350 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1351 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1352 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1353 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1354 case ARMISD::VDUP: return "ARMISD::VDUP"; 1355 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1356 case ARMISD::VEXT: return "ARMISD::VEXT"; 1357 case ARMISD::VREV64: return "ARMISD::VREV64"; 1358 case ARMISD::VREV32: return "ARMISD::VREV32"; 1359 case ARMISD::VREV16: return "ARMISD::VREV16"; 1360 case ARMISD::VZIP: return "ARMISD::VZIP"; 1361 case ARMISD::VUZP: return "ARMISD::VUZP"; 1362 case ARMISD::VTRN: return "ARMISD::VTRN"; 1363 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1364 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1365 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1366 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1367 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1368 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1369 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1370 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1371 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1372 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1373 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1374 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1375 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1376 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1377 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1378 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1379 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1380 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1381 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1382 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1383 case ARMISD::BFI: return "ARMISD::BFI"; 1384 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1385 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1386 case ARMISD::VBSL: return "ARMISD::VBSL"; 1387 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1388 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1389 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1390 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1391 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1392 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1393 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1394 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1395 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1396 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1397 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1398 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1399 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1400 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1401 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1402 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1403 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1404 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1405 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1406 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1407 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1408 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1409 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1410 } 1411 return nullptr; 1412 } 1413 1414 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1415 EVT VT) const { 1416 if (!VT.isVector()) 1417 return getPointerTy(DL); 1418 return VT.changeVectorElementTypeToInteger(); 1419 } 1420 1421 /// getRegClassFor - Return the register class that should be used for the 1422 /// specified value type. 1423 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { 1424 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1425 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1426 // load / store 4 to 8 consecutive D registers. 1427 if (Subtarget->hasNEON()) { 1428 if (VT == MVT::v4i64) 1429 return &ARM::QQPRRegClass; 1430 if (VT == MVT::v8i64) 1431 return &ARM::QQQQPRRegClass; 1432 } 1433 return TargetLowering::getRegClassFor(VT); 1434 } 1435 1436 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1437 // source/dest is aligned and the copy size is large enough. We therefore want 1438 // to align such objects passed to memory intrinsics. 1439 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1440 unsigned &PrefAlign) const { 1441 if (!isa<MemIntrinsic>(CI)) 1442 return false; 1443 MinSize = 8; 1444 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1445 // cycle faster than 4-byte aligned LDM. 1446 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1447 return true; 1448 } 1449 1450 // Create a fast isel object. 1451 FastISel * 1452 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1453 const TargetLibraryInfo *libInfo) const { 1454 return ARM::createFastISel(funcInfo, libInfo); 1455 } 1456 1457 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1458 unsigned NumVals = N->getNumValues(); 1459 if (!NumVals) 1460 return Sched::RegPressure; 1461 1462 for (unsigned i = 0; i != NumVals; ++i) { 1463 EVT VT = N->getValueType(i); 1464 if (VT == MVT::Glue || VT == MVT::Other) 1465 continue; 1466 if (VT.isFloatingPoint() || VT.isVector()) 1467 return Sched::ILP; 1468 } 1469 1470 if (!N->isMachineOpcode()) 1471 return Sched::RegPressure; 1472 1473 // Load are scheduled for latency even if there instruction itinerary 1474 // is not available. 1475 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1476 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1477 1478 if (MCID.getNumDefs() == 0) 1479 return Sched::RegPressure; 1480 if (!Itins->isEmpty() && 1481 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1482 return Sched::ILP; 1483 1484 return Sched::RegPressure; 1485 } 1486 1487 //===----------------------------------------------------------------------===// 1488 // Lowering Code 1489 //===----------------------------------------------------------------------===// 1490 1491 static bool isSRL16(const SDValue &Op) { 1492 if (Op.getOpcode() != ISD::SRL) 1493 return false; 1494 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1495 return Const->getZExtValue() == 16; 1496 return false; 1497 } 1498 1499 static bool isSRA16(const SDValue &Op) { 1500 if (Op.getOpcode() != ISD::SRA) 1501 return false; 1502 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1503 return Const->getZExtValue() == 16; 1504 return false; 1505 } 1506 1507 static bool isSHL16(const SDValue &Op) { 1508 if (Op.getOpcode() != ISD::SHL) 1509 return false; 1510 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1511 return Const->getZExtValue() == 16; 1512 return false; 1513 } 1514 1515 // Check for a signed 16-bit value. We special case SRA because it makes it 1516 // more simple when also looking for SRAs that aren't sign extending a 1517 // smaller value. Without the check, we'd need to take extra care with 1518 // checking order for some operations. 1519 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1520 if (isSRA16(Op)) 1521 return isSHL16(Op.getOperand(0)); 1522 return DAG.ComputeNumSignBits(Op) == 17; 1523 } 1524 1525 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1526 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1527 switch (CC) { 1528 default: llvm_unreachable("Unknown condition code!"); 1529 case ISD::SETNE: return ARMCC::NE; 1530 case ISD::SETEQ: return ARMCC::EQ; 1531 case ISD::SETGT: return ARMCC::GT; 1532 case ISD::SETGE: return ARMCC::GE; 1533 case ISD::SETLT: return ARMCC::LT; 1534 case ISD::SETLE: return ARMCC::LE; 1535 case ISD::SETUGT: return ARMCC::HI; 1536 case ISD::SETUGE: return ARMCC::HS; 1537 case ISD::SETULT: return ARMCC::LO; 1538 case ISD::SETULE: return ARMCC::LS; 1539 } 1540 } 1541 1542 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1543 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1544 ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { 1545 CondCode2 = ARMCC::AL; 1546 InvalidOnQNaN = true; 1547 switch (CC) { 1548 default: llvm_unreachable("Unknown FP condition!"); 1549 case ISD::SETEQ: 1550 case ISD::SETOEQ: 1551 CondCode = ARMCC::EQ; 1552 InvalidOnQNaN = false; 1553 break; 1554 case ISD::SETGT: 1555 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1556 case ISD::SETGE: 1557 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1558 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1559 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1560 case ISD::SETONE: 1561 CondCode = ARMCC::MI; 1562 CondCode2 = ARMCC::GT; 1563 InvalidOnQNaN = false; 1564 break; 1565 case ISD::SETO: CondCode = ARMCC::VC; break; 1566 case ISD::SETUO: CondCode = ARMCC::VS; break; 1567 case ISD::SETUEQ: 1568 CondCode = ARMCC::EQ; 1569 CondCode2 = ARMCC::VS; 1570 InvalidOnQNaN = false; 1571 break; 1572 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1573 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1574 case ISD::SETLT: 1575 case ISD::SETULT: CondCode = ARMCC::LT; break; 1576 case ISD::SETLE: 1577 case ISD::SETULE: CondCode = ARMCC::LE; break; 1578 case ISD::SETNE: 1579 case ISD::SETUNE: 1580 CondCode = ARMCC::NE; 1581 InvalidOnQNaN = false; 1582 break; 1583 } 1584 } 1585 1586 //===----------------------------------------------------------------------===// 1587 // Calling Convention Implementation 1588 //===----------------------------------------------------------------------===// 1589 1590 #include "ARMGenCallingConv.inc" 1591 1592 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1593 /// account presence of floating point hardware and calling convention 1594 /// limitations, such as support for variadic functions. 1595 CallingConv::ID 1596 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1597 bool isVarArg) const { 1598 switch (CC) { 1599 default: 1600 report_fatal_error("Unsupported calling convention"); 1601 case CallingConv::ARM_AAPCS: 1602 case CallingConv::ARM_APCS: 1603 case CallingConv::GHC: 1604 return CC; 1605 case CallingConv::PreserveMost: 1606 return CallingConv::PreserveMost; 1607 case CallingConv::ARM_AAPCS_VFP: 1608 case CallingConv::Swift: 1609 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1610 case CallingConv::C: 1611 if (!Subtarget->isAAPCS_ABI()) 1612 return CallingConv::ARM_APCS; 1613 else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && 1614 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1615 !isVarArg) 1616 return CallingConv::ARM_AAPCS_VFP; 1617 else 1618 return CallingConv::ARM_AAPCS; 1619 case CallingConv::Fast: 1620 case CallingConv::CXX_FAST_TLS: 1621 if (!Subtarget->isAAPCS_ABI()) { 1622 if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1623 return CallingConv::Fast; 1624 return CallingConv::ARM_APCS; 1625 } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1626 return CallingConv::ARM_AAPCS_VFP; 1627 else 1628 return CallingConv::ARM_AAPCS; 1629 } 1630 } 1631 1632 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1633 bool isVarArg) const { 1634 return CCAssignFnForNode(CC, false, isVarArg); 1635 } 1636 1637 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1638 bool isVarArg) const { 1639 return CCAssignFnForNode(CC, true, isVarArg); 1640 } 1641 1642 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1643 /// CallingConvention. 1644 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1645 bool Return, 1646 bool isVarArg) const { 1647 switch (getEffectiveCallingConv(CC, isVarArg)) { 1648 default: 1649 report_fatal_error("Unsupported calling convention"); 1650 case CallingConv::ARM_APCS: 1651 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1652 case CallingConv::ARM_AAPCS: 1653 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1654 case CallingConv::ARM_AAPCS_VFP: 1655 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1656 case CallingConv::Fast: 1657 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1658 case CallingConv::GHC: 1659 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1660 case CallingConv::PreserveMost: 1661 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1662 } 1663 } 1664 1665 /// LowerCallResult - Lower the result values of a call into the 1666 /// appropriate copies out of appropriate physical registers. 1667 SDValue ARMTargetLowering::LowerCallResult( 1668 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1669 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1670 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1671 SDValue ThisVal) const { 1672 // Assign locations to each value returned by this call. 1673 SmallVector<CCValAssign, 16> RVLocs; 1674 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1675 *DAG.getContext()); 1676 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1677 1678 // Copy all of the result registers out of their specified physreg. 1679 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1680 CCValAssign VA = RVLocs[i]; 1681 1682 // Pass 'this' value directly from the argument to return value, to avoid 1683 // reg unit interference 1684 if (i == 0 && isThisReturn) { 1685 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1686 "unexpected return calling convention register assignment"); 1687 InVals.push_back(ThisVal); 1688 continue; 1689 } 1690 1691 SDValue Val; 1692 if (VA.needsCustom()) { 1693 // Handle f64 or half of a v2f64. 1694 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1695 InFlag); 1696 Chain = Lo.getValue(1); 1697 InFlag = Lo.getValue(2); 1698 VA = RVLocs[++i]; // skip ahead to next loc 1699 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1700 InFlag); 1701 Chain = Hi.getValue(1); 1702 InFlag = Hi.getValue(2); 1703 if (!Subtarget->isLittle()) 1704 std::swap (Lo, Hi); 1705 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1706 1707 if (VA.getLocVT() == MVT::v2f64) { 1708 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1709 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1710 DAG.getConstant(0, dl, MVT::i32)); 1711 1712 VA = RVLocs[++i]; // skip ahead to next loc 1713 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1714 Chain = Lo.getValue(1); 1715 InFlag = Lo.getValue(2); 1716 VA = RVLocs[++i]; // skip ahead to next loc 1717 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1718 Chain = Hi.getValue(1); 1719 InFlag = Hi.getValue(2); 1720 if (!Subtarget->isLittle()) 1721 std::swap (Lo, Hi); 1722 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1723 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1724 DAG.getConstant(1, dl, MVT::i32)); 1725 } 1726 } else { 1727 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1728 InFlag); 1729 Chain = Val.getValue(1); 1730 InFlag = Val.getValue(2); 1731 } 1732 1733 switch (VA.getLocInfo()) { 1734 default: llvm_unreachable("Unknown loc info!"); 1735 case CCValAssign::Full: break; 1736 case CCValAssign::BCvt: 1737 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1738 break; 1739 } 1740 1741 InVals.push_back(Val); 1742 } 1743 1744 return Chain; 1745 } 1746 1747 /// LowerMemOpCallTo - Store the argument to the stack. 1748 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1749 SDValue Arg, const SDLoc &dl, 1750 SelectionDAG &DAG, 1751 const CCValAssign &VA, 1752 ISD::ArgFlagsTy Flags) const { 1753 unsigned LocMemOffset = VA.getLocMemOffset(); 1754 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1755 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1756 StackPtr, PtrOff); 1757 return DAG.getStore( 1758 Chain, dl, Arg, PtrOff, 1759 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 1760 } 1761 1762 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 1763 SDValue Chain, SDValue &Arg, 1764 RegsToPassVector &RegsToPass, 1765 CCValAssign &VA, CCValAssign &NextVA, 1766 SDValue &StackPtr, 1767 SmallVectorImpl<SDValue> &MemOpChains, 1768 ISD::ArgFlagsTy Flags) const { 1769 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1770 DAG.getVTList(MVT::i32, MVT::i32), Arg); 1771 unsigned id = Subtarget->isLittle() ? 0 : 1; 1772 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 1773 1774 if (NextVA.isRegLoc()) 1775 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 1776 else { 1777 assert(NextVA.isMemLoc()); 1778 if (!StackPtr.getNode()) 1779 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 1780 getPointerTy(DAG.getDataLayout())); 1781 1782 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 1783 dl, DAG, NextVA, 1784 Flags)); 1785 } 1786 } 1787 1788 /// LowerCall - Lowering a call into a callseq_start <- 1789 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 1790 /// nodes. 1791 SDValue 1792 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1793 SmallVectorImpl<SDValue> &InVals) const { 1794 SelectionDAG &DAG = CLI.DAG; 1795 SDLoc &dl = CLI.DL; 1796 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1797 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1798 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1799 SDValue Chain = CLI.Chain; 1800 SDValue Callee = CLI.Callee; 1801 bool &isTailCall = CLI.IsTailCall; 1802 CallingConv::ID CallConv = CLI.CallConv; 1803 bool doesNotRet = CLI.DoesNotReturn; 1804 bool isVarArg = CLI.IsVarArg; 1805 1806 MachineFunction &MF = DAG.getMachineFunction(); 1807 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 1808 bool isThisReturn = false; 1809 bool isSibCall = false; 1810 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); 1811 1812 // Disable tail calls if they're not supported. 1813 if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") 1814 isTailCall = false; 1815 1816 if (isTailCall) { 1817 // Check if it's really possible to do a tail call. 1818 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1819 isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), 1820 Outs, OutVals, Ins, DAG); 1821 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 1822 report_fatal_error("failed to perform tail call elimination on a call " 1823 "site marked musttail"); 1824 // We don't support GuaranteedTailCallOpt for ARM, only automatically 1825 // detected sibcalls. 1826 if (isTailCall) { 1827 ++NumTailCalls; 1828 isSibCall = true; 1829 } 1830 } 1831 1832 // Analyze operands of the call, assigning locations to each operand. 1833 SmallVector<CCValAssign, 16> ArgLocs; 1834 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1835 *DAG.getContext()); 1836 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 1837 1838 // Get a count of how many bytes are to be pushed on the stack. 1839 unsigned NumBytes = CCInfo.getNextStackOffset(); 1840 1841 // For tail calls, memory operands are available in our caller's stack. 1842 if (isSibCall) 1843 NumBytes = 0; 1844 1845 // Adjust the stack pointer for the new arguments... 1846 // These operations are automatically eliminated by the prolog/epilog pass 1847 if (!isSibCall) 1848 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 1849 1850 SDValue StackPtr = 1851 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 1852 1853 RegsToPassVector RegsToPass; 1854 SmallVector<SDValue, 8> MemOpChains; 1855 1856 // Walk the register/memloc assignments, inserting copies/loads. In the case 1857 // of tail call optimization, arguments are handled later. 1858 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1859 i != e; 1860 ++i, ++realArgIdx) { 1861 CCValAssign &VA = ArgLocs[i]; 1862 SDValue Arg = OutVals[realArgIdx]; 1863 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1864 bool isByVal = Flags.isByVal(); 1865 1866 // Promote the value if needed. 1867 switch (VA.getLocInfo()) { 1868 default: llvm_unreachable("Unknown loc info!"); 1869 case CCValAssign::Full: break; 1870 case CCValAssign::SExt: 1871 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1872 break; 1873 case CCValAssign::ZExt: 1874 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1875 break; 1876 case CCValAssign::AExt: 1877 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1878 break; 1879 case CCValAssign::BCvt: 1880 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1881 break; 1882 } 1883 1884 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 1885 if (VA.needsCustom()) { 1886 if (VA.getLocVT() == MVT::v2f64) { 1887 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1888 DAG.getConstant(0, dl, MVT::i32)); 1889 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1890 DAG.getConstant(1, dl, MVT::i32)); 1891 1892 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 1893 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1894 1895 VA = ArgLocs[++i]; // skip ahead to next loc 1896 if (VA.isRegLoc()) { 1897 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 1898 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1899 } else { 1900 assert(VA.isMemLoc()); 1901 1902 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 1903 dl, DAG, VA, Flags)); 1904 } 1905 } else { 1906 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 1907 StackPtr, MemOpChains, Flags); 1908 } 1909 } else if (VA.isRegLoc()) { 1910 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 1911 Outs[0].VT == MVT::i32) { 1912 assert(VA.getLocVT() == MVT::i32 && 1913 "unexpected calling convention register assignment"); 1914 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 1915 "unexpected use of 'returned'"); 1916 isThisReturn = true; 1917 } 1918 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1919 } else if (isByVal) { 1920 assert(VA.isMemLoc()); 1921 unsigned offset = 0; 1922 1923 // True if this byval aggregate will be split between registers 1924 // and memory. 1925 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 1926 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 1927 1928 if (CurByValIdx < ByValArgsCount) { 1929 1930 unsigned RegBegin, RegEnd; 1931 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 1932 1933 EVT PtrVT = 1934 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 1935 unsigned int i, j; 1936 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 1937 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 1938 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 1939 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 1940 MachinePointerInfo(), 1941 DAG.InferPtrAlignment(AddArg)); 1942 MemOpChains.push_back(Load.getValue(1)); 1943 RegsToPass.push_back(std::make_pair(j, Load)); 1944 } 1945 1946 // If parameter size outsides register area, "offset" value 1947 // helps us to calculate stack slot for remained part properly. 1948 offset = RegEnd - RegBegin; 1949 1950 CCInfo.nextInRegsParam(); 1951 } 1952 1953 if (Flags.getByValSize() > 4*offset) { 1954 auto PtrVT = getPointerTy(DAG.getDataLayout()); 1955 unsigned LocMemOffset = VA.getLocMemOffset(); 1956 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1957 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 1958 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 1959 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 1960 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 1961 MVT::i32); 1962 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 1963 MVT::i32); 1964 1965 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 1966 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 1967 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 1968 Ops)); 1969 } 1970 } else if (!isSibCall) { 1971 assert(VA.isMemLoc()); 1972 1973 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1974 dl, DAG, VA, Flags)); 1975 } 1976 } 1977 1978 if (!MemOpChains.empty()) 1979 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 1980 1981 // Build a sequence of copy-to-reg nodes chained together with token chain 1982 // and flag operands which copy the outgoing args into the appropriate regs. 1983 SDValue InFlag; 1984 // Tail call byval lowering might overwrite argument registers so in case of 1985 // tail call optimization the copies to registers are lowered later. 1986 if (!isTailCall) 1987 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1988 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1989 RegsToPass[i].second, InFlag); 1990 InFlag = Chain.getValue(1); 1991 } 1992 1993 // For tail calls lower the arguments to the 'real' stack slot. 1994 if (isTailCall) { 1995 // Force all the incoming stack arguments to be loaded from the stack 1996 // before any new outgoing arguments are stored to the stack, because the 1997 // outgoing stack slots may alias the incoming argument stack slots, and 1998 // the alias isn't otherwise explicit. This is slightly more conservative 1999 // than necessary, because it means that each store effectively depends 2000 // on every argument instead of just those arguments it would clobber. 2001 2002 // Do not flag preceding copytoreg stuff together with the following stuff. 2003 InFlag = SDValue(); 2004 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2005 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2006 RegsToPass[i].second, InFlag); 2007 InFlag = Chain.getValue(1); 2008 } 2009 InFlag = SDValue(); 2010 } 2011 2012 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2013 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2014 // node so that legalize doesn't hack it. 2015 bool isDirect = false; 2016 2017 const TargetMachine &TM = getTargetMachine(); 2018 const Module *Mod = MF.getFunction().getParent(); 2019 const GlobalValue *GV = nullptr; 2020 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2021 GV = G->getGlobal(); 2022 bool isStub = 2023 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2024 2025 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2026 bool isLocalARMFunc = false; 2027 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2028 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2029 2030 if (Subtarget->genLongCalls()) { 2031 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2032 "long-calls codegen is not position independent!"); 2033 // Handle a global address or an external symbol. If it's not one of 2034 // those, the target's already in a register, so we don't need to do 2035 // anything extra. 2036 if (isa<GlobalAddressSDNode>(Callee)) { 2037 // Create a constant pool entry for the callee address 2038 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2039 ARMConstantPoolValue *CPV = 2040 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2041 2042 // Get the address of the callee into a register 2043 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2044 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2045 Callee = DAG.getLoad( 2046 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2047 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2048 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2049 const char *Sym = S->getSymbol(); 2050 2051 // Create a constant pool entry for the callee address 2052 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2053 ARMConstantPoolValue *CPV = 2054 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2055 ARMPCLabelIndex, 0); 2056 // Get the address of the callee into a register 2057 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2058 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2059 Callee = DAG.getLoad( 2060 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2061 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2062 } 2063 } else if (isa<GlobalAddressSDNode>(Callee)) { 2064 // If we're optimizing for minimum size and the function is called three or 2065 // more times in this block, we can improve codesize by calling indirectly 2066 // as BLXr has a 16-bit encoding. 2067 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2068 auto *BB = CLI.CS.getParent(); 2069 bool PreferIndirect = 2070 Subtarget->isThumb() && MF.getFunction().optForMinSize() && 2071 count_if(GV->users(), [&BB](const User *U) { 2072 return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB; 2073 }) > 2; 2074 2075 if (!PreferIndirect) { 2076 isDirect = true; 2077 bool isDef = GV->isStrongDefinitionForLinker(); 2078 2079 // ARM call to a local ARM function is predicable. 2080 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2081 // tBX takes a register source operand. 2082 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2083 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2084 Callee = DAG.getNode( 2085 ARMISD::WrapperPIC, dl, PtrVt, 2086 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2087 Callee = DAG.getLoad( 2088 PtrVt, dl, DAG.getEntryNode(), Callee, 2089 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2090 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2091 MachineMemOperand::MOInvariant); 2092 } else if (Subtarget->isTargetCOFF()) { 2093 assert(Subtarget->isTargetWindows() && 2094 "Windows is the only supported COFF target"); 2095 unsigned TargetFlags = GV->hasDLLImportStorageClass() 2096 ? ARMII::MO_DLLIMPORT 2097 : ARMII::MO_NO_FLAG; 2098 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, 2099 TargetFlags); 2100 if (GV->hasDLLImportStorageClass()) 2101 Callee = 2102 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2103 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2104 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2105 } else { 2106 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2107 } 2108 } 2109 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2110 isDirect = true; 2111 // tBX takes a register source operand. 2112 const char *Sym = S->getSymbol(); 2113 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2114 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2115 ARMConstantPoolValue *CPV = 2116 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2117 ARMPCLabelIndex, 4); 2118 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2119 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2120 Callee = DAG.getLoad( 2121 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2122 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2123 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2124 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2125 } else { 2126 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2127 } 2128 } 2129 2130 // FIXME: handle tail calls differently. 2131 unsigned CallOpc; 2132 if (Subtarget->isThumb()) { 2133 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2134 CallOpc = ARMISD::CALL_NOLINK; 2135 else 2136 CallOpc = ARMISD::CALL; 2137 } else { 2138 if (!isDirect && !Subtarget->hasV5TOps()) 2139 CallOpc = ARMISD::CALL_NOLINK; 2140 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2141 // Emit regular call when code size is the priority 2142 !MF.getFunction().optForMinSize()) 2143 // "mov lr, pc; b _foo" to avoid confusing the RSP 2144 CallOpc = ARMISD::CALL_NOLINK; 2145 else 2146 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2147 } 2148 2149 std::vector<SDValue> Ops; 2150 Ops.push_back(Chain); 2151 Ops.push_back(Callee); 2152 2153 // Add argument registers to the end of the list so that they are known live 2154 // into the call. 2155 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2156 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2157 RegsToPass[i].second.getValueType())); 2158 2159 // Add a register mask operand representing the call-preserved registers. 2160 if (!isTailCall) { 2161 const uint32_t *Mask; 2162 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2163 if (isThisReturn) { 2164 // For 'this' returns, use the R0-preserving mask if applicable 2165 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2166 if (!Mask) { 2167 // Set isThisReturn to false if the calling convention is not one that 2168 // allows 'returned' to be modeled in this way, so LowerCallResult does 2169 // not try to pass 'this' straight through 2170 isThisReturn = false; 2171 Mask = ARI->getCallPreservedMask(MF, CallConv); 2172 } 2173 } else 2174 Mask = ARI->getCallPreservedMask(MF, CallConv); 2175 2176 assert(Mask && "Missing call preserved mask for calling convention"); 2177 Ops.push_back(DAG.getRegisterMask(Mask)); 2178 } 2179 2180 if (InFlag.getNode()) 2181 Ops.push_back(InFlag); 2182 2183 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2184 if (isTailCall) { 2185 MF.getFrameInfo().setHasTailCall(); 2186 return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2187 } 2188 2189 // Returns a chain and a flag for retval copy to use. 2190 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2191 InFlag = Chain.getValue(1); 2192 2193 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2194 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2195 if (!Ins.empty()) 2196 InFlag = Chain.getValue(1); 2197 2198 // Handle result values, copying them out of physregs into vregs that we 2199 // return. 2200 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2201 InVals, isThisReturn, 2202 isThisReturn ? OutVals[0] : SDValue()); 2203 } 2204 2205 /// HandleByVal - Every parameter *after* a byval parameter is passed 2206 /// on the stack. Remember the next parameter register to allocate, 2207 /// and then confiscate the rest of the parameter registers to insure 2208 /// this. 2209 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2210 unsigned Align) const { 2211 // Byval (as with any stack) slots are always at least 4 byte aligned. 2212 Align = std::max(Align, 4U); 2213 2214 unsigned Reg = State->AllocateReg(GPRArgRegs); 2215 if (!Reg) 2216 return; 2217 2218 unsigned AlignInRegs = Align / 4; 2219 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2220 for (unsigned i = 0; i < Waste; ++i) 2221 Reg = State->AllocateReg(GPRArgRegs); 2222 2223 if (!Reg) 2224 return; 2225 2226 unsigned Excess = 4 * (ARM::R4 - Reg); 2227 2228 // Special case when NSAA != SP and parameter size greater than size of 2229 // all remained GPR regs. In that case we can't split parameter, we must 2230 // send it to stack. We also must set NCRN to R4, so waste all 2231 // remained registers. 2232 const unsigned NSAAOffset = State->getNextStackOffset(); 2233 if (NSAAOffset != 0 && Size > Excess) { 2234 while (State->AllocateReg(GPRArgRegs)) 2235 ; 2236 return; 2237 } 2238 2239 // First register for byval parameter is the first register that wasn't 2240 // allocated before this method call, so it would be "reg". 2241 // If parameter is small enough to be saved in range [reg, r4), then 2242 // the end (first after last) register would be reg + param-size-in-regs, 2243 // else parameter would be splitted between registers and stack, 2244 // end register would be r4 in this case. 2245 unsigned ByValRegBegin = Reg; 2246 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2247 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2248 // Note, first register is allocated in the beginning of function already, 2249 // allocate remained amount of registers we need. 2250 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2251 State->AllocateReg(GPRArgRegs); 2252 // A byval parameter that is split between registers and memory needs its 2253 // size truncated here. 2254 // In the case where the entire structure fits in registers, we set the 2255 // size in memory to zero. 2256 Size = std::max<int>(Size - Excess, 0); 2257 } 2258 2259 /// MatchingStackOffset - Return true if the given stack call argument is 2260 /// already available in the same position (relatively) of the caller's 2261 /// incoming argument stack. 2262 static 2263 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2264 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2265 const TargetInstrInfo *TII) { 2266 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2267 int FI = std::numeric_limits<int>::max(); 2268 if (Arg.getOpcode() == ISD::CopyFromReg) { 2269 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2270 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2271 return false; 2272 MachineInstr *Def = MRI->getVRegDef(VR); 2273 if (!Def) 2274 return false; 2275 if (!Flags.isByVal()) { 2276 if (!TII->isLoadFromStackSlot(*Def, FI)) 2277 return false; 2278 } else { 2279 return false; 2280 } 2281 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2282 if (Flags.isByVal()) 2283 // ByVal argument is passed in as a pointer but it's now being 2284 // dereferenced. e.g. 2285 // define @foo(%struct.X* %A) { 2286 // tail call @bar(%struct.X* byval %A) 2287 // } 2288 return false; 2289 SDValue Ptr = Ld->getBasePtr(); 2290 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2291 if (!FINode) 2292 return false; 2293 FI = FINode->getIndex(); 2294 } else 2295 return false; 2296 2297 assert(FI != std::numeric_limits<int>::max()); 2298 if (!MFI.isFixedObjectIndex(FI)) 2299 return false; 2300 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2301 } 2302 2303 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2304 /// for tail call optimization. Targets which want to do tail call 2305 /// optimization should implement this function. 2306 bool 2307 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2308 CallingConv::ID CalleeCC, 2309 bool isVarArg, 2310 bool isCalleeStructRet, 2311 bool isCallerStructRet, 2312 const SmallVectorImpl<ISD::OutputArg> &Outs, 2313 const SmallVectorImpl<SDValue> &OutVals, 2314 const SmallVectorImpl<ISD::InputArg> &Ins, 2315 SelectionDAG& DAG) const { 2316 MachineFunction &MF = DAG.getMachineFunction(); 2317 const Function &CallerF = MF.getFunction(); 2318 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2319 2320 assert(Subtarget->supportsTailCall()); 2321 2322 // Tail calls to function pointers cannot be optimized for Thumb1 if the args 2323 // to the call take up r0-r3. The reason is that there are no legal registers 2324 // left to hold the pointer to the function to be called. 2325 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2326 !isa<GlobalAddressSDNode>(Callee.getNode())) 2327 return false; 2328 2329 // Look for obvious safe cases to perform tail call optimization that do not 2330 // require ABI changes. This is what gcc calls sibcall. 2331 2332 // Exception-handling functions need a special set of instructions to indicate 2333 // a return to the hardware. Tail-calling another function would probably 2334 // break this. 2335 if (CallerF.hasFnAttribute("interrupt")) 2336 return false; 2337 2338 // Also avoid sibcall optimization if either caller or callee uses struct 2339 // return semantics. 2340 if (isCalleeStructRet || isCallerStructRet) 2341 return false; 2342 2343 // Externally-defined functions with weak linkage should not be 2344 // tail-called on ARM when the OS does not support dynamic 2345 // pre-emption of symbols, as the AAELF spec requires normal calls 2346 // to undefined weak functions to be replaced with a NOP or jump to the 2347 // next instruction. The behaviour of branch instructions in this 2348 // situation (as used for tail calls) is implementation-defined, so we 2349 // cannot rely on the linker replacing the tail call with a return. 2350 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2351 const GlobalValue *GV = G->getGlobal(); 2352 const Triple &TT = getTargetMachine().getTargetTriple(); 2353 if (GV->hasExternalWeakLinkage() && 2354 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2355 return false; 2356 } 2357 2358 // Check that the call results are passed in the same way. 2359 LLVMContext &C = *DAG.getContext(); 2360 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2361 CCAssignFnForReturn(CalleeCC, isVarArg), 2362 CCAssignFnForReturn(CallerCC, isVarArg))) 2363 return false; 2364 // The callee has to preserve all registers the caller needs to preserve. 2365 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2366 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2367 if (CalleeCC != CallerCC) { 2368 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2369 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2370 return false; 2371 } 2372 2373 // If Caller's vararg or byval argument has been split between registers and 2374 // stack, do not perform tail call, since part of the argument is in caller's 2375 // local frame. 2376 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2377 if (AFI_Caller->getArgRegsSaveSize()) 2378 return false; 2379 2380 // If the callee takes no arguments then go on to check the results of the 2381 // call. 2382 if (!Outs.empty()) { 2383 // Check if stack adjustment is needed. For now, do not do this if any 2384 // argument is passed on the stack. 2385 SmallVector<CCValAssign, 16> ArgLocs; 2386 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2387 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2388 if (CCInfo.getNextStackOffset()) { 2389 // Check if the arguments are already laid out in the right way as 2390 // the caller's fixed stack objects. 2391 MachineFrameInfo &MFI = MF.getFrameInfo(); 2392 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2393 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2394 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2395 i != e; 2396 ++i, ++realArgIdx) { 2397 CCValAssign &VA = ArgLocs[i]; 2398 EVT RegVT = VA.getLocVT(); 2399 SDValue Arg = OutVals[realArgIdx]; 2400 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2401 if (VA.getLocInfo() == CCValAssign::Indirect) 2402 return false; 2403 if (VA.needsCustom()) { 2404 // f64 and vector types are split into multiple registers or 2405 // register/stack-slot combinations. The types will not match 2406 // the registers; give up on memory f64 refs until we figure 2407 // out what to do about this. 2408 if (!VA.isRegLoc()) 2409 return false; 2410 if (!ArgLocs[++i].isRegLoc()) 2411 return false; 2412 if (RegVT == MVT::v2f64) { 2413 if (!ArgLocs[++i].isRegLoc()) 2414 return false; 2415 if (!ArgLocs[++i].isRegLoc()) 2416 return false; 2417 } 2418 } else if (!VA.isRegLoc()) { 2419 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2420 MFI, MRI, TII)) 2421 return false; 2422 } 2423 } 2424 } 2425 2426 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2427 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2428 return false; 2429 } 2430 2431 return true; 2432 } 2433 2434 bool 2435 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2436 MachineFunction &MF, bool isVarArg, 2437 const SmallVectorImpl<ISD::OutputArg> &Outs, 2438 LLVMContext &Context) const { 2439 SmallVector<CCValAssign, 16> RVLocs; 2440 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2441 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2442 } 2443 2444 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2445 const SDLoc &DL, SelectionDAG &DAG) { 2446 const MachineFunction &MF = DAG.getMachineFunction(); 2447 const Function &F = MF.getFunction(); 2448 2449 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2450 2451 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2452 // version of the "preferred return address". These offsets affect the return 2453 // instruction if this is a return from PL1 without hypervisor extensions. 2454 // IRQ/FIQ: +4 "subs pc, lr, #4" 2455 // SWI: 0 "subs pc, lr, #0" 2456 // ABORT: +4 "subs pc, lr, #4" 2457 // UNDEF: +4/+2 "subs pc, lr, #0" 2458 // UNDEF varies depending on where the exception came from ARM or Thumb 2459 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2460 2461 int64_t LROffset; 2462 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2463 IntKind == "ABORT") 2464 LROffset = 4; 2465 else if (IntKind == "SWI" || IntKind == "UNDEF") 2466 LROffset = 0; 2467 else 2468 report_fatal_error("Unsupported interrupt attribute. If present, value " 2469 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2470 2471 RetOps.insert(RetOps.begin() + 1, 2472 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2473 2474 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2475 } 2476 2477 SDValue 2478 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2479 bool isVarArg, 2480 const SmallVectorImpl<ISD::OutputArg> &Outs, 2481 const SmallVectorImpl<SDValue> &OutVals, 2482 const SDLoc &dl, SelectionDAG &DAG) const { 2483 // CCValAssign - represent the assignment of the return value to a location. 2484 SmallVector<CCValAssign, 16> RVLocs; 2485 2486 // CCState - Info about the registers and stack slots. 2487 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2488 *DAG.getContext()); 2489 2490 // Analyze outgoing return values. 2491 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2492 2493 SDValue Flag; 2494 SmallVector<SDValue, 4> RetOps; 2495 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2496 bool isLittleEndian = Subtarget->isLittle(); 2497 2498 MachineFunction &MF = DAG.getMachineFunction(); 2499 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2500 AFI->setReturnRegsCount(RVLocs.size()); 2501 2502 // Copy the result values into the output registers. 2503 for (unsigned i = 0, realRVLocIdx = 0; 2504 i != RVLocs.size(); 2505 ++i, ++realRVLocIdx) { 2506 CCValAssign &VA = RVLocs[i]; 2507 assert(VA.isRegLoc() && "Can only return in registers!"); 2508 2509 SDValue Arg = OutVals[realRVLocIdx]; 2510 bool ReturnF16 = false; 2511 2512 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2513 // Half-precision return values can be returned like this: 2514 // 2515 // t11 f16 = fadd ... 2516 // t12: i16 = bitcast t11 2517 // t13: i32 = zero_extend t12 2518 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2519 // 2520 // to avoid code generation for bitcasts, we simply set Arg to the node 2521 // that produces the f16 value, t11 in this case. 2522 // 2523 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2524 SDValue ZE = Arg.getOperand(0); 2525 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2526 SDValue BC = ZE.getOperand(0); 2527 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2528 Arg = BC.getOperand(0); 2529 ReturnF16 = true; 2530 } 2531 } 2532 } 2533 } 2534 2535 switch (VA.getLocInfo()) { 2536 default: llvm_unreachable("Unknown loc info!"); 2537 case CCValAssign::Full: break; 2538 case CCValAssign::BCvt: 2539 if (!ReturnF16) 2540 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2541 break; 2542 } 2543 2544 if (VA.needsCustom()) { 2545 if (VA.getLocVT() == MVT::v2f64) { 2546 // Extract the first half and return it in two registers. 2547 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2548 DAG.getConstant(0, dl, MVT::i32)); 2549 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2550 DAG.getVTList(MVT::i32, MVT::i32), Half); 2551 2552 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2553 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2554 Flag); 2555 Flag = Chain.getValue(1); 2556 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2557 VA = RVLocs[++i]; // skip ahead to next loc 2558 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2559 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2560 Flag); 2561 Flag = Chain.getValue(1); 2562 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2563 VA = RVLocs[++i]; // skip ahead to next loc 2564 2565 // Extract the 2nd half and fall through to handle it as an f64 value. 2566 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2567 DAG.getConstant(1, dl, MVT::i32)); 2568 } 2569 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2570 // available. 2571 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2572 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2573 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2574 fmrrd.getValue(isLittleEndian ? 0 : 1), 2575 Flag); 2576 Flag = Chain.getValue(1); 2577 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2578 VA = RVLocs[++i]; // skip ahead to next loc 2579 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2580 fmrrd.getValue(isLittleEndian ? 1 : 0), 2581 Flag); 2582 } else 2583 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2584 2585 // Guarantee that all emitted copies are 2586 // stuck together, avoiding something bad. 2587 Flag = Chain.getValue(1); 2588 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2589 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2590 } 2591 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2592 const MCPhysReg *I = 2593 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2594 if (I) { 2595 for (; *I; ++I) { 2596 if (ARM::GPRRegClass.contains(*I)) 2597 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2598 else if (ARM::DPRRegClass.contains(*I)) 2599 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2600 else 2601 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2602 } 2603 } 2604 2605 // Update chain and glue. 2606 RetOps[0] = Chain; 2607 if (Flag.getNode()) 2608 RetOps.push_back(Flag); 2609 2610 // CPUs which aren't M-class use a special sequence to return from 2611 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2612 // though we use "subs pc, lr, #N"). 2613 // 2614 // M-class CPUs actually use a normal return sequence with a special 2615 // (hardware-provided) value in LR, so the normal code path works. 2616 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2617 !Subtarget->isMClass()) { 2618 if (Subtarget->isThumb1Only()) 2619 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2620 return LowerInterruptReturn(RetOps, dl, DAG); 2621 } 2622 2623 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2624 } 2625 2626 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2627 if (N->getNumValues() != 1) 2628 return false; 2629 if (!N->hasNUsesOfValue(1, 0)) 2630 return false; 2631 2632 SDValue TCChain = Chain; 2633 SDNode *Copy = *N->use_begin(); 2634 if (Copy->getOpcode() == ISD::CopyToReg) { 2635 // If the copy has a glue operand, we conservatively assume it isn't safe to 2636 // perform a tail call. 2637 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2638 return false; 2639 TCChain = Copy->getOperand(0); 2640 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2641 SDNode *VMov = Copy; 2642 // f64 returned in a pair of GPRs. 2643 SmallPtrSet<SDNode*, 2> Copies; 2644 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2645 UI != UE; ++UI) { 2646 if (UI->getOpcode() != ISD::CopyToReg) 2647 return false; 2648 Copies.insert(*UI); 2649 } 2650 if (Copies.size() > 2) 2651 return false; 2652 2653 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2654 UI != UE; ++UI) { 2655 SDValue UseChain = UI->getOperand(0); 2656 if (Copies.count(UseChain.getNode())) 2657 // Second CopyToReg 2658 Copy = *UI; 2659 else { 2660 // We are at the top of this chain. 2661 // If the copy has a glue operand, we conservatively assume it 2662 // isn't safe to perform a tail call. 2663 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2664 return false; 2665 // First CopyToReg 2666 TCChain = UseChain; 2667 } 2668 } 2669 } else if (Copy->getOpcode() == ISD::BITCAST) { 2670 // f32 returned in a single GPR. 2671 if (!Copy->hasOneUse()) 2672 return false; 2673 Copy = *Copy->use_begin(); 2674 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2675 return false; 2676 // If the copy has a glue operand, we conservatively assume it isn't safe to 2677 // perform a tail call. 2678 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2679 return false; 2680 TCChain = Copy->getOperand(0); 2681 } else { 2682 return false; 2683 } 2684 2685 bool HasRet = false; 2686 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2687 UI != UE; ++UI) { 2688 if (UI->getOpcode() != ARMISD::RET_FLAG && 2689 UI->getOpcode() != ARMISD::INTRET_FLAG) 2690 return false; 2691 HasRet = true; 2692 } 2693 2694 if (!HasRet) 2695 return false; 2696 2697 Chain = TCChain; 2698 return true; 2699 } 2700 2701 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2702 if (!Subtarget->supportsTailCall()) 2703 return false; 2704 2705 auto Attr = 2706 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2707 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2708 return false; 2709 2710 return true; 2711 } 2712 2713 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2714 // and pass the lower and high parts through. 2715 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2716 SDLoc DL(Op); 2717 SDValue WriteValue = Op->getOperand(2); 2718 2719 // This function is only supposed to be called for i64 type argument. 2720 assert(WriteValue.getValueType() == MVT::i64 2721 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2722 2723 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2724 DAG.getConstant(0, DL, MVT::i32)); 2725 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2726 DAG.getConstant(1, DL, MVT::i32)); 2727 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2728 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 2729 } 2730 2731 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2732 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2733 // one of the above mentioned nodes. It has to be wrapped because otherwise 2734 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2735 // be used to form addressing mode. These wrapped nodes will be selected 2736 // into MOVi. 2737 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 2738 SelectionDAG &DAG) const { 2739 EVT PtrVT = Op.getValueType(); 2740 // FIXME there is no actual debug info here 2741 SDLoc dl(Op); 2742 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2743 SDValue Res; 2744 2745 // When generating execute-only code Constant Pools must be promoted to the 2746 // global data section. It's a bit ugly that we can't share them across basic 2747 // blocks, but this way we guarantee that execute-only behaves correct with 2748 // position-independent addressing modes. 2749 if (Subtarget->genExecuteOnly()) { 2750 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 2751 auto T = const_cast<Type*>(CP->getType()); 2752 auto C = const_cast<Constant*>(CP->getConstVal()); 2753 auto M = const_cast<Module*>(DAG.getMachineFunction(). 2754 getFunction().getParent()); 2755 auto GV = new GlobalVariable( 2756 *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, 2757 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 2758 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 2759 Twine(AFI->createPICLabelUId()) 2760 ); 2761 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 2762 dl, PtrVT); 2763 return LowerGlobalAddress(GA, DAG); 2764 } 2765 2766 if (CP->isMachineConstantPoolEntry()) 2767 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 2768 CP->getAlignment()); 2769 else 2770 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 2771 CP->getAlignment()); 2772 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 2773 } 2774 2775 unsigned ARMTargetLowering::getJumpTableEncoding() const { 2776 return MachineJumpTableInfo::EK_Inline; 2777 } 2778 2779 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 2780 SelectionDAG &DAG) const { 2781 MachineFunction &MF = DAG.getMachineFunction(); 2782 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2783 unsigned ARMPCLabelIndex = 0; 2784 SDLoc DL(Op); 2785 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2786 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 2787 SDValue CPAddr; 2788 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 2789 if (!IsPositionIndependent) { 2790 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 2791 } else { 2792 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2793 ARMPCLabelIndex = AFI->createPICLabelUId(); 2794 ARMConstantPoolValue *CPV = 2795 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 2796 ARMCP::CPBlockAddress, PCAdj); 2797 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2798 } 2799 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 2800 SDValue Result = DAG.getLoad( 2801 PtrVT, DL, DAG.getEntryNode(), CPAddr, 2802 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2803 if (!IsPositionIndependent) 2804 return Result; 2805 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 2806 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 2807 } 2808 2809 /// Convert a TLS address reference into the correct sequence of loads 2810 /// and calls to compute the variable's address for Darwin, and return an 2811 /// SDValue containing the final node. 2812 2813 /// Darwin only has one TLS scheme which must be capable of dealing with the 2814 /// fully general situation, in the worst case. This means: 2815 /// + "extern __thread" declaration. 2816 /// + Defined in a possibly unknown dynamic library. 2817 /// 2818 /// The general system is that each __thread variable has a [3 x i32] descriptor 2819 /// which contains information used by the runtime to calculate the address. The 2820 /// only part of this the compiler needs to know about is the first word, which 2821 /// contains a function pointer that must be called with the address of the 2822 /// entire descriptor in "r0". 2823 /// 2824 /// Since this descriptor may be in a different unit, in general access must 2825 /// proceed along the usual ARM rules. A common sequence to produce is: 2826 /// 2827 /// movw rT1, :lower16:_var$non_lazy_ptr 2828 /// movt rT1, :upper16:_var$non_lazy_ptr 2829 /// ldr r0, [rT1] 2830 /// ldr rT2, [r0] 2831 /// blx rT2 2832 /// [...address now in r0...] 2833 SDValue 2834 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 2835 SelectionDAG &DAG) const { 2836 assert(Subtarget->isTargetDarwin() && 2837 "This function expects a Darwin target"); 2838 SDLoc DL(Op); 2839 2840 // First step is to get the address of the actua global symbol. This is where 2841 // the TLS descriptor lives. 2842 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 2843 2844 // The first entry in the descriptor is a function pointer that we must call 2845 // to obtain the address of the variable. 2846 SDValue Chain = DAG.getEntryNode(); 2847 SDValue FuncTLVGet = DAG.getLoad( 2848 MVT::i32, DL, Chain, DescAddr, 2849 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2850 /* Alignment = */ 4, 2851 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 2852 MachineMemOperand::MOInvariant); 2853 Chain = FuncTLVGet.getValue(1); 2854 2855 MachineFunction &F = DAG.getMachineFunction(); 2856 MachineFrameInfo &MFI = F.getFrameInfo(); 2857 MFI.setAdjustsStack(true); 2858 2859 // TLS calls preserve all registers except those that absolutely must be 2860 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 2861 // silly). 2862 auto TRI = 2863 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 2864 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 2865 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 2866 2867 // Finally, we can make the call. This is just a degenerate version of a 2868 // normal AArch64 call node: r0 takes the address of the descriptor, and 2869 // returns the address of the variable in this thread. 2870 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 2871 Chain = 2872 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 2873 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 2874 DAG.getRegisterMask(Mask), Chain.getValue(1)); 2875 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 2876 } 2877 2878 SDValue 2879 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 2880 SelectionDAG &DAG) const { 2881 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 2882 2883 SDValue Chain = DAG.getEntryNode(); 2884 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2885 SDLoc DL(Op); 2886 2887 // Load the current TEB (thread environment block) 2888 SDValue Ops[] = {Chain, 2889 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 2890 DAG.getConstant(15, DL, MVT::i32), 2891 DAG.getConstant(0, DL, MVT::i32), 2892 DAG.getConstant(13, DL, MVT::i32), 2893 DAG.getConstant(0, DL, MVT::i32), 2894 DAG.getConstant(2, DL, MVT::i32)}; 2895 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 2896 DAG.getVTList(MVT::i32, MVT::Other), Ops); 2897 2898 SDValue TEB = CurrentTEB.getValue(0); 2899 Chain = CurrentTEB.getValue(1); 2900 2901 // Load the ThreadLocalStoragePointer from the TEB 2902 // A pointer to the TLS array is located at offset 0x2c from the TEB. 2903 SDValue TLSArray = 2904 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 2905 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 2906 2907 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 2908 // offset into the TLSArray. 2909 2910 // Load the TLS index from the C runtime 2911 SDValue TLSIndex = 2912 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 2913 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 2914 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 2915 2916 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 2917 DAG.getConstant(2, DL, MVT::i32)); 2918 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 2919 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 2920 MachinePointerInfo()); 2921 2922 // Get the offset of the start of the .tls section (section base) 2923 const auto *GA = cast<GlobalAddressSDNode>(Op); 2924 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 2925 SDValue Offset = DAG.getLoad( 2926 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 2927 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 2928 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2929 2930 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 2931 } 2932 2933 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 2934 SDValue 2935 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 2936 SelectionDAG &DAG) const { 2937 SDLoc dl(GA); 2938 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2939 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2940 MachineFunction &MF = DAG.getMachineFunction(); 2941 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2942 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2943 ARMConstantPoolValue *CPV = 2944 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2945 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 2946 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2947 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 2948 Argument = DAG.getLoad( 2949 PtrVT, dl, DAG.getEntryNode(), Argument, 2950 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2951 SDValue Chain = Argument.getValue(1); 2952 2953 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2954 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 2955 2956 // call __tls_get_addr. 2957 ArgListTy Args; 2958 ArgListEntry Entry; 2959 Entry.Node = Argument; 2960 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 2961 Args.push_back(Entry); 2962 2963 // FIXME: is there useful debug info available here? 2964 TargetLowering::CallLoweringInfo CLI(DAG); 2965 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 2966 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 2967 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 2968 2969 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2970 return CallResult.first; 2971 } 2972 2973 // Lower ISD::GlobalTLSAddress using the "initial exec" or 2974 // "local exec" model. 2975 SDValue 2976 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 2977 SelectionDAG &DAG, 2978 TLSModel::Model model) const { 2979 const GlobalValue *GV = GA->getGlobal(); 2980 SDLoc dl(GA); 2981 SDValue Offset; 2982 SDValue Chain = DAG.getEntryNode(); 2983 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2984 // Get the Thread Pointer 2985 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2986 2987 if (model == TLSModel::InitialExec) { 2988 MachineFunction &MF = DAG.getMachineFunction(); 2989 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2990 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2991 // Initial exec model. 2992 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2993 ARMConstantPoolValue *CPV = 2994 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2995 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 2996 true); 2997 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2998 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2999 Offset = DAG.getLoad( 3000 PtrVT, dl, Chain, Offset, 3001 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3002 Chain = Offset.getValue(1); 3003 3004 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3005 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3006 3007 Offset = DAG.getLoad( 3008 PtrVT, dl, Chain, Offset, 3009 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3010 } else { 3011 // local exec model 3012 assert(model == TLSModel::LocalExec); 3013 ARMConstantPoolValue *CPV = 3014 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3015 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3016 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3017 Offset = DAG.getLoad( 3018 PtrVT, dl, Chain, Offset, 3019 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3020 } 3021 3022 // The address of the thread local variable is the add of the thread 3023 // pointer with the offset of the variable. 3024 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3025 } 3026 3027 SDValue 3028 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3029 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3030 if (DAG.getTarget().useEmulatedTLS()) 3031 return LowerToTLSEmulatedModel(GA, DAG); 3032 3033 if (Subtarget->isTargetDarwin()) 3034 return LowerGlobalTLSAddressDarwin(Op, DAG); 3035 3036 if (Subtarget->isTargetWindows()) 3037 return LowerGlobalTLSAddressWindows(Op, DAG); 3038 3039 // TODO: implement the "local dynamic" model 3040 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3041 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3042 3043 switch (model) { 3044 case TLSModel::GeneralDynamic: 3045 case TLSModel::LocalDynamic: 3046 return LowerToTLSGeneralDynamicModel(GA, DAG); 3047 case TLSModel::InitialExec: 3048 case TLSModel::LocalExec: 3049 return LowerToTLSExecModels(GA, DAG, model); 3050 } 3051 llvm_unreachable("bogus TLS model"); 3052 } 3053 3054 /// Return true if all users of V are within function F, looking through 3055 /// ConstantExprs. 3056 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3057 SmallVector<const User*,4> Worklist; 3058 for (auto *U : V->users()) 3059 Worklist.push_back(U); 3060 while (!Worklist.empty()) { 3061 auto *U = Worklist.pop_back_val(); 3062 if (isa<ConstantExpr>(U)) { 3063 for (auto *UU : U->users()) 3064 Worklist.push_back(UU); 3065 continue; 3066 } 3067 3068 auto *I = dyn_cast<Instruction>(U); 3069 if (!I || I->getParent()->getParent() != F) 3070 return false; 3071 } 3072 return true; 3073 } 3074 3075 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3076 const GlobalValue *GV, SelectionDAG &DAG, 3077 EVT PtrVT, const SDLoc &dl) { 3078 // If we're creating a pool entry for a constant global with unnamed address, 3079 // and the global is small enough, we can emit it inline into the constant pool 3080 // to save ourselves an indirection. 3081 // 3082 // This is a win if the constant is only used in one function (so it doesn't 3083 // need to be duplicated) or duplicating the constant wouldn't increase code 3084 // size (implying the constant is no larger than 4 bytes). 3085 const Function &F = DAG.getMachineFunction().getFunction(); 3086 3087 // We rely on this decision to inline being idemopotent and unrelated to the 3088 // use-site. We know that if we inline a variable at one use site, we'll 3089 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3090 // doesn't know about this optimization, so bail out if it's enabled else 3091 // we could decide to inline here (and thus never emit the GV) but require 3092 // the GV from fast-isel generated code. 3093 if (!EnableConstpoolPromotion || 3094 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3095 return SDValue(); 3096 3097 auto *GVar = dyn_cast<GlobalVariable>(GV); 3098 if (!GVar || !GVar->hasInitializer() || 3099 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3100 !GVar->hasLocalLinkage()) 3101 return SDValue(); 3102 3103 // If we inline a value that contains relocations, we move the relocations 3104 // from .data to .text. This is not allowed in position-independent code. 3105 auto *Init = GVar->getInitializer(); 3106 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3107 Init->needsRelocation()) 3108 return SDValue(); 3109 3110 // The constant islands pass can only really deal with alignment requests 3111 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3112 // any type wanting greater alignment requirements than 4 bytes. We also 3113 // can only promote constants that are multiples of 4 bytes in size or 3114 // are paddable to a multiple of 4. Currently we only try and pad constants 3115 // that are strings for simplicity. 3116 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3117 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3118 unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); 3119 unsigned RequiredPadding = 4 - (Size % 4); 3120 bool PaddingPossible = 3121 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3122 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3123 Size == 0) 3124 return SDValue(); 3125 3126 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3127 MachineFunction &MF = DAG.getMachineFunction(); 3128 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3129 3130 // We can't bloat the constant pool too much, else the ConstantIslands pass 3131 // may fail to converge. If we haven't promoted this global yet (it may have 3132 // multiple uses), and promoting it would increase the constant pool size (Sz 3133 // > 4), ensure we have space to do so up to MaxTotal. 3134 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3135 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3136 ConstpoolPromotionMaxTotal) 3137 return SDValue(); 3138 3139 // This is only valid if all users are in a single function; we can't clone 3140 // the constant in general. The LLVM IR unnamed_addr allows merging 3141 // constants, but not cloning them. 3142 // 3143 // We could potentially allow cloning if we could prove all uses of the 3144 // constant in the current function don't care about the address, like 3145 // printf format strings. But that isn't implemented for now. 3146 if (!allUsersAreInFunction(GVar, &F)) 3147 return SDValue(); 3148 3149 // We're going to inline this global. Pad it out if needed. 3150 if (RequiredPadding != 4) { 3151 StringRef S = CDAInit->getAsString(); 3152 3153 SmallVector<uint8_t,16> V(S.size()); 3154 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3155 while (RequiredPadding--) 3156 V.push_back(0); 3157 Init = ConstantDataArray::get(*DAG.getContext(), V); 3158 } 3159 3160 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3161 SDValue CPAddr = 3162 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3163 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3164 AFI->markGlobalAsPromotedToConstantPool(GVar); 3165 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3166 PaddedSize - 4); 3167 } 3168 ++NumConstpoolPromoted; 3169 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3170 } 3171 3172 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3173 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3174 GV = GA->getBaseObject(); 3175 return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) || 3176 isa<Function>(GV); 3177 } 3178 3179 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3180 SelectionDAG &DAG) const { 3181 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3182 default: llvm_unreachable("unknown object format"); 3183 case Triple::COFF: 3184 return LowerGlobalAddressWindows(Op, DAG); 3185 case Triple::ELF: 3186 return LowerGlobalAddressELF(Op, DAG); 3187 case Triple::MachO: 3188 return LowerGlobalAddressDarwin(Op, DAG); 3189 } 3190 } 3191 3192 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3193 SelectionDAG &DAG) const { 3194 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3195 SDLoc dl(Op); 3196 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3197 const TargetMachine &TM = getTargetMachine(); 3198 bool IsRO = isReadOnly(GV); 3199 3200 // promoteToConstantPool only if not generating XO text section 3201 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3202 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3203 return V; 3204 3205 if (isPositionIndependent()) { 3206 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3207 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3208 UseGOT_PREL ? ARMII::MO_GOT : 0); 3209 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3210 if (UseGOT_PREL) 3211 Result = 3212 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3213 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3214 return Result; 3215 } else if (Subtarget->isROPI() && IsRO) { 3216 // PC-relative. 3217 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3218 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3219 return Result; 3220 } else if (Subtarget->isRWPI() && !IsRO) { 3221 // SB-relative. 3222 SDValue RelAddr; 3223 if (Subtarget->useMovt(DAG.getMachineFunction())) { 3224 ++NumMovwMovt; 3225 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3226 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3227 } else { // use literal pool for address constant 3228 ARMConstantPoolValue *CPV = 3229 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3230 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3231 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3232 RelAddr = DAG.getLoad( 3233 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3234 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3235 } 3236 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3237 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3238 return Result; 3239 } 3240 3241 // If we have T2 ops, we can materialize the address directly via movt/movw 3242 // pair. This is always cheaper. 3243 if (Subtarget->useMovt(DAG.getMachineFunction())) { 3244 ++NumMovwMovt; 3245 // FIXME: Once remat is capable of dealing with instructions with register 3246 // operands, expand this into two nodes. 3247 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3248 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3249 } else { 3250 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3251 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3252 return DAG.getLoad( 3253 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3254 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3255 } 3256 } 3257 3258 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3259 SelectionDAG &DAG) const { 3260 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3261 "ROPI/RWPI not currently supported for Darwin"); 3262 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3263 SDLoc dl(Op); 3264 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3265 3266 if (Subtarget->useMovt(DAG.getMachineFunction())) 3267 ++NumMovwMovt; 3268 3269 // FIXME: Once remat is capable of dealing with instructions with register 3270 // operands, expand this into multiple nodes 3271 unsigned Wrapper = 3272 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3273 3274 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3275 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3276 3277 if (Subtarget->isGVIndirectSymbol(GV)) 3278 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3279 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3280 return Result; 3281 } 3282 3283 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3284 SelectionDAG &DAG) const { 3285 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3286 assert(Subtarget->useMovt(DAG.getMachineFunction()) && 3287 "Windows on ARM expects to use movw/movt"); 3288 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3289 "ROPI/RWPI not currently supported for Windows"); 3290 3291 const TargetMachine &TM = getTargetMachine(); 3292 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3293 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3294 if (GV->hasDLLImportStorageClass()) 3295 TargetFlags = ARMII::MO_DLLIMPORT; 3296 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3297 TargetFlags = ARMII::MO_COFFSTUB; 3298 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3299 SDValue Result; 3300 SDLoc DL(Op); 3301 3302 ++NumMovwMovt; 3303 3304 // FIXME: Once remat is capable of dealing with instructions with register 3305 // operands, expand this into two nodes. 3306 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3307 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, 3308 TargetFlags)); 3309 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3310 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3311 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3312 return Result; 3313 } 3314 3315 SDValue 3316 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3317 SDLoc dl(Op); 3318 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3319 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3320 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3321 Op.getOperand(1), Val); 3322 } 3323 3324 SDValue 3325 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3326 SDLoc dl(Op); 3327 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3328 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3329 } 3330 3331 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3332 SelectionDAG &DAG) const { 3333 SDLoc dl(Op); 3334 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3335 Op.getOperand(0)); 3336 } 3337 3338 SDValue 3339 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3340 const ARMSubtarget *Subtarget) const { 3341 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3342 SDLoc dl(Op); 3343 switch (IntNo) { 3344 default: return SDValue(); // Don't custom lower most intrinsics. 3345 case Intrinsic::thread_pointer: { 3346 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3347 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3348 } 3349 case Intrinsic::eh_sjlj_lsda: { 3350 MachineFunction &MF = DAG.getMachineFunction(); 3351 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3352 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3353 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3354 SDValue CPAddr; 3355 bool IsPositionIndependent = isPositionIndependent(); 3356 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3357 ARMConstantPoolValue *CPV = 3358 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3359 ARMCP::CPLSDA, PCAdj); 3360 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3361 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3362 SDValue Result = DAG.getLoad( 3363 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3364 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3365 3366 if (IsPositionIndependent) { 3367 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3368 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3369 } 3370 return Result; 3371 } 3372 case Intrinsic::arm_neon_vabs: 3373 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3374 Op.getOperand(1)); 3375 case Intrinsic::arm_neon_vmulls: 3376 case Intrinsic::arm_neon_vmullu: { 3377 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3378 ? ARMISD::VMULLs : ARMISD::VMULLu; 3379 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3380 Op.getOperand(1), Op.getOperand(2)); 3381 } 3382 case Intrinsic::arm_neon_vminnm: 3383 case Intrinsic::arm_neon_vmaxnm: { 3384 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3385 ? ISD::FMINNUM : ISD::FMAXNUM; 3386 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3387 Op.getOperand(1), Op.getOperand(2)); 3388 } 3389 case Intrinsic::arm_neon_vminu: 3390 case Intrinsic::arm_neon_vmaxu: { 3391 if (Op.getValueType().isFloatingPoint()) 3392 return SDValue(); 3393 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3394 ? ISD::UMIN : ISD::UMAX; 3395 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3396 Op.getOperand(1), Op.getOperand(2)); 3397 } 3398 case Intrinsic::arm_neon_vmins: 3399 case Intrinsic::arm_neon_vmaxs: { 3400 // v{min,max}s is overloaded between signed integers and floats. 3401 if (!Op.getValueType().isFloatingPoint()) { 3402 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3403 ? ISD::SMIN : ISD::SMAX; 3404 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3405 Op.getOperand(1), Op.getOperand(2)); 3406 } 3407 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3408 ? ISD::FMINNAN : ISD::FMAXNAN; 3409 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3410 Op.getOperand(1), Op.getOperand(2)); 3411 } 3412 case Intrinsic::arm_neon_vtbl1: 3413 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3414 Op.getOperand(1), Op.getOperand(2)); 3415 case Intrinsic::arm_neon_vtbl2: 3416 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3417 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3418 } 3419 } 3420 3421 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3422 const ARMSubtarget *Subtarget) { 3423 SDLoc dl(Op); 3424 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3425 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3426 if (SSID == SyncScope::SingleThread) 3427 return Op; 3428 3429 if (!Subtarget->hasDataBarrier()) { 3430 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3431 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3432 // here. 3433 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3434 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3435 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3436 DAG.getConstant(0, dl, MVT::i32)); 3437 } 3438 3439 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3440 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3441 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3442 if (Subtarget->isMClass()) { 3443 // Only a full system barrier exists in the M-class architectures. 3444 Domain = ARM_MB::SY; 3445 } else if (Subtarget->preferISHSTBarriers() && 3446 Ord == AtomicOrdering::Release) { 3447 // Swift happens to implement ISHST barriers in a way that's compatible with 3448 // Release semantics but weaker than ISH so we'd be fools not to use 3449 // it. Beware: other processors probably don't! 3450 Domain = ARM_MB::ISHST; 3451 } 3452 3453 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3454 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3455 DAG.getConstant(Domain, dl, MVT::i32)); 3456 } 3457 3458 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3459 const ARMSubtarget *Subtarget) { 3460 // ARM pre v5TE and Thumb1 does not have preload instructions. 3461 if (!(Subtarget->isThumb2() || 3462 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3463 // Just preserve the chain. 3464 return Op.getOperand(0); 3465 3466 SDLoc dl(Op); 3467 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3468 if (!isRead && 3469 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3470 // ARMv7 with MP extension has PLDW. 3471 return Op.getOperand(0); 3472 3473 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3474 if (Subtarget->isThumb()) { 3475 // Invert the bits. 3476 isRead = ~isRead & 1; 3477 isData = ~isData & 1; 3478 } 3479 3480 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3481 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3482 DAG.getConstant(isData, dl, MVT::i32)); 3483 } 3484 3485 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3486 MachineFunction &MF = DAG.getMachineFunction(); 3487 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3488 3489 // vastart just stores the address of the VarArgsFrameIndex slot into the 3490 // memory location argument. 3491 SDLoc dl(Op); 3492 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3493 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3494 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3495 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3496 MachinePointerInfo(SV)); 3497 } 3498 3499 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3500 CCValAssign &NextVA, 3501 SDValue &Root, 3502 SelectionDAG &DAG, 3503 const SDLoc &dl) const { 3504 MachineFunction &MF = DAG.getMachineFunction(); 3505 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3506 3507 const TargetRegisterClass *RC; 3508 if (AFI->isThumb1OnlyFunction()) 3509 RC = &ARM::tGPRRegClass; 3510 else 3511 RC = &ARM::GPRRegClass; 3512 3513 // Transform the arguments stored in physical registers into virtual ones. 3514 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3515 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3516 3517 SDValue ArgValue2; 3518 if (NextVA.isMemLoc()) { 3519 MachineFrameInfo &MFI = MF.getFrameInfo(); 3520 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3521 3522 // Create load node to retrieve arguments from the stack. 3523 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3524 ArgValue2 = DAG.getLoad( 3525 MVT::i32, dl, Root, FIN, 3526 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3527 } else { 3528 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3529 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3530 } 3531 if (!Subtarget->isLittle()) 3532 std::swap (ArgValue, ArgValue2); 3533 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3534 } 3535 3536 // The remaining GPRs hold either the beginning of variable-argument 3537 // data, or the beginning of an aggregate passed by value (usually 3538 // byval). Either way, we allocate stack slots adjacent to the data 3539 // provided by our caller, and store the unallocated registers there. 3540 // If this is a variadic function, the va_list pointer will begin with 3541 // these values; otherwise, this reassembles a (byval) structure that 3542 // was split between registers and memory. 3543 // Return: The frame index registers were stored into. 3544 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3545 const SDLoc &dl, SDValue &Chain, 3546 const Value *OrigArg, 3547 unsigned InRegsParamRecordIdx, 3548 int ArgOffset, unsigned ArgSize) const { 3549 // Currently, two use-cases possible: 3550 // Case #1. Non-var-args function, and we meet first byval parameter. 3551 // Setup first unallocated register as first byval register; 3552 // eat all remained registers 3553 // (these two actions are performed by HandleByVal method). 3554 // Then, here, we initialize stack frame with 3555 // "store-reg" instructions. 3556 // Case #2. Var-args function, that doesn't contain byval parameters. 3557 // The same: eat all remained unallocated registers, 3558 // initialize stack frame. 3559 3560 MachineFunction &MF = DAG.getMachineFunction(); 3561 MachineFrameInfo &MFI = MF.getFrameInfo(); 3562 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3563 unsigned RBegin, REnd; 3564 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3565 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3566 } else { 3567 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3568 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3569 REnd = ARM::R4; 3570 } 3571 3572 if (REnd != RBegin) 3573 ArgOffset = -4 * (ARM::R4 - RBegin); 3574 3575 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3576 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3577 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3578 3579 SmallVector<SDValue, 4> MemOps; 3580 const TargetRegisterClass *RC = 3581 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3582 3583 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3584 unsigned VReg = MF.addLiveIn(Reg, RC); 3585 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3586 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3587 MachinePointerInfo(OrigArg, 4 * i)); 3588 MemOps.push_back(Store); 3589 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3590 } 3591 3592 if (!MemOps.empty()) 3593 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3594 return FrameIndex; 3595 } 3596 3597 // Setup stack frame, the va_list pointer will start from. 3598 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3599 const SDLoc &dl, SDValue &Chain, 3600 unsigned ArgOffset, 3601 unsigned TotalArgRegsSaveSize, 3602 bool ForceMutable) const { 3603 MachineFunction &MF = DAG.getMachineFunction(); 3604 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3605 3606 // Try to store any remaining integer argument regs 3607 // to their spots on the stack so that they may be loaded by dereferencing 3608 // the result of va_next. 3609 // If there is no regs to be stored, just point address after last 3610 // argument passed via stack. 3611 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3612 CCInfo.getInRegsParamsCount(), 3613 CCInfo.getNextStackOffset(), 4); 3614 AFI->setVarArgsFrameIndex(FrameIndex); 3615 } 3616 3617 SDValue ARMTargetLowering::LowerFormalArguments( 3618 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3619 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3620 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3621 MachineFunction &MF = DAG.getMachineFunction(); 3622 MachineFrameInfo &MFI = MF.getFrameInfo(); 3623 3624 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3625 3626 // Assign locations to all of the incoming arguments. 3627 SmallVector<CCValAssign, 16> ArgLocs; 3628 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3629 *DAG.getContext()); 3630 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 3631 3632 SmallVector<SDValue, 16> ArgValues; 3633 SDValue ArgValue; 3634 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 3635 unsigned CurArgIdx = 0; 3636 3637 // Initially ArgRegsSaveSize is zero. 3638 // Then we increase this value each time we meet byval parameter. 3639 // We also increase this value in case of varargs function. 3640 AFI->setArgRegsSaveSize(0); 3641 3642 // Calculate the amount of stack space that we need to allocate to store 3643 // byval and variadic arguments that are passed in registers. 3644 // We need to know this before we allocate the first byval or variadic 3645 // argument, as they will be allocated a stack slot below the CFA (Canonical 3646 // Frame Address, the stack pointer at entry to the function). 3647 unsigned ArgRegBegin = ARM::R4; 3648 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3649 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 3650 break; 3651 3652 CCValAssign &VA = ArgLocs[i]; 3653 unsigned Index = VA.getValNo(); 3654 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 3655 if (!Flags.isByVal()) 3656 continue; 3657 3658 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 3659 unsigned RBegin, REnd; 3660 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 3661 ArgRegBegin = std::min(ArgRegBegin, RBegin); 3662 3663 CCInfo.nextInRegsParam(); 3664 } 3665 CCInfo.rewindByValRegsInfo(); 3666 3667 int lastInsIndex = -1; 3668 if (isVarArg && MFI.hasVAStart()) { 3669 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3670 if (RegIdx != array_lengthof(GPRArgRegs)) 3671 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 3672 } 3673 3674 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 3675 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 3676 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3677 3678 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3679 CCValAssign &VA = ArgLocs[i]; 3680 if (Ins[VA.getValNo()].isOrigArg()) { 3681 std::advance(CurOrigArg, 3682 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 3683 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 3684 } 3685 // Arguments stored in registers. 3686 if (VA.isRegLoc()) { 3687 EVT RegVT = VA.getLocVT(); 3688 3689 if (VA.needsCustom()) { 3690 // f64 and vector types are split up into multiple registers or 3691 // combinations of registers and stack slots. 3692 if (VA.getLocVT() == MVT::v2f64) { 3693 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 3694 Chain, DAG, dl); 3695 VA = ArgLocs[++i]; // skip ahead to next loc 3696 SDValue ArgValue2; 3697 if (VA.isMemLoc()) { 3698 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 3699 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3700 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 3701 MachinePointerInfo::getFixedStack( 3702 DAG.getMachineFunction(), FI)); 3703 } else { 3704 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 3705 Chain, DAG, dl); 3706 } 3707 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 3708 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3709 ArgValue, ArgValue1, 3710 DAG.getIntPtrConstant(0, dl)); 3711 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3712 ArgValue, ArgValue2, 3713 DAG.getIntPtrConstant(1, dl)); 3714 } else 3715 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 3716 } else { 3717 const TargetRegisterClass *RC; 3718 3719 3720 if (RegVT == MVT::f16) 3721 RC = &ARM::HPRRegClass; 3722 else if (RegVT == MVT::f32) 3723 RC = &ARM::SPRRegClass; 3724 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 3725 RC = &ARM::DPRRegClass; 3726 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 3727 RC = &ARM::QPRRegClass; 3728 else if (RegVT == MVT::i32) 3729 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 3730 : &ARM::GPRRegClass; 3731 else 3732 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3733 3734 // Transform the arguments in physical registers into virtual ones. 3735 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3736 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 3737 } 3738 3739 // If this is an 8 or 16-bit value, it is really passed promoted 3740 // to 32 bits. Insert an assert[sz]ext to capture this, then 3741 // truncate to the right size. 3742 switch (VA.getLocInfo()) { 3743 default: llvm_unreachable("Unknown loc info!"); 3744 case CCValAssign::Full: break; 3745 case CCValAssign::BCvt: 3746 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 3747 break; 3748 case CCValAssign::SExt: 3749 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 3750 DAG.getValueType(VA.getValVT())); 3751 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3752 break; 3753 case CCValAssign::ZExt: 3754 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 3755 DAG.getValueType(VA.getValVT())); 3756 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3757 break; 3758 } 3759 3760 InVals.push_back(ArgValue); 3761 } else { // VA.isRegLoc() 3762 // sanity check 3763 assert(VA.isMemLoc()); 3764 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 3765 3766 int index = VA.getValNo(); 3767 3768 // Some Ins[] entries become multiple ArgLoc[] entries. 3769 // Process them only once. 3770 if (index != lastInsIndex) 3771 { 3772 ISD::ArgFlagsTy Flags = Ins[index].Flags; 3773 // FIXME: For now, all byval parameter objects are marked mutable. 3774 // This can be changed with more analysis. 3775 // In case of tail call optimization mark all arguments mutable. 3776 // Since they could be overwritten by lowering of arguments in case of 3777 // a tail call. 3778 if (Flags.isByVal()) { 3779 assert(Ins[index].isOrigArg() && 3780 "Byval arguments cannot be implicit"); 3781 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 3782 3783 int FrameIndex = StoreByValRegs( 3784 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 3785 VA.getLocMemOffset(), Flags.getByValSize()); 3786 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 3787 CCInfo.nextInRegsParam(); 3788 } else { 3789 unsigned FIOffset = VA.getLocMemOffset(); 3790 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 3791 FIOffset, true); 3792 3793 // Create load nodes to retrieve arguments from the stack. 3794 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3795 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 3796 MachinePointerInfo::getFixedStack( 3797 DAG.getMachineFunction(), FI))); 3798 } 3799 lastInsIndex = index; 3800 } 3801 } 3802 } 3803 3804 // varargs 3805 if (isVarArg && MFI.hasVAStart()) 3806 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 3807 CCInfo.getNextStackOffset(), 3808 TotalArgRegsSaveSize); 3809 3810 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 3811 3812 return Chain; 3813 } 3814 3815 /// isFloatingPointZero - Return true if this is +0.0. 3816 static bool isFloatingPointZero(SDValue Op) { 3817 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 3818 return CFP->getValueAPF().isPosZero(); 3819 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 3820 // Maybe this has already been legalized into the constant pool? 3821 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 3822 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 3823 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 3824 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 3825 return CFP->getValueAPF().isPosZero(); 3826 } 3827 } else if (Op->getOpcode() == ISD::BITCAST && 3828 Op->getValueType(0) == MVT::f64) { 3829 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 3830 // created by LowerConstantFP(). 3831 SDValue BitcastOp = Op->getOperand(0); 3832 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 3833 isNullConstant(BitcastOp->getOperand(0))) 3834 return true; 3835 } 3836 return false; 3837 } 3838 3839 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 3840 /// the given operands. 3841 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3842 SDValue &ARMcc, SelectionDAG &DAG, 3843 const SDLoc &dl) const { 3844 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3845 unsigned C = RHSC->getZExtValue(); 3846 if (!isLegalICmpImmediate((int32_t)C)) { 3847 // Constant does not fit, try adjusting it by one. 3848 switch (CC) { 3849 default: break; 3850 case ISD::SETLT: 3851 case ISD::SETGE: 3852 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 3853 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3854 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3855 } 3856 break; 3857 case ISD::SETULT: 3858 case ISD::SETUGE: 3859 if (C != 0 && isLegalICmpImmediate(C-1)) { 3860 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3861 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3862 } 3863 break; 3864 case ISD::SETLE: 3865 case ISD::SETGT: 3866 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 3867 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3868 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3869 } 3870 break; 3871 case ISD::SETULE: 3872 case ISD::SETUGT: 3873 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 3874 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3875 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3876 } 3877 break; 3878 } 3879 } 3880 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 3881 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 3882 // In ARM and Thumb-2, the compare instructions can shift their second 3883 // operand. 3884 CC = ISD::getSetCCSwappedOperands(CC); 3885 std::swap(LHS, RHS); 3886 } 3887 3888 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3889 ARMISD::NodeType CompareType; 3890 switch (CondCode) { 3891 default: 3892 CompareType = ARMISD::CMP; 3893 break; 3894 case ARMCC::EQ: 3895 case ARMCC::NE: 3896 // Uses only Z Flag 3897 CompareType = ARMISD::CMPZ; 3898 break; 3899 } 3900 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3901 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 3902 } 3903 3904 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 3905 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 3906 SelectionDAG &DAG, const SDLoc &dl, 3907 bool InvalidOnQNaN) const { 3908 assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); 3909 SDValue Cmp; 3910 SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); 3911 if (!isFloatingPointZero(RHS)) 3912 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); 3913 else 3914 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); 3915 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 3916 } 3917 3918 /// duplicateCmp - Glue values can have only one use, so this function 3919 /// duplicates a comparison node. 3920 SDValue 3921 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 3922 unsigned Opc = Cmp.getOpcode(); 3923 SDLoc DL(Cmp); 3924 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 3925 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3926 3927 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 3928 Cmp = Cmp.getOperand(0); 3929 Opc = Cmp.getOpcode(); 3930 if (Opc == ARMISD::CMPFP) 3931 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), 3932 Cmp.getOperand(1), Cmp.getOperand(2)); 3933 else { 3934 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 3935 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), 3936 Cmp.getOperand(1)); 3937 } 3938 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 3939 } 3940 3941 // This function returns three things: the arithmetic computation itself 3942 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 3943 // comparison and the condition code define the case in which the arithmetic 3944 // computation *does not* overflow. 3945 std::pair<SDValue, SDValue> 3946 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 3947 SDValue &ARMcc) const { 3948 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 3949 3950 SDValue Value, OverflowCmp; 3951 SDValue LHS = Op.getOperand(0); 3952 SDValue RHS = Op.getOperand(1); 3953 SDLoc dl(Op); 3954 3955 // FIXME: We are currently always generating CMPs because we don't support 3956 // generating CMN through the backend. This is not as good as the natural 3957 // CMP case because it causes a register dependency and cannot be folded 3958 // later. 3959 3960 switch (Op.getOpcode()) { 3961 default: 3962 llvm_unreachable("Unknown overflow instruction!"); 3963 case ISD::SADDO: 3964 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3965 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3966 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3967 break; 3968 case ISD::UADDO: 3969 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3970 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 3971 // We do not use it in the USUBO case as Value may not be used. 3972 Value = DAG.getNode(ARMISD::ADDC, dl, 3973 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 3974 .getValue(0); 3975 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3976 break; 3977 case ISD::SSUBO: 3978 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3979 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3980 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3981 break; 3982 case ISD::USUBO: 3983 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3984 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3985 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3986 break; 3987 case ISD::UMULO: 3988 // We generate a UMUL_LOHI and then check if the high word is 0. 3989 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 3990 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 3991 DAG.getVTList(Op.getValueType(), Op.getValueType()), 3992 LHS, RHS); 3993 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 3994 DAG.getConstant(0, dl, MVT::i32)); 3995 Value = Value.getValue(0); // We only want the low 32 bits for the result. 3996 break; 3997 case ISD::SMULO: 3998 // We generate a SMUL_LOHI and then check if all the bits of the high word 3999 // are the same as the sign bit of the low word. 4000 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4001 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4002 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4003 LHS, RHS); 4004 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4005 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4006 Value.getValue(0), 4007 DAG.getConstant(31, dl, MVT::i32))); 4008 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4009 break; 4010 } // switch (...) 4011 4012 return std::make_pair(Value, OverflowCmp); 4013 } 4014 4015 SDValue 4016 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4017 // Let legalize expand this if it isn't a legal type yet. 4018 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4019 return SDValue(); 4020 4021 SDValue Value, OverflowCmp; 4022 SDValue ARMcc; 4023 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4024 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4025 SDLoc dl(Op); 4026 // We use 0 and 1 as false and true values. 4027 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4028 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4029 EVT VT = Op.getValueType(); 4030 4031 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4032 ARMcc, CCR, OverflowCmp); 4033 4034 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4035 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4036 } 4037 4038 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4039 SelectionDAG &DAG) { 4040 SDLoc DL(BoolCarry); 4041 EVT CarryVT = BoolCarry.getValueType(); 4042 4043 // This converts the boolean value carry into the carry flag by doing 4044 // ARMISD::SUBC Carry, 1 4045 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4046 DAG.getVTList(CarryVT, MVT::i32), 4047 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4048 return Carry.getValue(1); 4049 } 4050 4051 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4052 SelectionDAG &DAG) { 4053 SDLoc DL(Flags); 4054 4055 // Now convert the carry flag into a boolean carry. We do this 4056 // using ARMISD:ADDE 0, 0, Carry 4057 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4058 DAG.getConstant(0, DL, MVT::i32), 4059 DAG.getConstant(0, DL, MVT::i32), Flags); 4060 } 4061 4062 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4063 SelectionDAG &DAG) const { 4064 // Let legalize expand this if it isn't a legal type yet. 4065 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4066 return SDValue(); 4067 4068 SDValue LHS = Op.getOperand(0); 4069 SDValue RHS = Op.getOperand(1); 4070 SDLoc dl(Op); 4071 4072 EVT VT = Op.getValueType(); 4073 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4074 SDValue Value; 4075 SDValue Overflow; 4076 switch (Op.getOpcode()) { 4077 default: 4078 llvm_unreachable("Unknown overflow instruction!"); 4079 case ISD::UADDO: 4080 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4081 // Convert the carry flag into a boolean value. 4082 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4083 break; 4084 case ISD::USUBO: { 4085 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4086 // Convert the carry flag into a boolean value. 4087 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4088 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4089 // value. So compute 1 - C. 4090 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4091 DAG.getConstant(1, dl, MVT::i32), Overflow); 4092 break; 4093 } 4094 } 4095 4096 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4097 } 4098 4099 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4100 SDValue Cond = Op.getOperand(0); 4101 SDValue SelectTrue = Op.getOperand(1); 4102 SDValue SelectFalse = Op.getOperand(2); 4103 SDLoc dl(Op); 4104 unsigned Opc = Cond.getOpcode(); 4105 4106 if (Cond.getResNo() == 1 && 4107 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4108 Opc == ISD::USUBO)) { 4109 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4110 return SDValue(); 4111 4112 SDValue Value, OverflowCmp; 4113 SDValue ARMcc; 4114 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4115 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4116 EVT VT = Op.getValueType(); 4117 4118 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4119 OverflowCmp, DAG); 4120 } 4121 4122 // Convert: 4123 // 4124 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4125 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4126 // 4127 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4128 const ConstantSDNode *CMOVTrue = 4129 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4130 const ConstantSDNode *CMOVFalse = 4131 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4132 4133 if (CMOVTrue && CMOVFalse) { 4134 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4135 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4136 4137 SDValue True; 4138 SDValue False; 4139 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4140 True = SelectTrue; 4141 False = SelectFalse; 4142 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4143 True = SelectFalse; 4144 False = SelectTrue; 4145 } 4146 4147 if (True.getNode() && False.getNode()) { 4148 EVT VT = Op.getValueType(); 4149 SDValue ARMcc = Cond.getOperand(2); 4150 SDValue CCR = Cond.getOperand(3); 4151 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4152 assert(True.getValueType() == VT); 4153 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4154 } 4155 } 4156 } 4157 4158 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4159 // undefined bits before doing a full-word comparison with zero. 4160 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4161 DAG.getConstant(1, dl, Cond.getValueType())); 4162 4163 return DAG.getSelectCC(dl, Cond, 4164 DAG.getConstant(0, dl, Cond.getValueType()), 4165 SelectTrue, SelectFalse, ISD::SETNE); 4166 } 4167 4168 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4169 bool &swpCmpOps, bool &swpVselOps) { 4170 // Start by selecting the GE condition code for opcodes that return true for 4171 // 'equality' 4172 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4173 CC == ISD::SETULE) 4174 CondCode = ARMCC::GE; 4175 4176 // and GT for opcodes that return false for 'equality'. 4177 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4178 CC == ISD::SETULT) 4179 CondCode = ARMCC::GT; 4180 4181 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4182 // to swap the compare operands. 4183 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4184 CC == ISD::SETULT) 4185 swpCmpOps = true; 4186 4187 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4188 // If we have an unordered opcode, we need to swap the operands to the VSEL 4189 // instruction (effectively negating the condition). 4190 // 4191 // This also has the effect of swapping which one of 'less' or 'greater' 4192 // returns true, so we also swap the compare operands. It also switches 4193 // whether we return true for 'equality', so we compensate by picking the 4194 // opposite condition code to our original choice. 4195 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4196 CC == ISD::SETUGT) { 4197 swpCmpOps = !swpCmpOps; 4198 swpVselOps = !swpVselOps; 4199 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4200 } 4201 4202 // 'ordered' is 'anything but unordered', so use the VS condition code and 4203 // swap the VSEL operands. 4204 if (CC == ISD::SETO) { 4205 CondCode = ARMCC::VS; 4206 swpVselOps = true; 4207 } 4208 4209 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4210 // code and swap the VSEL operands. 4211 if (CC == ISD::SETUNE) { 4212 CondCode = ARMCC::EQ; 4213 swpVselOps = true; 4214 } 4215 } 4216 4217 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4218 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4219 SDValue Cmp, SelectionDAG &DAG) const { 4220 if (Subtarget->isFPOnlySP() && VT == MVT::f64) { 4221 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4222 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4223 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4224 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4225 4226 SDValue TrueLow = TrueVal.getValue(0); 4227 SDValue TrueHigh = TrueVal.getValue(1); 4228 SDValue FalseLow = FalseVal.getValue(0); 4229 SDValue FalseHigh = FalseVal.getValue(1); 4230 4231 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4232 ARMcc, CCR, Cmp); 4233 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4234 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4235 4236 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4237 } else { 4238 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4239 Cmp); 4240 } 4241 } 4242 4243 static bool isGTorGE(ISD::CondCode CC) { 4244 return CC == ISD::SETGT || CC == ISD::SETGE; 4245 } 4246 4247 static bool isLTorLE(ISD::CondCode CC) { 4248 return CC == ISD::SETLT || CC == ISD::SETLE; 4249 } 4250 4251 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4252 // All of these conditions (and their <= and >= counterparts) will do: 4253 // x < k ? k : x 4254 // x > k ? x : k 4255 // k < x ? x : k 4256 // k > x ? k : x 4257 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4258 const SDValue TrueVal, const SDValue FalseVal, 4259 const ISD::CondCode CC, const SDValue K) { 4260 return (isGTorGE(CC) && 4261 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4262 (isLTorLE(CC) && 4263 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4264 } 4265 4266 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4267 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4268 const SDValue TrueVal, const SDValue FalseVal, 4269 const ISD::CondCode CC, const SDValue K) { 4270 return (isGTorGE(CC) && 4271 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4272 (isLTorLE(CC) && 4273 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4274 } 4275 4276 // Check if two chained conditionals could be converted into SSAT or USAT. 4277 // 4278 // SSAT can replace a set of two conditional selectors that bound a number to an 4279 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4280 // 4281 // x < -k ? -k : (x > k ? k : x) 4282 // x < -k ? -k : (x < k ? x : k) 4283 // x > -k ? (x > k ? k : x) : -k 4284 // x < k ? (x < -k ? -k : x) : k 4285 // etc. 4286 // 4287 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4288 // a power of 2. 4289 // 4290 // It returns true if the conversion can be done, false otherwise. 4291 // Additionally, the variable is returned in parameter V, the constant in K and 4292 // usat is set to true if the conditional represents an unsigned saturation 4293 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4294 uint64_t &K, bool &usat) { 4295 SDValue LHS1 = Op.getOperand(0); 4296 SDValue RHS1 = Op.getOperand(1); 4297 SDValue TrueVal1 = Op.getOperand(2); 4298 SDValue FalseVal1 = Op.getOperand(3); 4299 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4300 4301 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4302 if (Op2.getOpcode() != ISD::SELECT_CC) 4303 return false; 4304 4305 SDValue LHS2 = Op2.getOperand(0); 4306 SDValue RHS2 = Op2.getOperand(1); 4307 SDValue TrueVal2 = Op2.getOperand(2); 4308 SDValue FalseVal2 = Op2.getOperand(3); 4309 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4310 4311 // Find out which are the constants and which are the variables 4312 // in each conditional 4313 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4314 ? &RHS1 4315 : nullptr; 4316 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4317 ? &RHS2 4318 : nullptr; 4319 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4320 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4321 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4322 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4323 4324 // We must detect cases where the original operations worked with 16- or 4325 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4326 // must work with sign-extended values but the select operations return 4327 // the original non-extended value. 4328 SDValue V2TmpReg = V2Tmp; 4329 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4330 V2TmpReg = V2Tmp->getOperand(0); 4331 4332 // Check that the registers and the constants have the correct values 4333 // in both conditionals 4334 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4335 V2TmpReg != V2) 4336 return false; 4337 4338 // Figure out which conditional is saturating the lower/upper bound. 4339 const SDValue *LowerCheckOp = 4340 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4341 ? &Op 4342 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4343 ? &Op2 4344 : nullptr; 4345 const SDValue *UpperCheckOp = 4346 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4347 ? &Op 4348 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4349 ? &Op2 4350 : nullptr; 4351 4352 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4353 return false; 4354 4355 // Check that the constant in the lower-bound check is 4356 // the opposite of the constant in the upper-bound check 4357 // in 1's complement. 4358 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4359 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4360 int64_t PosVal = std::max(Val1, Val2); 4361 int64_t NegVal = std::min(Val1, Val2); 4362 4363 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4364 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4365 isPowerOf2_64(PosVal + 1)) { 4366 4367 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4368 if (Val1 == ~Val2) 4369 usat = false; 4370 else if (NegVal == 0) 4371 usat = true; 4372 else 4373 return false; 4374 4375 V = V2; 4376 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4377 4378 return true; 4379 } 4380 4381 return false; 4382 } 4383 4384 // Check if a condition of the type x < k ? k : x can be converted into a 4385 // bit operation instead of conditional moves. 4386 // Currently this is allowed given: 4387 // - The conditions and values match up 4388 // - k is 0 or -1 (all ones) 4389 // This function will not check the last condition, thats up to the caller 4390 // It returns true if the transformation can be made, and in such case 4391 // returns x in V, and k in SatK. 4392 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4393 SDValue &SatK) 4394 { 4395 SDValue LHS = Op.getOperand(0); 4396 SDValue RHS = Op.getOperand(1); 4397 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4398 SDValue TrueVal = Op.getOperand(2); 4399 SDValue FalseVal = Op.getOperand(3); 4400 4401 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4402 ? &RHS 4403 : nullptr; 4404 4405 // No constant operation in comparison, early out 4406 if (!K) 4407 return false; 4408 4409 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4410 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4411 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4412 4413 // If the constant on left and right side, or variable on left and right, 4414 // does not match, early out 4415 if (*K != KTmp || V != VTmp) 4416 return false; 4417 4418 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4419 SatK = *K; 4420 return true; 4421 } 4422 4423 return false; 4424 } 4425 4426 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4427 EVT VT = Op.getValueType(); 4428 SDLoc dl(Op); 4429 4430 // Try to convert two saturating conditional selects into a single SSAT 4431 SDValue SatValue; 4432 uint64_t SatConstant; 4433 bool SatUSat; 4434 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4435 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4436 if (SatUSat) 4437 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4438 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4439 else 4440 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4441 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4442 } 4443 4444 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4445 // into more efficient bit operations, which is possible when k is 0 or -1 4446 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4447 // single instructions. On Thumb the shift and the bit operation will be two 4448 // instructions. 4449 // Only allow this transformation on full-width (32-bit) operations 4450 SDValue LowerSatConstant; 4451 if (VT == MVT::i32 && 4452 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4453 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4454 DAG.getConstant(31, dl, VT)); 4455 if (isNullConstant(LowerSatConstant)) { 4456 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4457 DAG.getAllOnesConstant(dl, VT)); 4458 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4459 } else if (isAllOnesConstant(LowerSatConstant)) 4460 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4461 } 4462 4463 SDValue LHS = Op.getOperand(0); 4464 SDValue RHS = Op.getOperand(1); 4465 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4466 SDValue TrueVal = Op.getOperand(2); 4467 SDValue FalseVal = Op.getOperand(3); 4468 4469 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4470 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4471 dl); 4472 4473 // If softenSetCCOperands only returned one value, we should compare it to 4474 // zero. 4475 if (!RHS.getNode()) { 4476 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4477 CC = ISD::SETNE; 4478 } 4479 } 4480 4481 if (LHS.getValueType() == MVT::i32) { 4482 // Try to generate VSEL on ARMv8. 4483 // The VSEL instruction can't use all the usual ARM condition 4484 // codes: it only has two bits to select the condition code, so it's 4485 // constrained to use only GE, GT, VS and EQ. 4486 // 4487 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 4488 // swap the operands of the previous compare instruction (effectively 4489 // inverting the compare condition, swapping 'less' and 'greater') and 4490 // sometimes need to swap the operands to the VSEL (which inverts the 4491 // condition in the sense of firing whenever the previous condition didn't) 4492 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 4493 TrueVal.getValueType() == MVT::f64)) { 4494 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4495 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 4496 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 4497 CC = ISD::getSetCCInverse(CC, true); 4498 std::swap(TrueVal, FalseVal); 4499 } 4500 } 4501 4502 SDValue ARMcc; 4503 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4504 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4505 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4506 } 4507 4508 ARMCC::CondCodes CondCode, CondCode2; 4509 bool InvalidOnQNaN; 4510 FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); 4511 4512 // Normalize the fp compare. If RHS is zero we keep it there so we match 4513 // CMPFPw0 instead of CMPFP. 4514 if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) && 4515 (TrueVal.getValueType() == MVT::f16 || 4516 TrueVal.getValueType() == MVT::f32 || 4517 TrueVal.getValueType() == MVT::f64)) { 4518 bool swpCmpOps = false; 4519 bool swpVselOps = false; 4520 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 4521 4522 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 4523 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 4524 if (swpCmpOps) 4525 std::swap(LHS, RHS); 4526 if (swpVselOps) 4527 std::swap(TrueVal, FalseVal); 4528 } 4529 } 4530 4531 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4532 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4533 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4534 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4535 if (CondCode2 != ARMCC::AL) { 4536 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 4537 // FIXME: Needs another CMP because flag can have but one use. 4538 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4539 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 4540 } 4541 return Result; 4542 } 4543 4544 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 4545 /// to morph to an integer compare sequence. 4546 static bool canChangeToInt(SDValue Op, bool &SeenZero, 4547 const ARMSubtarget *Subtarget) { 4548 SDNode *N = Op.getNode(); 4549 if (!N->hasOneUse()) 4550 // Otherwise it requires moving the value from fp to integer registers. 4551 return false; 4552 if (!N->getNumValues()) 4553 return false; 4554 EVT VT = Op.getValueType(); 4555 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 4556 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 4557 // vmrs are very slow, e.g. cortex-a8. 4558 return false; 4559 4560 if (isFloatingPointZero(Op)) { 4561 SeenZero = true; 4562 return true; 4563 } 4564 return ISD::isNormalLoad(N); 4565 } 4566 4567 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 4568 if (isFloatingPointZero(Op)) 4569 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 4570 4571 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 4572 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 4573 Ld->getPointerInfo(), Ld->getAlignment(), 4574 Ld->getMemOperand()->getFlags()); 4575 4576 llvm_unreachable("Unknown VFP cmp argument!"); 4577 } 4578 4579 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 4580 SDValue &RetVal1, SDValue &RetVal2) { 4581 SDLoc dl(Op); 4582 4583 if (isFloatingPointZero(Op)) { 4584 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 4585 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 4586 return; 4587 } 4588 4589 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 4590 SDValue Ptr = Ld->getBasePtr(); 4591 RetVal1 = 4592 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 4593 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 4594 4595 EVT PtrType = Ptr.getValueType(); 4596 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 4597 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 4598 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 4599 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 4600 Ld->getPointerInfo().getWithOffset(4), NewAlign, 4601 Ld->getMemOperand()->getFlags()); 4602 return; 4603 } 4604 4605 llvm_unreachable("Unknown VFP cmp argument!"); 4606 } 4607 4608 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 4609 /// f32 and even f64 comparisons to integer ones. 4610 SDValue 4611 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 4612 SDValue Chain = Op.getOperand(0); 4613 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4614 SDValue LHS = Op.getOperand(2); 4615 SDValue RHS = Op.getOperand(3); 4616 SDValue Dest = Op.getOperand(4); 4617 SDLoc dl(Op); 4618 4619 bool LHSSeenZero = false; 4620 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 4621 bool RHSSeenZero = false; 4622 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 4623 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 4624 // If unsafe fp math optimization is enabled and there are no other uses of 4625 // the CMP operands, and the condition code is EQ or NE, we can optimize it 4626 // to an integer comparison. 4627 if (CC == ISD::SETOEQ) 4628 CC = ISD::SETEQ; 4629 else if (CC == ISD::SETUNE) 4630 CC = ISD::SETNE; 4631 4632 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4633 SDValue ARMcc; 4634 if (LHS.getValueType() == MVT::f32) { 4635 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4636 bitcastf32Toi32(LHS, DAG), Mask); 4637 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4638 bitcastf32Toi32(RHS, DAG), Mask); 4639 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4640 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4641 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4642 Chain, Dest, ARMcc, CCR, Cmp); 4643 } 4644 4645 SDValue LHS1, LHS2; 4646 SDValue RHS1, RHS2; 4647 expandf64Toi32(LHS, DAG, LHS1, LHS2); 4648 expandf64Toi32(RHS, DAG, RHS1, RHS2); 4649 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 4650 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 4651 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4652 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4653 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4654 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 4655 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 4656 } 4657 4658 return SDValue(); 4659 } 4660 4661 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 4662 SDValue Chain = Op.getOperand(0); 4663 SDValue Cond = Op.getOperand(1); 4664 SDValue Dest = Op.getOperand(2); 4665 SDLoc dl(Op); 4666 4667 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 4668 // instruction. 4669 unsigned Opc = Cond.getOpcode(); 4670 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 4671 !Subtarget->isThumb1Only(); 4672 if (Cond.getResNo() == 1 && 4673 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4674 Opc == ISD::USUBO || OptimizeMul)) { 4675 // Only lower legal XALUO ops. 4676 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4677 return SDValue(); 4678 4679 // The actual operation with overflow check. 4680 SDValue Value, OverflowCmp; 4681 SDValue ARMcc; 4682 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4683 4684 // Reverse the condition code. 4685 ARMCC::CondCodes CondCode = 4686 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 4687 CondCode = ARMCC::getOppositeCondition(CondCode); 4688 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 4689 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4690 4691 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 4692 OverflowCmp); 4693 } 4694 4695 return SDValue(); 4696 } 4697 4698 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 4699 SDValue Chain = Op.getOperand(0); 4700 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4701 SDValue LHS = Op.getOperand(2); 4702 SDValue RHS = Op.getOperand(3); 4703 SDValue Dest = Op.getOperand(4); 4704 SDLoc dl(Op); 4705 4706 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4707 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4708 dl); 4709 4710 // If softenSetCCOperands only returned one value, we should compare it to 4711 // zero. 4712 if (!RHS.getNode()) { 4713 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4714 CC = ISD::SETNE; 4715 } 4716 } 4717 4718 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 4719 // instruction. 4720 unsigned Opc = LHS.getOpcode(); 4721 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 4722 !Subtarget->isThumb1Only(); 4723 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 4724 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4725 Opc == ISD::USUBO || OptimizeMul) && 4726 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 4727 // Only lower legal XALUO ops. 4728 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 4729 return SDValue(); 4730 4731 // The actual operation with overflow check. 4732 SDValue Value, OverflowCmp; 4733 SDValue ARMcc; 4734 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 4735 4736 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 4737 // Reverse the condition code. 4738 ARMCC::CondCodes CondCode = 4739 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 4740 CondCode = ARMCC::getOppositeCondition(CondCode); 4741 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 4742 } 4743 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4744 4745 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 4746 OverflowCmp); 4747 } 4748 4749 if (LHS.getValueType() == MVT::i32) { 4750 SDValue ARMcc; 4751 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4752 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4753 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4754 Chain, Dest, ARMcc, CCR, Cmp); 4755 } 4756 4757 if (getTargetMachine().Options.UnsafeFPMath && 4758 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 4759 CC == ISD::SETNE || CC == ISD::SETUNE)) { 4760 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 4761 return Result; 4762 } 4763 4764 ARMCC::CondCodes CondCode, CondCode2; 4765 bool InvalidOnQNaN; 4766 FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); 4767 4768 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4769 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4770 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4771 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4772 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 4773 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4774 if (CondCode2 != ARMCC::AL) { 4775 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 4776 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 4777 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4778 } 4779 return Res; 4780 } 4781 4782 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 4783 SDValue Chain = Op.getOperand(0); 4784 SDValue Table = Op.getOperand(1); 4785 SDValue Index = Op.getOperand(2); 4786 SDLoc dl(Op); 4787 4788 EVT PTy = getPointerTy(DAG.getDataLayout()); 4789 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 4790 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 4791 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 4792 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 4793 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 4794 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 4795 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 4796 // which does another jump to the destination. This also makes it easier 4797 // to translate it to TBB / TBH later (Thumb2 only). 4798 // FIXME: This might not work if the function is extremely large. 4799 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 4800 Addr, Op.getOperand(2), JTI); 4801 } 4802 if (isPositionIndependent() || Subtarget->isROPI()) { 4803 Addr = 4804 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 4805 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4806 Chain = Addr.getValue(1); 4807 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 4808 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4809 } else { 4810 Addr = 4811 DAG.getLoad(PTy, dl, Chain, Addr, 4812 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4813 Chain = Addr.getValue(1); 4814 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4815 } 4816 } 4817 4818 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 4819 EVT VT = Op.getValueType(); 4820 SDLoc dl(Op); 4821 4822 if (Op.getValueType().getVectorElementType() == MVT::i32) { 4823 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 4824 return Op; 4825 return DAG.UnrollVectorOp(Op.getNode()); 4826 } 4827 4828 const bool HasFullFP16 = 4829 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 4830 4831 EVT NewTy; 4832 const EVT OpTy = Op.getOperand(0).getValueType(); 4833 if (OpTy == MVT::v4f32) 4834 NewTy = MVT::v4i32; 4835 else if (OpTy == MVT::v4f16 && HasFullFP16) 4836 NewTy = MVT::v4i16; 4837 else if (OpTy == MVT::v8f16 && HasFullFP16) 4838 NewTy = MVT::v8i16; 4839 else 4840 llvm_unreachable("Invalid type for custom lowering!"); 4841 4842 if (VT != MVT::v4i16 && VT != MVT::v8i16) 4843 return DAG.UnrollVectorOp(Op.getNode()); 4844 4845 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 4846 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 4847 } 4848 4849 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 4850 EVT VT = Op.getValueType(); 4851 if (VT.isVector()) 4852 return LowerVectorFP_TO_INT(Op, DAG); 4853 if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { 4854 RTLIB::Libcall LC; 4855 if (Op.getOpcode() == ISD::FP_TO_SINT) 4856 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), 4857 Op.getValueType()); 4858 else 4859 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), 4860 Op.getValueType()); 4861 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4862 /*isSigned*/ false, SDLoc(Op)).first; 4863 } 4864 4865 return Op; 4866 } 4867 4868 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4869 EVT VT = Op.getValueType(); 4870 SDLoc dl(Op); 4871 4872 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 4873 if (VT.getVectorElementType() == MVT::f32) 4874 return Op; 4875 return DAG.UnrollVectorOp(Op.getNode()); 4876 } 4877 4878 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 4879 Op.getOperand(0).getValueType() == MVT::v8i16) && 4880 "Invalid type for custom lowering!"); 4881 4882 const bool HasFullFP16 = 4883 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 4884 4885 EVT DestVecType; 4886 if (VT == MVT::v4f32) 4887 DestVecType = MVT::v4i32; 4888 else if (VT == MVT::v4f16 && HasFullFP16) 4889 DestVecType = MVT::v4i16; 4890 else if (VT == MVT::v8f16 && HasFullFP16) 4891 DestVecType = MVT::v8i16; 4892 else 4893 return DAG.UnrollVectorOp(Op.getNode()); 4894 4895 unsigned CastOpc; 4896 unsigned Opc; 4897 switch (Op.getOpcode()) { 4898 default: llvm_unreachable("Invalid opcode!"); 4899 case ISD::SINT_TO_FP: 4900 CastOpc = ISD::SIGN_EXTEND; 4901 Opc = ISD::SINT_TO_FP; 4902 break; 4903 case ISD::UINT_TO_FP: 4904 CastOpc = ISD::ZERO_EXTEND; 4905 Opc = ISD::UINT_TO_FP; 4906 break; 4907 } 4908 4909 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 4910 return DAG.getNode(Opc, dl, VT, Op); 4911 } 4912 4913 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 4914 EVT VT = Op.getValueType(); 4915 if (VT.isVector()) 4916 return LowerVectorINT_TO_FP(Op, DAG); 4917 if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { 4918 RTLIB::Libcall LC; 4919 if (Op.getOpcode() == ISD::SINT_TO_FP) 4920 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 4921 Op.getValueType()); 4922 else 4923 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 4924 Op.getValueType()); 4925 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4926 /*isSigned*/ false, SDLoc(Op)).first; 4927 } 4928 4929 return Op; 4930 } 4931 4932 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 4933 // Implement fcopysign with a fabs and a conditional fneg. 4934 SDValue Tmp0 = Op.getOperand(0); 4935 SDValue Tmp1 = Op.getOperand(1); 4936 SDLoc dl(Op); 4937 EVT VT = Op.getValueType(); 4938 EVT SrcVT = Tmp1.getValueType(); 4939 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 4940 Tmp0.getOpcode() == ARMISD::VMOVDRR; 4941 bool UseNEON = !InGPR && Subtarget->hasNEON(); 4942 4943 if (UseNEON) { 4944 // Use VBSL to copy the sign bit. 4945 unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); 4946 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 4947 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 4948 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 4949 if (VT == MVT::f64) 4950 Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4951 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 4952 DAG.getConstant(32, dl, MVT::i32)); 4953 else /*if (VT == MVT::f32)*/ 4954 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 4955 if (SrcVT == MVT::f32) { 4956 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 4957 if (VT == MVT::f64) 4958 Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4959 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 4960 DAG.getConstant(32, dl, MVT::i32)); 4961 } else if (VT == MVT::f32) 4962 Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, 4963 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 4964 DAG.getConstant(32, dl, MVT::i32)); 4965 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 4966 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 4967 4968 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), 4969 dl, MVT::i32); 4970 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 4971 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 4972 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 4973 4974 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 4975 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 4976 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 4977 if (VT == MVT::f32) { 4978 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 4979 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 4980 DAG.getConstant(0, dl, MVT::i32)); 4981 } else { 4982 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 4983 } 4984 4985 return Res; 4986 } 4987 4988 // Bitcast operand 1 to i32. 4989 if (SrcVT == MVT::f64) 4990 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4991 Tmp1).getValue(1); 4992 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 4993 4994 // Or in the signbit with integer operations. 4995 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 4996 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4997 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 4998 if (VT == MVT::f32) { 4999 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5000 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5001 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5002 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5003 } 5004 5005 // f64: Or the high part with signbit and then combine two parts. 5006 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5007 Tmp0); 5008 SDValue Lo = Tmp0.getValue(0); 5009 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5010 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5011 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5012 } 5013 5014 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5015 MachineFunction &MF = DAG.getMachineFunction(); 5016 MachineFrameInfo &MFI = MF.getFrameInfo(); 5017 MFI.setReturnAddressIsTaken(true); 5018 5019 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5020 return SDValue(); 5021 5022 EVT VT = Op.getValueType(); 5023 SDLoc dl(Op); 5024 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5025 if (Depth) { 5026 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5027 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5028 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5029 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5030 MachinePointerInfo()); 5031 } 5032 5033 // Return LR, which contains the return address. Mark it an implicit live-in. 5034 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5035 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5036 } 5037 5038 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5039 const ARMBaseRegisterInfo &ARI = 5040 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5041 MachineFunction &MF = DAG.getMachineFunction(); 5042 MachineFrameInfo &MFI = MF.getFrameInfo(); 5043 MFI.setFrameAddressIsTaken(true); 5044 5045 EVT VT = Op.getValueType(); 5046 SDLoc dl(Op); // FIXME probably not meaningful 5047 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5048 unsigned FrameReg = ARI.getFrameRegister(MF); 5049 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5050 while (Depth--) 5051 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5052 MachinePointerInfo()); 5053 return FrameAddr; 5054 } 5055 5056 // FIXME? Maybe this could be a TableGen attribute on some registers and 5057 // this table could be generated automatically from RegInfo. 5058 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, 5059 SelectionDAG &DAG) const { 5060 unsigned Reg = StringSwitch<unsigned>(RegName) 5061 .Case("sp", ARM::SP) 5062 .Default(0); 5063 if (Reg) 5064 return Reg; 5065 report_fatal_error(Twine("Invalid register name \"" 5066 + StringRef(RegName) + "\".")); 5067 } 5068 5069 // Result is 64 bit value so split into two 32 bit values and return as a 5070 // pair of values. 5071 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5072 SelectionDAG &DAG) { 5073 SDLoc DL(N); 5074 5075 // This function is only supposed to be called for i64 type destination. 5076 assert(N->getValueType(0) == MVT::i64 5077 && "ExpandREAD_REGISTER called for non-i64 type result."); 5078 5079 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5080 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5081 N->getOperand(0), 5082 N->getOperand(1)); 5083 5084 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5085 Read.getValue(1))); 5086 Results.push_back(Read.getOperand(0)); 5087 } 5088 5089 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5090 /// When \p DstVT, the destination type of \p BC, is on the vector 5091 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5092 /// it might be possible to combine them, such that everything stays on the 5093 /// vector register bank. 5094 /// \p return The node that would replace \p BT, if the combine 5095 /// is possible. 5096 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5097 SelectionDAG &DAG) { 5098 SDValue Op = BC->getOperand(0); 5099 EVT DstVT = BC->getValueType(0); 5100 5101 // The only vector instruction that can produce a scalar (remember, 5102 // since the bitcast was about to be turned into VMOVDRR, the source 5103 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5104 // Moreover, we can do this combine only if there is one use. 5105 // Finally, if the destination type is not a vector, there is not 5106 // much point on forcing everything on the vector bank. 5107 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5108 !Op.hasOneUse()) 5109 return SDValue(); 5110 5111 // If the index is not constant, we will introduce an additional 5112 // multiply that will stick. 5113 // Give up in that case. 5114 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5115 if (!Index) 5116 return SDValue(); 5117 unsigned DstNumElt = DstVT.getVectorNumElements(); 5118 5119 // Compute the new index. 5120 const APInt &APIntIndex = Index->getAPIntValue(); 5121 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5122 NewIndex *= APIntIndex; 5123 // Check if the new constant index fits into i32. 5124 if (NewIndex.getBitWidth() > 32) 5125 return SDValue(); 5126 5127 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5128 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5129 SDLoc dl(Op); 5130 SDValue ExtractSrc = Op.getOperand(0); 5131 EVT VecVT = EVT::getVectorVT( 5132 *DAG.getContext(), DstVT.getScalarType(), 5133 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5134 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5135 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5136 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5137 } 5138 5139 /// ExpandBITCAST - If the target supports VFP, this function is called to 5140 /// expand a bit convert where either the source or destination type is i64 to 5141 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5142 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5143 /// vectors), since the legalizer won't know what to do with that. 5144 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5145 const ARMSubtarget *Subtarget) { 5146 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5147 SDLoc dl(N); 5148 SDValue Op = N->getOperand(0); 5149 5150 // This function is only supposed to be called for i64 types, either as the 5151 // source or destination of the bit convert. 5152 EVT SrcVT = Op.getValueType(); 5153 EVT DstVT = N->getValueType(0); 5154 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5155 5156 if (SrcVT == MVT::f32 && DstVT == MVT::i32) { 5157 // FullFP16: half values are passed in S-registers, and we don't 5158 // need any of the bitcast and moves: 5159 // 5160 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 5161 // t5: i32 = bitcast t2 5162 // t18: f16 = ARMISD::VMOVhr t5 5163 if (Op.getOpcode() != ISD::CopyFromReg || 5164 Op.getValueType() != MVT::f32) 5165 return SDValue(); 5166 5167 auto Move = N->use_begin(); 5168 if (Move->getOpcode() != ARMISD::VMOVhr) 5169 return SDValue(); 5170 5171 SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; 5172 SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); 5173 DAG.ReplaceAllUsesWith(*Move, &Copy); 5174 return Copy; 5175 } 5176 5177 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5178 if (!HasFullFP16) 5179 return SDValue(); 5180 // SoftFP: read half-precision arguments: 5181 // 5182 // t2: i32,ch = ... 5183 // t7: i16 = truncate t2 <~~~~ Op 5184 // t8: f16 = bitcast t7 <~~~~ N 5185 // 5186 if (Op.getOperand(0).getValueType() == MVT::i32) 5187 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5188 MVT::f16, Op.getOperand(0)); 5189 5190 return SDValue(); 5191 } 5192 5193 // Half-precision return values 5194 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5195 if (!HasFullFP16) 5196 return SDValue(); 5197 // 5198 // t11: f16 = fadd t8, t10 5199 // t12: i16 = bitcast t11 <~~~ SDNode N 5200 // t13: i32 = zero_extend t12 5201 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5202 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5203 // 5204 // transform this into: 5205 // 5206 // t20: i32 = ARMISD::VMOVrh t11 5207 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5208 // 5209 auto ZeroExtend = N->use_begin(); 5210 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5211 ZeroExtend->getValueType(0) != MVT::i32) 5212 return SDValue(); 5213 5214 auto Copy = ZeroExtend->use_begin(); 5215 if (Copy->getOpcode() == ISD::CopyToReg && 5216 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5217 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5218 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5219 return Cvt; 5220 } 5221 return SDValue(); 5222 } 5223 5224 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5225 return SDValue(); 5226 5227 // Turn i64->f64 into VMOVDRR. 5228 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5229 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5230 // if we can combine the bitcast with its source. 5231 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5232 return Val; 5233 5234 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5235 DAG.getConstant(0, dl, MVT::i32)); 5236 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5237 DAG.getConstant(1, dl, MVT::i32)); 5238 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5239 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5240 } 5241 5242 // Turn f64->i64 into VMOVRRD. 5243 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5244 SDValue Cvt; 5245 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5246 SrcVT.getVectorNumElements() > 1) 5247 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5248 DAG.getVTList(MVT::i32, MVT::i32), 5249 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5250 else 5251 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5252 DAG.getVTList(MVT::i32, MVT::i32), Op); 5253 // Merge the pieces into a single i64 value. 5254 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5255 } 5256 5257 return SDValue(); 5258 } 5259 5260 /// getZeroVector - Returns a vector of specified type with all zero elements. 5261 /// Zero vectors are used to represent vector negation and in those cases 5262 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5263 /// not support i64 elements, so sometimes the zero vectors will need to be 5264 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5265 /// zero vector. 5266 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5267 assert(VT.isVector() && "Expected a vector type"); 5268 // The canonical modified immediate encoding of a zero vector is....0! 5269 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5270 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5271 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5272 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5273 } 5274 5275 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5276 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5277 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5278 SelectionDAG &DAG) const { 5279 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5280 EVT VT = Op.getValueType(); 5281 unsigned VTBits = VT.getSizeInBits(); 5282 SDLoc dl(Op); 5283 SDValue ShOpLo = Op.getOperand(0); 5284 SDValue ShOpHi = Op.getOperand(1); 5285 SDValue ShAmt = Op.getOperand(2); 5286 SDValue ARMcc; 5287 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5288 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5289 5290 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5291 5292 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5293 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5294 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5295 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5296 DAG.getConstant(VTBits, dl, MVT::i32)); 5297 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5298 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5299 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5300 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5301 ISD::SETGE, ARMcc, DAG, dl); 5302 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5303 ARMcc, CCR, CmpLo); 5304 5305 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5306 SDValue HiBigShift = Opc == ISD::SRA 5307 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5308 DAG.getConstant(VTBits - 1, dl, VT)) 5309 : DAG.getConstant(0, dl, VT); 5310 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5311 ISD::SETGE, ARMcc, DAG, dl); 5312 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5313 ARMcc, CCR, CmpHi); 5314 5315 SDValue Ops[2] = { Lo, Hi }; 5316 return DAG.getMergeValues(Ops, dl); 5317 } 5318 5319 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5320 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5321 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5322 SelectionDAG &DAG) const { 5323 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5324 EVT VT = Op.getValueType(); 5325 unsigned VTBits = VT.getSizeInBits(); 5326 SDLoc dl(Op); 5327 SDValue ShOpLo = Op.getOperand(0); 5328 SDValue ShOpHi = Op.getOperand(1); 5329 SDValue ShAmt = Op.getOperand(2); 5330 SDValue ARMcc; 5331 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5332 5333 assert(Op.getOpcode() == ISD::SHL_PARTS); 5334 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5335 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5336 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5337 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5338 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5339 5340 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5341 DAG.getConstant(VTBits, dl, MVT::i32)); 5342 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5343 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5344 ISD::SETGE, ARMcc, DAG, dl); 5345 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5346 ARMcc, CCR, CmpHi); 5347 5348 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5349 ISD::SETGE, ARMcc, DAG, dl); 5350 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5351 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5352 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5353 5354 SDValue Ops[2] = { Lo, Hi }; 5355 return DAG.getMergeValues(Ops, dl); 5356 } 5357 5358 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5359 SelectionDAG &DAG) const { 5360 // The rounding mode is in bits 23:22 of the FPSCR. 5361 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5362 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5363 // so that the shift + and get folded into a bitfield extract. 5364 SDLoc dl(Op); 5365 SDValue Ops[] = { DAG.getEntryNode(), 5366 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; 5367 5368 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); 5369 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5370 DAG.getConstant(1U << 22, dl, MVT::i32)); 5371 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5372 DAG.getConstant(22, dl, MVT::i32)); 5373 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5374 DAG.getConstant(3, dl, MVT::i32)); 5375 } 5376 5377 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5378 const ARMSubtarget *ST) { 5379 SDLoc dl(N); 5380 EVT VT = N->getValueType(0); 5381 if (VT.isVector()) { 5382 assert(ST->hasNEON()); 5383 5384 // Compute the least significant set bit: LSB = X & -X 5385 SDValue X = N->getOperand(0); 5386 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5387 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5388 5389 EVT ElemTy = VT.getVectorElementType(); 5390 5391 if (ElemTy == MVT::i8) { 5392 // Compute with: cttz(x) = ctpop(lsb - 1) 5393 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5394 DAG.getTargetConstant(1, dl, ElemTy)); 5395 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5396 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5397 } 5398 5399 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5400 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5401 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5402 unsigned NumBits = ElemTy.getSizeInBits(); 5403 SDValue WidthMinus1 = 5404 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5405 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5406 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5407 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5408 } 5409 5410 // Compute with: cttz(x) = ctpop(lsb - 1) 5411 5412 // Compute LSB - 1. 5413 SDValue Bits; 5414 if (ElemTy == MVT::i64) { 5415 // Load constant 0xffff'ffff'ffff'ffff to register. 5416 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5417 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 5418 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 5419 } else { 5420 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5421 DAG.getTargetConstant(1, dl, ElemTy)); 5422 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5423 } 5424 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5425 } 5426 5427 if (!ST->hasV6T2Ops()) 5428 return SDValue(); 5429 5430 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 5431 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 5432 } 5433 5434 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 5435 const ARMSubtarget *ST) { 5436 EVT VT = N->getValueType(0); 5437 SDLoc DL(N); 5438 5439 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 5440 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 5441 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 5442 "Unexpected type for custom ctpop lowering"); 5443 5444 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5445 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 5446 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 5447 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 5448 5449 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 5450 unsigned EltSize = 8; 5451 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 5452 while (EltSize != VT.getScalarSizeInBits()) { 5453 SmallVector<SDValue, 8> Ops; 5454 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 5455 TLI.getPointerTy(DAG.getDataLayout()))); 5456 Ops.push_back(Res); 5457 5458 EltSize *= 2; 5459 NumElts /= 2; 5460 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 5461 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 5462 } 5463 5464 return Res; 5465 } 5466 5467 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 5468 const ARMSubtarget *ST) { 5469 EVT VT = N->getValueType(0); 5470 SDLoc dl(N); 5471 5472 if (!VT.isVector()) 5473 return SDValue(); 5474 5475 // Lower vector shifts on NEON to use VSHL. 5476 assert(ST->hasNEON() && "unexpected vector shift"); 5477 5478 // Left shifts translate directly to the vshiftu intrinsic. 5479 if (N->getOpcode() == ISD::SHL) 5480 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5481 DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, 5482 MVT::i32), 5483 N->getOperand(0), N->getOperand(1)); 5484 5485 assert((N->getOpcode() == ISD::SRA || 5486 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); 5487 5488 // NEON uses the same intrinsics for both left and right shifts. For 5489 // right shifts, the shift amounts are negative, so negate the vector of 5490 // shift amounts. 5491 EVT ShiftVT = N->getOperand(1).getValueType(); 5492 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, 5493 getZeroVector(ShiftVT, DAG, dl), 5494 N->getOperand(1)); 5495 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? 5496 Intrinsic::arm_neon_vshifts : 5497 Intrinsic::arm_neon_vshiftu); 5498 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5499 DAG.getConstant(vshiftInt, dl, MVT::i32), 5500 N->getOperand(0), NegatedCount); 5501 } 5502 5503 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 5504 const ARMSubtarget *ST) { 5505 EVT VT = N->getValueType(0); 5506 SDLoc dl(N); 5507 5508 // We can get here for a node like i32 = ISD::SHL i32, i64 5509 if (VT != MVT::i64) 5510 return SDValue(); 5511 5512 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 5513 "Unknown shift to lower!"); 5514 5515 // We only lower SRA, SRL of 1 here, all others use generic lowering. 5516 if (!isOneConstant(N->getOperand(1))) 5517 return SDValue(); 5518 5519 // If we are in thumb mode, we don't have RRX. 5520 if (ST->isThumb1Only()) return SDValue(); 5521 5522 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 5523 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5524 DAG.getConstant(0, dl, MVT::i32)); 5525 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5526 DAG.getConstant(1, dl, MVT::i32)); 5527 5528 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 5529 // captures the result into a carry flag. 5530 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 5531 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 5532 5533 // The low part is an ARMISD::RRX operand, which shifts the carry in. 5534 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 5535 5536 // Merge the pieces into a single i64 value. 5537 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5538 } 5539 5540 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5541 SDValue TmpOp0, TmpOp1; 5542 bool Invert = false; 5543 bool Swap = false; 5544 unsigned Opc = 0; 5545 5546 SDValue Op0 = Op.getOperand(0); 5547 SDValue Op1 = Op.getOperand(1); 5548 SDValue CC = Op.getOperand(2); 5549 EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 5550 EVT VT = Op.getValueType(); 5551 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5552 SDLoc dl(Op); 5553 5554 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 5555 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 5556 // Special-case integer 64-bit equality comparisons. They aren't legal, 5557 // but they can be lowered with a few vector instructions. 5558 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 5559 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 5560 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 5561 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 5562 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 5563 DAG.getCondCode(ISD::SETEQ)); 5564 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 5565 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 5566 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 5567 if (SetCCOpcode == ISD::SETNE) 5568 Merged = DAG.getNOT(dl, Merged, CmpVT); 5569 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 5570 return Merged; 5571 } 5572 5573 if (CmpVT.getVectorElementType() == MVT::i64) 5574 // 64-bit comparisons are not legal in general. 5575 return SDValue(); 5576 5577 if (Op1.getValueType().isFloatingPoint()) { 5578 switch (SetCCOpcode) { 5579 default: llvm_unreachable("Illegal FP comparison"); 5580 case ISD::SETUNE: 5581 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; 5582 case ISD::SETOEQ: 5583 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5584 case ISD::SETOLT: 5585 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 5586 case ISD::SETOGT: 5587 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5588 case ISD::SETOLE: 5589 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 5590 case ISD::SETOGE: 5591 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5592 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 5593 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; 5594 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 5595 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; 5596 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 5597 case ISD::SETONE: 5598 // Expand this to (OLT | OGT). 5599 TmpOp0 = Op0; 5600 TmpOp1 = Op1; 5601 Opc = ISD::OR; 5602 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5603 Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); 5604 break; 5605 case ISD::SETUO: 5606 Invert = true; 5607 LLVM_FALLTHROUGH; 5608 case ISD::SETO: 5609 // Expand this to (OLT | OGE). 5610 TmpOp0 = Op0; 5611 TmpOp1 = Op1; 5612 Opc = ISD::OR; 5613 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5614 Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); 5615 break; 5616 } 5617 } else { 5618 // Integer comparisons. 5619 switch (SetCCOpcode) { 5620 default: llvm_unreachable("Illegal integer comparison"); 5621 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; 5622 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5623 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 5624 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5625 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 5626 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5627 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 5628 case ISD::SETUGT: Opc = ARMISD::VCGTU; break; 5629 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 5630 case ISD::SETUGE: Opc = ARMISD::VCGEU; break; 5631 } 5632 5633 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 5634 if (Opc == ARMISD::VCEQ) { 5635 SDValue AndOp; 5636 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5637 AndOp = Op0; 5638 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 5639 AndOp = Op1; 5640 5641 // Ignore bitconvert. 5642 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 5643 AndOp = AndOp.getOperand(0); 5644 5645 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 5646 Opc = ARMISD::VTST; 5647 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 5648 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 5649 Invert = !Invert; 5650 } 5651 } 5652 } 5653 5654 if (Swap) 5655 std::swap(Op0, Op1); 5656 5657 // If one of the operands is a constant vector zero, attempt to fold the 5658 // comparison to a specialized compare-against-zero form. 5659 SDValue SingleOp; 5660 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5661 SingleOp = Op0; 5662 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 5663 if (Opc == ARMISD::VCGE) 5664 Opc = ARMISD::VCLEZ; 5665 else if (Opc == ARMISD::VCGT) 5666 Opc = ARMISD::VCLTZ; 5667 SingleOp = Op1; 5668 } 5669 5670 SDValue Result; 5671 if (SingleOp.getNode()) { 5672 switch (Opc) { 5673 case ARMISD::VCEQ: 5674 Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; 5675 case ARMISD::VCGE: 5676 Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; 5677 case ARMISD::VCLEZ: 5678 Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; 5679 case ARMISD::VCGT: 5680 Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; 5681 case ARMISD::VCLTZ: 5682 Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; 5683 default: 5684 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5685 } 5686 } else { 5687 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5688 } 5689 5690 Result = DAG.getSExtOrTrunc(Result, dl, VT); 5691 5692 if (Invert) 5693 Result = DAG.getNOT(dl, Result, VT); 5694 5695 return Result; 5696 } 5697 5698 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 5699 SDValue LHS = Op.getOperand(0); 5700 SDValue RHS = Op.getOperand(1); 5701 SDValue Carry = Op.getOperand(2); 5702 SDValue Cond = Op.getOperand(3); 5703 SDLoc DL(Op); 5704 5705 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 5706 5707 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 5708 // have to invert the carry first. 5709 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 5710 DAG.getConstant(1, DL, MVT::i32), Carry); 5711 // This converts the boolean value carry into the carry flag. 5712 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 5713 5714 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 5715 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 5716 5717 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 5718 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 5719 SDValue ARMcc = DAG.getConstant( 5720 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 5721 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5722 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 5723 Cmp.getValue(1), SDValue()); 5724 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 5725 CCR, Chain.getValue(1)); 5726 } 5727 5728 /// isNEONModifiedImm - Check if the specified splat value corresponds to a 5729 /// valid vector constant for a NEON instruction with a "modified immediate" 5730 /// operand (e.g., VMOV). If so, return the encoded value. 5731 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 5732 unsigned SplatBitSize, SelectionDAG &DAG, 5733 const SDLoc &dl, EVT &VT, bool is128Bits, 5734 NEONModImmType type) { 5735 unsigned OpCmode, Imm; 5736 5737 // SplatBitSize is set to the smallest size that splats the vector, so a 5738 // zero vector will always have SplatBitSize == 8. However, NEON modified 5739 // immediate instructions others than VMOV do not support the 8-bit encoding 5740 // of a zero vector, and the default encoding of zero is supposed to be the 5741 // 32-bit version. 5742 if (SplatBits == 0) 5743 SplatBitSize = 32; 5744 5745 switch (SplatBitSize) { 5746 case 8: 5747 if (type != VMOVModImm) 5748 return SDValue(); 5749 // Any 1-byte value is OK. Op=0, Cmode=1110. 5750 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 5751 OpCmode = 0xe; 5752 Imm = SplatBits; 5753 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 5754 break; 5755 5756 case 16: 5757 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 5758 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 5759 if ((SplatBits & ~0xff) == 0) { 5760 // Value = 0x00nn: Op=x, Cmode=100x. 5761 OpCmode = 0x8; 5762 Imm = SplatBits; 5763 break; 5764 } 5765 if ((SplatBits & ~0xff00) == 0) { 5766 // Value = 0xnn00: Op=x, Cmode=101x. 5767 OpCmode = 0xa; 5768 Imm = SplatBits >> 8; 5769 break; 5770 } 5771 return SDValue(); 5772 5773 case 32: 5774 // NEON's 32-bit VMOV supports splat values where: 5775 // * only one byte is nonzero, or 5776 // * the least significant byte is 0xff and the second byte is nonzero, or 5777 // * the least significant 2 bytes are 0xff and the third is nonzero. 5778 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 5779 if ((SplatBits & ~0xff) == 0) { 5780 // Value = 0x000000nn: Op=x, Cmode=000x. 5781 OpCmode = 0; 5782 Imm = SplatBits; 5783 break; 5784 } 5785 if ((SplatBits & ~0xff00) == 0) { 5786 // Value = 0x0000nn00: Op=x, Cmode=001x. 5787 OpCmode = 0x2; 5788 Imm = SplatBits >> 8; 5789 break; 5790 } 5791 if ((SplatBits & ~0xff0000) == 0) { 5792 // Value = 0x00nn0000: Op=x, Cmode=010x. 5793 OpCmode = 0x4; 5794 Imm = SplatBits >> 16; 5795 break; 5796 } 5797 if ((SplatBits & ~0xff000000) == 0) { 5798 // Value = 0xnn000000: Op=x, Cmode=011x. 5799 OpCmode = 0x6; 5800 Imm = SplatBits >> 24; 5801 break; 5802 } 5803 5804 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 5805 if (type == OtherModImm) return SDValue(); 5806 5807 if ((SplatBits & ~0xffff) == 0 && 5808 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 5809 // Value = 0x0000nnff: Op=x, Cmode=1100. 5810 OpCmode = 0xc; 5811 Imm = SplatBits >> 8; 5812 break; 5813 } 5814 5815 if ((SplatBits & ~0xffffff) == 0 && 5816 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 5817 // Value = 0x00nnffff: Op=x, Cmode=1101. 5818 OpCmode = 0xd; 5819 Imm = SplatBits >> 16; 5820 break; 5821 } 5822 5823 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 5824 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 5825 // VMOV.I32. A (very) minor optimization would be to replicate the value 5826 // and fall through here to test for a valid 64-bit splat. But, then the 5827 // caller would also need to check and handle the change in size. 5828 return SDValue(); 5829 5830 case 64: { 5831 if (type != VMOVModImm) 5832 return SDValue(); 5833 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 5834 uint64_t BitMask = 0xff; 5835 uint64_t Val = 0; 5836 unsigned ImmMask = 1; 5837 Imm = 0; 5838 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 5839 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 5840 Val |= BitMask; 5841 Imm |= ImmMask; 5842 } else if ((SplatBits & BitMask) != 0) { 5843 return SDValue(); 5844 } 5845 BitMask <<= 8; 5846 ImmMask <<= 1; 5847 } 5848 5849 if (DAG.getDataLayout().isBigEndian()) 5850 // swap higher and lower 32 bit word 5851 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 5852 5853 // Op=1, Cmode=1110. 5854 OpCmode = 0x1e; 5855 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 5856 break; 5857 } 5858 5859 default: 5860 llvm_unreachable("unexpected size for isNEONModifiedImm"); 5861 } 5862 5863 unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); 5864 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 5865 } 5866 5867 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 5868 const ARMSubtarget *ST) const { 5869 EVT VT = Op.getValueType(); 5870 bool IsDouble = (VT == MVT::f64); 5871 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 5872 const APFloat &FPVal = CFP->getValueAPF(); 5873 5874 // Prevent floating-point constants from using literal loads 5875 // when execute-only is enabled. 5876 if (ST->genExecuteOnly()) { 5877 // If we can represent the constant as an immediate, don't lower it 5878 if (isFPImmLegal(FPVal, VT)) 5879 return Op; 5880 // Otherwise, construct as integer, and move to float register 5881 APInt INTVal = FPVal.bitcastToAPInt(); 5882 SDLoc DL(CFP); 5883 switch (VT.getSimpleVT().SimpleTy) { 5884 default: 5885 llvm_unreachable("Unknown floating point type!"); 5886 break; 5887 case MVT::f64: { 5888 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 5889 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 5890 if (!ST->isLittle()) 5891 std::swap(Lo, Hi); 5892 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 5893 } 5894 case MVT::f32: 5895 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 5896 DAG.getConstant(INTVal, DL, MVT::i32)); 5897 } 5898 } 5899 5900 if (!ST->hasVFP3()) 5901 return SDValue(); 5902 5903 // Use the default (constant pool) lowering for double constants when we have 5904 // an SP-only FPU 5905 if (IsDouble && Subtarget->isFPOnlySP()) 5906 return SDValue(); 5907 5908 // Try splatting with a VMOV.f32... 5909 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 5910 5911 if (ImmVal != -1) { 5912 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 5913 // We have code in place to select a valid ConstantFP already, no need to 5914 // do any mangling. 5915 return Op; 5916 } 5917 5918 // It's a float and we are trying to use NEON operations where 5919 // possible. Lower it to a splat followed by an extract. 5920 SDLoc DL(Op); 5921 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 5922 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 5923 NewVal); 5924 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 5925 DAG.getConstant(0, DL, MVT::i32)); 5926 } 5927 5928 // The rest of our options are NEON only, make sure that's allowed before 5929 // proceeding.. 5930 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 5931 return SDValue(); 5932 5933 EVT VMovVT; 5934 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 5935 5936 // It wouldn't really be worth bothering for doubles except for one very 5937 // important value, which does happen to match: 0.0. So make sure we don't do 5938 // anything stupid. 5939 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 5940 return SDValue(); 5941 5942 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 5943 SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 5944 VMovVT, false, VMOVModImm); 5945 if (NewVal != SDValue()) { 5946 SDLoc DL(Op); 5947 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 5948 NewVal); 5949 if (IsDouble) 5950 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5951 5952 // It's a float: cast and extract a vector element. 5953 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5954 VecConstant); 5955 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5956 DAG.getConstant(0, DL, MVT::i32)); 5957 } 5958 5959 // Finally, try a VMVN.i32 5960 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 5961 false, VMVNModImm); 5962 if (NewVal != SDValue()) { 5963 SDLoc DL(Op); 5964 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 5965 5966 if (IsDouble) 5967 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5968 5969 // It's a float: cast and extract a vector element. 5970 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5971 VecConstant); 5972 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5973 DAG.getConstant(0, DL, MVT::i32)); 5974 } 5975 5976 return SDValue(); 5977 } 5978 5979 // check if an VEXT instruction can handle the shuffle mask when the 5980 // vector sources of the shuffle are the same. 5981 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5982 unsigned NumElts = VT.getVectorNumElements(); 5983 5984 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5985 if (M[0] < 0) 5986 return false; 5987 5988 Imm = M[0]; 5989 5990 // If this is a VEXT shuffle, the immediate value is the index of the first 5991 // element. The other shuffle indices must be the successive elements after 5992 // the first one. 5993 unsigned ExpectedElt = Imm; 5994 for (unsigned i = 1; i < NumElts; ++i) { 5995 // Increment the expected index. If it wraps around, just follow it 5996 // back to index zero and keep going. 5997 ++ExpectedElt; 5998 if (ExpectedElt == NumElts) 5999 ExpectedElt = 0; 6000 6001 if (M[i] < 0) continue; // ignore UNDEF indices 6002 if (ExpectedElt != static_cast<unsigned>(M[i])) 6003 return false; 6004 } 6005 6006 return true; 6007 } 6008 6009 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6010 bool &ReverseVEXT, unsigned &Imm) { 6011 unsigned NumElts = VT.getVectorNumElements(); 6012 ReverseVEXT = false; 6013 6014 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6015 if (M[0] < 0) 6016 return false; 6017 6018 Imm = M[0]; 6019 6020 // If this is a VEXT shuffle, the immediate value is the index of the first 6021 // element. The other shuffle indices must be the successive elements after 6022 // the first one. 6023 unsigned ExpectedElt = Imm; 6024 for (unsigned i = 1; i < NumElts; ++i) { 6025 // Increment the expected index. If it wraps around, it may still be 6026 // a VEXT but the source vectors must be swapped. 6027 ExpectedElt += 1; 6028 if (ExpectedElt == NumElts * 2) { 6029 ExpectedElt = 0; 6030 ReverseVEXT = true; 6031 } 6032 6033 if (M[i] < 0) continue; // ignore UNDEF indices 6034 if (ExpectedElt != static_cast<unsigned>(M[i])) 6035 return false; 6036 } 6037 6038 // Adjust the index value if the source operands will be swapped. 6039 if (ReverseVEXT) 6040 Imm -= NumElts; 6041 6042 return true; 6043 } 6044 6045 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6046 /// instruction with the specified blocksize. (The order of the elements 6047 /// within each block of the vector is reversed.) 6048 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6049 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6050 "Only possible block sizes for VREV are: 16, 32, 64"); 6051 6052 unsigned EltSz = VT.getScalarSizeInBits(); 6053 if (EltSz == 64) 6054 return false; 6055 6056 unsigned NumElts = VT.getVectorNumElements(); 6057 unsigned BlockElts = M[0] + 1; 6058 // If the first shuffle index is UNDEF, be optimistic. 6059 if (M[0] < 0) 6060 BlockElts = BlockSize / EltSz; 6061 6062 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6063 return false; 6064 6065 for (unsigned i = 0; i < NumElts; ++i) { 6066 if (M[i] < 0) continue; // ignore UNDEF indices 6067 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6068 return false; 6069 } 6070 6071 return true; 6072 } 6073 6074 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6075 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6076 // range, then 0 is placed into the resulting vector. So pretty much any mask 6077 // of 8 elements can work here. 6078 return VT == MVT::v8i8 && M.size() == 8; 6079 } 6080 6081 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6082 unsigned Index) { 6083 if (Mask.size() == Elements * 2) 6084 return Index / Elements; 6085 return Mask[Index] == 0 ? 0 : 1; 6086 } 6087 6088 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6089 // checking that pairs of elements in the shuffle mask represent the same index 6090 // in each vector, incrementing the expected index by 2 at each step. 6091 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6092 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6093 // v2={e,f,g,h} 6094 // WhichResult gives the offset for each element in the mask based on which 6095 // of the two results it belongs to. 6096 // 6097 // The transpose can be represented either as: 6098 // result1 = shufflevector v1, v2, result1_shuffle_mask 6099 // result2 = shufflevector v1, v2, result2_shuffle_mask 6100 // where v1/v2 and the shuffle masks have the same number of elements 6101 // (here WhichResult (see below) indicates which result is being checked) 6102 // 6103 // or as: 6104 // results = shufflevector v1, v2, shuffle_mask 6105 // where both results are returned in one vector and the shuffle mask has twice 6106 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6107 // want to check the low half and high half of the shuffle mask as if it were 6108 // the other case 6109 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6110 unsigned EltSz = VT.getScalarSizeInBits(); 6111 if (EltSz == 64) 6112 return false; 6113 6114 unsigned NumElts = VT.getVectorNumElements(); 6115 if (M.size() != NumElts && M.size() != NumElts*2) 6116 return false; 6117 6118 // If the mask is twice as long as the input vector then we need to check the 6119 // upper and lower parts of the mask with a matching value for WhichResult 6120 // FIXME: A mask with only even values will be rejected in case the first 6121 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6122 // M[0] is used to determine WhichResult 6123 for (unsigned i = 0; i < M.size(); i += NumElts) { 6124 WhichResult = SelectPairHalf(NumElts, M, i); 6125 for (unsigned j = 0; j < NumElts; j += 2) { 6126 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6127 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6128 return false; 6129 } 6130 } 6131 6132 if (M.size() == NumElts*2) 6133 WhichResult = 0; 6134 6135 return true; 6136 } 6137 6138 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6139 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6140 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6141 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6142 unsigned EltSz = VT.getScalarSizeInBits(); 6143 if (EltSz == 64) 6144 return false; 6145 6146 unsigned NumElts = VT.getVectorNumElements(); 6147 if (M.size() != NumElts && M.size() != NumElts*2) 6148 return false; 6149 6150 for (unsigned i = 0; i < M.size(); i += NumElts) { 6151 WhichResult = SelectPairHalf(NumElts, M, i); 6152 for (unsigned j = 0; j < NumElts; j += 2) { 6153 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6154 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6155 return false; 6156 } 6157 } 6158 6159 if (M.size() == NumElts*2) 6160 WhichResult = 0; 6161 6162 return true; 6163 } 6164 6165 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6166 // that the mask elements are either all even and in steps of size 2 or all odd 6167 // and in steps of size 2. 6168 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6169 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6170 // v2={e,f,g,h} 6171 // Requires similar checks to that of isVTRNMask with 6172 // respect the how results are returned. 6173 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6174 unsigned EltSz = VT.getScalarSizeInBits(); 6175 if (EltSz == 64) 6176 return false; 6177 6178 unsigned NumElts = VT.getVectorNumElements(); 6179 if (M.size() != NumElts && M.size() != NumElts*2) 6180 return false; 6181 6182 for (unsigned i = 0; i < M.size(); i += NumElts) { 6183 WhichResult = SelectPairHalf(NumElts, M, i); 6184 for (unsigned j = 0; j < NumElts; ++j) { 6185 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6186 return false; 6187 } 6188 } 6189 6190 if (M.size() == NumElts*2) 6191 WhichResult = 0; 6192 6193 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6194 if (VT.is64BitVector() && EltSz == 32) 6195 return false; 6196 6197 return true; 6198 } 6199 6200 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6201 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6202 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6203 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6204 unsigned EltSz = VT.getScalarSizeInBits(); 6205 if (EltSz == 64) 6206 return false; 6207 6208 unsigned NumElts = VT.getVectorNumElements(); 6209 if (M.size() != NumElts && M.size() != NumElts*2) 6210 return false; 6211 6212 unsigned Half = NumElts / 2; 6213 for (unsigned i = 0; i < M.size(); i += NumElts) { 6214 WhichResult = SelectPairHalf(NumElts, M, i); 6215 for (unsigned j = 0; j < NumElts; j += Half) { 6216 unsigned Idx = WhichResult; 6217 for (unsigned k = 0; k < Half; ++k) { 6218 int MIdx = M[i + j + k]; 6219 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6220 return false; 6221 Idx += 2; 6222 } 6223 } 6224 } 6225 6226 if (M.size() == NumElts*2) 6227 WhichResult = 0; 6228 6229 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6230 if (VT.is64BitVector() && EltSz == 32) 6231 return false; 6232 6233 return true; 6234 } 6235 6236 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6237 // that pairs of elements of the shufflemask represent the same index in each 6238 // vector incrementing sequentially through the vectors. 6239 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6240 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6241 // v2={e,f,g,h} 6242 // Requires similar checks to that of isVTRNMask with respect the how results 6243 // are returned. 6244 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6245 unsigned EltSz = VT.getScalarSizeInBits(); 6246 if (EltSz == 64) 6247 return false; 6248 6249 unsigned NumElts = VT.getVectorNumElements(); 6250 if (M.size() != NumElts && M.size() != NumElts*2) 6251 return false; 6252 6253 for (unsigned i = 0; i < M.size(); i += NumElts) { 6254 WhichResult = SelectPairHalf(NumElts, M, i); 6255 unsigned Idx = WhichResult * NumElts / 2; 6256 for (unsigned j = 0; j < NumElts; j += 2) { 6257 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6258 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6259 return false; 6260 Idx += 1; 6261 } 6262 } 6263 6264 if (M.size() == NumElts*2) 6265 WhichResult = 0; 6266 6267 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6268 if (VT.is64BitVector() && EltSz == 32) 6269 return false; 6270 6271 return true; 6272 } 6273 6274 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6275 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6276 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6277 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6278 unsigned EltSz = VT.getScalarSizeInBits(); 6279 if (EltSz == 64) 6280 return false; 6281 6282 unsigned NumElts = VT.getVectorNumElements(); 6283 if (M.size() != NumElts && M.size() != NumElts*2) 6284 return false; 6285 6286 for (unsigned i = 0; i < M.size(); i += NumElts) { 6287 WhichResult = SelectPairHalf(NumElts, M, i); 6288 unsigned Idx = WhichResult * NumElts / 2; 6289 for (unsigned j = 0; j < NumElts; j += 2) { 6290 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6291 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 6292 return false; 6293 Idx += 1; 6294 } 6295 } 6296 6297 if (M.size() == NumElts*2) 6298 WhichResult = 0; 6299 6300 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6301 if (VT.is64BitVector() && EltSz == 32) 6302 return false; 6303 6304 return true; 6305 } 6306 6307 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 6308 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 6309 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 6310 unsigned &WhichResult, 6311 bool &isV_UNDEF) { 6312 isV_UNDEF = false; 6313 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 6314 return ARMISD::VTRN; 6315 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 6316 return ARMISD::VUZP; 6317 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 6318 return ARMISD::VZIP; 6319 6320 isV_UNDEF = true; 6321 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6322 return ARMISD::VTRN; 6323 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6324 return ARMISD::VUZP; 6325 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6326 return ARMISD::VZIP; 6327 6328 return 0; 6329 } 6330 6331 /// \return true if this is a reverse operation on an vector. 6332 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 6333 unsigned NumElts = VT.getVectorNumElements(); 6334 // Make sure the mask has the right size. 6335 if (NumElts != M.size()) 6336 return false; 6337 6338 // Look for <15, ..., 3, -1, 1, 0>. 6339 for (unsigned i = 0; i != NumElts; ++i) 6340 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 6341 return false; 6342 6343 return true; 6344 } 6345 6346 // If N is an integer constant that can be moved into a register in one 6347 // instruction, return an SDValue of such a constant (will become a MOV 6348 // instruction). Otherwise return null. 6349 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 6350 const ARMSubtarget *ST, const SDLoc &dl) { 6351 uint64_t Val; 6352 if (!isa<ConstantSDNode>(N)) 6353 return SDValue(); 6354 Val = cast<ConstantSDNode>(N)->getZExtValue(); 6355 6356 if (ST->isThumb1Only()) { 6357 if (Val <= 255 || ~Val <= 255) 6358 return DAG.getConstant(Val, dl, MVT::i32); 6359 } else { 6360 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 6361 return DAG.getConstant(Val, dl, MVT::i32); 6362 } 6363 return SDValue(); 6364 } 6365 6366 // If this is a case we can't handle, return null and let the default 6367 // expansion code take care of it. 6368 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 6369 const ARMSubtarget *ST) const { 6370 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 6371 SDLoc dl(Op); 6372 EVT VT = Op.getValueType(); 6373 6374 APInt SplatBits, SplatUndef; 6375 unsigned SplatBitSize; 6376 bool HasAnyUndefs; 6377 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 6378 if (SplatUndef.isAllOnesValue()) 6379 return DAG.getUNDEF(VT); 6380 6381 if (SplatBitSize <= 64) { 6382 // Check if an immediate VMOV works. 6383 EVT VmovVT; 6384 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 6385 SplatUndef.getZExtValue(), SplatBitSize, 6386 DAG, dl, VmovVT, VT.is128BitVector(), 6387 VMOVModImm); 6388 if (Val.getNode()) { 6389 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 6390 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6391 } 6392 6393 // Try an immediate VMVN. 6394 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 6395 Val = isNEONModifiedImm(NegatedImm, 6396 SplatUndef.getZExtValue(), SplatBitSize, 6397 DAG, dl, VmovVT, VT.is128BitVector(), 6398 VMVNModImm); 6399 if (Val.getNode()) { 6400 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 6401 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6402 } 6403 6404 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 6405 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 6406 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 6407 if (ImmVal != -1) { 6408 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 6409 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 6410 } 6411 } 6412 } 6413 } 6414 6415 // Scan through the operands to see if only one value is used. 6416 // 6417 // As an optimisation, even if more than one value is used it may be more 6418 // profitable to splat with one value then change some lanes. 6419 // 6420 // Heuristically we decide to do this if the vector has a "dominant" value, 6421 // defined as splatted to more than half of the lanes. 6422 unsigned NumElts = VT.getVectorNumElements(); 6423 bool isOnlyLowElement = true; 6424 bool usesOnlyOneValue = true; 6425 bool hasDominantValue = false; 6426 bool isConstant = true; 6427 6428 // Map of the number of times a particular SDValue appears in the 6429 // element list. 6430 DenseMap<SDValue, unsigned> ValueCounts; 6431 SDValue Value; 6432 for (unsigned i = 0; i < NumElts; ++i) { 6433 SDValue V = Op.getOperand(i); 6434 if (V.isUndef()) 6435 continue; 6436 if (i > 0) 6437 isOnlyLowElement = false; 6438 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 6439 isConstant = false; 6440 6441 ValueCounts.insert(std::make_pair(V, 0)); 6442 unsigned &Count = ValueCounts[V]; 6443 6444 // Is this value dominant? (takes up more than half of the lanes) 6445 if (++Count > (NumElts / 2)) { 6446 hasDominantValue = true; 6447 Value = V; 6448 } 6449 } 6450 if (ValueCounts.size() != 1) 6451 usesOnlyOneValue = false; 6452 if (!Value.getNode() && !ValueCounts.empty()) 6453 Value = ValueCounts.begin()->first; 6454 6455 if (ValueCounts.empty()) 6456 return DAG.getUNDEF(VT); 6457 6458 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 6459 // Keep going if we are hitting this case. 6460 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 6461 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 6462 6463 unsigned EltSize = VT.getScalarSizeInBits(); 6464 6465 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 6466 // i32 and try again. 6467 if (hasDominantValue && EltSize <= 32) { 6468 if (!isConstant) { 6469 SDValue N; 6470 6471 // If we are VDUPing a value that comes directly from a vector, that will 6472 // cause an unnecessary move to and from a GPR, where instead we could 6473 // just use VDUPLANE. We can only do this if the lane being extracted 6474 // is at a constant index, as the VDUP from lane instructions only have 6475 // constant-index forms. 6476 ConstantSDNode *constIndex; 6477 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6478 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 6479 // We need to create a new undef vector to use for the VDUPLANE if the 6480 // size of the vector from which we get the value is different than the 6481 // size of the vector that we need to create. We will insert the element 6482 // such that the register coalescer will remove unnecessary copies. 6483 if (VT != Value->getOperand(0).getValueType()) { 6484 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 6485 VT.getVectorNumElements(); 6486 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6487 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 6488 Value, DAG.getConstant(index, dl, MVT::i32)), 6489 DAG.getConstant(index, dl, MVT::i32)); 6490 } else 6491 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6492 Value->getOperand(0), Value->getOperand(1)); 6493 } else 6494 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 6495 6496 if (!usesOnlyOneValue) { 6497 // The dominant value was splatted as 'N', but we now have to insert 6498 // all differing elements. 6499 for (unsigned I = 0; I < NumElts; ++I) { 6500 if (Op.getOperand(I) == Value) 6501 continue; 6502 SmallVector<SDValue, 3> Ops; 6503 Ops.push_back(N); 6504 Ops.push_back(Op.getOperand(I)); 6505 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 6506 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 6507 } 6508 } 6509 return N; 6510 } 6511 if (VT.getVectorElementType().isFloatingPoint()) { 6512 SmallVector<SDValue, 8> Ops; 6513 for (unsigned i = 0; i < NumElts; ++i) 6514 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, 6515 Op.getOperand(i))); 6516 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 6517 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 6518 Val = LowerBUILD_VECTOR(Val, DAG, ST); 6519 if (Val.getNode()) 6520 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6521 } 6522 if (usesOnlyOneValue) { 6523 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 6524 if (isConstant && Val.getNode()) 6525 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 6526 } 6527 } 6528 6529 // If all elements are constants and the case above didn't get hit, fall back 6530 // to the default expansion, which will generate a load from the constant 6531 // pool. 6532 if (isConstant) 6533 return SDValue(); 6534 6535 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 6536 if (NumElts >= 4) { 6537 SDValue shuffle = ReconstructShuffle(Op, DAG); 6538 if (shuffle != SDValue()) 6539 return shuffle; 6540 } 6541 6542 if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 6543 // If we haven't found an efficient lowering, try splitting a 128-bit vector 6544 // into two 64-bit vectors; we might discover a better way to lower it. 6545 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 6546 EVT ExtVT = VT.getVectorElementType(); 6547 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 6548 SDValue Lower = 6549 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 6550 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 6551 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 6552 SDValue Upper = DAG.getBuildVector( 6553 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 6554 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 6555 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 6556 if (Lower && Upper) 6557 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 6558 } 6559 6560 // Vectors with 32- or 64-bit elements can be built by directly assigning 6561 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 6562 // will be legalized. 6563 if (EltSize >= 32) { 6564 // Do the expansion with floating-point types, since that is what the VFP 6565 // registers are defined to use, and since i64 is not legal. 6566 EVT EltVT = EVT::getFloatingPointVT(EltSize); 6567 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 6568 SmallVector<SDValue, 8> Ops; 6569 for (unsigned i = 0; i < NumElts; ++i) 6570 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 6571 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 6572 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6573 } 6574 6575 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 6576 // know the default expansion would otherwise fall back on something even 6577 // worse. For a vector with one or two non-undef values, that's 6578 // scalar_to_vector for the elements followed by a shuffle (provided the 6579 // shuffle is valid for the target) and materialization element by element 6580 // on the stack followed by a load for everything else. 6581 if (!isConstant && !usesOnlyOneValue) { 6582 SDValue Vec = DAG.getUNDEF(VT); 6583 for (unsigned i = 0 ; i < NumElts; ++i) { 6584 SDValue V = Op.getOperand(i); 6585 if (V.isUndef()) 6586 continue; 6587 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 6588 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 6589 } 6590 return Vec; 6591 } 6592 6593 return SDValue(); 6594 } 6595 6596 // Gather data to see if the operation can be modelled as a 6597 // shuffle in combination with VEXTs. 6598 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 6599 SelectionDAG &DAG) const { 6600 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 6601 SDLoc dl(Op); 6602 EVT VT = Op.getValueType(); 6603 unsigned NumElts = VT.getVectorNumElements(); 6604 6605 struct ShuffleSourceInfo { 6606 SDValue Vec; 6607 unsigned MinElt = std::numeric_limits<unsigned>::max(); 6608 unsigned MaxElt = 0; 6609 6610 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 6611 // be compatible with the shuffle we intend to construct. As a result 6612 // ShuffleVec will be some sliding window into the original Vec. 6613 SDValue ShuffleVec; 6614 6615 // Code should guarantee that element i in Vec starts at element "WindowBase 6616 // + i * WindowScale in ShuffleVec". 6617 int WindowBase = 0; 6618 int WindowScale = 1; 6619 6620 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 6621 6622 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 6623 }; 6624 6625 // First gather all vectors used as an immediate source for this BUILD_VECTOR 6626 // node. 6627 SmallVector<ShuffleSourceInfo, 2> Sources; 6628 for (unsigned i = 0; i < NumElts; ++i) { 6629 SDValue V = Op.getOperand(i); 6630 if (V.isUndef()) 6631 continue; 6632 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 6633 // A shuffle can only come from building a vector from various 6634 // elements of other vectors. 6635 return SDValue(); 6636 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 6637 // Furthermore, shuffles require a constant mask, whereas extractelts 6638 // accept variable indices. 6639 return SDValue(); 6640 } 6641 6642 // Add this element source to the list if it's not already there. 6643 SDValue SourceVec = V.getOperand(0); 6644 auto Source = llvm::find(Sources, SourceVec); 6645 if (Source == Sources.end()) 6646 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 6647 6648 // Update the minimum and maximum lane number seen. 6649 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 6650 Source->MinElt = std::min(Source->MinElt, EltNo); 6651 Source->MaxElt = std::max(Source->MaxElt, EltNo); 6652 } 6653 6654 // Currently only do something sane when at most two source vectors 6655 // are involved. 6656 if (Sources.size() > 2) 6657 return SDValue(); 6658 6659 // Find out the smallest element size among result and two sources, and use 6660 // it as element size to build the shuffle_vector. 6661 EVT SmallestEltTy = VT.getVectorElementType(); 6662 for (auto &Source : Sources) { 6663 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 6664 if (SrcEltTy.bitsLT(SmallestEltTy)) 6665 SmallestEltTy = SrcEltTy; 6666 } 6667 unsigned ResMultiplier = 6668 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 6669 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6670 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 6671 6672 // If the source vector is too wide or too narrow, we may nevertheless be able 6673 // to construct a compatible shuffle either by concatenating it with UNDEF or 6674 // extracting a suitable range of elements. 6675 for (auto &Src : Sources) { 6676 EVT SrcVT = Src.ShuffleVec.getValueType(); 6677 6678 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 6679 continue; 6680 6681 // This stage of the search produces a source with the same element type as 6682 // the original, but with a total width matching the BUILD_VECTOR output. 6683 EVT EltVT = SrcVT.getVectorElementType(); 6684 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 6685 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 6686 6687 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 6688 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 6689 return SDValue(); 6690 // We can pad out the smaller vector for free, so if it's part of a 6691 // shuffle... 6692 Src.ShuffleVec = 6693 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 6694 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 6695 continue; 6696 } 6697 6698 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 6699 return SDValue(); 6700 6701 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 6702 // Span too large for a VEXT to cope 6703 return SDValue(); 6704 } 6705 6706 if (Src.MinElt >= NumSrcElts) { 6707 // The extraction can just take the second half 6708 Src.ShuffleVec = 6709 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6710 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6711 Src.WindowBase = -NumSrcElts; 6712 } else if (Src.MaxElt < NumSrcElts) { 6713 // The extraction can just take the first half 6714 Src.ShuffleVec = 6715 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6716 DAG.getConstant(0, dl, MVT::i32)); 6717 } else { 6718 // An actual VEXT is needed 6719 SDValue VEXTSrc1 = 6720 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6721 DAG.getConstant(0, dl, MVT::i32)); 6722 SDValue VEXTSrc2 = 6723 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6724 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6725 6726 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 6727 VEXTSrc2, 6728 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 6729 Src.WindowBase = -Src.MinElt; 6730 } 6731 } 6732 6733 // Another possible incompatibility occurs from the vector element types. We 6734 // can fix this by bitcasting the source vectors to the same type we intend 6735 // for the shuffle. 6736 for (auto &Src : Sources) { 6737 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 6738 if (SrcEltTy == SmallestEltTy) 6739 continue; 6740 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 6741 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 6742 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6743 Src.WindowBase *= Src.WindowScale; 6744 } 6745 6746 // Final sanity check before we try to actually produce a shuffle. 6747 LLVM_DEBUG(for (auto Src 6748 : Sources) 6749 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 6750 6751 // The stars all align, our next step is to produce the mask for the shuffle. 6752 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 6753 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 6754 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 6755 SDValue Entry = Op.getOperand(i); 6756 if (Entry.isUndef()) 6757 continue; 6758 6759 auto Src = llvm::find(Sources, Entry.getOperand(0)); 6760 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 6761 6762 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 6763 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 6764 // segment. 6765 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 6766 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 6767 VT.getScalarSizeInBits()); 6768 int LanesDefined = BitsDefined / BitsPerShuffleLane; 6769 6770 // This source is expected to fill ResMultiplier lanes of the final shuffle, 6771 // starting at the appropriate offset. 6772 int *LaneMask = &Mask[i * ResMultiplier]; 6773 6774 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 6775 ExtractBase += NumElts * (Src - Sources.begin()); 6776 for (int j = 0; j < LanesDefined; ++j) 6777 LaneMask[j] = ExtractBase + j; 6778 } 6779 6780 // Final check before we try to produce nonsense... 6781 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 6782 return SDValue(); 6783 6784 // We can't handle more than two sources. This should have already 6785 // been checked before this point. 6786 assert(Sources.size() <= 2 && "Too many sources!"); 6787 6788 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 6789 for (unsigned i = 0; i < Sources.size(); ++i) 6790 ShuffleOps[i] = Sources[i].ShuffleVec; 6791 6792 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 6793 ShuffleOps[1], Mask); 6794 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 6795 } 6796 6797 /// isShuffleMaskLegal - Targets can use this to indicate that they only 6798 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6799 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6800 /// are assumed to be legal. 6801 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 6802 if (VT.getVectorNumElements() == 4 && 6803 (VT.is128BitVector() || VT.is64BitVector())) { 6804 unsigned PFIndexes[4]; 6805 for (unsigned i = 0; i != 4; ++i) { 6806 if (M[i] < 0) 6807 PFIndexes[i] = 8; 6808 else 6809 PFIndexes[i] = M[i]; 6810 } 6811 6812 // Compute the index in the perfect shuffle table. 6813 unsigned PFTableIndex = 6814 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6815 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6816 unsigned Cost = (PFEntry >> 30); 6817 6818 if (Cost <= 4) 6819 return true; 6820 } 6821 6822 bool ReverseVEXT, isV_UNDEF; 6823 unsigned Imm, WhichResult; 6824 6825 unsigned EltSize = VT.getScalarSizeInBits(); 6826 return (EltSize >= 32 || 6827 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6828 isVREVMask(M, VT, 64) || 6829 isVREVMask(M, VT, 32) || 6830 isVREVMask(M, VT, 16) || 6831 isVEXTMask(M, VT, ReverseVEXT, Imm) || 6832 isVTBLMask(M, VT) || 6833 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || 6834 ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); 6835 } 6836 6837 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 6838 /// the specified operations to build the shuffle. 6839 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 6840 SDValue RHS, SelectionDAG &DAG, 6841 const SDLoc &dl) { 6842 unsigned OpNum = (PFEntry >> 26) & 0x0F; 6843 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 6844 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 6845 6846 enum { 6847 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 6848 OP_VREV, 6849 OP_VDUP0, 6850 OP_VDUP1, 6851 OP_VDUP2, 6852 OP_VDUP3, 6853 OP_VEXT1, 6854 OP_VEXT2, 6855 OP_VEXT3, 6856 OP_VUZPL, // VUZP, left result 6857 OP_VUZPR, // VUZP, right result 6858 OP_VZIPL, // VZIP, left result 6859 OP_VZIPR, // VZIP, right result 6860 OP_VTRNL, // VTRN, left result 6861 OP_VTRNR // VTRN, right result 6862 }; 6863 6864 if (OpNum == OP_COPY) { 6865 if (LHSID == (1*9+2)*9+3) return LHS; 6866 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 6867 return RHS; 6868 } 6869 6870 SDValue OpLHS, OpRHS; 6871 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 6872 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 6873 EVT VT = OpLHS.getValueType(); 6874 6875 switch (OpNum) { 6876 default: llvm_unreachable("Unknown shuffle opcode!"); 6877 case OP_VREV: 6878 // VREV divides the vector in half and swaps within the half. 6879 if (VT.getVectorElementType() == MVT::i32 || 6880 VT.getVectorElementType() == MVT::f32) 6881 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 6882 // vrev <4 x i16> -> VREV32 6883 if (VT.getVectorElementType() == MVT::i16) 6884 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 6885 // vrev <4 x i8> -> VREV16 6886 assert(VT.getVectorElementType() == MVT::i8); 6887 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 6888 case OP_VDUP0: 6889 case OP_VDUP1: 6890 case OP_VDUP2: 6891 case OP_VDUP3: 6892 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6893 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 6894 case OP_VEXT1: 6895 case OP_VEXT2: 6896 case OP_VEXT3: 6897 return DAG.getNode(ARMISD::VEXT, dl, VT, 6898 OpLHS, OpRHS, 6899 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 6900 case OP_VUZPL: 6901 case OP_VUZPR: 6902 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 6903 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 6904 case OP_VZIPL: 6905 case OP_VZIPR: 6906 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 6907 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 6908 case OP_VTRNL: 6909 case OP_VTRNR: 6910 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 6911 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 6912 } 6913 } 6914 6915 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 6916 ArrayRef<int> ShuffleMask, 6917 SelectionDAG &DAG) { 6918 // Check to see if we can use the VTBL instruction. 6919 SDValue V1 = Op.getOperand(0); 6920 SDValue V2 = Op.getOperand(1); 6921 SDLoc DL(Op); 6922 6923 SmallVector<SDValue, 8> VTBLMask; 6924 for (ArrayRef<int>::iterator 6925 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 6926 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 6927 6928 if (V2.getNode()->isUndef()) 6929 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 6930 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6931 6932 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 6933 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6934 } 6935 6936 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 6937 SelectionDAG &DAG) { 6938 SDLoc DL(Op); 6939 SDValue OpLHS = Op.getOperand(0); 6940 EVT VT = OpLHS.getValueType(); 6941 6942 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 6943 "Expect an v8i16/v16i8 type"); 6944 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 6945 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 6946 // extract the first 8 bytes into the top double word and the last 8 bytes 6947 // into the bottom double word. The v8i16 case is similar. 6948 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 6949 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 6950 DAG.getConstant(ExtractNum, DL, MVT::i32)); 6951 } 6952 6953 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 6954 SDValue V1 = Op.getOperand(0); 6955 SDValue V2 = Op.getOperand(1); 6956 SDLoc dl(Op); 6957 EVT VT = Op.getValueType(); 6958 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 6959 6960 // Convert shuffles that are directly supported on NEON to target-specific 6961 // DAG nodes, instead of keeping them as shuffles and matching them again 6962 // during code selection. This is more efficient and avoids the possibility 6963 // of inconsistencies between legalization and selection. 6964 // FIXME: floating-point vectors should be canonicalized to integer vectors 6965 // of the same time so that they get CSEd properly. 6966 ArrayRef<int> ShuffleMask = SVN->getMask(); 6967 6968 unsigned EltSize = VT.getScalarSizeInBits(); 6969 if (EltSize <= 32) { 6970 if (SVN->isSplat()) { 6971 int Lane = SVN->getSplatIndex(); 6972 // If this is undef splat, generate it via "just" vdup, if possible. 6973 if (Lane == -1) Lane = 0; 6974 6975 // Test if V1 is a SCALAR_TO_VECTOR. 6976 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 6977 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 6978 } 6979 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 6980 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 6981 // reaches it). 6982 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 6983 !isa<ConstantSDNode>(V1.getOperand(0))) { 6984 bool IsScalarToVector = true; 6985 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 6986 if (!V1.getOperand(i).isUndef()) { 6987 IsScalarToVector = false; 6988 break; 6989 } 6990 if (IsScalarToVector) 6991 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 6992 } 6993 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 6994 DAG.getConstant(Lane, dl, MVT::i32)); 6995 } 6996 6997 bool ReverseVEXT; 6998 unsigned Imm; 6999 if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 7000 if (ReverseVEXT) 7001 std::swap(V1, V2); 7002 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 7003 DAG.getConstant(Imm, dl, MVT::i32)); 7004 } 7005 7006 if (isVREVMask(ShuffleMask, VT, 64)) 7007 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 7008 if (isVREVMask(ShuffleMask, VT, 32)) 7009 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 7010 if (isVREVMask(ShuffleMask, VT, 16)) 7011 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 7012 7013 if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 7014 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 7015 DAG.getConstant(Imm, dl, MVT::i32)); 7016 } 7017 7018 // Check for Neon shuffles that modify both input vectors in place. 7019 // If both results are used, i.e., if there are two shuffles with the same 7020 // source operands and with masks corresponding to both results of one of 7021 // these operations, DAG memoization will ensure that a single node is 7022 // used for both shuffles. 7023 unsigned WhichResult; 7024 bool isV_UNDEF; 7025 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7026 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 7027 if (isV_UNDEF) 7028 V2 = V1; 7029 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 7030 .getValue(WhichResult); 7031 } 7032 7033 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 7034 // shuffles that produce a result larger than their operands with: 7035 // shuffle(concat(v1, undef), concat(v2, undef)) 7036 // -> 7037 // shuffle(concat(v1, v2), undef) 7038 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 7039 // 7040 // This is useful in the general case, but there are special cases where 7041 // native shuffles produce larger results: the two-result ops. 7042 // 7043 // Look through the concat when lowering them: 7044 // shuffle(concat(v1, v2), undef) 7045 // -> 7046 // concat(VZIP(v1, v2):0, :1) 7047 // 7048 if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 7049 SDValue SubV1 = V1->getOperand(0); 7050 SDValue SubV2 = V1->getOperand(1); 7051 EVT SubVT = SubV1.getValueType(); 7052 7053 // We expect these to have been canonicalized to -1. 7054 assert(llvm::all_of(ShuffleMask, [&](int i) { 7055 return i < (int)VT.getVectorNumElements(); 7056 }) && "Unexpected shuffle index into UNDEF operand!"); 7057 7058 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7059 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 7060 if (isV_UNDEF) 7061 SubV2 = SubV1; 7062 assert((WhichResult == 0) && 7063 "In-place shuffle of concat can only have one result!"); 7064 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 7065 SubV1, SubV2); 7066 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 7067 Res.getValue(1)); 7068 } 7069 } 7070 } 7071 7072 // If the shuffle is not directly supported and it has 4 elements, use 7073 // the PerfectShuffle-generated table to synthesize it from other shuffles. 7074 unsigned NumElts = VT.getVectorNumElements(); 7075 if (NumElts == 4) { 7076 unsigned PFIndexes[4]; 7077 for (unsigned i = 0; i != 4; ++i) { 7078 if (ShuffleMask[i] < 0) 7079 PFIndexes[i] = 8; 7080 else 7081 PFIndexes[i] = ShuffleMask[i]; 7082 } 7083 7084 // Compute the index in the perfect shuffle table. 7085 unsigned PFTableIndex = 7086 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7087 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7088 unsigned Cost = (PFEntry >> 30); 7089 7090 if (Cost <= 4) 7091 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 7092 } 7093 7094 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 7095 if (EltSize >= 32) { 7096 // Do the expansion with floating-point types, since that is what the VFP 7097 // registers are defined to use, and since i64 is not legal. 7098 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7099 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7100 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 7101 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 7102 SmallVector<SDValue, 8> Ops; 7103 for (unsigned i = 0; i < NumElts; ++i) { 7104 if (ShuffleMask[i] < 0) 7105 Ops.push_back(DAG.getUNDEF(EltVT)); 7106 else 7107 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 7108 ShuffleMask[i] < (int)NumElts ? V1 : V2, 7109 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 7110 dl, MVT::i32))); 7111 } 7112 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7113 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7114 } 7115 7116 if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 7117 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 7118 7119 if (VT == MVT::v8i8) 7120 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 7121 return NewOp; 7122 7123 return SDValue(); 7124 } 7125 7126 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 7127 // INSERT_VECTOR_ELT is legal only for immediate indexes. 7128 SDValue Lane = Op.getOperand(2); 7129 if (!isa<ConstantSDNode>(Lane)) 7130 return SDValue(); 7131 7132 return Op; 7133 } 7134 7135 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 7136 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 7137 SDValue Lane = Op.getOperand(1); 7138 if (!isa<ConstantSDNode>(Lane)) 7139 return SDValue(); 7140 7141 SDValue Vec = Op.getOperand(0); 7142 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 7143 SDLoc dl(Op); 7144 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 7145 } 7146 7147 return Op; 7148 } 7149 7150 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 7151 // The only time a CONCAT_VECTORS operation can have legal types is when 7152 // two 64-bit vectors are concatenated to a 128-bit vector. 7153 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 7154 "unexpected CONCAT_VECTORS"); 7155 SDLoc dl(Op); 7156 SDValue Val = DAG.getUNDEF(MVT::v2f64); 7157 SDValue Op0 = Op.getOperand(0); 7158 SDValue Op1 = Op.getOperand(1); 7159 if (!Op0.isUndef()) 7160 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 7161 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 7162 DAG.getIntPtrConstant(0, dl)); 7163 if (!Op1.isUndef()) 7164 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 7165 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 7166 DAG.getIntPtrConstant(1, dl)); 7167 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 7168 } 7169 7170 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 7171 /// element has been zero/sign-extended, depending on the isSigned parameter, 7172 /// from an integer type half its size. 7173 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 7174 bool isSigned) { 7175 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 7176 EVT VT = N->getValueType(0); 7177 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 7178 SDNode *BVN = N->getOperand(0).getNode(); 7179 if (BVN->getValueType(0) != MVT::v4i32 || 7180 BVN->getOpcode() != ISD::BUILD_VECTOR) 7181 return false; 7182 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 7183 unsigned HiElt = 1 - LoElt; 7184 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 7185 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 7186 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 7187 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 7188 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 7189 return false; 7190 if (isSigned) { 7191 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 7192 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 7193 return true; 7194 } else { 7195 if (Hi0->isNullValue() && Hi1->isNullValue()) 7196 return true; 7197 } 7198 return false; 7199 } 7200 7201 if (N->getOpcode() != ISD::BUILD_VECTOR) 7202 return false; 7203 7204 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 7205 SDNode *Elt = N->getOperand(i).getNode(); 7206 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 7207 unsigned EltSize = VT.getScalarSizeInBits(); 7208 unsigned HalfSize = EltSize / 2; 7209 if (isSigned) { 7210 if (!isIntN(HalfSize, C->getSExtValue())) 7211 return false; 7212 } else { 7213 if (!isUIntN(HalfSize, C->getZExtValue())) 7214 return false; 7215 } 7216 continue; 7217 } 7218 return false; 7219 } 7220 7221 return true; 7222 } 7223 7224 /// isSignExtended - Check if a node is a vector value that is sign-extended 7225 /// or a constant BUILD_VECTOR with sign-extended elements. 7226 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 7227 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 7228 return true; 7229 if (isExtendedBUILD_VECTOR(N, DAG, true)) 7230 return true; 7231 return false; 7232 } 7233 7234 /// isZeroExtended - Check if a node is a vector value that is zero-extended 7235 /// or a constant BUILD_VECTOR with zero-extended elements. 7236 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 7237 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 7238 return true; 7239 if (isExtendedBUILD_VECTOR(N, DAG, false)) 7240 return true; 7241 return false; 7242 } 7243 7244 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 7245 if (OrigVT.getSizeInBits() >= 64) 7246 return OrigVT; 7247 7248 assert(OrigVT.isSimple() && "Expecting a simple value type"); 7249 7250 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 7251 switch (OrigSimpleTy) { 7252 default: llvm_unreachable("Unexpected Vector Type"); 7253 case MVT::v2i8: 7254 case MVT::v2i16: 7255 return MVT::v2i32; 7256 case MVT::v4i8: 7257 return MVT::v4i16; 7258 } 7259 } 7260 7261 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 7262 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 7263 /// We insert the required extension here to get the vector to fill a D register. 7264 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 7265 const EVT &OrigTy, 7266 const EVT &ExtTy, 7267 unsigned ExtOpcode) { 7268 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 7269 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 7270 // 64-bits we need to insert a new extension so that it will be 64-bits. 7271 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 7272 if (OrigTy.getSizeInBits() >= 64) 7273 return N; 7274 7275 // Must extend size to at least 64 bits to be used as an operand for VMULL. 7276 EVT NewVT = getExtensionTo64Bits(OrigTy); 7277 7278 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 7279 } 7280 7281 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 7282 /// does not do any sign/zero extension. If the original vector is less 7283 /// than 64 bits, an appropriate extension will be added after the load to 7284 /// reach a total size of 64 bits. We have to add the extension separately 7285 /// because ARM does not have a sign/zero extending load for vectors. 7286 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 7287 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 7288 7289 // The load already has the right type. 7290 if (ExtendedTy == LD->getMemoryVT()) 7291 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 7292 LD->getBasePtr(), LD->getPointerInfo(), 7293 LD->getAlignment(), LD->getMemOperand()->getFlags()); 7294 7295 // We need to create a zextload/sextload. We cannot just create a load 7296 // followed by a zext/zext node because LowerMUL is also run during normal 7297 // operation legalization where we can't create illegal types. 7298 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 7299 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 7300 LD->getMemoryVT(), LD->getAlignment(), 7301 LD->getMemOperand()->getFlags()); 7302 } 7303 7304 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 7305 /// extending load, or BUILD_VECTOR with extended elements, return the 7306 /// unextended value. The unextended vector should be 64 bits so that it can 7307 /// be used as an operand to a VMULL instruction. If the original vector size 7308 /// before extension is less than 64 bits we add a an extension to resize 7309 /// the vector to 64 bits. 7310 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 7311 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 7312 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 7313 N->getOperand(0)->getValueType(0), 7314 N->getValueType(0), 7315 N->getOpcode()); 7316 7317 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 7318 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 7319 "Expected extending load"); 7320 7321 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 7322 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 7323 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 7324 SDValue extLoad = 7325 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 7326 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 7327 7328 return newLoad; 7329 } 7330 7331 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 7332 // have been legalized as a BITCAST from v4i32. 7333 if (N->getOpcode() == ISD::BITCAST) { 7334 SDNode *BVN = N->getOperand(0).getNode(); 7335 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 7336 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 7337 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 7338 return DAG.getBuildVector( 7339 MVT::v2i32, SDLoc(N), 7340 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 7341 } 7342 // Construct a new BUILD_VECTOR with elements truncated to half the size. 7343 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 7344 EVT VT = N->getValueType(0); 7345 unsigned EltSize = VT.getScalarSizeInBits() / 2; 7346 unsigned NumElts = VT.getVectorNumElements(); 7347 MVT TruncVT = MVT::getIntegerVT(EltSize); 7348 SmallVector<SDValue, 8> Ops; 7349 SDLoc dl(N); 7350 for (unsigned i = 0; i != NumElts; ++i) { 7351 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 7352 const APInt &CInt = C->getAPIntValue(); 7353 // Element types smaller than 32 bits are not legal, so use i32 elements. 7354 // The values are implicitly truncated so sext vs. zext doesn't matter. 7355 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 7356 } 7357 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 7358 } 7359 7360 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 7361 unsigned Opcode = N->getOpcode(); 7362 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 7363 SDNode *N0 = N->getOperand(0).getNode(); 7364 SDNode *N1 = N->getOperand(1).getNode(); 7365 return N0->hasOneUse() && N1->hasOneUse() && 7366 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 7367 } 7368 return false; 7369 } 7370 7371 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 7372 unsigned Opcode = N->getOpcode(); 7373 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 7374 SDNode *N0 = N->getOperand(0).getNode(); 7375 SDNode *N1 = N->getOperand(1).getNode(); 7376 return N0->hasOneUse() && N1->hasOneUse() && 7377 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 7378 } 7379 return false; 7380 } 7381 7382 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 7383 // Multiplications are only custom-lowered for 128-bit vectors so that 7384 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 7385 EVT VT = Op.getValueType(); 7386 assert(VT.is128BitVector() && VT.isInteger() && 7387 "unexpected type for custom-lowering ISD::MUL"); 7388 SDNode *N0 = Op.getOperand(0).getNode(); 7389 SDNode *N1 = Op.getOperand(1).getNode(); 7390 unsigned NewOpc = 0; 7391 bool isMLA = false; 7392 bool isN0SExt = isSignExtended(N0, DAG); 7393 bool isN1SExt = isSignExtended(N1, DAG); 7394 if (isN0SExt && isN1SExt) 7395 NewOpc = ARMISD::VMULLs; 7396 else { 7397 bool isN0ZExt = isZeroExtended(N0, DAG); 7398 bool isN1ZExt = isZeroExtended(N1, DAG); 7399 if (isN0ZExt && isN1ZExt) 7400 NewOpc = ARMISD::VMULLu; 7401 else if (isN1SExt || isN1ZExt) { 7402 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 7403 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 7404 if (isN1SExt && isAddSubSExt(N0, DAG)) { 7405 NewOpc = ARMISD::VMULLs; 7406 isMLA = true; 7407 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 7408 NewOpc = ARMISD::VMULLu; 7409 isMLA = true; 7410 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 7411 std::swap(N0, N1); 7412 NewOpc = ARMISD::VMULLu; 7413 isMLA = true; 7414 } 7415 } 7416 7417 if (!NewOpc) { 7418 if (VT == MVT::v2i64) 7419 // Fall through to expand this. It is not legal. 7420 return SDValue(); 7421 else 7422 // Other vector multiplications are legal. 7423 return Op; 7424 } 7425 } 7426 7427 // Legalize to a VMULL instruction. 7428 SDLoc DL(Op); 7429 SDValue Op0; 7430 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 7431 if (!isMLA) { 7432 Op0 = SkipExtensionForVMULL(N0, DAG); 7433 assert(Op0.getValueType().is64BitVector() && 7434 Op1.getValueType().is64BitVector() && 7435 "unexpected types for extended operands to VMULL"); 7436 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 7437 } 7438 7439 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 7440 // isel lowering to take advantage of no-stall back to back vmul + vmla. 7441 // vmull q0, d4, d6 7442 // vmlal q0, d5, d6 7443 // is faster than 7444 // vaddl q0, d4, d5 7445 // vmovl q1, d6 7446 // vmul q0, q0, q1 7447 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 7448 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 7449 EVT Op1VT = Op1.getValueType(); 7450 return DAG.getNode(N0->getOpcode(), DL, VT, 7451 DAG.getNode(NewOpc, DL, VT, 7452 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 7453 DAG.getNode(NewOpc, DL, VT, 7454 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 7455 } 7456 7457 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 7458 SelectionDAG &DAG) { 7459 // TODO: Should this propagate fast-math-flags? 7460 7461 // Convert to float 7462 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 7463 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 7464 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 7465 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 7466 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 7467 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 7468 // Get reciprocal estimate. 7469 // float4 recip = vrecpeq_f32(yf); 7470 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7471 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7472 Y); 7473 // Because char has a smaller range than uchar, we can actually get away 7474 // without any newton steps. This requires that we use a weird bias 7475 // of 0xb000, however (again, this has been exhaustively tested). 7476 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 7477 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 7478 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 7479 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 7480 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 7481 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 7482 // Convert back to short. 7483 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 7484 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 7485 return X; 7486 } 7487 7488 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 7489 SelectionDAG &DAG) { 7490 // TODO: Should this propagate fast-math-flags? 7491 7492 SDValue N2; 7493 // Convert to float. 7494 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 7495 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 7496 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 7497 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 7498 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7499 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7500 7501 // Use reciprocal estimate and one refinement step. 7502 // float4 recip = vrecpeq_f32(yf); 7503 // recip *= vrecpsq_f32(yf, recip); 7504 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7505 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7506 N1); 7507 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7508 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7509 N1, N2); 7510 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7511 // Because short has a smaller range than ushort, we can actually get away 7512 // with only a single newton step. This requires that we use a weird bias 7513 // of 89, however (again, this has been exhaustively tested). 7514 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 7515 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7516 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7517 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 7518 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7519 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7520 // Convert back to integer and return. 7521 // return vmovn_s32(vcvt_s32_f32(result)); 7522 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7523 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7524 return N0; 7525 } 7526 7527 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 7528 EVT VT = Op.getValueType(); 7529 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7530 "unexpected type for custom-lowering ISD::SDIV"); 7531 7532 SDLoc dl(Op); 7533 SDValue N0 = Op.getOperand(0); 7534 SDValue N1 = Op.getOperand(1); 7535 SDValue N2, N3; 7536 7537 if (VT == MVT::v8i8) { 7538 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 7539 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 7540 7541 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7542 DAG.getIntPtrConstant(4, dl)); 7543 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7544 DAG.getIntPtrConstant(4, dl)); 7545 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7546 DAG.getIntPtrConstant(0, dl)); 7547 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7548 DAG.getIntPtrConstant(0, dl)); 7549 7550 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 7551 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 7552 7553 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7554 N0 = LowerCONCAT_VECTORS(N0, DAG); 7555 7556 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 7557 return N0; 7558 } 7559 return LowerSDIV_v4i16(N0, N1, dl, DAG); 7560 } 7561 7562 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { 7563 // TODO: Should this propagate fast-math-flags? 7564 EVT VT = Op.getValueType(); 7565 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7566 "unexpected type for custom-lowering ISD::UDIV"); 7567 7568 SDLoc dl(Op); 7569 SDValue N0 = Op.getOperand(0); 7570 SDValue N1 = Op.getOperand(1); 7571 SDValue N2, N3; 7572 7573 if (VT == MVT::v8i8) { 7574 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 7575 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 7576 7577 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7578 DAG.getIntPtrConstant(4, dl)); 7579 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7580 DAG.getIntPtrConstant(4, dl)); 7581 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7582 DAG.getIntPtrConstant(0, dl)); 7583 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7584 DAG.getIntPtrConstant(0, dl)); 7585 7586 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 7587 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 7588 7589 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7590 N0 = LowerCONCAT_VECTORS(N0, DAG); 7591 7592 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 7593 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 7594 MVT::i32), 7595 N0); 7596 return N0; 7597 } 7598 7599 // v4i16 sdiv ... Convert to float. 7600 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 7601 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 7602 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 7603 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 7604 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7605 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7606 7607 // Use reciprocal estimate and two refinement steps. 7608 // float4 recip = vrecpeq_f32(yf); 7609 // recip *= vrecpsq_f32(yf, recip); 7610 // recip *= vrecpsq_f32(yf, recip); 7611 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7612 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7613 BN1); 7614 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7615 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7616 BN1, N2); 7617 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7618 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7619 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7620 BN1, N2); 7621 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7622 // Simply multiplying by the reciprocal estimate can leave us a few ulps 7623 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 7624 // and that it will never cause us to return an answer too large). 7625 // float4 result = as_float4(as_int4(xf*recip) + 2); 7626 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7627 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7628 N1 = DAG.getConstant(2, dl, MVT::v4i32); 7629 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7630 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7631 // Convert back to integer and return. 7632 // return vmovn_u32(vcvt_s32_f32(result)); 7633 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7634 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7635 return N0; 7636 } 7637 7638 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 7639 SDNode *N = Op.getNode(); 7640 EVT VT = N->getValueType(0); 7641 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 7642 7643 SDValue Carry = Op.getOperand(2); 7644 7645 SDLoc DL(Op); 7646 7647 SDValue Result; 7648 if (Op.getOpcode() == ISD::ADDCARRY) { 7649 // This converts the boolean value carry into the carry flag. 7650 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 7651 7652 // Do the addition proper using the carry flag we wanted. 7653 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 7654 Op.getOperand(1), Carry); 7655 7656 // Now convert the carry flag into a boolean value. 7657 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 7658 } else { 7659 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 7660 // have to invert the carry first. 7661 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 7662 DAG.getConstant(1, DL, MVT::i32), Carry); 7663 // This converts the boolean value carry into the carry flag. 7664 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 7665 7666 // Do the subtraction proper using the carry flag we wanted. 7667 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 7668 Op.getOperand(1), Carry); 7669 7670 // Now convert the carry flag into a boolean value. 7671 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 7672 // But the carry returned by ARMISD::SUBE is not a borrow as expected 7673 // by ISD::SUBCARRY, so compute 1 - C. 7674 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 7675 DAG.getConstant(1, DL, MVT::i32), Carry); 7676 } 7677 7678 // Return both values. 7679 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 7680 } 7681 7682 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 7683 assert(Subtarget->isTargetDarwin()); 7684 7685 // For iOS, we want to call an alternative entry point: __sincos_stret, 7686 // return values are passed via sret. 7687 SDLoc dl(Op); 7688 SDValue Arg = Op.getOperand(0); 7689 EVT ArgVT = Arg.getValueType(); 7690 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7691 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7692 7693 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7694 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 7695 7696 // Pair of floats / doubles used to pass the result. 7697 Type *RetTy = StructType::get(ArgTy, ArgTy); 7698 auto &DL = DAG.getDataLayout(); 7699 7700 ArgListTy Args; 7701 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 7702 SDValue SRet; 7703 if (ShouldUseSRet) { 7704 // Create stack object for sret. 7705 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 7706 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 7707 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 7708 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 7709 7710 ArgListEntry Entry; 7711 Entry.Node = SRet; 7712 Entry.Ty = RetTy->getPointerTo(); 7713 Entry.IsSExt = false; 7714 Entry.IsZExt = false; 7715 Entry.IsSRet = true; 7716 Args.push_back(Entry); 7717 RetTy = Type::getVoidTy(*DAG.getContext()); 7718 } 7719 7720 ArgListEntry Entry; 7721 Entry.Node = Arg; 7722 Entry.Ty = ArgTy; 7723 Entry.IsSExt = false; 7724 Entry.IsZExt = false; 7725 Args.push_back(Entry); 7726 7727 RTLIB::Libcall LC = 7728 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 7729 const char *LibcallName = getLibcallName(LC); 7730 CallingConv::ID CC = getLibcallCallingConv(LC); 7731 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 7732 7733 TargetLowering::CallLoweringInfo CLI(DAG); 7734 CLI.setDebugLoc(dl) 7735 .setChain(DAG.getEntryNode()) 7736 .setCallee(CC, RetTy, Callee, std::move(Args)) 7737 .setDiscardResult(ShouldUseSRet); 7738 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 7739 7740 if (!ShouldUseSRet) 7741 return CallResult.first; 7742 7743 SDValue LoadSin = 7744 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 7745 7746 // Address of cos field. 7747 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 7748 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 7749 SDValue LoadCos = 7750 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 7751 7752 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 7753 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 7754 LoadSin.getValue(0), LoadCos.getValue(0)); 7755 } 7756 7757 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 7758 bool Signed, 7759 SDValue &Chain) const { 7760 EVT VT = Op.getValueType(); 7761 assert((VT == MVT::i32 || VT == MVT::i64) && 7762 "unexpected type for custom lowering DIV"); 7763 SDLoc dl(Op); 7764 7765 const auto &DL = DAG.getDataLayout(); 7766 const auto &TLI = DAG.getTargetLoweringInfo(); 7767 7768 const char *Name = nullptr; 7769 if (Signed) 7770 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 7771 else 7772 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 7773 7774 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 7775 7776 ARMTargetLowering::ArgListTy Args; 7777 7778 for (auto AI : {1, 0}) { 7779 ArgListEntry Arg; 7780 Arg.Node = Op.getOperand(AI); 7781 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 7782 Args.push_back(Arg); 7783 } 7784 7785 CallLoweringInfo CLI(DAG); 7786 CLI.setDebugLoc(dl) 7787 .setChain(Chain) 7788 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 7789 ES, std::move(Args)); 7790 7791 return LowerCallTo(CLI).first; 7792 } 7793 7794 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 7795 bool Signed) const { 7796 assert(Op.getValueType() == MVT::i32 && 7797 "unexpected type for custom lowering DIV"); 7798 SDLoc dl(Op); 7799 7800 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 7801 DAG.getEntryNode(), Op.getOperand(1)); 7802 7803 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7804 } 7805 7806 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 7807 SDLoc DL(N); 7808 SDValue Op = N->getOperand(1); 7809 if (N->getValueType(0) == MVT::i32) 7810 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 7811 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7812 DAG.getConstant(0, DL, MVT::i32)); 7813 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7814 DAG.getConstant(1, DL, MVT::i32)); 7815 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 7816 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 7817 } 7818 7819 void ARMTargetLowering::ExpandDIV_Windows( 7820 SDValue Op, SelectionDAG &DAG, bool Signed, 7821 SmallVectorImpl<SDValue> &Results) const { 7822 const auto &DL = DAG.getDataLayout(); 7823 const auto &TLI = DAG.getTargetLoweringInfo(); 7824 7825 assert(Op.getValueType() == MVT::i64 && 7826 "unexpected type for custom lowering DIV"); 7827 SDLoc dl(Op); 7828 7829 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 7830 7831 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7832 7833 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 7834 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 7835 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 7836 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 7837 7838 Results.push_back(Lower); 7839 Results.push_back(Upper); 7840 } 7841 7842 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 7843 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 7844 // Acquire/Release load/store is not legal for targets without a dmb or 7845 // equivalent available. 7846 return SDValue(); 7847 7848 // Monotonic load/store is legal for all targets. 7849 return Op; 7850 } 7851 7852 static void ReplaceREADCYCLECOUNTER(SDNode *N, 7853 SmallVectorImpl<SDValue> &Results, 7854 SelectionDAG &DAG, 7855 const ARMSubtarget *Subtarget) { 7856 SDLoc DL(N); 7857 // Under Power Management extensions, the cycle-count is: 7858 // mrc p15, #0, <Rt>, c9, c13, #0 7859 SDValue Ops[] = { N->getOperand(0), // Chain 7860 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 7861 DAG.getConstant(15, DL, MVT::i32), 7862 DAG.getConstant(0, DL, MVT::i32), 7863 DAG.getConstant(9, DL, MVT::i32), 7864 DAG.getConstant(13, DL, MVT::i32), 7865 DAG.getConstant(0, DL, MVT::i32) 7866 }; 7867 7868 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 7869 DAG.getVTList(MVT::i32, MVT::Other), Ops); 7870 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 7871 DAG.getConstant(0, DL, MVT::i32))); 7872 Results.push_back(Cycles32.getValue(1)); 7873 } 7874 7875 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 7876 SDLoc dl(V.getNode()); 7877 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 7878 SDValue VHi = DAG.getAnyExtOrTrunc( 7879 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 7880 dl, MVT::i32); 7881 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 7882 if (isBigEndian) 7883 std::swap (VLo, VHi); 7884 SDValue RegClass = 7885 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 7886 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 7887 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 7888 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 7889 return SDValue( 7890 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 7891 } 7892 7893 static void ReplaceCMP_SWAP_64Results(SDNode *N, 7894 SmallVectorImpl<SDValue> & Results, 7895 SelectionDAG &DAG) { 7896 assert(N->getValueType(0) == MVT::i64 && 7897 "AtomicCmpSwap on types less than 64 should be legal"); 7898 SDValue Ops[] = {N->getOperand(1), 7899 createGPRPairNode(DAG, N->getOperand(2)), 7900 createGPRPairNode(DAG, N->getOperand(3)), 7901 N->getOperand(0)}; 7902 SDNode *CmpSwap = DAG.getMachineNode( 7903 ARM::CMP_SWAP_64, SDLoc(N), 7904 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 7905 7906 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 7907 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 7908 7909 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 7910 7911 Results.push_back( 7912 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 7913 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 7914 Results.push_back( 7915 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 7916 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 7917 Results.push_back(SDValue(CmpSwap, 2)); 7918 } 7919 7920 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, 7921 SelectionDAG &DAG) { 7922 const auto &TLI = DAG.getTargetLoweringInfo(); 7923 7924 assert(Subtarget.getTargetTriple().isOSMSVCRT() && 7925 "Custom lowering is MSVCRT specific!"); 7926 7927 SDLoc dl(Op); 7928 SDValue Val = Op.getOperand(0); 7929 MVT Ty = Val->getSimpleValueType(0); 7930 SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); 7931 SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", 7932 TLI.getPointerTy(DAG.getDataLayout())); 7933 7934 TargetLowering::ArgListTy Args; 7935 TargetLowering::ArgListEntry Entry; 7936 7937 Entry.Node = Val; 7938 Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); 7939 Entry.IsZExt = true; 7940 Args.push_back(Entry); 7941 7942 Entry.Node = Exponent; 7943 Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); 7944 Entry.IsZExt = true; 7945 Args.push_back(Entry); 7946 7947 Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); 7948 7949 // In the in-chain to the call is the entry node If we are emitting a 7950 // tailcall, the chain will be mutated if the node has a non-entry input 7951 // chain. 7952 SDValue InChain = DAG.getEntryNode(); 7953 SDValue TCChain = InChain; 7954 7955 const Function &F = DAG.getMachineFunction().getFunction(); 7956 bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && 7957 F.getReturnType() == LCRTy; 7958 if (IsTC) 7959 InChain = TCChain; 7960 7961 TargetLowering::CallLoweringInfo CLI(DAG); 7962 CLI.setDebugLoc(dl) 7963 .setChain(InChain) 7964 .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) 7965 .setTailCall(IsTC); 7966 std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI); 7967 7968 // Return the chain (the DAG root) if it is a tail call 7969 return !CI.second.getNode() ? DAG.getRoot() : CI.first; 7970 } 7971 7972 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7973 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 7974 switch (Op.getOpcode()) { 7975 default: llvm_unreachable("Don't know how to custom lower this!"); 7976 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 7977 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7978 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7979 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7980 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7981 case ISD::SELECT: return LowerSELECT(Op, DAG); 7982 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 7983 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7984 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 7985 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 7986 case ISD::VASTART: return LowerVASTART(Op, DAG); 7987 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 7988 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 7989 case ISD::SINT_TO_FP: 7990 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 7991 case ISD::FP_TO_SINT: 7992 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 7993 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7994 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7995 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7996 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 7997 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 7998 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 7999 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 8000 Subtarget); 8001 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 8002 case ISD::SHL: 8003 case ISD::SRL: 8004 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 8005 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 8006 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 8007 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 8008 case ISD::SRL_PARTS: 8009 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 8010 case ISD::CTTZ: 8011 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 8012 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 8013 case ISD::SETCC: return LowerVSETCC(Op, DAG); 8014 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 8015 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 8016 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 8017 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8018 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8019 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8020 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8021 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8022 case ISD::MUL: return LowerMUL(Op, DAG); 8023 case ISD::SDIV: 8024 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 8025 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 8026 return LowerSDIV(Op, DAG); 8027 case ISD::UDIV: 8028 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 8029 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 8030 return LowerUDIV(Op, DAG); 8031 case ISD::ADDCARRY: 8032 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 8033 case ISD::SADDO: 8034 case ISD::SSUBO: 8035 return LowerSignedALUO(Op, DAG); 8036 case ISD::UADDO: 8037 case ISD::USUBO: 8038 return LowerUnsignedALUO(Op, DAG); 8039 case ISD::ATOMIC_LOAD: 8040 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 8041 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 8042 case ISD::SDIVREM: 8043 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 8044 case ISD::DYNAMIC_STACKALLOC: 8045 if (Subtarget->isTargetWindows()) 8046 return LowerDYNAMIC_STACKALLOC(Op, DAG); 8047 llvm_unreachable("Don't know how to custom lower this!"); 8048 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 8049 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 8050 case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); 8051 case ARMISD::WIN__DBZCHK: return SDValue(); 8052 } 8053 } 8054 8055 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 8056 SelectionDAG &DAG) { 8057 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 8058 unsigned Opc = 0; 8059 if (IntNo == Intrinsic::arm_smlald) 8060 Opc = ARMISD::SMLALD; 8061 else if (IntNo == Intrinsic::arm_smlaldx) 8062 Opc = ARMISD::SMLALDX; 8063 else if (IntNo == Intrinsic::arm_smlsld) 8064 Opc = ARMISD::SMLSLD; 8065 else if (IntNo == Intrinsic::arm_smlsldx) 8066 Opc = ARMISD::SMLSLDX; 8067 else 8068 return; 8069 8070 SDLoc dl(N); 8071 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8072 N->getOperand(3), 8073 DAG.getConstant(0, dl, MVT::i32)); 8074 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8075 N->getOperand(3), 8076 DAG.getConstant(1, dl, MVT::i32)); 8077 8078 SDValue LongMul = DAG.getNode(Opc, dl, 8079 DAG.getVTList(MVT::i32, MVT::i32), 8080 N->getOperand(1), N->getOperand(2), 8081 Lo, Hi); 8082 Results.push_back(LongMul.getValue(0)); 8083 Results.push_back(LongMul.getValue(1)); 8084 } 8085 8086 /// ReplaceNodeResults - Replace the results of node with an illegal result 8087 /// type with new values built out of custom code. 8088 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 8089 SmallVectorImpl<SDValue> &Results, 8090 SelectionDAG &DAG) const { 8091 SDValue Res; 8092 switch (N->getOpcode()) { 8093 default: 8094 llvm_unreachable("Don't know how to custom expand this!"); 8095 case ISD::READ_REGISTER: 8096 ExpandREAD_REGISTER(N, Results, DAG); 8097 break; 8098 case ISD::BITCAST: 8099 Res = ExpandBITCAST(N, DAG, Subtarget); 8100 break; 8101 case ISD::SRL: 8102 case ISD::SRA: 8103 Res = Expand64BitShift(N, DAG, Subtarget); 8104 break; 8105 case ISD::SREM: 8106 case ISD::UREM: 8107 Res = LowerREM(N, DAG); 8108 break; 8109 case ISD::SDIVREM: 8110 case ISD::UDIVREM: 8111 Res = LowerDivRem(SDValue(N, 0), DAG); 8112 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 8113 Results.push_back(Res.getValue(0)); 8114 Results.push_back(Res.getValue(1)); 8115 return; 8116 case ISD::READCYCLECOUNTER: 8117 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 8118 return; 8119 case ISD::UDIV: 8120 case ISD::SDIV: 8121 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 8122 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 8123 Results); 8124 case ISD::ATOMIC_CMP_SWAP: 8125 ReplaceCMP_SWAP_64Results(N, Results, DAG); 8126 return; 8127 case ISD::INTRINSIC_WO_CHAIN: 8128 return ReplaceLongIntrinsic(N, Results, DAG); 8129 } 8130 if (Res.getNode()) 8131 Results.push_back(Res); 8132 } 8133 8134 //===----------------------------------------------------------------------===// 8135 // ARM Scheduler Hooks 8136 //===----------------------------------------------------------------------===// 8137 8138 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 8139 /// registers the function context. 8140 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 8141 MachineBasicBlock *MBB, 8142 MachineBasicBlock *DispatchBB, 8143 int FI) const { 8144 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 8145 "ROPI/RWPI not currently supported with SjLj"); 8146 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8147 DebugLoc dl = MI.getDebugLoc(); 8148 MachineFunction *MF = MBB->getParent(); 8149 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8150 MachineConstantPool *MCP = MF->getConstantPool(); 8151 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 8152 const Function &F = MF->getFunction(); 8153 8154 bool isThumb = Subtarget->isThumb(); 8155 bool isThumb2 = Subtarget->isThumb2(); 8156 8157 unsigned PCLabelId = AFI->createPICLabelUId(); 8158 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 8159 ARMConstantPoolValue *CPV = 8160 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 8161 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 8162 8163 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 8164 : &ARM::GPRRegClass; 8165 8166 // Grab constant pool and fixed stack memory operands. 8167 MachineMemOperand *CPMMO = 8168 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 8169 MachineMemOperand::MOLoad, 4, 4); 8170 8171 MachineMemOperand *FIMMOSt = 8172 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 8173 MachineMemOperand::MOStore, 4, 4); 8174 8175 // Load the address of the dispatch MBB into the jump buffer. 8176 if (isThumb2) { 8177 // Incoming value: jbuf 8178 // ldr.n r5, LCPI1_1 8179 // orr r5, r5, #1 8180 // add r5, pc 8181 // str r5, [$jbuf, #+4] ; &jbuf[1] 8182 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8183 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 8184 .addConstantPoolIndex(CPI) 8185 .addMemOperand(CPMMO) 8186 .add(predOps(ARMCC::AL)); 8187 // Set the low bit because of thumb mode. 8188 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8189 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 8190 .addReg(NewVReg1, RegState::Kill) 8191 .addImm(0x01) 8192 .add(predOps(ARMCC::AL)) 8193 .add(condCodeOp()); 8194 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8195 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 8196 .addReg(NewVReg2, RegState::Kill) 8197 .addImm(PCLabelId); 8198 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 8199 .addReg(NewVReg3, RegState::Kill) 8200 .addFrameIndex(FI) 8201 .addImm(36) // &jbuf[1] :: pc 8202 .addMemOperand(FIMMOSt) 8203 .add(predOps(ARMCC::AL)); 8204 } else if (isThumb) { 8205 // Incoming value: jbuf 8206 // ldr.n r1, LCPI1_4 8207 // add r1, pc 8208 // mov r2, #1 8209 // orrs r1, r2 8210 // add r2, $jbuf, #+4 ; &jbuf[1] 8211 // str r1, [r2] 8212 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8213 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 8214 .addConstantPoolIndex(CPI) 8215 .addMemOperand(CPMMO) 8216 .add(predOps(ARMCC::AL)); 8217 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8218 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 8219 .addReg(NewVReg1, RegState::Kill) 8220 .addImm(PCLabelId); 8221 // Set the low bit because of thumb mode. 8222 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8223 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 8224 .addReg(ARM::CPSR, RegState::Define) 8225 .addImm(1) 8226 .add(predOps(ARMCC::AL)); 8227 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8228 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 8229 .addReg(ARM::CPSR, RegState::Define) 8230 .addReg(NewVReg2, RegState::Kill) 8231 .addReg(NewVReg3, RegState::Kill) 8232 .add(predOps(ARMCC::AL)); 8233 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8234 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 8235 .addFrameIndex(FI) 8236 .addImm(36); // &jbuf[1] :: pc 8237 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 8238 .addReg(NewVReg4, RegState::Kill) 8239 .addReg(NewVReg5, RegState::Kill) 8240 .addImm(0) 8241 .addMemOperand(FIMMOSt) 8242 .add(predOps(ARMCC::AL)); 8243 } else { 8244 // Incoming value: jbuf 8245 // ldr r1, LCPI1_1 8246 // add r1, pc, r1 8247 // str r1, [$jbuf, #+4] ; &jbuf[1] 8248 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8249 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 8250 .addConstantPoolIndex(CPI) 8251 .addImm(0) 8252 .addMemOperand(CPMMO) 8253 .add(predOps(ARMCC::AL)); 8254 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8255 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 8256 .addReg(NewVReg1, RegState::Kill) 8257 .addImm(PCLabelId) 8258 .add(predOps(ARMCC::AL)); 8259 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 8260 .addReg(NewVReg2, RegState::Kill) 8261 .addFrameIndex(FI) 8262 .addImm(36) // &jbuf[1] :: pc 8263 .addMemOperand(FIMMOSt) 8264 .add(predOps(ARMCC::AL)); 8265 } 8266 } 8267 8268 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 8269 MachineBasicBlock *MBB) const { 8270 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8271 DebugLoc dl = MI.getDebugLoc(); 8272 MachineFunction *MF = MBB->getParent(); 8273 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8274 MachineFrameInfo &MFI = MF->getFrameInfo(); 8275 int FI = MFI.getFunctionContextIndex(); 8276 8277 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 8278 : &ARM::GPRnopcRegClass; 8279 8280 // Get a mapping of the call site numbers to all of the landing pads they're 8281 // associated with. 8282 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 8283 unsigned MaxCSNum = 0; 8284 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 8285 ++BB) { 8286 if (!BB->isEHPad()) continue; 8287 8288 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 8289 // pad. 8290 for (MachineBasicBlock::iterator 8291 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 8292 if (!II->isEHLabel()) continue; 8293 8294 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 8295 if (!MF->hasCallSiteLandingPad(Sym)) continue; 8296 8297 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 8298 for (SmallVectorImpl<unsigned>::iterator 8299 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 8300 CSI != CSE; ++CSI) { 8301 CallSiteNumToLPad[*CSI].push_back(&*BB); 8302 MaxCSNum = std::max(MaxCSNum, *CSI); 8303 } 8304 break; 8305 } 8306 } 8307 8308 // Get an ordered list of the machine basic blocks for the jump table. 8309 std::vector<MachineBasicBlock*> LPadList; 8310 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 8311 LPadList.reserve(CallSiteNumToLPad.size()); 8312 for (unsigned I = 1; I <= MaxCSNum; ++I) { 8313 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 8314 for (SmallVectorImpl<MachineBasicBlock*>::iterator 8315 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 8316 LPadList.push_back(*II); 8317 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 8318 } 8319 } 8320 8321 assert(!LPadList.empty() && 8322 "No landing pad destinations for the dispatch jump table!"); 8323 8324 // Create the jump table and associated information. 8325 MachineJumpTableInfo *JTI = 8326 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 8327 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 8328 8329 // Create the MBBs for the dispatch code. 8330 8331 // Shove the dispatch's address into the return slot in the function context. 8332 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 8333 DispatchBB->setIsEHPad(); 8334 8335 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 8336 unsigned trap_opcode; 8337 if (Subtarget->isThumb()) 8338 trap_opcode = ARM::tTRAP; 8339 else 8340 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 8341 8342 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 8343 DispatchBB->addSuccessor(TrapBB); 8344 8345 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 8346 DispatchBB->addSuccessor(DispContBB); 8347 8348 // Insert and MBBs. 8349 MF->insert(MF->end(), DispatchBB); 8350 MF->insert(MF->end(), DispContBB); 8351 MF->insert(MF->end(), TrapBB); 8352 8353 // Insert code into the entry block that creates and registers the function 8354 // context. 8355 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 8356 8357 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 8358 MachinePointerInfo::getFixedStack(*MF, FI), 8359 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 8360 8361 MachineInstrBuilder MIB; 8362 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 8363 8364 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 8365 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 8366 8367 // Add a register mask with no preserved registers. This results in all 8368 // registers being marked as clobbered. This can't work if the dispatch block 8369 // is in a Thumb1 function and is linked with ARM code which uses the FP 8370 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 8371 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 8372 8373 bool IsPositionIndependent = isPositionIndependent(); 8374 unsigned NumLPads = LPadList.size(); 8375 if (Subtarget->isThumb2()) { 8376 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8377 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 8378 .addFrameIndex(FI) 8379 .addImm(4) 8380 .addMemOperand(FIMMOLd) 8381 .add(predOps(ARMCC::AL)); 8382 8383 if (NumLPads < 256) { 8384 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 8385 .addReg(NewVReg1) 8386 .addImm(LPadList.size()) 8387 .add(predOps(ARMCC::AL)); 8388 } else { 8389 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8390 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 8391 .addImm(NumLPads & 0xFFFF) 8392 .add(predOps(ARMCC::AL)); 8393 8394 unsigned VReg2 = VReg1; 8395 if ((NumLPads & 0xFFFF0000) != 0) { 8396 VReg2 = MRI->createVirtualRegister(TRC); 8397 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 8398 .addReg(VReg1) 8399 .addImm(NumLPads >> 16) 8400 .add(predOps(ARMCC::AL)); 8401 } 8402 8403 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 8404 .addReg(NewVReg1) 8405 .addReg(VReg2) 8406 .add(predOps(ARMCC::AL)); 8407 } 8408 8409 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 8410 .addMBB(TrapBB) 8411 .addImm(ARMCC::HI) 8412 .addReg(ARM::CPSR); 8413 8414 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8415 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 8416 .addJumpTableIndex(MJTI) 8417 .add(predOps(ARMCC::AL)); 8418 8419 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8420 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 8421 .addReg(NewVReg3, RegState::Kill) 8422 .addReg(NewVReg1) 8423 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 8424 .add(predOps(ARMCC::AL)) 8425 .add(condCodeOp()); 8426 8427 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 8428 .addReg(NewVReg4, RegState::Kill) 8429 .addReg(NewVReg1) 8430 .addJumpTableIndex(MJTI); 8431 } else if (Subtarget->isThumb()) { 8432 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8433 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 8434 .addFrameIndex(FI) 8435 .addImm(1) 8436 .addMemOperand(FIMMOLd) 8437 .add(predOps(ARMCC::AL)); 8438 8439 if (NumLPads < 256) { 8440 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 8441 .addReg(NewVReg1) 8442 .addImm(NumLPads) 8443 .add(predOps(ARMCC::AL)); 8444 } else { 8445 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8446 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 8447 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 8448 8449 // MachineConstantPool wants an explicit alignment. 8450 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8451 if (Align == 0) 8452 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8453 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8454 8455 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8456 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 8457 .addReg(VReg1, RegState::Define) 8458 .addConstantPoolIndex(Idx) 8459 .add(predOps(ARMCC::AL)); 8460 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 8461 .addReg(NewVReg1) 8462 .addReg(VReg1) 8463 .add(predOps(ARMCC::AL)); 8464 } 8465 8466 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 8467 .addMBB(TrapBB) 8468 .addImm(ARMCC::HI) 8469 .addReg(ARM::CPSR); 8470 8471 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8472 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 8473 .addReg(ARM::CPSR, RegState::Define) 8474 .addReg(NewVReg1) 8475 .addImm(2) 8476 .add(predOps(ARMCC::AL)); 8477 8478 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8479 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 8480 .addJumpTableIndex(MJTI) 8481 .add(predOps(ARMCC::AL)); 8482 8483 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8484 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 8485 .addReg(ARM::CPSR, RegState::Define) 8486 .addReg(NewVReg2, RegState::Kill) 8487 .addReg(NewVReg3) 8488 .add(predOps(ARMCC::AL)); 8489 8490 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8491 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8492 8493 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8494 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 8495 .addReg(NewVReg4, RegState::Kill) 8496 .addImm(0) 8497 .addMemOperand(JTMMOLd) 8498 .add(predOps(ARMCC::AL)); 8499 8500 unsigned NewVReg6 = NewVReg5; 8501 if (IsPositionIndependent) { 8502 NewVReg6 = MRI->createVirtualRegister(TRC); 8503 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 8504 .addReg(ARM::CPSR, RegState::Define) 8505 .addReg(NewVReg5, RegState::Kill) 8506 .addReg(NewVReg3) 8507 .add(predOps(ARMCC::AL)); 8508 } 8509 8510 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 8511 .addReg(NewVReg6, RegState::Kill) 8512 .addJumpTableIndex(MJTI); 8513 } else { 8514 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8515 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 8516 .addFrameIndex(FI) 8517 .addImm(4) 8518 .addMemOperand(FIMMOLd) 8519 .add(predOps(ARMCC::AL)); 8520 8521 if (NumLPads < 256) { 8522 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 8523 .addReg(NewVReg1) 8524 .addImm(NumLPads) 8525 .add(predOps(ARMCC::AL)); 8526 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 8527 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8528 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 8529 .addImm(NumLPads & 0xFFFF) 8530 .add(predOps(ARMCC::AL)); 8531 8532 unsigned VReg2 = VReg1; 8533 if ((NumLPads & 0xFFFF0000) != 0) { 8534 VReg2 = MRI->createVirtualRegister(TRC); 8535 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 8536 .addReg(VReg1) 8537 .addImm(NumLPads >> 16) 8538 .add(predOps(ARMCC::AL)); 8539 } 8540 8541 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8542 .addReg(NewVReg1) 8543 .addReg(VReg2) 8544 .add(predOps(ARMCC::AL)); 8545 } else { 8546 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8547 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 8548 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 8549 8550 // MachineConstantPool wants an explicit alignment. 8551 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8552 if (Align == 0) 8553 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8554 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8555 8556 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8557 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 8558 .addReg(VReg1, RegState::Define) 8559 .addConstantPoolIndex(Idx) 8560 .addImm(0) 8561 .add(predOps(ARMCC::AL)); 8562 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8563 .addReg(NewVReg1) 8564 .addReg(VReg1, RegState::Kill) 8565 .add(predOps(ARMCC::AL)); 8566 } 8567 8568 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 8569 .addMBB(TrapBB) 8570 .addImm(ARMCC::HI) 8571 .addReg(ARM::CPSR); 8572 8573 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8574 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 8575 .addReg(NewVReg1) 8576 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 8577 .add(predOps(ARMCC::AL)) 8578 .add(condCodeOp()); 8579 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8580 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 8581 .addJumpTableIndex(MJTI) 8582 .add(predOps(ARMCC::AL)); 8583 8584 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8585 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8586 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8587 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 8588 .addReg(NewVReg3, RegState::Kill) 8589 .addReg(NewVReg4) 8590 .addImm(0) 8591 .addMemOperand(JTMMOLd) 8592 .add(predOps(ARMCC::AL)); 8593 8594 if (IsPositionIndependent) { 8595 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 8596 .addReg(NewVReg5, RegState::Kill) 8597 .addReg(NewVReg4) 8598 .addJumpTableIndex(MJTI); 8599 } else { 8600 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 8601 .addReg(NewVReg5, RegState::Kill) 8602 .addJumpTableIndex(MJTI); 8603 } 8604 } 8605 8606 // Add the jump table entries as successors to the MBB. 8607 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 8608 for (std::vector<MachineBasicBlock*>::iterator 8609 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 8610 MachineBasicBlock *CurMBB = *I; 8611 if (SeenMBBs.insert(CurMBB).second) 8612 DispContBB->addSuccessor(CurMBB); 8613 } 8614 8615 // N.B. the order the invoke BBs are processed in doesn't matter here. 8616 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 8617 SmallVector<MachineBasicBlock*, 64> MBBLPads; 8618 for (MachineBasicBlock *BB : InvokeBBs) { 8619 8620 // Remove the landing pad successor from the invoke block and replace it 8621 // with the new dispatch block. 8622 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 8623 BB->succ_end()); 8624 while (!Successors.empty()) { 8625 MachineBasicBlock *SMBB = Successors.pop_back_val(); 8626 if (SMBB->isEHPad()) { 8627 BB->removeSuccessor(SMBB); 8628 MBBLPads.push_back(SMBB); 8629 } 8630 } 8631 8632 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 8633 BB->normalizeSuccProbs(); 8634 8635 // Find the invoke call and mark all of the callee-saved registers as 8636 // 'implicit defined' so that they're spilled. This prevents code from 8637 // moving instructions to before the EH block, where they will never be 8638 // executed. 8639 for (MachineBasicBlock::reverse_iterator 8640 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 8641 if (!II->isCall()) continue; 8642 8643 DenseMap<unsigned, bool> DefRegs; 8644 for (MachineInstr::mop_iterator 8645 OI = II->operands_begin(), OE = II->operands_end(); 8646 OI != OE; ++OI) { 8647 if (!OI->isReg()) continue; 8648 DefRegs[OI->getReg()] = true; 8649 } 8650 8651 MachineInstrBuilder MIB(*MF, &*II); 8652 8653 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 8654 unsigned Reg = SavedRegs[i]; 8655 if (Subtarget->isThumb2() && 8656 !ARM::tGPRRegClass.contains(Reg) && 8657 !ARM::hGPRRegClass.contains(Reg)) 8658 continue; 8659 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 8660 continue; 8661 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 8662 continue; 8663 if (!DefRegs[Reg]) 8664 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 8665 } 8666 8667 break; 8668 } 8669 } 8670 8671 // Mark all former landing pads as non-landing pads. The dispatch is the only 8672 // landing pad now. 8673 for (SmallVectorImpl<MachineBasicBlock*>::iterator 8674 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 8675 (*I)->setIsEHPad(false); 8676 8677 // The instruction is gone now. 8678 MI.eraseFromParent(); 8679 } 8680 8681 static 8682 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 8683 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 8684 E = MBB->succ_end(); I != E; ++I) 8685 if (*I != Succ) 8686 return *I; 8687 llvm_unreachable("Expecting a BB with two successors!"); 8688 } 8689 8690 /// Return the load opcode for a given load size. If load size >= 8, 8691 /// neon opcode will be returned. 8692 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 8693 if (LdSize >= 8) 8694 return LdSize == 16 ? ARM::VLD1q32wb_fixed 8695 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 8696 if (IsThumb1) 8697 return LdSize == 4 ? ARM::tLDRi 8698 : LdSize == 2 ? ARM::tLDRHi 8699 : LdSize == 1 ? ARM::tLDRBi : 0; 8700 if (IsThumb2) 8701 return LdSize == 4 ? ARM::t2LDR_POST 8702 : LdSize == 2 ? ARM::t2LDRH_POST 8703 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 8704 return LdSize == 4 ? ARM::LDR_POST_IMM 8705 : LdSize == 2 ? ARM::LDRH_POST 8706 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 8707 } 8708 8709 /// Return the store opcode for a given store size. If store size >= 8, 8710 /// neon opcode will be returned. 8711 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 8712 if (StSize >= 8) 8713 return StSize == 16 ? ARM::VST1q32wb_fixed 8714 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 8715 if (IsThumb1) 8716 return StSize == 4 ? ARM::tSTRi 8717 : StSize == 2 ? ARM::tSTRHi 8718 : StSize == 1 ? ARM::tSTRBi : 0; 8719 if (IsThumb2) 8720 return StSize == 4 ? ARM::t2STR_POST 8721 : StSize == 2 ? ARM::t2STRH_POST 8722 : StSize == 1 ? ARM::t2STRB_POST : 0; 8723 return StSize == 4 ? ARM::STR_POST_IMM 8724 : StSize == 2 ? ARM::STRH_POST 8725 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 8726 } 8727 8728 /// Emit a post-increment load operation with given size. The instructions 8729 /// will be added to BB at Pos. 8730 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8731 const TargetInstrInfo *TII, const DebugLoc &dl, 8732 unsigned LdSize, unsigned Data, unsigned AddrIn, 8733 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8734 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 8735 assert(LdOpc != 0 && "Should have a load opcode"); 8736 if (LdSize >= 8) { 8737 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8738 .addReg(AddrOut, RegState::Define) 8739 .addReg(AddrIn) 8740 .addImm(0) 8741 .add(predOps(ARMCC::AL)); 8742 } else if (IsThumb1) { 8743 // load + update AddrIn 8744 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8745 .addReg(AddrIn) 8746 .addImm(0) 8747 .add(predOps(ARMCC::AL)); 8748 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8749 .add(t1CondCodeOp()) 8750 .addReg(AddrIn) 8751 .addImm(LdSize) 8752 .add(predOps(ARMCC::AL)); 8753 } else if (IsThumb2) { 8754 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8755 .addReg(AddrOut, RegState::Define) 8756 .addReg(AddrIn) 8757 .addImm(LdSize) 8758 .add(predOps(ARMCC::AL)); 8759 } else { // arm 8760 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8761 .addReg(AddrOut, RegState::Define) 8762 .addReg(AddrIn) 8763 .addReg(0) 8764 .addImm(LdSize) 8765 .add(predOps(ARMCC::AL)); 8766 } 8767 } 8768 8769 /// Emit a post-increment store operation with given size. The instructions 8770 /// will be added to BB at Pos. 8771 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8772 const TargetInstrInfo *TII, const DebugLoc &dl, 8773 unsigned StSize, unsigned Data, unsigned AddrIn, 8774 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8775 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 8776 assert(StOpc != 0 && "Should have a store opcode"); 8777 if (StSize >= 8) { 8778 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8779 .addReg(AddrIn) 8780 .addImm(0) 8781 .addReg(Data) 8782 .add(predOps(ARMCC::AL)); 8783 } else if (IsThumb1) { 8784 // store + update AddrIn 8785 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 8786 .addReg(Data) 8787 .addReg(AddrIn) 8788 .addImm(0) 8789 .add(predOps(ARMCC::AL)); 8790 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8791 .add(t1CondCodeOp()) 8792 .addReg(AddrIn) 8793 .addImm(StSize) 8794 .add(predOps(ARMCC::AL)); 8795 } else if (IsThumb2) { 8796 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8797 .addReg(Data) 8798 .addReg(AddrIn) 8799 .addImm(StSize) 8800 .add(predOps(ARMCC::AL)); 8801 } else { // arm 8802 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8803 .addReg(Data) 8804 .addReg(AddrIn) 8805 .addReg(0) 8806 .addImm(StSize) 8807 .add(predOps(ARMCC::AL)); 8808 } 8809 } 8810 8811 MachineBasicBlock * 8812 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 8813 MachineBasicBlock *BB) const { 8814 // This pseudo instruction has 3 operands: dst, src, size 8815 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 8816 // Otherwise, we will generate unrolled scalar copies. 8817 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8818 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8819 MachineFunction::iterator It = ++BB->getIterator(); 8820 8821 unsigned dest = MI.getOperand(0).getReg(); 8822 unsigned src = MI.getOperand(1).getReg(); 8823 unsigned SizeVal = MI.getOperand(2).getImm(); 8824 unsigned Align = MI.getOperand(3).getImm(); 8825 DebugLoc dl = MI.getDebugLoc(); 8826 8827 MachineFunction *MF = BB->getParent(); 8828 MachineRegisterInfo &MRI = MF->getRegInfo(); 8829 unsigned UnitSize = 0; 8830 const TargetRegisterClass *TRC = nullptr; 8831 const TargetRegisterClass *VecTRC = nullptr; 8832 8833 bool IsThumb1 = Subtarget->isThumb1Only(); 8834 bool IsThumb2 = Subtarget->isThumb2(); 8835 bool IsThumb = Subtarget->isThumb(); 8836 8837 if (Align & 1) { 8838 UnitSize = 1; 8839 } else if (Align & 2) { 8840 UnitSize = 2; 8841 } else { 8842 // Check whether we can use NEON instructions. 8843 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 8844 Subtarget->hasNEON()) { 8845 if ((Align % 16 == 0) && SizeVal >= 16) 8846 UnitSize = 16; 8847 else if ((Align % 8 == 0) && SizeVal >= 8) 8848 UnitSize = 8; 8849 } 8850 // Can't use NEON instructions. 8851 if (UnitSize == 0) 8852 UnitSize = 4; 8853 } 8854 8855 // Select the correct opcode and register class for unit size load/store 8856 bool IsNeon = UnitSize >= 8; 8857 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 8858 if (IsNeon) 8859 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 8860 : UnitSize == 8 ? &ARM::DPRRegClass 8861 : nullptr; 8862 8863 unsigned BytesLeft = SizeVal % UnitSize; 8864 unsigned LoopSize = SizeVal - BytesLeft; 8865 8866 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 8867 // Use LDR and STR to copy. 8868 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 8869 // [destOut] = STR_POST(scratch, destIn, UnitSize) 8870 unsigned srcIn = src; 8871 unsigned destIn = dest; 8872 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 8873 unsigned srcOut = MRI.createVirtualRegister(TRC); 8874 unsigned destOut = MRI.createVirtualRegister(TRC); 8875 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 8876 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 8877 IsThumb1, IsThumb2); 8878 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 8879 IsThumb1, IsThumb2); 8880 srcIn = srcOut; 8881 destIn = destOut; 8882 } 8883 8884 // Handle the leftover bytes with LDRB and STRB. 8885 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 8886 // [destOut] = STRB_POST(scratch, destIn, 1) 8887 for (unsigned i = 0; i < BytesLeft; i++) { 8888 unsigned srcOut = MRI.createVirtualRegister(TRC); 8889 unsigned destOut = MRI.createVirtualRegister(TRC); 8890 unsigned scratch = MRI.createVirtualRegister(TRC); 8891 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 8892 IsThumb1, IsThumb2); 8893 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 8894 IsThumb1, IsThumb2); 8895 srcIn = srcOut; 8896 destIn = destOut; 8897 } 8898 MI.eraseFromParent(); // The instruction is gone now. 8899 return BB; 8900 } 8901 8902 // Expand the pseudo op to a loop. 8903 // thisMBB: 8904 // ... 8905 // movw varEnd, # --> with thumb2 8906 // movt varEnd, # 8907 // ldrcp varEnd, idx --> without thumb2 8908 // fallthrough --> loopMBB 8909 // loopMBB: 8910 // PHI varPhi, varEnd, varLoop 8911 // PHI srcPhi, src, srcLoop 8912 // PHI destPhi, dst, destLoop 8913 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 8914 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 8915 // subs varLoop, varPhi, #UnitSize 8916 // bne loopMBB 8917 // fallthrough --> exitMBB 8918 // exitMBB: 8919 // epilogue to handle left-over bytes 8920 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 8921 // [destOut] = STRB_POST(scratch, destLoop, 1) 8922 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8923 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8924 MF->insert(It, loopMBB); 8925 MF->insert(It, exitMBB); 8926 8927 // Transfer the remainder of BB and its successor edges to exitMBB. 8928 exitMBB->splice(exitMBB->begin(), BB, 8929 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8930 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 8931 8932 // Load an immediate to varEnd. 8933 unsigned varEnd = MRI.createVirtualRegister(TRC); 8934 if (Subtarget->useMovt(*MF)) { 8935 unsigned Vtmp = varEnd; 8936 if ((LoopSize & 0xFFFF0000) != 0) 8937 Vtmp = MRI.createVirtualRegister(TRC); 8938 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 8939 .addImm(LoopSize & 0xFFFF) 8940 .add(predOps(ARMCC::AL)); 8941 8942 if ((LoopSize & 0xFFFF0000) != 0) 8943 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 8944 .addReg(Vtmp) 8945 .addImm(LoopSize >> 16) 8946 .add(predOps(ARMCC::AL)); 8947 } else { 8948 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8949 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 8950 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 8951 8952 // MachineConstantPool wants an explicit alignment. 8953 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8954 if (Align == 0) 8955 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8956 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8957 8958 if (IsThumb) 8959 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 8960 .addReg(varEnd, RegState::Define) 8961 .addConstantPoolIndex(Idx) 8962 .add(predOps(ARMCC::AL)); 8963 else 8964 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 8965 .addReg(varEnd, RegState::Define) 8966 .addConstantPoolIndex(Idx) 8967 .addImm(0) 8968 .add(predOps(ARMCC::AL)); 8969 } 8970 BB->addSuccessor(loopMBB); 8971 8972 // Generate the loop body: 8973 // varPhi = PHI(varLoop, varEnd) 8974 // srcPhi = PHI(srcLoop, src) 8975 // destPhi = PHI(destLoop, dst) 8976 MachineBasicBlock *entryBB = BB; 8977 BB = loopMBB; 8978 unsigned varLoop = MRI.createVirtualRegister(TRC); 8979 unsigned varPhi = MRI.createVirtualRegister(TRC); 8980 unsigned srcLoop = MRI.createVirtualRegister(TRC); 8981 unsigned srcPhi = MRI.createVirtualRegister(TRC); 8982 unsigned destLoop = MRI.createVirtualRegister(TRC); 8983 unsigned destPhi = MRI.createVirtualRegister(TRC); 8984 8985 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 8986 .addReg(varLoop).addMBB(loopMBB) 8987 .addReg(varEnd).addMBB(entryBB); 8988 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 8989 .addReg(srcLoop).addMBB(loopMBB) 8990 .addReg(src).addMBB(entryBB); 8991 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 8992 .addReg(destLoop).addMBB(loopMBB) 8993 .addReg(dest).addMBB(entryBB); 8994 8995 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 8996 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 8997 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 8998 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 8999 IsThumb1, IsThumb2); 9000 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 9001 IsThumb1, IsThumb2); 9002 9003 // Decrement loop variable by UnitSize. 9004 if (IsThumb1) { 9005 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 9006 .add(t1CondCodeOp()) 9007 .addReg(varPhi) 9008 .addImm(UnitSize) 9009 .add(predOps(ARMCC::AL)); 9010 } else { 9011 MachineInstrBuilder MIB = 9012 BuildMI(*BB, BB->end(), dl, 9013 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 9014 MIB.addReg(varPhi) 9015 .addImm(UnitSize) 9016 .add(predOps(ARMCC::AL)) 9017 .add(condCodeOp()); 9018 MIB->getOperand(5).setReg(ARM::CPSR); 9019 MIB->getOperand(5).setIsDef(true); 9020 } 9021 BuildMI(*BB, BB->end(), dl, 9022 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 9023 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 9024 9025 // loopMBB can loop back to loopMBB or fall through to exitMBB. 9026 BB->addSuccessor(loopMBB); 9027 BB->addSuccessor(exitMBB); 9028 9029 // Add epilogue to handle BytesLeft. 9030 BB = exitMBB; 9031 auto StartOfExit = exitMBB->begin(); 9032 9033 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 9034 // [destOut] = STRB_POST(scratch, destLoop, 1) 9035 unsigned srcIn = srcLoop; 9036 unsigned destIn = destLoop; 9037 for (unsigned i = 0; i < BytesLeft; i++) { 9038 unsigned srcOut = MRI.createVirtualRegister(TRC); 9039 unsigned destOut = MRI.createVirtualRegister(TRC); 9040 unsigned scratch = MRI.createVirtualRegister(TRC); 9041 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 9042 IsThumb1, IsThumb2); 9043 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 9044 IsThumb1, IsThumb2); 9045 srcIn = srcOut; 9046 destIn = destOut; 9047 } 9048 9049 MI.eraseFromParent(); // The instruction is gone now. 9050 return BB; 9051 } 9052 9053 MachineBasicBlock * 9054 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 9055 MachineBasicBlock *MBB) const { 9056 const TargetMachine &TM = getTargetMachine(); 9057 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 9058 DebugLoc DL = MI.getDebugLoc(); 9059 9060 assert(Subtarget->isTargetWindows() && 9061 "__chkstk is only supported on Windows"); 9062 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 9063 9064 // __chkstk takes the number of words to allocate on the stack in R4, and 9065 // returns the stack adjustment in number of bytes in R4. This will not 9066 // clober any other registers (other than the obvious lr). 9067 // 9068 // Although, technically, IP should be considered a register which may be 9069 // clobbered, the call itself will not touch it. Windows on ARM is a pure 9070 // thumb-2 environment, so there is no interworking required. As a result, we 9071 // do not expect a veneer to be emitted by the linker, clobbering IP. 9072 // 9073 // Each module receives its own copy of __chkstk, so no import thunk is 9074 // required, again, ensuring that IP is not clobbered. 9075 // 9076 // Finally, although some linkers may theoretically provide a trampoline for 9077 // out of range calls (which is quite common due to a 32M range limitation of 9078 // branches for Thumb), we can generate the long-call version via 9079 // -mcmodel=large, alleviating the need for the trampoline which may clobber 9080 // IP. 9081 9082 switch (TM.getCodeModel()) { 9083 case CodeModel::Tiny: 9084 llvm_unreachable("Tiny code model not available on ARM."); 9085 case CodeModel::Small: 9086 case CodeModel::Medium: 9087 case CodeModel::Kernel: 9088 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 9089 .add(predOps(ARMCC::AL)) 9090 .addExternalSymbol("__chkstk") 9091 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 9092 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 9093 .addReg(ARM::R12, 9094 RegState::Implicit | RegState::Define | RegState::Dead) 9095 .addReg(ARM::CPSR, 9096 RegState::Implicit | RegState::Define | RegState::Dead); 9097 break; 9098 case CodeModel::Large: { 9099 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9100 unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 9101 9102 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 9103 .addExternalSymbol("__chkstk"); 9104 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 9105 .add(predOps(ARMCC::AL)) 9106 .addReg(Reg, RegState::Kill) 9107 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 9108 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 9109 .addReg(ARM::R12, 9110 RegState::Implicit | RegState::Define | RegState::Dead) 9111 .addReg(ARM::CPSR, 9112 RegState::Implicit | RegState::Define | RegState::Dead); 9113 break; 9114 } 9115 } 9116 9117 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 9118 .addReg(ARM::SP, RegState::Kill) 9119 .addReg(ARM::R4, RegState::Kill) 9120 .setMIFlags(MachineInstr::FrameSetup) 9121 .add(predOps(ARMCC::AL)) 9122 .add(condCodeOp()); 9123 9124 MI.eraseFromParent(); 9125 return MBB; 9126 } 9127 9128 MachineBasicBlock * 9129 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 9130 MachineBasicBlock *MBB) const { 9131 DebugLoc DL = MI.getDebugLoc(); 9132 MachineFunction *MF = MBB->getParent(); 9133 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9134 9135 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 9136 MF->insert(++MBB->getIterator(), ContBB); 9137 ContBB->splice(ContBB->begin(), MBB, 9138 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 9139 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 9140 MBB->addSuccessor(ContBB); 9141 9142 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9143 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 9144 MF->push_back(TrapBB); 9145 MBB->addSuccessor(TrapBB); 9146 9147 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 9148 .addReg(MI.getOperand(0).getReg()) 9149 .addImm(0) 9150 .add(predOps(ARMCC::AL)); 9151 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 9152 .addMBB(TrapBB) 9153 .addImm(ARMCC::EQ) 9154 .addReg(ARM::CPSR); 9155 9156 MI.eraseFromParent(); 9157 return ContBB; 9158 } 9159 9160 MachineBasicBlock * 9161 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 9162 MachineBasicBlock *BB) const { 9163 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9164 DebugLoc dl = MI.getDebugLoc(); 9165 bool isThumb2 = Subtarget->isThumb2(); 9166 switch (MI.getOpcode()) { 9167 default: { 9168 MI.print(errs()); 9169 llvm_unreachable("Unexpected instr type to insert"); 9170 } 9171 9172 // Thumb1 post-indexed loads are really just single-register LDMs. 9173 case ARM::tLDR_postidx: { 9174 MachineOperand Def(MI.getOperand(1)); 9175 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 9176 .add(Def) // Rn_wb 9177 .add(MI.getOperand(2)) // Rn 9178 .add(MI.getOperand(3)) // PredImm 9179 .add(MI.getOperand(4)) // PredReg 9180 .add(MI.getOperand(0)); // Rt 9181 MI.eraseFromParent(); 9182 return BB; 9183 } 9184 9185 // The Thumb2 pre-indexed stores have the same MI operands, they just 9186 // define them differently in the .td files from the isel patterns, so 9187 // they need pseudos. 9188 case ARM::t2STR_preidx: 9189 MI.setDesc(TII->get(ARM::t2STR_PRE)); 9190 return BB; 9191 case ARM::t2STRB_preidx: 9192 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 9193 return BB; 9194 case ARM::t2STRH_preidx: 9195 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 9196 return BB; 9197 9198 case ARM::STRi_preidx: 9199 case ARM::STRBi_preidx: { 9200 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 9201 : ARM::STRB_PRE_IMM; 9202 // Decode the offset. 9203 unsigned Offset = MI.getOperand(4).getImm(); 9204 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 9205 Offset = ARM_AM::getAM2Offset(Offset); 9206 if (isSub) 9207 Offset = -Offset; 9208 9209 MachineMemOperand *MMO = *MI.memoperands_begin(); 9210 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 9211 .add(MI.getOperand(0)) // Rn_wb 9212 .add(MI.getOperand(1)) // Rt 9213 .add(MI.getOperand(2)) // Rn 9214 .addImm(Offset) // offset (skip GPR==zero_reg) 9215 .add(MI.getOperand(5)) // pred 9216 .add(MI.getOperand(6)) 9217 .addMemOperand(MMO); 9218 MI.eraseFromParent(); 9219 return BB; 9220 } 9221 case ARM::STRr_preidx: 9222 case ARM::STRBr_preidx: 9223 case ARM::STRH_preidx: { 9224 unsigned NewOpc; 9225 switch (MI.getOpcode()) { 9226 default: llvm_unreachable("unexpected opcode!"); 9227 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 9228 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 9229 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 9230 } 9231 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 9232 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 9233 MIB.add(MI.getOperand(i)); 9234 MI.eraseFromParent(); 9235 return BB; 9236 } 9237 9238 case ARM::tMOVCCr_pseudo: { 9239 // To "insert" a SELECT_CC instruction, we actually have to insert the 9240 // diamond control-flow pattern. The incoming instruction knows the 9241 // destination vreg to set, the condition code register to branch on, the 9242 // true/false values to select between, and a branch opcode to use. 9243 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9244 MachineFunction::iterator It = ++BB->getIterator(); 9245 9246 // thisMBB: 9247 // ... 9248 // TrueVal = ... 9249 // cmpTY ccX, r1, r2 9250 // bCC copy1MBB 9251 // fallthrough --> copy0MBB 9252 MachineBasicBlock *thisMBB = BB; 9253 MachineFunction *F = BB->getParent(); 9254 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9255 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9256 F->insert(It, copy0MBB); 9257 F->insert(It, sinkMBB); 9258 9259 // Transfer the remainder of BB and its successor edges to sinkMBB. 9260 sinkMBB->splice(sinkMBB->begin(), BB, 9261 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9262 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9263 9264 BB->addSuccessor(copy0MBB); 9265 BB->addSuccessor(sinkMBB); 9266 9267 BuildMI(BB, dl, TII->get(ARM::tBcc)) 9268 .addMBB(sinkMBB) 9269 .addImm(MI.getOperand(3).getImm()) 9270 .addReg(MI.getOperand(4).getReg()); 9271 9272 // copy0MBB: 9273 // %FalseValue = ... 9274 // # fallthrough to sinkMBB 9275 BB = copy0MBB; 9276 9277 // Update machine-CFG edges 9278 BB->addSuccessor(sinkMBB); 9279 9280 // sinkMBB: 9281 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9282 // ... 9283 BB = sinkMBB; 9284 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 9285 .addReg(MI.getOperand(1).getReg()) 9286 .addMBB(copy0MBB) 9287 .addReg(MI.getOperand(2).getReg()) 9288 .addMBB(thisMBB); 9289 9290 MI.eraseFromParent(); // The pseudo instruction is gone now. 9291 return BB; 9292 } 9293 9294 case ARM::BCCi64: 9295 case ARM::BCCZi64: { 9296 // If there is an unconditional branch to the other successor, remove it. 9297 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9298 9299 // Compare both parts that make up the double comparison separately for 9300 // equality. 9301 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 9302 9303 unsigned LHS1 = MI.getOperand(1).getReg(); 9304 unsigned LHS2 = MI.getOperand(2).getReg(); 9305 if (RHSisZero) { 9306 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9307 .addReg(LHS1) 9308 .addImm(0) 9309 .add(predOps(ARMCC::AL)); 9310 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9311 .addReg(LHS2).addImm(0) 9312 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 9313 } else { 9314 unsigned RHS1 = MI.getOperand(3).getReg(); 9315 unsigned RHS2 = MI.getOperand(4).getReg(); 9316 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 9317 .addReg(LHS1) 9318 .addReg(RHS1) 9319 .add(predOps(ARMCC::AL)); 9320 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 9321 .addReg(LHS2).addReg(RHS2) 9322 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 9323 } 9324 9325 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 9326 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 9327 if (MI.getOperand(0).getImm() == ARMCC::NE) 9328 std::swap(destMBB, exitMBB); 9329 9330 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 9331 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 9332 if (isThumb2) 9333 BuildMI(BB, dl, TII->get(ARM::t2B)) 9334 .addMBB(exitMBB) 9335 .add(predOps(ARMCC::AL)); 9336 else 9337 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 9338 9339 MI.eraseFromParent(); // The pseudo instruction is gone now. 9340 return BB; 9341 } 9342 9343 case ARM::Int_eh_sjlj_setjmp: 9344 case ARM::Int_eh_sjlj_setjmp_nofp: 9345 case ARM::tInt_eh_sjlj_setjmp: 9346 case ARM::t2Int_eh_sjlj_setjmp: 9347 case ARM::t2Int_eh_sjlj_setjmp_nofp: 9348 return BB; 9349 9350 case ARM::Int_eh_sjlj_setup_dispatch: 9351 EmitSjLjDispatchBlock(MI, BB); 9352 return BB; 9353 9354 case ARM::ABS: 9355 case ARM::t2ABS: { 9356 // To insert an ABS instruction, we have to insert the 9357 // diamond control-flow pattern. The incoming instruction knows the 9358 // source vreg to test against 0, the destination vreg to set, 9359 // the condition code register to branch on, the 9360 // true/false values to select between, and a branch opcode to use. 9361 // It transforms 9362 // V1 = ABS V0 9363 // into 9364 // V2 = MOVS V0 9365 // BCC (branch to SinkBB if V0 >= 0) 9366 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 9367 // SinkBB: V1 = PHI(V2, V3) 9368 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9369 MachineFunction::iterator BBI = ++BB->getIterator(); 9370 MachineFunction *Fn = BB->getParent(); 9371 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 9372 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 9373 Fn->insert(BBI, RSBBB); 9374 Fn->insert(BBI, SinkBB); 9375 9376 unsigned int ABSSrcReg = MI.getOperand(1).getReg(); 9377 unsigned int ABSDstReg = MI.getOperand(0).getReg(); 9378 bool ABSSrcKIll = MI.getOperand(1).isKill(); 9379 bool isThumb2 = Subtarget->isThumb2(); 9380 MachineRegisterInfo &MRI = Fn->getRegInfo(); 9381 // In Thumb mode S must not be specified if source register is the SP or 9382 // PC and if destination register is the SP, so restrict register class 9383 unsigned NewRsbDstReg = 9384 MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 9385 9386 // Transfer the remainder of BB and its successor edges to sinkMBB. 9387 SinkBB->splice(SinkBB->begin(), BB, 9388 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9389 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 9390 9391 BB->addSuccessor(RSBBB); 9392 BB->addSuccessor(SinkBB); 9393 9394 // fall through to SinkMBB 9395 RSBBB->addSuccessor(SinkBB); 9396 9397 // insert a cmp at the end of BB 9398 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9399 .addReg(ABSSrcReg) 9400 .addImm(0) 9401 .add(predOps(ARMCC::AL)); 9402 9403 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 9404 BuildMI(BB, dl, 9405 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 9406 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 9407 9408 // insert rsbri in RSBBB 9409 // Note: BCC and rsbri will be converted into predicated rsbmi 9410 // by if-conversion pass 9411 BuildMI(*RSBBB, RSBBB->begin(), dl, 9412 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 9413 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 9414 .addImm(0) 9415 .add(predOps(ARMCC::AL)) 9416 .add(condCodeOp()); 9417 9418 // insert PHI in SinkBB, 9419 // reuse ABSDstReg to not change uses of ABS instruction 9420 BuildMI(*SinkBB, SinkBB->begin(), dl, 9421 TII->get(ARM::PHI), ABSDstReg) 9422 .addReg(NewRsbDstReg).addMBB(RSBBB) 9423 .addReg(ABSSrcReg).addMBB(BB); 9424 9425 // remove ABS instruction 9426 MI.eraseFromParent(); 9427 9428 // return last added BB 9429 return SinkBB; 9430 } 9431 case ARM::COPY_STRUCT_BYVAL_I32: 9432 ++NumLoopByVals; 9433 return EmitStructByval(MI, BB); 9434 case ARM::WIN__CHKSTK: 9435 return EmitLowered__chkstk(MI, BB); 9436 case ARM::WIN__DBZCHK: 9437 return EmitLowered__dbzchk(MI, BB); 9438 } 9439 } 9440 9441 /// Attaches vregs to MEMCPY that it will use as scratch registers 9442 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 9443 /// instead of as a custom inserter because we need the use list from the SDNode. 9444 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 9445 MachineInstr &MI, const SDNode *Node) { 9446 bool isThumb1 = Subtarget->isThumb1Only(); 9447 9448 DebugLoc DL = MI.getDebugLoc(); 9449 MachineFunction *MF = MI.getParent()->getParent(); 9450 MachineRegisterInfo &MRI = MF->getRegInfo(); 9451 MachineInstrBuilder MIB(*MF, MI); 9452 9453 // If the new dst/src is unused mark it as dead. 9454 if (!Node->hasAnyUseOfValue(0)) { 9455 MI.getOperand(0).setIsDead(true); 9456 } 9457 if (!Node->hasAnyUseOfValue(1)) { 9458 MI.getOperand(1).setIsDead(true); 9459 } 9460 9461 // The MEMCPY both defines and kills the scratch registers. 9462 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 9463 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 9464 : &ARM::GPRRegClass); 9465 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 9466 } 9467 } 9468 9469 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 9470 SDNode *Node) const { 9471 if (MI.getOpcode() == ARM::MEMCPY) { 9472 attachMEMCPYScratchRegs(Subtarget, MI, Node); 9473 return; 9474 } 9475 9476 const MCInstrDesc *MCID = &MI.getDesc(); 9477 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 9478 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 9479 // operand is still set to noreg. If needed, set the optional operand's 9480 // register to CPSR, and remove the redundant implicit def. 9481 // 9482 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 9483 9484 // Rename pseudo opcodes. 9485 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 9486 unsigned ccOutIdx; 9487 if (NewOpc) { 9488 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 9489 MCID = &TII->get(NewOpc); 9490 9491 assert(MCID->getNumOperands() == 9492 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 9493 && "converted opcode should be the same except for cc_out" 9494 " (and, on Thumb1, pred)"); 9495 9496 MI.setDesc(*MCID); 9497 9498 // Add the optional cc_out operand 9499 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 9500 9501 // On Thumb1, move all input operands to the end, then add the predicate 9502 if (Subtarget->isThumb1Only()) { 9503 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 9504 MI.addOperand(MI.getOperand(1)); 9505 MI.RemoveOperand(1); 9506 } 9507 9508 // Restore the ties 9509 for (unsigned i = MI.getNumOperands(); i--;) { 9510 const MachineOperand& op = MI.getOperand(i); 9511 if (op.isReg() && op.isUse()) { 9512 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 9513 if (DefIdx != -1) 9514 MI.tieOperands(DefIdx, i); 9515 } 9516 } 9517 9518 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 9519 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 9520 ccOutIdx = 1; 9521 } else 9522 ccOutIdx = MCID->getNumOperands() - 1; 9523 } else 9524 ccOutIdx = MCID->getNumOperands() - 1; 9525 9526 // Any ARM instruction that sets the 's' bit should specify an optional 9527 // "cc_out" operand in the last operand position. 9528 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 9529 assert(!NewOpc && "Optional cc_out operand required"); 9530 return; 9531 } 9532 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 9533 // since we already have an optional CPSR def. 9534 bool definesCPSR = false; 9535 bool deadCPSR = false; 9536 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 9537 ++i) { 9538 const MachineOperand &MO = MI.getOperand(i); 9539 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 9540 definesCPSR = true; 9541 if (MO.isDead()) 9542 deadCPSR = true; 9543 MI.RemoveOperand(i); 9544 break; 9545 } 9546 } 9547 if (!definesCPSR) { 9548 assert(!NewOpc && "Optional cc_out operand required"); 9549 return; 9550 } 9551 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 9552 if (deadCPSR) { 9553 assert(!MI.getOperand(ccOutIdx).getReg() && 9554 "expect uninitialized optional cc_out operand"); 9555 // Thumb1 instructions must have the S bit even if the CPSR is dead. 9556 if (!Subtarget->isThumb1Only()) 9557 return; 9558 } 9559 9560 // If this instruction was defined with an optional CPSR def and its dag node 9561 // had a live implicit CPSR def, then activate the optional CPSR def. 9562 MachineOperand &MO = MI.getOperand(ccOutIdx); 9563 MO.setReg(ARM::CPSR); 9564 MO.setIsDef(true); 9565 } 9566 9567 //===----------------------------------------------------------------------===// 9568 // ARM Optimization Hooks 9569 //===----------------------------------------------------------------------===// 9570 9571 // Helper function that checks if N is a null or all ones constant. 9572 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 9573 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 9574 } 9575 9576 // Return true if N is conditionally 0 or all ones. 9577 // Detects these expressions where cc is an i1 value: 9578 // 9579 // (select cc 0, y) [AllOnes=0] 9580 // (select cc y, 0) [AllOnes=0] 9581 // (zext cc) [AllOnes=0] 9582 // (sext cc) [AllOnes=0/1] 9583 // (select cc -1, y) [AllOnes=1] 9584 // (select cc y, -1) [AllOnes=1] 9585 // 9586 // Invert is set when N is the null/all ones constant when CC is false. 9587 // OtherOp is set to the alternative value of N. 9588 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 9589 SDValue &CC, bool &Invert, 9590 SDValue &OtherOp, 9591 SelectionDAG &DAG) { 9592 switch (N->getOpcode()) { 9593 default: return false; 9594 case ISD::SELECT: { 9595 CC = N->getOperand(0); 9596 SDValue N1 = N->getOperand(1); 9597 SDValue N2 = N->getOperand(2); 9598 if (isZeroOrAllOnes(N1, AllOnes)) { 9599 Invert = false; 9600 OtherOp = N2; 9601 return true; 9602 } 9603 if (isZeroOrAllOnes(N2, AllOnes)) { 9604 Invert = true; 9605 OtherOp = N1; 9606 return true; 9607 } 9608 return false; 9609 } 9610 case ISD::ZERO_EXTEND: 9611 // (zext cc) can never be the all ones value. 9612 if (AllOnes) 9613 return false; 9614 LLVM_FALLTHROUGH; 9615 case ISD::SIGN_EXTEND: { 9616 SDLoc dl(N); 9617 EVT VT = N->getValueType(0); 9618 CC = N->getOperand(0); 9619 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 9620 return false; 9621 Invert = !AllOnes; 9622 if (AllOnes) 9623 // When looking for an AllOnes constant, N is an sext, and the 'other' 9624 // value is 0. 9625 OtherOp = DAG.getConstant(0, dl, VT); 9626 else if (N->getOpcode() == ISD::ZERO_EXTEND) 9627 // When looking for a 0 constant, N can be zext or sext. 9628 OtherOp = DAG.getConstant(1, dl, VT); 9629 else 9630 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 9631 VT); 9632 return true; 9633 } 9634 } 9635 } 9636 9637 // Combine a constant select operand into its use: 9638 // 9639 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 9640 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 9641 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 9642 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 9643 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 9644 // 9645 // The transform is rejected if the select doesn't have a constant operand that 9646 // is null, or all ones when AllOnes is set. 9647 // 9648 // Also recognize sext/zext from i1: 9649 // 9650 // (add (zext cc), x) -> (select cc (add x, 1), x) 9651 // (add (sext cc), x) -> (select cc (add x, -1), x) 9652 // 9653 // These transformations eventually create predicated instructions. 9654 // 9655 // @param N The node to transform. 9656 // @param Slct The N operand that is a select. 9657 // @param OtherOp The other N operand (x above). 9658 // @param DCI Context. 9659 // @param AllOnes Require the select constant to be all ones instead of null. 9660 // @returns The new node, or SDValue() on failure. 9661 static 9662 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 9663 TargetLowering::DAGCombinerInfo &DCI, 9664 bool AllOnes = false) { 9665 SelectionDAG &DAG = DCI.DAG; 9666 EVT VT = N->getValueType(0); 9667 SDValue NonConstantVal; 9668 SDValue CCOp; 9669 bool SwapSelectOps; 9670 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 9671 NonConstantVal, DAG)) 9672 return SDValue(); 9673 9674 // Slct is now know to be the desired identity constant when CC is true. 9675 SDValue TrueVal = OtherOp; 9676 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 9677 OtherOp, NonConstantVal); 9678 // Unless SwapSelectOps says CC should be false. 9679 if (SwapSelectOps) 9680 std::swap(TrueVal, FalseVal); 9681 9682 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 9683 CCOp, TrueVal, FalseVal); 9684 } 9685 9686 // Attempt combineSelectAndUse on each operand of a commutative operator N. 9687 static 9688 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 9689 TargetLowering::DAGCombinerInfo &DCI) { 9690 SDValue N0 = N->getOperand(0); 9691 SDValue N1 = N->getOperand(1); 9692 if (N0.getNode()->hasOneUse()) 9693 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 9694 return Result; 9695 if (N1.getNode()->hasOneUse()) 9696 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 9697 return Result; 9698 return SDValue(); 9699 } 9700 9701 static bool IsVUZPShuffleNode(SDNode *N) { 9702 // VUZP shuffle node. 9703 if (N->getOpcode() == ARMISD::VUZP) 9704 return true; 9705 9706 // "VUZP" on i32 is an alias for VTRN. 9707 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 9708 return true; 9709 9710 return false; 9711 } 9712 9713 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 9714 TargetLowering::DAGCombinerInfo &DCI, 9715 const ARMSubtarget *Subtarget) { 9716 // Look for ADD(VUZP.0, VUZP.1). 9717 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 9718 N0 == N1) 9719 return SDValue(); 9720 9721 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 9722 if (!N->getValueType(0).is64BitVector()) 9723 return SDValue(); 9724 9725 // Generate vpadd. 9726 SelectionDAG &DAG = DCI.DAG; 9727 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9728 SDLoc dl(N); 9729 SDNode *Unzip = N0.getNode(); 9730 EVT VT = N->getValueType(0); 9731 9732 SmallVector<SDValue, 8> Ops; 9733 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 9734 TLI.getPointerTy(DAG.getDataLayout()))); 9735 Ops.push_back(Unzip->getOperand(0)); 9736 Ops.push_back(Unzip->getOperand(1)); 9737 9738 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9739 } 9740 9741 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9742 TargetLowering::DAGCombinerInfo &DCI, 9743 const ARMSubtarget *Subtarget) { 9744 // Check for two extended operands. 9745 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 9746 N1.getOpcode() == ISD::SIGN_EXTEND) && 9747 !(N0.getOpcode() == ISD::ZERO_EXTEND && 9748 N1.getOpcode() == ISD::ZERO_EXTEND)) 9749 return SDValue(); 9750 9751 SDValue N00 = N0.getOperand(0); 9752 SDValue N10 = N1.getOperand(0); 9753 9754 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 9755 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 9756 N00 == N10) 9757 return SDValue(); 9758 9759 // We only recognize Q register paddl here; this can't be reached until 9760 // after type legalization. 9761 if (!N00.getValueType().is64BitVector() || 9762 !N0.getValueType().is128BitVector()) 9763 return SDValue(); 9764 9765 // Generate vpaddl. 9766 SelectionDAG &DAG = DCI.DAG; 9767 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9768 SDLoc dl(N); 9769 EVT VT = N->getValueType(0); 9770 9771 SmallVector<SDValue, 8> Ops; 9772 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 9773 unsigned Opcode; 9774 if (N0.getOpcode() == ISD::SIGN_EXTEND) 9775 Opcode = Intrinsic::arm_neon_vpaddls; 9776 else 9777 Opcode = Intrinsic::arm_neon_vpaddlu; 9778 Ops.push_back(DAG.getConstant(Opcode, dl, 9779 TLI.getPointerTy(DAG.getDataLayout()))); 9780 EVT ElemTy = N00.getValueType().getVectorElementType(); 9781 unsigned NumElts = VT.getVectorNumElements(); 9782 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 9783 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 9784 N00.getOperand(0), N00.getOperand(1)); 9785 Ops.push_back(Concat); 9786 9787 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9788 } 9789 9790 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 9791 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 9792 // much easier to match. 9793 static SDValue 9794 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9795 TargetLowering::DAGCombinerInfo &DCI, 9796 const ARMSubtarget *Subtarget) { 9797 // Only perform optimization if after legalize, and if NEON is available. We 9798 // also expected both operands to be BUILD_VECTORs. 9799 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 9800 || N0.getOpcode() != ISD::BUILD_VECTOR 9801 || N1.getOpcode() != ISD::BUILD_VECTOR) 9802 return SDValue(); 9803 9804 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 9805 EVT VT = N->getValueType(0); 9806 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 9807 return SDValue(); 9808 9809 // Check that the vector operands are of the right form. 9810 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 9811 // operands, where N is the size of the formed vector. 9812 // Each EXTRACT_VECTOR should have the same input vector and odd or even 9813 // index such that we have a pair wise add pattern. 9814 9815 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 9816 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9817 return SDValue(); 9818 SDValue Vec = N0->getOperand(0)->getOperand(0); 9819 SDNode *V = Vec.getNode(); 9820 unsigned nextIndex = 0; 9821 9822 // For each operands to the ADD which are BUILD_VECTORs, 9823 // check to see if each of their operands are an EXTRACT_VECTOR with 9824 // the same vector and appropriate index. 9825 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 9826 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 9827 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 9828 9829 SDValue ExtVec0 = N0->getOperand(i); 9830 SDValue ExtVec1 = N1->getOperand(i); 9831 9832 // First operand is the vector, verify its the same. 9833 if (V != ExtVec0->getOperand(0).getNode() || 9834 V != ExtVec1->getOperand(0).getNode()) 9835 return SDValue(); 9836 9837 // Second is the constant, verify its correct. 9838 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 9839 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 9840 9841 // For the constant, we want to see all the even or all the odd. 9842 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 9843 || C1->getZExtValue() != nextIndex+1) 9844 return SDValue(); 9845 9846 // Increment index. 9847 nextIndex+=2; 9848 } else 9849 return SDValue(); 9850 } 9851 9852 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 9853 // we're using the entire input vector, otherwise there's a size/legality 9854 // mismatch somewhere. 9855 if (nextIndex != Vec.getValueType().getVectorNumElements() || 9856 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 9857 return SDValue(); 9858 9859 // Create VPADDL node. 9860 SelectionDAG &DAG = DCI.DAG; 9861 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9862 9863 SDLoc dl(N); 9864 9865 // Build operand list. 9866 SmallVector<SDValue, 8> Ops; 9867 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 9868 TLI.getPointerTy(DAG.getDataLayout()))); 9869 9870 // Input is the vector. 9871 Ops.push_back(Vec); 9872 9873 // Get widened type and narrowed type. 9874 MVT widenType; 9875 unsigned numElem = VT.getVectorNumElements(); 9876 9877 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 9878 switch (inputLaneType.getSimpleVT().SimpleTy) { 9879 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 9880 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 9881 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 9882 default: 9883 llvm_unreachable("Invalid vector element type for padd optimization."); 9884 } 9885 9886 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 9887 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 9888 return DAG.getNode(ExtOp, dl, VT, tmp); 9889 } 9890 9891 static SDValue findMUL_LOHI(SDValue V) { 9892 if (V->getOpcode() == ISD::UMUL_LOHI || 9893 V->getOpcode() == ISD::SMUL_LOHI) 9894 return V; 9895 return SDValue(); 9896 } 9897 9898 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 9899 TargetLowering::DAGCombinerInfo &DCI, 9900 const ARMSubtarget *Subtarget) { 9901 if (Subtarget->isThumb()) { 9902 if (!Subtarget->hasDSP()) 9903 return SDValue(); 9904 } else if (!Subtarget->hasV5TEOps()) 9905 return SDValue(); 9906 9907 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 9908 // accumulates the product into a 64-bit value. The 16-bit values will 9909 // be sign extended somehow or SRA'd into 32-bit values 9910 // (addc (adde (mul 16bit, 16bit), lo), hi) 9911 SDValue Mul = AddcNode->getOperand(0); 9912 SDValue Lo = AddcNode->getOperand(1); 9913 if (Mul.getOpcode() != ISD::MUL) { 9914 Lo = AddcNode->getOperand(0); 9915 Mul = AddcNode->getOperand(1); 9916 if (Mul.getOpcode() != ISD::MUL) 9917 return SDValue(); 9918 } 9919 9920 SDValue SRA = AddeNode->getOperand(0); 9921 SDValue Hi = AddeNode->getOperand(1); 9922 if (SRA.getOpcode() != ISD::SRA) { 9923 SRA = AddeNode->getOperand(1); 9924 Hi = AddeNode->getOperand(0); 9925 if (SRA.getOpcode() != ISD::SRA) 9926 return SDValue(); 9927 } 9928 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 9929 if (Const->getZExtValue() != 31) 9930 return SDValue(); 9931 } else 9932 return SDValue(); 9933 9934 if (SRA.getOperand(0) != Mul) 9935 return SDValue(); 9936 9937 SelectionDAG &DAG = DCI.DAG; 9938 SDLoc dl(AddcNode); 9939 unsigned Opcode = 0; 9940 SDValue Op0; 9941 SDValue Op1; 9942 9943 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 9944 Opcode = ARMISD::SMLALBB; 9945 Op0 = Mul.getOperand(0); 9946 Op1 = Mul.getOperand(1); 9947 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 9948 Opcode = ARMISD::SMLALBT; 9949 Op0 = Mul.getOperand(0); 9950 Op1 = Mul.getOperand(1).getOperand(0); 9951 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 9952 Opcode = ARMISD::SMLALTB; 9953 Op0 = Mul.getOperand(0).getOperand(0); 9954 Op1 = Mul.getOperand(1); 9955 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 9956 Opcode = ARMISD::SMLALTT; 9957 Op0 = Mul->getOperand(0).getOperand(0); 9958 Op1 = Mul->getOperand(1).getOperand(0); 9959 } 9960 9961 if (!Op0 || !Op1) 9962 return SDValue(); 9963 9964 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 9965 Op0, Op1, Lo, Hi); 9966 // Replace the ADDs' nodes uses by the MLA node's values. 9967 SDValue HiMLALResult(SMLAL.getNode(), 1); 9968 SDValue LoMLALResult(SMLAL.getNode(), 0); 9969 9970 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 9971 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 9972 9973 // Return original node to notify the driver to stop replacing. 9974 SDValue resNode(AddcNode, 0); 9975 return resNode; 9976 } 9977 9978 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 9979 TargetLowering::DAGCombinerInfo &DCI, 9980 const ARMSubtarget *Subtarget) { 9981 // Look for multiply add opportunities. 9982 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 9983 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 9984 // a glue link from the first add to the second add. 9985 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 9986 // a S/UMLAL instruction. 9987 // UMUL_LOHI 9988 // / :lo \ :hi 9989 // V \ [no multiline comment] 9990 // loAdd -> ADDC | 9991 // \ :carry / 9992 // V V 9993 // ADDE <- hiAdd 9994 // 9995 // In the special case where only the higher part of a signed result is used 9996 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 9997 // a constant with the exact value of 0x80000000, we recognize we are dealing 9998 // with a "rounded multiply and add" (or subtract) and transform it into 9999 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 10000 10001 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 10002 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 10003 "Expect an ADDE or SUBE"); 10004 10005 assert(AddeSubeNode->getNumOperands() == 3 && 10006 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 10007 "ADDE node has the wrong inputs"); 10008 10009 // Check that we are chained to the right ADDC or SUBC node. 10010 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 10011 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 10012 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 10013 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 10014 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 10015 return SDValue(); 10016 10017 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 10018 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 10019 10020 // Check if the two operands are from the same mul_lohi node. 10021 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 10022 return SDValue(); 10023 10024 assert(AddcSubcNode->getNumValues() == 2 && 10025 AddcSubcNode->getValueType(0) == MVT::i32 && 10026 "Expect ADDC with two result values. First: i32"); 10027 10028 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 10029 // maybe a SMLAL which multiplies two 16-bit values. 10030 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 10031 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 10032 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 10033 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 10034 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 10035 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 10036 10037 // Check for the triangle shape. 10038 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 10039 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 10040 10041 // Make sure that the ADDE/SUBE operands are not coming from the same node. 10042 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 10043 return SDValue(); 10044 10045 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 10046 bool IsLeftOperandMUL = false; 10047 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 10048 if (MULOp == SDValue()) 10049 MULOp = findMUL_LOHI(AddeSubeOp1); 10050 else 10051 IsLeftOperandMUL = true; 10052 if (MULOp == SDValue()) 10053 return SDValue(); 10054 10055 // Figure out the right opcode. 10056 unsigned Opc = MULOp->getOpcode(); 10057 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 10058 10059 // Figure out the high and low input values to the MLAL node. 10060 SDValue *HiAddSub = nullptr; 10061 SDValue *LoMul = nullptr; 10062 SDValue *LowAddSub = nullptr; 10063 10064 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 10065 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 10066 return SDValue(); 10067 10068 if (IsLeftOperandMUL) 10069 HiAddSub = &AddeSubeOp1; 10070 else 10071 HiAddSub = &AddeSubeOp0; 10072 10073 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 10074 // whose low result is fed to the ADDC/SUBC we are checking. 10075 10076 if (AddcSubcOp0 == MULOp.getValue(0)) { 10077 LoMul = &AddcSubcOp0; 10078 LowAddSub = &AddcSubcOp1; 10079 } 10080 if (AddcSubcOp1 == MULOp.getValue(0)) { 10081 LoMul = &AddcSubcOp1; 10082 LowAddSub = &AddcSubcOp0; 10083 } 10084 10085 if (!LoMul) 10086 return SDValue(); 10087 10088 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 10089 // the replacement below will create a cycle. 10090 if (AddcSubcNode == HiAddSub->getNode() || 10091 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 10092 return SDValue(); 10093 10094 // Create the merged node. 10095 SelectionDAG &DAG = DCI.DAG; 10096 10097 // Start building operand list. 10098 SmallVector<SDValue, 8> Ops; 10099 Ops.push_back(LoMul->getOperand(0)); 10100 Ops.push_back(LoMul->getOperand(1)); 10101 10102 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 10103 // the case, we must be doing signed multiplication and only use the higher 10104 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 10105 // addition or subtraction with the value of 0x800000. 10106 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 10107 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 10108 LowAddSub->getNode()->getOpcode() == ISD::Constant && 10109 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 10110 0x80000000) { 10111 Ops.push_back(*HiAddSub); 10112 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 10113 FinalOpc = ARMISD::SMMLSR; 10114 } else { 10115 FinalOpc = ARMISD::SMMLAR; 10116 } 10117 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 10118 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 10119 10120 return SDValue(AddeSubeNode, 0); 10121 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 10122 // SMMLS is generated during instruction selection and the rest of this 10123 // function can not handle the case where AddcSubcNode is a SUBC. 10124 return SDValue(); 10125 10126 // Finish building the operand list for {U/S}MLAL 10127 Ops.push_back(*LowAddSub); 10128 Ops.push_back(*HiAddSub); 10129 10130 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 10131 DAG.getVTList(MVT::i32, MVT::i32), Ops); 10132 10133 // Replace the ADDs' nodes uses by the MLA node's values. 10134 SDValue HiMLALResult(MLALNode.getNode(), 1); 10135 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 10136 10137 SDValue LoMLALResult(MLALNode.getNode(), 0); 10138 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 10139 10140 // Return original node to notify the driver to stop replacing. 10141 return SDValue(AddeSubeNode, 0); 10142 } 10143 10144 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 10145 TargetLowering::DAGCombinerInfo &DCI, 10146 const ARMSubtarget *Subtarget) { 10147 // UMAAL is similar to UMLAL except that it adds two unsigned values. 10148 // While trying to combine for the other MLAL nodes, first search for the 10149 // chance to use UMAAL. Check if Addc uses a node which has already 10150 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 10151 // as the addend, and it's handled in PerformUMLALCombine. 10152 10153 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 10154 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 10155 10156 // Check that we have a glued ADDC node. 10157 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 10158 if (AddcNode->getOpcode() != ARMISD::ADDC) 10159 return SDValue(); 10160 10161 // Find the converted UMAAL or quit if it doesn't exist. 10162 SDNode *UmlalNode = nullptr; 10163 SDValue AddHi; 10164 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 10165 UmlalNode = AddcNode->getOperand(0).getNode(); 10166 AddHi = AddcNode->getOperand(1); 10167 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 10168 UmlalNode = AddcNode->getOperand(1).getNode(); 10169 AddHi = AddcNode->getOperand(0); 10170 } else { 10171 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 10172 } 10173 10174 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 10175 // the ADDC as well as Zero. 10176 if (!isNullConstant(UmlalNode->getOperand(3))) 10177 return SDValue(); 10178 10179 if ((isNullConstant(AddeNode->getOperand(0)) && 10180 AddeNode->getOperand(1).getNode() == UmlalNode) || 10181 (AddeNode->getOperand(0).getNode() == UmlalNode && 10182 isNullConstant(AddeNode->getOperand(1)))) { 10183 SelectionDAG &DAG = DCI.DAG; 10184 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 10185 UmlalNode->getOperand(2), AddHi }; 10186 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 10187 DAG.getVTList(MVT::i32, MVT::i32), Ops); 10188 10189 // Replace the ADDs' nodes uses by the UMAAL node's values. 10190 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 10191 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 10192 10193 // Return original node to notify the driver to stop replacing. 10194 return SDValue(AddeNode, 0); 10195 } 10196 return SDValue(); 10197 } 10198 10199 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 10200 const ARMSubtarget *Subtarget) { 10201 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 10202 return SDValue(); 10203 10204 // Check that we have a pair of ADDC and ADDE as operands. 10205 // Both addends of the ADDE must be zero. 10206 SDNode* AddcNode = N->getOperand(2).getNode(); 10207 SDNode* AddeNode = N->getOperand(3).getNode(); 10208 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 10209 (AddeNode->getOpcode() == ARMISD::ADDE) && 10210 isNullConstant(AddeNode->getOperand(0)) && 10211 isNullConstant(AddeNode->getOperand(1)) && 10212 (AddeNode->getOperand(2).getNode() == AddcNode)) 10213 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 10214 DAG.getVTList(MVT::i32, MVT::i32), 10215 {N->getOperand(0), N->getOperand(1), 10216 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 10217 else 10218 return SDValue(); 10219 } 10220 10221 static SDValue PerformAddcSubcCombine(SDNode *N, 10222 TargetLowering::DAGCombinerInfo &DCI, 10223 const ARMSubtarget *Subtarget) { 10224 SelectionDAG &DAG(DCI.DAG); 10225 10226 if (N->getOpcode() == ARMISD::SUBC) { 10227 // (SUBC (ADDE 0, 0, C), 1) -> C 10228 SDValue LHS = N->getOperand(0); 10229 SDValue RHS = N->getOperand(1); 10230 if (LHS->getOpcode() == ARMISD::ADDE && 10231 isNullConstant(LHS->getOperand(0)) && 10232 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 10233 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 10234 } 10235 } 10236 10237 if (Subtarget->isThumb1Only()) { 10238 SDValue RHS = N->getOperand(1); 10239 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 10240 int32_t imm = C->getSExtValue(); 10241 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 10242 SDLoc DL(N); 10243 RHS = DAG.getConstant(-imm, DL, MVT::i32); 10244 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 10245 : ARMISD::ADDC; 10246 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 10247 } 10248 } 10249 } 10250 10251 return SDValue(); 10252 } 10253 10254 static SDValue PerformAddeSubeCombine(SDNode *N, 10255 TargetLowering::DAGCombinerInfo &DCI, 10256 const ARMSubtarget *Subtarget) { 10257 if (Subtarget->isThumb1Only()) { 10258 SelectionDAG &DAG = DCI.DAG; 10259 SDValue RHS = N->getOperand(1); 10260 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 10261 int64_t imm = C->getSExtValue(); 10262 if (imm < 0) { 10263 SDLoc DL(N); 10264 10265 // The with-carry-in form matches bitwise not instead of the negation. 10266 // Effectively, the inverse interpretation of the carry flag already 10267 // accounts for part of the negation. 10268 RHS = DAG.getConstant(~imm, DL, MVT::i32); 10269 10270 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 10271 : ARMISD::ADDE; 10272 return DAG.getNode(Opcode, DL, N->getVTList(), 10273 N->getOperand(0), RHS, N->getOperand(2)); 10274 } 10275 } 10276 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 10277 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 10278 } 10279 return SDValue(); 10280 } 10281 10282 /// PerformADDECombine - Target-specific dag combine transform from 10283 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 10284 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 10285 static SDValue PerformADDECombine(SDNode *N, 10286 TargetLowering::DAGCombinerInfo &DCI, 10287 const ARMSubtarget *Subtarget) { 10288 // Only ARM and Thumb2 support UMLAL/SMLAL. 10289 if (Subtarget->isThumb1Only()) 10290 return PerformAddeSubeCombine(N, DCI, Subtarget); 10291 10292 // Only perform the checks after legalize when the pattern is available. 10293 if (DCI.isBeforeLegalize()) return SDValue(); 10294 10295 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 10296 } 10297 10298 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 10299 /// operands N0 and N1. This is a helper for PerformADDCombine that is 10300 /// called with the default operands, and if that fails, with commuted 10301 /// operands. 10302 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 10303 TargetLowering::DAGCombinerInfo &DCI, 10304 const ARMSubtarget *Subtarget){ 10305 // Attempt to create vpadd for this add. 10306 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 10307 return Result; 10308 10309 // Attempt to create vpaddl for this add. 10310 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 10311 return Result; 10312 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 10313 Subtarget)) 10314 return Result; 10315 10316 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 10317 if (N0.getNode()->hasOneUse()) 10318 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 10319 return Result; 10320 return SDValue(); 10321 } 10322 10323 bool 10324 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 10325 CombineLevel Level) const { 10326 if (Level == BeforeLegalizeTypes) 10327 return true; 10328 10329 if (Subtarget->isThumb() && Subtarget->isThumb1Only()) 10330 return true; 10331 10332 if (N->getOpcode() != ISD::SHL) 10333 return true; 10334 10335 // Turn off commute-with-shift transform after legalization, so it doesn't 10336 // conflict with PerformSHLSimplify. (We could try to detect when 10337 // PerformSHLSimplify would trigger more precisely, but it isn't 10338 // really necessary.) 10339 return false; 10340 } 10341 10342 static SDValue PerformSHLSimplify(SDNode *N, 10343 TargetLowering::DAGCombinerInfo &DCI, 10344 const ARMSubtarget *ST) { 10345 // Allow the generic combiner to identify potential bswaps. 10346 if (DCI.isBeforeLegalize()) 10347 return SDValue(); 10348 10349 // DAG combiner will fold: 10350 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 10351 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 10352 // Other code patterns that can be also be modified have the following form: 10353 // b + ((a << 1) | 510) 10354 // b + ((a << 1) & 510) 10355 // b + ((a << 1) ^ 510) 10356 // b + ((a << 1) + 510) 10357 10358 // Many instructions can perform the shift for free, but it requires both 10359 // the operands to be registers. If c1 << c2 is too large, a mov immediate 10360 // instruction will needed. So, unfold back to the original pattern if: 10361 // - if c1 and c2 are small enough that they don't require mov imms. 10362 // - the user(s) of the node can perform an shl 10363 10364 // No shifted operands for 16-bit instructions. 10365 if (ST->isThumb() && ST->isThumb1Only()) 10366 return SDValue(); 10367 10368 // Check that all the users could perform the shl themselves. 10369 for (auto U : N->uses()) { 10370 switch(U->getOpcode()) { 10371 default: 10372 return SDValue(); 10373 case ISD::SUB: 10374 case ISD::ADD: 10375 case ISD::AND: 10376 case ISD::OR: 10377 case ISD::XOR: 10378 case ISD::SETCC: 10379 case ARMISD::CMP: 10380 // Check that the user isn't already using a constant because there 10381 // aren't any instructions that support an immediate operand and a 10382 // shifted operand. 10383 if (isa<ConstantSDNode>(U->getOperand(0)) || 10384 isa<ConstantSDNode>(U->getOperand(1))) 10385 return SDValue(); 10386 10387 // Check that it's not already using a shift. 10388 if (U->getOperand(0).getOpcode() == ISD::SHL || 10389 U->getOperand(1).getOpcode() == ISD::SHL) 10390 return SDValue(); 10391 break; 10392 } 10393 } 10394 10395 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 10396 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 10397 return SDValue(); 10398 10399 if (N->getOperand(0).getOpcode() != ISD::SHL) 10400 return SDValue(); 10401 10402 SDValue SHL = N->getOperand(0); 10403 10404 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10405 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 10406 if (!C1ShlC2 || !C2) 10407 return SDValue(); 10408 10409 APInt C2Int = C2->getAPIntValue(); 10410 APInt C1Int = C1ShlC2->getAPIntValue(); 10411 10412 // Check that performing a lshr will not lose any information. 10413 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 10414 C2Int.getBitWidth() - C2->getZExtValue()); 10415 if ((C1Int & Mask) != C1Int) 10416 return SDValue(); 10417 10418 // Shift the first constant. 10419 C1Int.lshrInPlace(C2Int); 10420 10421 // The immediates are encoded as an 8-bit value that can be rotated. 10422 auto LargeImm = [](const APInt &Imm) { 10423 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 10424 return Imm.getBitWidth() - Zeros > 8; 10425 }; 10426 10427 if (LargeImm(C1Int) || LargeImm(C2Int)) 10428 return SDValue(); 10429 10430 SelectionDAG &DAG = DCI.DAG; 10431 SDLoc dl(N); 10432 SDValue X = SHL.getOperand(0); 10433 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 10434 DAG.getConstant(C1Int, dl, MVT::i32)); 10435 // Shift left to compensate for the lshr of C1Int. 10436 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 10437 10438 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 10439 SHL.dump(); N->dump()); 10440 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 10441 return Res; 10442 } 10443 10444 10445 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 10446 /// 10447 static SDValue PerformADDCombine(SDNode *N, 10448 TargetLowering::DAGCombinerInfo &DCI, 10449 const ARMSubtarget *Subtarget) { 10450 SDValue N0 = N->getOperand(0); 10451 SDValue N1 = N->getOperand(1); 10452 10453 // Only works one way, because it needs an immediate operand. 10454 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 10455 return Result; 10456 10457 // First try with the default operand order. 10458 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 10459 return Result; 10460 10461 // If that didn't work, try again with the operands commuted. 10462 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 10463 } 10464 10465 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 10466 /// 10467 static SDValue PerformSUBCombine(SDNode *N, 10468 TargetLowering::DAGCombinerInfo &DCI) { 10469 SDValue N0 = N->getOperand(0); 10470 SDValue N1 = N->getOperand(1); 10471 10472 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 10473 if (N1.getNode()->hasOneUse()) 10474 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 10475 return Result; 10476 10477 return SDValue(); 10478 } 10479 10480 /// PerformVMULCombine 10481 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 10482 /// special multiplier accumulator forwarding. 10483 /// vmul d3, d0, d2 10484 /// vmla d3, d1, d2 10485 /// is faster than 10486 /// vadd d3, d0, d1 10487 /// vmul d3, d3, d2 10488 // However, for (A + B) * (A + B), 10489 // vadd d2, d0, d1 10490 // vmul d3, d0, d2 10491 // vmla d3, d1, d2 10492 // is slower than 10493 // vadd d2, d0, d1 10494 // vmul d3, d2, d2 10495 static SDValue PerformVMULCombine(SDNode *N, 10496 TargetLowering::DAGCombinerInfo &DCI, 10497 const ARMSubtarget *Subtarget) { 10498 if (!Subtarget->hasVMLxForwarding()) 10499 return SDValue(); 10500 10501 SelectionDAG &DAG = DCI.DAG; 10502 SDValue N0 = N->getOperand(0); 10503 SDValue N1 = N->getOperand(1); 10504 unsigned Opcode = N0.getOpcode(); 10505 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 10506 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 10507 Opcode = N1.getOpcode(); 10508 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 10509 Opcode != ISD::FADD && Opcode != ISD::FSUB) 10510 return SDValue(); 10511 std::swap(N0, N1); 10512 } 10513 10514 if (N0 == N1) 10515 return SDValue(); 10516 10517 EVT VT = N->getValueType(0); 10518 SDLoc DL(N); 10519 SDValue N00 = N0->getOperand(0); 10520 SDValue N01 = N0->getOperand(1); 10521 return DAG.getNode(Opcode, DL, VT, 10522 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 10523 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 10524 } 10525 10526 static SDValue PerformMULCombine(SDNode *N, 10527 TargetLowering::DAGCombinerInfo &DCI, 10528 const ARMSubtarget *Subtarget) { 10529 SelectionDAG &DAG = DCI.DAG; 10530 10531 if (Subtarget->isThumb1Only()) 10532 return SDValue(); 10533 10534 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10535 return SDValue(); 10536 10537 EVT VT = N->getValueType(0); 10538 if (VT.is64BitVector() || VT.is128BitVector()) 10539 return PerformVMULCombine(N, DCI, Subtarget); 10540 if (VT != MVT::i32) 10541 return SDValue(); 10542 10543 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10544 if (!C) 10545 return SDValue(); 10546 10547 int64_t MulAmt = C->getSExtValue(); 10548 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 10549 10550 ShiftAmt = ShiftAmt & (32 - 1); 10551 SDValue V = N->getOperand(0); 10552 SDLoc DL(N); 10553 10554 SDValue Res; 10555 MulAmt >>= ShiftAmt; 10556 10557 if (MulAmt >= 0) { 10558 if (isPowerOf2_32(MulAmt - 1)) { 10559 // (mul x, 2^N + 1) => (add (shl x, N), x) 10560 Res = DAG.getNode(ISD::ADD, DL, VT, 10561 V, 10562 DAG.getNode(ISD::SHL, DL, VT, 10563 V, 10564 DAG.getConstant(Log2_32(MulAmt - 1), DL, 10565 MVT::i32))); 10566 } else if (isPowerOf2_32(MulAmt + 1)) { 10567 // (mul x, 2^N - 1) => (sub (shl x, N), x) 10568 Res = DAG.getNode(ISD::SUB, DL, VT, 10569 DAG.getNode(ISD::SHL, DL, VT, 10570 V, 10571 DAG.getConstant(Log2_32(MulAmt + 1), DL, 10572 MVT::i32)), 10573 V); 10574 } else 10575 return SDValue(); 10576 } else { 10577 uint64_t MulAmtAbs = -MulAmt; 10578 if (isPowerOf2_32(MulAmtAbs + 1)) { 10579 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 10580 Res = DAG.getNode(ISD::SUB, DL, VT, 10581 V, 10582 DAG.getNode(ISD::SHL, DL, VT, 10583 V, 10584 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 10585 MVT::i32))); 10586 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 10587 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 10588 Res = DAG.getNode(ISD::ADD, DL, VT, 10589 V, 10590 DAG.getNode(ISD::SHL, DL, VT, 10591 V, 10592 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 10593 MVT::i32))); 10594 Res = DAG.getNode(ISD::SUB, DL, VT, 10595 DAG.getConstant(0, DL, MVT::i32), Res); 10596 } else 10597 return SDValue(); 10598 } 10599 10600 if (ShiftAmt != 0) 10601 Res = DAG.getNode(ISD::SHL, DL, VT, 10602 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 10603 10604 // Do not add new nodes to DAG combiner worklist. 10605 DCI.CombineTo(N, Res, false); 10606 return SDValue(); 10607 } 10608 10609 static SDValue CombineANDShift(SDNode *N, 10610 TargetLowering::DAGCombinerInfo &DCI, 10611 const ARMSubtarget *Subtarget) { 10612 // Allow DAGCombine to pattern-match before we touch the canonical form. 10613 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10614 return SDValue(); 10615 10616 if (N->getValueType(0) != MVT::i32) 10617 return SDValue(); 10618 10619 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10620 if (!N1C) 10621 return SDValue(); 10622 10623 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 10624 // Don't transform uxtb/uxth. 10625 if (C1 == 255 || C1 == 65535) 10626 return SDValue(); 10627 10628 SDNode *N0 = N->getOperand(0).getNode(); 10629 if (!N0->hasOneUse()) 10630 return SDValue(); 10631 10632 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 10633 return SDValue(); 10634 10635 bool LeftShift = N0->getOpcode() == ISD::SHL; 10636 10637 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 10638 if (!N01C) 10639 return SDValue(); 10640 10641 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 10642 if (!C2 || C2 >= 32) 10643 return SDValue(); 10644 10645 SelectionDAG &DAG = DCI.DAG; 10646 SDLoc DL(N); 10647 10648 // We have a pattern of the form "(and (shl x, c2) c1)" or 10649 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 10650 // transform to a pair of shifts, to save materializing c1. 10651 10652 // First pattern: right shift, and c1+1 is a power of two. 10653 // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power 10654 // of two). 10655 // FIXME: Use demanded bits? 10656 if (!LeftShift && isMask_32(C1)) { 10657 uint32_t C3 = countLeadingZeros(C1); 10658 if (C2 < C3) { 10659 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 10660 DAG.getConstant(C3 - C2, DL, MVT::i32)); 10661 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 10662 DAG.getConstant(C3, DL, MVT::i32)); 10663 } 10664 } 10665 10666 // Second pattern: left shift, and (c1>>c2)+1 is a power of two. 10667 // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1 10668 // is a power of two). 10669 // FIXME: Use demanded bits? 10670 if (LeftShift && isShiftedMask_32(C1)) { 10671 uint32_t C3 = countLeadingZeros(C1); 10672 if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) { 10673 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 10674 DAG.getConstant(C2 + C3, DL, MVT::i32)); 10675 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 10676 DAG.getConstant(C3, DL, MVT::i32)); 10677 } 10678 } 10679 10680 // FIXME: Transform "(and (shl x, c2) c1)" -> 10681 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 10682 // c1. 10683 return SDValue(); 10684 } 10685 10686 static SDValue PerformANDCombine(SDNode *N, 10687 TargetLowering::DAGCombinerInfo &DCI, 10688 const ARMSubtarget *Subtarget) { 10689 // Attempt to use immediate-form VBIC 10690 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 10691 SDLoc dl(N); 10692 EVT VT = N->getValueType(0); 10693 SelectionDAG &DAG = DCI.DAG; 10694 10695 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10696 return SDValue(); 10697 10698 APInt SplatBits, SplatUndef; 10699 unsigned SplatBitSize; 10700 bool HasAnyUndefs; 10701 if (BVN && 10702 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 10703 if (SplatBitSize <= 64) { 10704 EVT VbicVT; 10705 SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), 10706 SplatUndef.getZExtValue(), SplatBitSize, 10707 DAG, dl, VbicVT, VT.is128BitVector(), 10708 OtherModImm); 10709 if (Val.getNode()) { 10710 SDValue Input = 10711 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 10712 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 10713 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 10714 } 10715 } 10716 } 10717 10718 if (!Subtarget->isThumb1Only()) { 10719 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 10720 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 10721 return Result; 10722 10723 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 10724 return Result; 10725 } 10726 10727 if (Subtarget->isThumb1Only()) 10728 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 10729 return Result; 10730 10731 return SDValue(); 10732 } 10733 10734 // Try combining OR nodes to SMULWB, SMULWT. 10735 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 10736 TargetLowering::DAGCombinerInfo &DCI, 10737 const ARMSubtarget *Subtarget) { 10738 if (!Subtarget->hasV6Ops() || 10739 (Subtarget->isThumb() && 10740 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 10741 return SDValue(); 10742 10743 SDValue SRL = OR->getOperand(0); 10744 SDValue SHL = OR->getOperand(1); 10745 10746 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 10747 SRL = OR->getOperand(1); 10748 SHL = OR->getOperand(0); 10749 } 10750 if (!isSRL16(SRL) || !isSHL16(SHL)) 10751 return SDValue(); 10752 10753 // The first operands to the shifts need to be the two results from the 10754 // same smul_lohi node. 10755 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 10756 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 10757 return SDValue(); 10758 10759 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 10760 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 10761 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 10762 return SDValue(); 10763 10764 // Now we have: 10765 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 10766 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 10767 // For SMUWB the 16-bit value will signed extended somehow. 10768 // For SMULWT only the SRA is required. 10769 // Check both sides of SMUL_LOHI 10770 SDValue OpS16 = SMULLOHI->getOperand(0); 10771 SDValue OpS32 = SMULLOHI->getOperand(1); 10772 10773 SelectionDAG &DAG = DCI.DAG; 10774 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 10775 OpS16 = OpS32; 10776 OpS32 = SMULLOHI->getOperand(0); 10777 } 10778 10779 SDLoc dl(OR); 10780 unsigned Opcode = 0; 10781 if (isS16(OpS16, DAG)) 10782 Opcode = ARMISD::SMULWB; 10783 else if (isSRA16(OpS16)) { 10784 Opcode = ARMISD::SMULWT; 10785 OpS16 = OpS16->getOperand(0); 10786 } 10787 else 10788 return SDValue(); 10789 10790 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 10791 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 10792 return SDValue(OR, 0); 10793 } 10794 10795 static SDValue PerformORCombineToBFI(SDNode *N, 10796 TargetLowering::DAGCombinerInfo &DCI, 10797 const ARMSubtarget *Subtarget) { 10798 // BFI is only available on V6T2+ 10799 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 10800 return SDValue(); 10801 10802 EVT VT = N->getValueType(0); 10803 SDValue N0 = N->getOperand(0); 10804 SDValue N1 = N->getOperand(1); 10805 SelectionDAG &DAG = DCI.DAG; 10806 SDLoc DL(N); 10807 // 1) or (and A, mask), val => ARMbfi A, val, mask 10808 // iff (val & mask) == val 10809 // 10810 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 10811 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 10812 // && mask == ~mask2 10813 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 10814 // && ~mask == mask2 10815 // (i.e., copy a bitfield value into another bitfield of the same width) 10816 10817 if (VT != MVT::i32) 10818 return SDValue(); 10819 10820 SDValue N00 = N0.getOperand(0); 10821 10822 // The value and the mask need to be constants so we can verify this is 10823 // actually a bitfield set. If the mask is 0xffff, we can do better 10824 // via a movt instruction, so don't use BFI in that case. 10825 SDValue MaskOp = N0.getOperand(1); 10826 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 10827 if (!MaskC) 10828 return SDValue(); 10829 unsigned Mask = MaskC->getZExtValue(); 10830 if (Mask == 0xffff) 10831 return SDValue(); 10832 SDValue Res; 10833 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 10834 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 10835 if (N1C) { 10836 unsigned Val = N1C->getZExtValue(); 10837 if ((Val & ~Mask) != Val) 10838 return SDValue(); 10839 10840 if (ARM::isBitFieldInvertedMask(Mask)) { 10841 Val >>= countTrailingZeros(~Mask); 10842 10843 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 10844 DAG.getConstant(Val, DL, MVT::i32), 10845 DAG.getConstant(Mask, DL, MVT::i32)); 10846 10847 DCI.CombineTo(N, Res, false); 10848 // Return value from the original node to inform the combiner than N is 10849 // now dead. 10850 return SDValue(N, 0); 10851 } 10852 } else if (N1.getOpcode() == ISD::AND) { 10853 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 10854 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 10855 if (!N11C) 10856 return SDValue(); 10857 unsigned Mask2 = N11C->getZExtValue(); 10858 10859 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 10860 // as is to match. 10861 if (ARM::isBitFieldInvertedMask(Mask) && 10862 (Mask == ~Mask2)) { 10863 // The pack halfword instruction works better for masks that fit it, 10864 // so use that when it's available. 10865 if (Subtarget->hasDSP() && 10866 (Mask == 0xffff || Mask == 0xffff0000)) 10867 return SDValue(); 10868 // 2a 10869 unsigned amt = countTrailingZeros(Mask2); 10870 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 10871 DAG.getConstant(amt, DL, MVT::i32)); 10872 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 10873 DAG.getConstant(Mask, DL, MVT::i32)); 10874 DCI.CombineTo(N, Res, false); 10875 // Return value from the original node to inform the combiner than N is 10876 // now dead. 10877 return SDValue(N, 0); 10878 } else if (ARM::isBitFieldInvertedMask(~Mask) && 10879 (~Mask == Mask2)) { 10880 // The pack halfword instruction works better for masks that fit it, 10881 // so use that when it's available. 10882 if (Subtarget->hasDSP() && 10883 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 10884 return SDValue(); 10885 // 2b 10886 unsigned lsb = countTrailingZeros(Mask); 10887 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 10888 DAG.getConstant(lsb, DL, MVT::i32)); 10889 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 10890 DAG.getConstant(Mask2, DL, MVT::i32)); 10891 DCI.CombineTo(N, Res, false); 10892 // Return value from the original node to inform the combiner than N is 10893 // now dead. 10894 return SDValue(N, 0); 10895 } 10896 } 10897 10898 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 10899 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 10900 ARM::isBitFieldInvertedMask(~Mask)) { 10901 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 10902 // where lsb(mask) == #shamt and masked bits of B are known zero. 10903 SDValue ShAmt = N00.getOperand(1); 10904 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 10905 unsigned LSB = countTrailingZeros(Mask); 10906 if (ShAmtC != LSB) 10907 return SDValue(); 10908 10909 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 10910 DAG.getConstant(~Mask, DL, MVT::i32)); 10911 10912 DCI.CombineTo(N, Res, false); 10913 // Return value from the original node to inform the combiner than N is 10914 // now dead. 10915 return SDValue(N, 0); 10916 } 10917 10918 return SDValue(); 10919 } 10920 10921 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 10922 static SDValue PerformORCombine(SDNode *N, 10923 TargetLowering::DAGCombinerInfo &DCI, 10924 const ARMSubtarget *Subtarget) { 10925 // Attempt to use immediate-form VORR 10926 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 10927 SDLoc dl(N); 10928 EVT VT = N->getValueType(0); 10929 SelectionDAG &DAG = DCI.DAG; 10930 10931 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10932 return SDValue(); 10933 10934 APInt SplatBits, SplatUndef; 10935 unsigned SplatBitSize; 10936 bool HasAnyUndefs; 10937 if (BVN && Subtarget->hasNEON() && 10938 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 10939 if (SplatBitSize <= 64) { 10940 EVT VorrVT; 10941 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 10942 SplatUndef.getZExtValue(), SplatBitSize, 10943 DAG, dl, VorrVT, VT.is128BitVector(), 10944 OtherModImm); 10945 if (Val.getNode()) { 10946 SDValue Input = 10947 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 10948 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 10949 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 10950 } 10951 } 10952 } 10953 10954 if (!Subtarget->isThumb1Only()) { 10955 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 10956 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 10957 return Result; 10958 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 10959 return Result; 10960 } 10961 10962 SDValue N0 = N->getOperand(0); 10963 SDValue N1 = N->getOperand(1); 10964 10965 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 10966 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 10967 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 10968 10969 // The code below optimizes (or (and X, Y), Z). 10970 // The AND operand needs to have a single user to make these optimizations 10971 // profitable. 10972 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 10973 return SDValue(); 10974 10975 APInt SplatUndef; 10976 unsigned SplatBitSize; 10977 bool HasAnyUndefs; 10978 10979 APInt SplatBits0, SplatBits1; 10980 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 10981 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 10982 // Ensure that the second operand of both ands are constants 10983 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 10984 HasAnyUndefs) && !HasAnyUndefs) { 10985 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 10986 HasAnyUndefs) && !HasAnyUndefs) { 10987 // Ensure that the bit width of the constants are the same and that 10988 // the splat arguments are logical inverses as per the pattern we 10989 // are trying to simplify. 10990 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 10991 SplatBits0 == ~SplatBits1) { 10992 // Canonicalize the vector type to make instruction selection 10993 // simpler. 10994 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 10995 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 10996 N0->getOperand(1), 10997 N0->getOperand(0), 10998 N1->getOperand(0)); 10999 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 11000 } 11001 } 11002 } 11003 } 11004 11005 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 11006 // reasonable. 11007 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 11008 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 11009 return Res; 11010 } 11011 11012 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11013 return Result; 11014 11015 return SDValue(); 11016 } 11017 11018 static SDValue PerformXORCombine(SDNode *N, 11019 TargetLowering::DAGCombinerInfo &DCI, 11020 const ARMSubtarget *Subtarget) { 11021 EVT VT = N->getValueType(0); 11022 SelectionDAG &DAG = DCI.DAG; 11023 11024 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11025 return SDValue(); 11026 11027 if (!Subtarget->isThumb1Only()) { 11028 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11029 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 11030 return Result; 11031 11032 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11033 return Result; 11034 } 11035 11036 return SDValue(); 11037 } 11038 11039 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 11040 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 11041 // their position in "to" (Rd). 11042 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 11043 assert(N->getOpcode() == ARMISD::BFI); 11044 11045 SDValue From = N->getOperand(1); 11046 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 11047 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 11048 11049 // If the Base came from a SHR #C, we can deduce that it is really testing bit 11050 // #C in the base of the SHR. 11051 if (From->getOpcode() == ISD::SRL && 11052 isa<ConstantSDNode>(From->getOperand(1))) { 11053 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 11054 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 11055 FromMask <<= Shift.getLimitedValue(31); 11056 From = From->getOperand(0); 11057 } 11058 11059 return From; 11060 } 11061 11062 // If A and B contain one contiguous set of bits, does A | B == A . B? 11063 // 11064 // Neither A nor B must be zero. 11065 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 11066 unsigned LastActiveBitInA = A.countTrailingZeros(); 11067 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 11068 return LastActiveBitInA - 1 == FirstActiveBitInB; 11069 } 11070 11071 static SDValue FindBFIToCombineWith(SDNode *N) { 11072 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 11073 // if one exists. 11074 APInt ToMask, FromMask; 11075 SDValue From = ParseBFI(N, ToMask, FromMask); 11076 SDValue To = N->getOperand(0); 11077 11078 // Now check for a compatible BFI to merge with. We can pass through BFIs that 11079 // aren't compatible, but not if they set the same bit in their destination as 11080 // we do (or that of any BFI we're going to combine with). 11081 SDValue V = To; 11082 APInt CombinedToMask = ToMask; 11083 while (V.getOpcode() == ARMISD::BFI) { 11084 APInt NewToMask, NewFromMask; 11085 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 11086 if (NewFrom != From) { 11087 // This BFI has a different base. Keep going. 11088 CombinedToMask |= NewToMask; 11089 V = V.getOperand(0); 11090 continue; 11091 } 11092 11093 // Do the written bits conflict with any we've seen so far? 11094 if ((NewToMask & CombinedToMask).getBoolValue()) 11095 // Conflicting bits - bail out because going further is unsafe. 11096 return SDValue(); 11097 11098 // Are the new bits contiguous when combined with the old bits? 11099 if (BitsProperlyConcatenate(ToMask, NewToMask) && 11100 BitsProperlyConcatenate(FromMask, NewFromMask)) 11101 return V; 11102 if (BitsProperlyConcatenate(NewToMask, ToMask) && 11103 BitsProperlyConcatenate(NewFromMask, FromMask)) 11104 return V; 11105 11106 // We've seen a write to some bits, so track it. 11107 CombinedToMask |= NewToMask; 11108 // Keep going... 11109 V = V.getOperand(0); 11110 } 11111 11112 return SDValue(); 11113 } 11114 11115 static SDValue PerformBFICombine(SDNode *N, 11116 TargetLowering::DAGCombinerInfo &DCI) { 11117 SDValue N1 = N->getOperand(1); 11118 if (N1.getOpcode() == ISD::AND) { 11119 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 11120 // the bits being cleared by the AND are not demanded by the BFI. 11121 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 11122 if (!N11C) 11123 return SDValue(); 11124 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 11125 unsigned LSB = countTrailingZeros(~InvMask); 11126 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 11127 assert(Width < 11128 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 11129 "undefined behavior"); 11130 unsigned Mask = (1u << Width) - 1; 11131 unsigned Mask2 = N11C->getZExtValue(); 11132 if ((Mask & (~Mask2)) == 0) 11133 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 11134 N->getOperand(0), N1.getOperand(0), 11135 N->getOperand(2)); 11136 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 11137 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 11138 // Keep track of any consecutive bits set that all come from the same base 11139 // value. We can combine these together into a single BFI. 11140 SDValue CombineBFI = FindBFIToCombineWith(N); 11141 if (CombineBFI == SDValue()) 11142 return SDValue(); 11143 11144 // We've found a BFI. 11145 APInt ToMask1, FromMask1; 11146 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 11147 11148 APInt ToMask2, FromMask2; 11149 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 11150 assert(From1 == From2); 11151 (void)From2; 11152 11153 // First, unlink CombineBFI. 11154 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 11155 // Then create a new BFI, combining the two together. 11156 APInt NewFromMask = FromMask1 | FromMask2; 11157 APInt NewToMask = ToMask1 | ToMask2; 11158 11159 EVT VT = N->getValueType(0); 11160 SDLoc dl(N); 11161 11162 if (NewFromMask[0] == 0) 11163 From1 = DCI.DAG.getNode( 11164 ISD::SRL, dl, VT, From1, 11165 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 11166 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 11167 DCI.DAG.getConstant(~NewToMask, dl, VT)); 11168 } 11169 return SDValue(); 11170 } 11171 11172 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 11173 /// ARMISD::VMOVRRD. 11174 static SDValue PerformVMOVRRDCombine(SDNode *N, 11175 TargetLowering::DAGCombinerInfo &DCI, 11176 const ARMSubtarget *Subtarget) { 11177 // vmovrrd(vmovdrr x, y) -> x,y 11178 SDValue InDouble = N->getOperand(0); 11179 if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) 11180 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 11181 11182 // vmovrrd(load f64) -> (load i32), (load i32) 11183 SDNode *InNode = InDouble.getNode(); 11184 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 11185 InNode->getValueType(0) == MVT::f64 && 11186 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 11187 !cast<LoadSDNode>(InNode)->isVolatile()) { 11188 // TODO: Should this be done for non-FrameIndex operands? 11189 LoadSDNode *LD = cast<LoadSDNode>(InNode); 11190 11191 SelectionDAG &DAG = DCI.DAG; 11192 SDLoc DL(LD); 11193 SDValue BasePtr = LD->getBasePtr(); 11194 SDValue NewLD1 = 11195 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 11196 LD->getAlignment(), LD->getMemOperand()->getFlags()); 11197 11198 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 11199 DAG.getConstant(4, DL, MVT::i32)); 11200 SDValue NewLD2 = DAG.getLoad( 11201 MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), 11202 std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); 11203 11204 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 11205 if (DCI.DAG.getDataLayout().isBigEndian()) 11206 std::swap (NewLD1, NewLD2); 11207 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 11208 return Result; 11209 } 11210 11211 return SDValue(); 11212 } 11213 11214 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 11215 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 11216 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 11217 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 11218 SDValue Op0 = N->getOperand(0); 11219 SDValue Op1 = N->getOperand(1); 11220 if (Op0.getOpcode() == ISD::BITCAST) 11221 Op0 = Op0.getOperand(0); 11222 if (Op1.getOpcode() == ISD::BITCAST) 11223 Op1 = Op1.getOperand(0); 11224 if (Op0.getOpcode() == ARMISD::VMOVRRD && 11225 Op0.getNode() == Op1.getNode() && 11226 Op0.getResNo() == 0 && Op1.getResNo() == 1) 11227 return DAG.getNode(ISD::BITCAST, SDLoc(N), 11228 N->getValueType(0), Op0.getOperand(0)); 11229 return SDValue(); 11230 } 11231 11232 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 11233 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 11234 /// i64 vector to have f64 elements, since the value can then be loaded 11235 /// directly into a VFP register. 11236 static bool hasNormalLoadOperand(SDNode *N) { 11237 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 11238 for (unsigned i = 0; i < NumElts; ++i) { 11239 SDNode *Elt = N->getOperand(i).getNode(); 11240 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 11241 return true; 11242 } 11243 return false; 11244 } 11245 11246 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 11247 /// ISD::BUILD_VECTOR. 11248 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 11249 TargetLowering::DAGCombinerInfo &DCI, 11250 const ARMSubtarget *Subtarget) { 11251 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 11252 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 11253 // into a pair of GPRs, which is fine when the value is used as a scalar, 11254 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 11255 SelectionDAG &DAG = DCI.DAG; 11256 if (N->getNumOperands() == 2) 11257 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 11258 return RV; 11259 11260 // Load i64 elements as f64 values so that type legalization does not split 11261 // them up into i32 values. 11262 EVT VT = N->getValueType(0); 11263 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 11264 return SDValue(); 11265 SDLoc dl(N); 11266 SmallVector<SDValue, 8> Ops; 11267 unsigned NumElts = VT.getVectorNumElements(); 11268 for (unsigned i = 0; i < NumElts; ++i) { 11269 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 11270 Ops.push_back(V); 11271 // Make the DAGCombiner fold the bitcast. 11272 DCI.AddToWorklist(V.getNode()); 11273 } 11274 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 11275 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 11276 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 11277 } 11278 11279 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 11280 static SDValue 11281 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 11282 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 11283 // At that time, we may have inserted bitcasts from integer to float. 11284 // If these bitcasts have survived DAGCombine, change the lowering of this 11285 // BUILD_VECTOR in something more vector friendly, i.e., that does not 11286 // force to use floating point types. 11287 11288 // Make sure we can change the type of the vector. 11289 // This is possible iff: 11290 // 1. The vector is only used in a bitcast to a integer type. I.e., 11291 // 1.1. Vector is used only once. 11292 // 1.2. Use is a bit convert to an integer type. 11293 // 2. The size of its operands are 32-bits (64-bits are not legal). 11294 EVT VT = N->getValueType(0); 11295 EVT EltVT = VT.getVectorElementType(); 11296 11297 // Check 1.1. and 2. 11298 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 11299 return SDValue(); 11300 11301 // By construction, the input type must be float. 11302 assert(EltVT == MVT::f32 && "Unexpected type!"); 11303 11304 // Check 1.2. 11305 SDNode *Use = *N->use_begin(); 11306 if (Use->getOpcode() != ISD::BITCAST || 11307 Use->getValueType(0).isFloatingPoint()) 11308 return SDValue(); 11309 11310 // Check profitability. 11311 // Model is, if more than half of the relevant operands are bitcast from 11312 // i32, turn the build_vector into a sequence of insert_vector_elt. 11313 // Relevant operands are everything that is not statically 11314 // (i.e., at compile time) bitcasted. 11315 unsigned NumOfBitCastedElts = 0; 11316 unsigned NumElts = VT.getVectorNumElements(); 11317 unsigned NumOfRelevantElts = NumElts; 11318 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 11319 SDValue Elt = N->getOperand(Idx); 11320 if (Elt->getOpcode() == ISD::BITCAST) { 11321 // Assume only bit cast to i32 will go away. 11322 if (Elt->getOperand(0).getValueType() == MVT::i32) 11323 ++NumOfBitCastedElts; 11324 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 11325 // Constants are statically casted, thus do not count them as 11326 // relevant operands. 11327 --NumOfRelevantElts; 11328 } 11329 11330 // Check if more than half of the elements require a non-free bitcast. 11331 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 11332 return SDValue(); 11333 11334 SelectionDAG &DAG = DCI.DAG; 11335 // Create the new vector type. 11336 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 11337 // Check if the type is legal. 11338 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11339 if (!TLI.isTypeLegal(VecVT)) 11340 return SDValue(); 11341 11342 // Combine: 11343 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 11344 // => BITCAST INSERT_VECTOR_ELT 11345 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 11346 // (BITCAST EN), N. 11347 SDValue Vec = DAG.getUNDEF(VecVT); 11348 SDLoc dl(N); 11349 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 11350 SDValue V = N->getOperand(Idx); 11351 if (V.isUndef()) 11352 continue; 11353 if (V.getOpcode() == ISD::BITCAST && 11354 V->getOperand(0).getValueType() == MVT::i32) 11355 // Fold obvious case. 11356 V = V.getOperand(0); 11357 else { 11358 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 11359 // Make the DAGCombiner fold the bitcasts. 11360 DCI.AddToWorklist(V.getNode()); 11361 } 11362 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 11363 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 11364 } 11365 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 11366 // Make the DAGCombiner fold the bitcasts. 11367 DCI.AddToWorklist(Vec.getNode()); 11368 return Vec; 11369 } 11370 11371 /// PerformInsertEltCombine - Target-specific dag combine xforms for 11372 /// ISD::INSERT_VECTOR_ELT. 11373 static SDValue PerformInsertEltCombine(SDNode *N, 11374 TargetLowering::DAGCombinerInfo &DCI) { 11375 // Bitcast an i64 load inserted into a vector to f64. 11376 // Otherwise, the i64 value will be legalized to a pair of i32 values. 11377 EVT VT = N->getValueType(0); 11378 SDNode *Elt = N->getOperand(1).getNode(); 11379 if (VT.getVectorElementType() != MVT::i64 || 11380 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 11381 return SDValue(); 11382 11383 SelectionDAG &DAG = DCI.DAG; 11384 SDLoc dl(N); 11385 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 11386 VT.getVectorNumElements()); 11387 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 11388 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 11389 // Make the DAGCombiner fold the bitcasts. 11390 DCI.AddToWorklist(Vec.getNode()); 11391 DCI.AddToWorklist(V.getNode()); 11392 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 11393 Vec, V, N->getOperand(2)); 11394 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 11395 } 11396 11397 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 11398 /// ISD::VECTOR_SHUFFLE. 11399 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 11400 // The LLVM shufflevector instruction does not require the shuffle mask 11401 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 11402 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 11403 // operands do not match the mask length, they are extended by concatenating 11404 // them with undef vectors. That is probably the right thing for other 11405 // targets, but for NEON it is better to concatenate two double-register 11406 // size vector operands into a single quad-register size vector. Do that 11407 // transformation here: 11408 // shuffle(concat(v1, undef), concat(v2, undef)) -> 11409 // shuffle(concat(v1, v2), undef) 11410 SDValue Op0 = N->getOperand(0); 11411 SDValue Op1 = N->getOperand(1); 11412 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 11413 Op1.getOpcode() != ISD::CONCAT_VECTORS || 11414 Op0.getNumOperands() != 2 || 11415 Op1.getNumOperands() != 2) 11416 return SDValue(); 11417 SDValue Concat0Op1 = Op0.getOperand(1); 11418 SDValue Concat1Op1 = Op1.getOperand(1); 11419 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 11420 return SDValue(); 11421 // Skip the transformation if any of the types are illegal. 11422 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11423 EVT VT = N->getValueType(0); 11424 if (!TLI.isTypeLegal(VT) || 11425 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 11426 !TLI.isTypeLegal(Concat1Op1.getValueType())) 11427 return SDValue(); 11428 11429 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 11430 Op0.getOperand(0), Op1.getOperand(0)); 11431 // Translate the shuffle mask. 11432 SmallVector<int, 16> NewMask; 11433 unsigned NumElts = VT.getVectorNumElements(); 11434 unsigned HalfElts = NumElts/2; 11435 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 11436 for (unsigned n = 0; n < NumElts; ++n) { 11437 int MaskElt = SVN->getMaskElt(n); 11438 int NewElt = -1; 11439 if (MaskElt < (int)HalfElts) 11440 NewElt = MaskElt; 11441 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 11442 NewElt = HalfElts + MaskElt - NumElts; 11443 NewMask.push_back(NewElt); 11444 } 11445 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 11446 DAG.getUNDEF(VT), NewMask); 11447 } 11448 11449 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 11450 /// NEON load/store intrinsics, and generic vector load/stores, to merge 11451 /// base address updates. 11452 /// For generic load/stores, the memory type is assumed to be a vector. 11453 /// The caller is assumed to have checked legality. 11454 static SDValue CombineBaseUpdate(SDNode *N, 11455 TargetLowering::DAGCombinerInfo &DCI) { 11456 SelectionDAG &DAG = DCI.DAG; 11457 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 11458 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 11459 const bool isStore = N->getOpcode() == ISD::STORE; 11460 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 11461 SDValue Addr = N->getOperand(AddrOpIdx); 11462 MemSDNode *MemN = cast<MemSDNode>(N); 11463 SDLoc dl(N); 11464 11465 // Search for a use of the address operand that is an increment. 11466 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 11467 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 11468 SDNode *User = *UI; 11469 if (User->getOpcode() != ISD::ADD || 11470 UI.getUse().getResNo() != Addr.getResNo()) 11471 continue; 11472 11473 // Check that the add is independent of the load/store. Otherwise, folding 11474 // it would create a cycle. We can avoid searching through Addr as it's a 11475 // predecessor to both. 11476 SmallPtrSet<const SDNode *, 32> Visited; 11477 SmallVector<const SDNode *, 16> Worklist; 11478 Visited.insert(Addr.getNode()); 11479 Worklist.push_back(N); 11480 Worklist.push_back(User); 11481 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 11482 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 11483 continue; 11484 11485 // Find the new opcode for the updating load/store. 11486 bool isLoadOp = true; 11487 bool isLaneOp = false; 11488 unsigned NewOpc = 0; 11489 unsigned NumVecs = 0; 11490 if (isIntrinsic) { 11491 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 11492 switch (IntNo) { 11493 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 11494 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 11495 NumVecs = 1; break; 11496 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 11497 NumVecs = 2; break; 11498 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 11499 NumVecs = 3; break; 11500 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 11501 NumVecs = 4; break; 11502 case Intrinsic::arm_neon_vld2dup: 11503 case Intrinsic::arm_neon_vld3dup: 11504 case Intrinsic::arm_neon_vld4dup: 11505 // TODO: Support updating VLDxDUP nodes. For now, we just skip 11506 // combining base updates for such intrinsics. 11507 continue; 11508 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 11509 NumVecs = 2; isLaneOp = true; break; 11510 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 11511 NumVecs = 3; isLaneOp = true; break; 11512 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 11513 NumVecs = 4; isLaneOp = true; break; 11514 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 11515 NumVecs = 1; isLoadOp = false; break; 11516 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 11517 NumVecs = 2; isLoadOp = false; break; 11518 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 11519 NumVecs = 3; isLoadOp = false; break; 11520 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 11521 NumVecs = 4; isLoadOp = false; break; 11522 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 11523 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 11524 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 11525 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 11526 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 11527 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 11528 } 11529 } else { 11530 isLaneOp = true; 11531 switch (N->getOpcode()) { 11532 default: llvm_unreachable("unexpected opcode for Neon base update"); 11533 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 11534 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 11535 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 11536 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 11537 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 11538 NumVecs = 1; isLaneOp = false; break; 11539 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 11540 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 11541 } 11542 } 11543 11544 // Find the size of memory referenced by the load/store. 11545 EVT VecTy; 11546 if (isLoadOp) { 11547 VecTy = N->getValueType(0); 11548 } else if (isIntrinsic) { 11549 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 11550 } else { 11551 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 11552 VecTy = N->getOperand(1).getValueType(); 11553 } 11554 11555 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 11556 if (isLaneOp) 11557 NumBytes /= VecTy.getVectorNumElements(); 11558 11559 // If the increment is a constant, it must match the memory ref size. 11560 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 11561 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 11562 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 11563 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 11564 // separate instructions that make it harder to use a non-constant update. 11565 continue; 11566 } 11567 11568 // OK, we found an ADD we can fold into the base update. 11569 // Now, create a _UPD node, taking care of not breaking alignment. 11570 11571 EVT AlignedVecTy = VecTy; 11572 unsigned Alignment = MemN->getAlignment(); 11573 11574 // If this is a less-than-standard-aligned load/store, change the type to 11575 // match the standard alignment. 11576 // The alignment is overlooked when selecting _UPD variants; and it's 11577 // easier to introduce bitcasts here than fix that. 11578 // There are 3 ways to get to this base-update combine: 11579 // - intrinsics: they are assumed to be properly aligned (to the standard 11580 // alignment of the memory type), so we don't need to do anything. 11581 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 11582 // intrinsics, so, likewise, there's nothing to do. 11583 // - generic load/store instructions: the alignment is specified as an 11584 // explicit operand, rather than implicitly as the standard alignment 11585 // of the memory type (like the intrisics). We need to change the 11586 // memory type to match the explicit alignment. That way, we don't 11587 // generate non-standard-aligned ARMISD::VLDx nodes. 11588 if (isa<LSBaseSDNode>(N)) { 11589 if (Alignment == 0) 11590 Alignment = 1; 11591 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 11592 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 11593 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 11594 assert(!isLaneOp && "Unexpected generic load/store lane."); 11595 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 11596 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 11597 } 11598 // Don't set an explicit alignment on regular load/stores that we want 11599 // to transform to VLD/VST 1_UPD nodes. 11600 // This matches the behavior of regular load/stores, which only get an 11601 // explicit alignment if the MMO alignment is larger than the standard 11602 // alignment of the memory type. 11603 // Intrinsics, however, always get an explicit alignment, set to the 11604 // alignment of the MMO. 11605 Alignment = 1; 11606 } 11607 11608 // Create the new updating load/store node. 11609 // First, create an SDVTList for the new updating node's results. 11610 EVT Tys[6]; 11611 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 11612 unsigned n; 11613 for (n = 0; n < NumResultVecs; ++n) 11614 Tys[n] = AlignedVecTy; 11615 Tys[n++] = MVT::i32; 11616 Tys[n] = MVT::Other; 11617 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 11618 11619 // Then, gather the new node's operands. 11620 SmallVector<SDValue, 8> Ops; 11621 Ops.push_back(N->getOperand(0)); // incoming chain 11622 Ops.push_back(N->getOperand(AddrOpIdx)); 11623 Ops.push_back(Inc); 11624 11625 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 11626 // Try to match the intrinsic's signature 11627 Ops.push_back(StN->getValue()); 11628 } else { 11629 // Loads (and of course intrinsics) match the intrinsics' signature, 11630 // so just add all but the alignment operand. 11631 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 11632 Ops.push_back(N->getOperand(i)); 11633 } 11634 11635 // For all node types, the alignment operand is always the last one. 11636 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 11637 11638 // If this is a non-standard-aligned STORE, the penultimate operand is the 11639 // stored value. Bitcast it to the aligned type. 11640 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 11641 SDValue &StVal = Ops[Ops.size()-2]; 11642 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 11643 } 11644 11645 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 11646 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 11647 MemN->getMemOperand()); 11648 11649 // Update the uses. 11650 SmallVector<SDValue, 5> NewResults; 11651 for (unsigned i = 0; i < NumResultVecs; ++i) 11652 NewResults.push_back(SDValue(UpdN.getNode(), i)); 11653 11654 // If this is an non-standard-aligned LOAD, the first result is the loaded 11655 // value. Bitcast it to the expected result type. 11656 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 11657 SDValue &LdVal = NewResults[0]; 11658 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 11659 } 11660 11661 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 11662 DCI.CombineTo(N, NewResults); 11663 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 11664 11665 break; 11666 } 11667 return SDValue(); 11668 } 11669 11670 static SDValue PerformVLDCombine(SDNode *N, 11671 TargetLowering::DAGCombinerInfo &DCI) { 11672 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11673 return SDValue(); 11674 11675 return CombineBaseUpdate(N, DCI); 11676 } 11677 11678 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 11679 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 11680 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 11681 /// return true. 11682 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 11683 SelectionDAG &DAG = DCI.DAG; 11684 EVT VT = N->getValueType(0); 11685 // vldN-dup instructions only support 64-bit vectors for N > 1. 11686 if (!VT.is64BitVector()) 11687 return false; 11688 11689 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 11690 SDNode *VLD = N->getOperand(0).getNode(); 11691 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 11692 return false; 11693 unsigned NumVecs = 0; 11694 unsigned NewOpc = 0; 11695 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 11696 if (IntNo == Intrinsic::arm_neon_vld2lane) { 11697 NumVecs = 2; 11698 NewOpc = ARMISD::VLD2DUP; 11699 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 11700 NumVecs = 3; 11701 NewOpc = ARMISD::VLD3DUP; 11702 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 11703 NumVecs = 4; 11704 NewOpc = ARMISD::VLD4DUP; 11705 } else { 11706 return false; 11707 } 11708 11709 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 11710 // numbers match the load. 11711 unsigned VLDLaneNo = 11712 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 11713 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 11714 UI != UE; ++UI) { 11715 // Ignore uses of the chain result. 11716 if (UI.getUse().getResNo() == NumVecs) 11717 continue; 11718 SDNode *User = *UI; 11719 if (User->getOpcode() != ARMISD::VDUPLANE || 11720 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 11721 return false; 11722 } 11723 11724 // Create the vldN-dup node. 11725 EVT Tys[5]; 11726 unsigned n; 11727 for (n = 0; n < NumVecs; ++n) 11728 Tys[n] = VT; 11729 Tys[n] = MVT::Other; 11730 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 11731 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 11732 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 11733 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 11734 Ops, VLDMemInt->getMemoryVT(), 11735 VLDMemInt->getMemOperand()); 11736 11737 // Update the uses. 11738 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 11739 UI != UE; ++UI) { 11740 unsigned ResNo = UI.getUse().getResNo(); 11741 // Ignore uses of the chain result. 11742 if (ResNo == NumVecs) 11743 continue; 11744 SDNode *User = *UI; 11745 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 11746 } 11747 11748 // Now the vldN-lane intrinsic is dead except for its chain result. 11749 // Update uses of the chain. 11750 std::vector<SDValue> VLDDupResults; 11751 for (unsigned n = 0; n < NumVecs; ++n) 11752 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 11753 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 11754 DCI.CombineTo(VLD, VLDDupResults); 11755 11756 return true; 11757 } 11758 11759 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 11760 /// ARMISD::VDUPLANE. 11761 static SDValue PerformVDUPLANECombine(SDNode *N, 11762 TargetLowering::DAGCombinerInfo &DCI) { 11763 SDValue Op = N->getOperand(0); 11764 11765 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 11766 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 11767 if (CombineVLDDUP(N, DCI)) 11768 return SDValue(N, 0); 11769 11770 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 11771 // redundant. Ignore bit_converts for now; element sizes are checked below. 11772 while (Op.getOpcode() == ISD::BITCAST) 11773 Op = Op.getOperand(0); 11774 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 11775 return SDValue(); 11776 11777 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 11778 unsigned EltSize = Op.getScalarValueSizeInBits(); 11779 // The canonical VMOV for a zero vector uses a 32-bit element size. 11780 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11781 unsigned EltBits; 11782 if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) 11783 EltSize = 8; 11784 EVT VT = N->getValueType(0); 11785 if (EltSize > VT.getScalarSizeInBits()) 11786 return SDValue(); 11787 11788 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 11789 } 11790 11791 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 11792 static SDValue PerformVDUPCombine(SDNode *N, 11793 TargetLowering::DAGCombinerInfo &DCI) { 11794 SelectionDAG &DAG = DCI.DAG; 11795 SDValue Op = N->getOperand(0); 11796 11797 // Match VDUP(LOAD) -> VLD1DUP. 11798 // We match this pattern here rather than waiting for isel because the 11799 // transform is only legal for unindexed loads. 11800 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 11801 if (LD && Op.hasOneUse() && LD->isUnindexed() && 11802 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 11803 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 11804 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 11805 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 11806 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 11807 Ops, LD->getMemoryVT(), 11808 LD->getMemOperand()); 11809 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 11810 return VLDDup; 11811 } 11812 11813 return SDValue(); 11814 } 11815 11816 static SDValue PerformLOADCombine(SDNode *N, 11817 TargetLowering::DAGCombinerInfo &DCI) { 11818 EVT VT = N->getValueType(0); 11819 11820 // If this is a legal vector load, try to combine it into a VLD1_UPD. 11821 if (ISD::isNormalLoad(N) && VT.isVector() && 11822 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11823 return CombineBaseUpdate(N, DCI); 11824 11825 return SDValue(); 11826 } 11827 11828 /// PerformSTORECombine - Target-specific dag combine xforms for 11829 /// ISD::STORE. 11830 static SDValue PerformSTORECombine(SDNode *N, 11831 TargetLowering::DAGCombinerInfo &DCI) { 11832 StoreSDNode *St = cast<StoreSDNode>(N); 11833 if (St->isVolatile()) 11834 return SDValue(); 11835 11836 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 11837 // pack all of the elements in one place. Next, store to memory in fewer 11838 // chunks. 11839 SDValue StVal = St->getValue(); 11840 EVT VT = StVal.getValueType(); 11841 if (St->isTruncatingStore() && VT.isVector()) { 11842 SelectionDAG &DAG = DCI.DAG; 11843 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11844 EVT StVT = St->getMemoryVT(); 11845 unsigned NumElems = VT.getVectorNumElements(); 11846 assert(StVT != VT && "Cannot truncate to the same type"); 11847 unsigned FromEltSz = VT.getScalarSizeInBits(); 11848 unsigned ToEltSz = StVT.getScalarSizeInBits(); 11849 11850 // From, To sizes and ElemCount must be pow of two 11851 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); 11852 11853 // We are going to use the original vector elt for storing. 11854 // Accumulated smaller vector elements must be a multiple of the store size. 11855 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); 11856 11857 unsigned SizeRatio = FromEltSz / ToEltSz; 11858 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 11859 11860 // Create a type on which we perform the shuffle. 11861 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 11862 NumElems*SizeRatio); 11863 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 11864 11865 SDLoc DL(St); 11866 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 11867 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 11868 for (unsigned i = 0; i < NumElems; ++i) 11869 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() 11870 ? (i + 1) * SizeRatio - 1 11871 : i * SizeRatio; 11872 11873 // Can't shuffle using an illegal type. 11874 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 11875 11876 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, 11877 DAG.getUNDEF(WideVec.getValueType()), 11878 ShuffleVec); 11879 // At this point all of the data is stored at the bottom of the 11880 // register. We now need to save it to mem. 11881 11882 // Find the largest store unit 11883 MVT StoreType = MVT::i8; 11884 for (MVT Tp : MVT::integer_valuetypes()) { 11885 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 11886 StoreType = Tp; 11887 } 11888 // Didn't find a legal store type. 11889 if (!TLI.isTypeLegal(StoreType)) 11890 return SDValue(); 11891 11892 // Bitcast the original vector into a vector of store-size units 11893 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 11894 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 11895 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 11896 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 11897 SmallVector<SDValue, 8> Chains; 11898 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 11899 TLI.getPointerTy(DAG.getDataLayout())); 11900 SDValue BasePtr = St->getBasePtr(); 11901 11902 // Perform one or more big stores into memory. 11903 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); 11904 for (unsigned I = 0; I < E; I++) { 11905 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 11906 StoreType, ShuffWide, 11907 DAG.getIntPtrConstant(I, DL)); 11908 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, 11909 St->getPointerInfo(), St->getAlignment(), 11910 St->getMemOperand()->getFlags()); 11911 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, 11912 Increment); 11913 Chains.push_back(Ch); 11914 } 11915 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 11916 } 11917 11918 if (!ISD::isNormalStore(St)) 11919 return SDValue(); 11920 11921 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 11922 // ARM stores of arguments in the same cache line. 11923 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 11924 StVal.getNode()->hasOneUse()) { 11925 SelectionDAG &DAG = DCI.DAG; 11926 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 11927 SDLoc DL(St); 11928 SDValue BasePtr = St->getBasePtr(); 11929 SDValue NewST1 = DAG.getStore( 11930 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 11931 BasePtr, St->getPointerInfo(), St->getAlignment(), 11932 St->getMemOperand()->getFlags()); 11933 11934 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 11935 DAG.getConstant(4, DL, MVT::i32)); 11936 return DAG.getStore(NewST1.getValue(0), DL, 11937 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 11938 OffsetPtr, St->getPointerInfo(), 11939 std::min(4U, St->getAlignment() / 2), 11940 St->getMemOperand()->getFlags()); 11941 } 11942 11943 if (StVal.getValueType() == MVT::i64 && 11944 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11945 11946 // Bitcast an i64 store extracted from a vector to f64. 11947 // Otherwise, the i64 value will be legalized to a pair of i32 values. 11948 SelectionDAG &DAG = DCI.DAG; 11949 SDLoc dl(StVal); 11950 SDValue IntVec = StVal.getOperand(0); 11951 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 11952 IntVec.getValueType().getVectorNumElements()); 11953 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 11954 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 11955 Vec, StVal.getOperand(1)); 11956 dl = SDLoc(N); 11957 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 11958 // Make the DAGCombiner fold the bitcasts. 11959 DCI.AddToWorklist(Vec.getNode()); 11960 DCI.AddToWorklist(ExtElt.getNode()); 11961 DCI.AddToWorklist(V.getNode()); 11962 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 11963 St->getPointerInfo(), St->getAlignment(), 11964 St->getMemOperand()->getFlags(), St->getAAInfo()); 11965 } 11966 11967 // If this is a legal vector store, try to combine it into a VST1_UPD. 11968 if (ISD::isNormalStore(N) && VT.isVector() && 11969 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11970 return CombineBaseUpdate(N, DCI); 11971 11972 return SDValue(); 11973 } 11974 11975 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 11976 /// can replace combinations of VMUL and VCVT (floating-point to integer) 11977 /// when the VMUL has a constant operand that is a power of 2. 11978 /// 11979 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 11980 /// vmul.f32 d16, d17, d16 11981 /// vcvt.s32.f32 d16, d16 11982 /// becomes: 11983 /// vcvt.s32.f32 d16, d16, #3 11984 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 11985 const ARMSubtarget *Subtarget) { 11986 if (!Subtarget->hasNEON()) 11987 return SDValue(); 11988 11989 SDValue Op = N->getOperand(0); 11990 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 11991 Op.getOpcode() != ISD::FMUL) 11992 return SDValue(); 11993 11994 SDValue ConstVec = Op->getOperand(1); 11995 if (!isa<BuildVectorSDNode>(ConstVec)) 11996 return SDValue(); 11997 11998 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 11999 uint32_t FloatBits = FloatTy.getSizeInBits(); 12000 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 12001 uint32_t IntBits = IntTy.getSizeInBits(); 12002 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12003 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 12004 // These instructions only exist converting from f32 to i32. We can handle 12005 // smaller integers by generating an extra truncate, but larger ones would 12006 // be lossy. We also can't handle more then 4 lanes, since these intructions 12007 // only support v2i32/v4i32 types. 12008 return SDValue(); 12009 } 12010 12011 BitVector UndefElements; 12012 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12013 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 12014 if (C == -1 || C == 0 || C > 32) 12015 return SDValue(); 12016 12017 SDLoc dl(N); 12018 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 12019 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 12020 Intrinsic::arm_neon_vcvtfp2fxu; 12021 SDValue FixConv = DAG.getNode( 12022 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 12023 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 12024 DAG.getConstant(C, dl, MVT::i32)); 12025 12026 if (IntBits < FloatBits) 12027 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 12028 12029 return FixConv; 12030 } 12031 12032 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 12033 /// can replace combinations of VCVT (integer to floating-point) and VDIV 12034 /// when the VDIV has a constant operand that is a power of 2. 12035 /// 12036 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 12037 /// vcvt.f32.s32 d16, d16 12038 /// vdiv.f32 d16, d17, d16 12039 /// becomes: 12040 /// vcvt.f32.s32 d16, d16, #3 12041 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 12042 const ARMSubtarget *Subtarget) { 12043 if (!Subtarget->hasNEON()) 12044 return SDValue(); 12045 12046 SDValue Op = N->getOperand(0); 12047 unsigned OpOpcode = Op.getNode()->getOpcode(); 12048 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 12049 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 12050 return SDValue(); 12051 12052 SDValue ConstVec = N->getOperand(1); 12053 if (!isa<BuildVectorSDNode>(ConstVec)) 12054 return SDValue(); 12055 12056 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 12057 uint32_t FloatBits = FloatTy.getSizeInBits(); 12058 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 12059 uint32_t IntBits = IntTy.getSizeInBits(); 12060 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12061 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 12062 // These instructions only exist converting from i32 to f32. We can handle 12063 // smaller integers by generating an extra extend, but larger ones would 12064 // be lossy. We also can't handle more then 4 lanes, since these intructions 12065 // only support v2i32/v4i32 types. 12066 return SDValue(); 12067 } 12068 12069 BitVector UndefElements; 12070 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12071 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 12072 if (C == -1 || C == 0 || C > 32) 12073 return SDValue(); 12074 12075 SDLoc dl(N); 12076 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 12077 SDValue ConvInput = Op.getOperand(0); 12078 if (IntBits < FloatBits) 12079 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 12080 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 12081 ConvInput); 12082 12083 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 12084 Intrinsic::arm_neon_vcvtfxu2fp; 12085 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 12086 Op.getValueType(), 12087 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 12088 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 12089 } 12090 12091 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 12092 /// operand of a vector shift operation, where all the elements of the 12093 /// build_vector must have the same constant integer value. 12094 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 12095 // Ignore bit_converts. 12096 while (Op.getOpcode() == ISD::BITCAST) 12097 Op = Op.getOperand(0); 12098 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 12099 APInt SplatBits, SplatUndef; 12100 unsigned SplatBitSize; 12101 bool HasAnyUndefs; 12102 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 12103 HasAnyUndefs, ElementBits) || 12104 SplatBitSize > ElementBits) 12105 return false; 12106 Cnt = SplatBits.getSExtValue(); 12107 return true; 12108 } 12109 12110 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 12111 /// operand of a vector shift left operation. That value must be in the range: 12112 /// 0 <= Value < ElementBits for a left shift; or 12113 /// 0 <= Value <= ElementBits for a long left shift. 12114 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 12115 assert(VT.isVector() && "vector shift count is not a vector type"); 12116 int64_t ElementBits = VT.getScalarSizeInBits(); 12117 if (! getVShiftImm(Op, ElementBits, Cnt)) 12118 return false; 12119 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); 12120 } 12121 12122 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 12123 /// operand of a vector shift right operation. For a shift opcode, the value 12124 /// is positive, but for an intrinsic the value count must be negative. The 12125 /// absolute value must be in the range: 12126 /// 1 <= |Value| <= ElementBits for a right shift; or 12127 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 12128 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 12129 int64_t &Cnt) { 12130 assert(VT.isVector() && "vector shift count is not a vector type"); 12131 int64_t ElementBits = VT.getScalarSizeInBits(); 12132 if (! getVShiftImm(Op, ElementBits, Cnt)) 12133 return false; 12134 if (!isIntrinsic) 12135 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); 12136 if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { 12137 Cnt = -Cnt; 12138 return true; 12139 } 12140 return false; 12141 } 12142 12143 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 12144 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 12145 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 12146 switch (IntNo) { 12147 default: 12148 // Don't do anything for most intrinsics. 12149 break; 12150 12151 // Vector shifts: check for immediate versions and lower them. 12152 // Note: This is done during DAG combining instead of DAG legalizing because 12153 // the build_vectors for 64-bit vector element shift counts are generally 12154 // not legal, and it is hard to see their values after they get legalized to 12155 // loads from a constant pool. 12156 case Intrinsic::arm_neon_vshifts: 12157 case Intrinsic::arm_neon_vshiftu: 12158 case Intrinsic::arm_neon_vrshifts: 12159 case Intrinsic::arm_neon_vrshiftu: 12160 case Intrinsic::arm_neon_vrshiftn: 12161 case Intrinsic::arm_neon_vqshifts: 12162 case Intrinsic::arm_neon_vqshiftu: 12163 case Intrinsic::arm_neon_vqshiftsu: 12164 case Intrinsic::arm_neon_vqshiftns: 12165 case Intrinsic::arm_neon_vqshiftnu: 12166 case Intrinsic::arm_neon_vqshiftnsu: 12167 case Intrinsic::arm_neon_vqrshiftns: 12168 case Intrinsic::arm_neon_vqrshiftnu: 12169 case Intrinsic::arm_neon_vqrshiftnsu: { 12170 EVT VT = N->getOperand(1).getValueType(); 12171 int64_t Cnt; 12172 unsigned VShiftOpc = 0; 12173 12174 switch (IntNo) { 12175 case Intrinsic::arm_neon_vshifts: 12176 case Intrinsic::arm_neon_vshiftu: 12177 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 12178 VShiftOpc = ARMISD::VSHL; 12179 break; 12180 } 12181 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 12182 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? 12183 ARMISD::VSHRs : ARMISD::VSHRu); 12184 break; 12185 } 12186 return SDValue(); 12187 12188 case Intrinsic::arm_neon_vrshifts: 12189 case Intrinsic::arm_neon_vrshiftu: 12190 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 12191 break; 12192 return SDValue(); 12193 12194 case Intrinsic::arm_neon_vqshifts: 12195 case Intrinsic::arm_neon_vqshiftu: 12196 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 12197 break; 12198 return SDValue(); 12199 12200 case Intrinsic::arm_neon_vqshiftsu: 12201 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 12202 break; 12203 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 12204 12205 case Intrinsic::arm_neon_vrshiftn: 12206 case Intrinsic::arm_neon_vqshiftns: 12207 case Intrinsic::arm_neon_vqshiftnu: 12208 case Intrinsic::arm_neon_vqshiftnsu: 12209 case Intrinsic::arm_neon_vqrshiftns: 12210 case Intrinsic::arm_neon_vqrshiftnu: 12211 case Intrinsic::arm_neon_vqrshiftnsu: 12212 // Narrowing shifts require an immediate right shift. 12213 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 12214 break; 12215 llvm_unreachable("invalid shift count for narrowing vector shift " 12216 "intrinsic"); 12217 12218 default: 12219 llvm_unreachable("unhandled vector shift"); 12220 } 12221 12222 switch (IntNo) { 12223 case Intrinsic::arm_neon_vshifts: 12224 case Intrinsic::arm_neon_vshiftu: 12225 // Opcode already set above. 12226 break; 12227 case Intrinsic::arm_neon_vrshifts: 12228 VShiftOpc = ARMISD::VRSHRs; break; 12229 case Intrinsic::arm_neon_vrshiftu: 12230 VShiftOpc = ARMISD::VRSHRu; break; 12231 case Intrinsic::arm_neon_vrshiftn: 12232 VShiftOpc = ARMISD::VRSHRN; break; 12233 case Intrinsic::arm_neon_vqshifts: 12234 VShiftOpc = ARMISD::VQSHLs; break; 12235 case Intrinsic::arm_neon_vqshiftu: 12236 VShiftOpc = ARMISD::VQSHLu; break; 12237 case Intrinsic::arm_neon_vqshiftsu: 12238 VShiftOpc = ARMISD::VQSHLsu; break; 12239 case Intrinsic::arm_neon_vqshiftns: 12240 VShiftOpc = ARMISD::VQSHRNs; break; 12241 case Intrinsic::arm_neon_vqshiftnu: 12242 VShiftOpc = ARMISD::VQSHRNu; break; 12243 case Intrinsic::arm_neon_vqshiftnsu: 12244 VShiftOpc = ARMISD::VQSHRNsu; break; 12245 case Intrinsic::arm_neon_vqrshiftns: 12246 VShiftOpc = ARMISD::VQRSHRNs; break; 12247 case Intrinsic::arm_neon_vqrshiftnu: 12248 VShiftOpc = ARMISD::VQRSHRNu; break; 12249 case Intrinsic::arm_neon_vqrshiftnsu: 12250 VShiftOpc = ARMISD::VQRSHRNsu; break; 12251 } 12252 12253 SDLoc dl(N); 12254 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 12255 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 12256 } 12257 12258 case Intrinsic::arm_neon_vshiftins: { 12259 EVT VT = N->getOperand(1).getValueType(); 12260 int64_t Cnt; 12261 unsigned VShiftOpc = 0; 12262 12263 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 12264 VShiftOpc = ARMISD::VSLI; 12265 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 12266 VShiftOpc = ARMISD::VSRI; 12267 else { 12268 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 12269 } 12270 12271 SDLoc dl(N); 12272 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 12273 N->getOperand(1), N->getOperand(2), 12274 DAG.getConstant(Cnt, dl, MVT::i32)); 12275 } 12276 12277 case Intrinsic::arm_neon_vqrshifts: 12278 case Intrinsic::arm_neon_vqrshiftu: 12279 // No immediate versions of these to check for. 12280 break; 12281 } 12282 12283 return SDValue(); 12284 } 12285 12286 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 12287 /// lowers them. As with the vector shift intrinsics, this is done during DAG 12288 /// combining instead of DAG legalizing because the build_vectors for 64-bit 12289 /// vector element shift counts are generally not legal, and it is hard to see 12290 /// their values after they get legalized to loads from a constant pool. 12291 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, 12292 const ARMSubtarget *ST) { 12293 EVT VT = N->getValueType(0); 12294 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 12295 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 12296 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 12297 SDValue N1 = N->getOperand(1); 12298 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 12299 SDValue N0 = N->getOperand(0); 12300 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 12301 DAG.MaskedValueIsZero(N0.getOperand(0), 12302 APInt::getHighBitsSet(32, 16))) 12303 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 12304 } 12305 } 12306 12307 // Nothing to be done for scalar shifts. 12308 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12309 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 12310 return SDValue(); 12311 12312 assert(ST->hasNEON() && "unexpected vector shift"); 12313 int64_t Cnt; 12314 12315 switch (N->getOpcode()) { 12316 default: llvm_unreachable("unexpected shift opcode"); 12317 12318 case ISD::SHL: 12319 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 12320 SDLoc dl(N); 12321 return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), 12322 DAG.getConstant(Cnt, dl, MVT::i32)); 12323 } 12324 break; 12325 12326 case ISD::SRA: 12327 case ISD::SRL: 12328 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 12329 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? 12330 ARMISD::VSHRs : ARMISD::VSHRu); 12331 SDLoc dl(N); 12332 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 12333 DAG.getConstant(Cnt, dl, MVT::i32)); 12334 } 12335 } 12336 return SDValue(); 12337 } 12338 12339 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 12340 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 12341 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 12342 const ARMSubtarget *ST) { 12343 SDValue N0 = N->getOperand(0); 12344 12345 // Check for sign- and zero-extensions of vector extract operations of 8- 12346 // and 16-bit vector elements. NEON supports these directly. They are 12347 // handled during DAG combining because type legalization will promote them 12348 // to 32-bit types and it is messy to recognize the operations after that. 12349 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12350 SDValue Vec = N0.getOperand(0); 12351 SDValue Lane = N0.getOperand(1); 12352 EVT VT = N->getValueType(0); 12353 EVT EltVT = N0.getValueType(); 12354 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12355 12356 if (VT == MVT::i32 && 12357 (EltVT == MVT::i8 || EltVT == MVT::i16) && 12358 TLI.isTypeLegal(Vec.getValueType()) && 12359 isa<ConstantSDNode>(Lane)) { 12360 12361 unsigned Opc = 0; 12362 switch (N->getOpcode()) { 12363 default: llvm_unreachable("unexpected opcode"); 12364 case ISD::SIGN_EXTEND: 12365 Opc = ARMISD::VGETLANEs; 12366 break; 12367 case ISD::ZERO_EXTEND: 12368 case ISD::ANY_EXTEND: 12369 Opc = ARMISD::VGETLANEu; 12370 break; 12371 } 12372 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 12373 } 12374 } 12375 12376 return SDValue(); 12377 } 12378 12379 static const APInt *isPowerOf2Constant(SDValue V) { 12380 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 12381 if (!C) 12382 return nullptr; 12383 const APInt *CV = &C->getAPIntValue(); 12384 return CV->isPowerOf2() ? CV : nullptr; 12385 } 12386 12387 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 12388 // If we have a CMOV, OR and AND combination such as: 12389 // if (x & CN) 12390 // y |= CM; 12391 // 12392 // And: 12393 // * CN is a single bit; 12394 // * All bits covered by CM are known zero in y 12395 // 12396 // Then we can convert this into a sequence of BFI instructions. This will 12397 // always be a win if CM is a single bit, will always be no worse than the 12398 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 12399 // three bits (due to the extra IT instruction). 12400 12401 SDValue Op0 = CMOV->getOperand(0); 12402 SDValue Op1 = CMOV->getOperand(1); 12403 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 12404 auto CC = CCNode->getAPIntValue().getLimitedValue(); 12405 SDValue CmpZ = CMOV->getOperand(4); 12406 12407 // The compare must be against zero. 12408 if (!isNullConstant(CmpZ->getOperand(1))) 12409 return SDValue(); 12410 12411 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 12412 SDValue And = CmpZ->getOperand(0); 12413 if (And->getOpcode() != ISD::AND) 12414 return SDValue(); 12415 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 12416 if (!AndC) 12417 return SDValue(); 12418 SDValue X = And->getOperand(0); 12419 12420 if (CC == ARMCC::EQ) { 12421 // We're performing an "equal to zero" compare. Swap the operands so we 12422 // canonicalize on a "not equal to zero" compare. 12423 std::swap(Op0, Op1); 12424 } else { 12425 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 12426 } 12427 12428 if (Op1->getOpcode() != ISD::OR) 12429 return SDValue(); 12430 12431 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 12432 if (!OrC) 12433 return SDValue(); 12434 SDValue Y = Op1->getOperand(0); 12435 12436 if (Op0 != Y) 12437 return SDValue(); 12438 12439 // Now, is it profitable to continue? 12440 APInt OrCI = OrC->getAPIntValue(); 12441 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 12442 if (OrCI.countPopulation() > Heuristic) 12443 return SDValue(); 12444 12445 // Lastly, can we determine that the bits defined by OrCI 12446 // are zero in Y? 12447 KnownBits Known; 12448 DAG.computeKnownBits(Y, Known); 12449 if ((OrCI & Known.Zero) != OrCI) 12450 return SDValue(); 12451 12452 // OK, we can do the combine. 12453 SDValue V = Y; 12454 SDLoc dl(X); 12455 EVT VT = X.getValueType(); 12456 unsigned BitInX = AndC->logBase2(); 12457 12458 if (BitInX != 0) { 12459 // We must shift X first. 12460 X = DAG.getNode(ISD::SRL, dl, VT, X, 12461 DAG.getConstant(BitInX, dl, VT)); 12462 } 12463 12464 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 12465 BitInY < NumActiveBits; ++BitInY) { 12466 if (OrCI[BitInY] == 0) 12467 continue; 12468 APInt Mask(VT.getSizeInBits(), 0); 12469 Mask.setBit(BitInY); 12470 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 12471 // Confusingly, the operand is an *inverted* mask. 12472 DAG.getConstant(~Mask, dl, VT)); 12473 } 12474 12475 return V; 12476 } 12477 12478 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 12479 SDValue 12480 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 12481 SDValue Cmp = N->getOperand(4); 12482 if (Cmp.getOpcode() != ARMISD::CMPZ) 12483 // Only looking at NE cases. 12484 return SDValue(); 12485 12486 EVT VT = N->getValueType(0); 12487 SDLoc dl(N); 12488 SDValue LHS = Cmp.getOperand(0); 12489 SDValue RHS = Cmp.getOperand(1); 12490 SDValue Chain = N->getOperand(0); 12491 SDValue BB = N->getOperand(1); 12492 SDValue ARMcc = N->getOperand(2); 12493 ARMCC::CondCodes CC = 12494 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 12495 12496 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 12497 // -> (brcond Chain BB CC CPSR Cmp) 12498 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 12499 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 12500 LHS->getOperand(0)->hasOneUse()) { 12501 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 12502 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 12503 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 12504 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 12505 if ((LHS00C && LHS00C->getZExtValue() == 0) && 12506 (LHS01C && LHS01C->getZExtValue() == 1) && 12507 (LHS1C && LHS1C->getZExtValue() == 1) && 12508 (RHSC && RHSC->getZExtValue() == 0)) { 12509 return DAG.getNode( 12510 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 12511 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 12512 } 12513 } 12514 12515 return SDValue(); 12516 } 12517 12518 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 12519 SDValue 12520 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 12521 SDValue Cmp = N->getOperand(4); 12522 if (Cmp.getOpcode() != ARMISD::CMPZ) 12523 // Only looking at EQ and NE cases. 12524 return SDValue(); 12525 12526 EVT VT = N->getValueType(0); 12527 SDLoc dl(N); 12528 SDValue LHS = Cmp.getOperand(0); 12529 SDValue RHS = Cmp.getOperand(1); 12530 SDValue FalseVal = N->getOperand(0); 12531 SDValue TrueVal = N->getOperand(1); 12532 SDValue ARMcc = N->getOperand(2); 12533 ARMCC::CondCodes CC = 12534 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 12535 12536 // BFI is only available on V6T2+. 12537 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 12538 SDValue R = PerformCMOVToBFICombine(N, DAG); 12539 if (R) 12540 return R; 12541 } 12542 12543 // Simplify 12544 // mov r1, r0 12545 // cmp r1, x 12546 // mov r0, y 12547 // moveq r0, x 12548 // to 12549 // cmp r0, x 12550 // movne r0, y 12551 // 12552 // mov r1, r0 12553 // cmp r1, x 12554 // mov r0, x 12555 // movne r0, y 12556 // to 12557 // cmp r0, x 12558 // movne r0, y 12559 /// FIXME: Turn this into a target neutral optimization? 12560 SDValue Res; 12561 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 12562 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 12563 N->getOperand(3), Cmp); 12564 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 12565 SDValue ARMcc; 12566 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 12567 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 12568 N->getOperand(3), NewCmp); 12569 } 12570 12571 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 12572 // -> (cmov F T CC CPSR Cmp) 12573 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 12574 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 12575 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 12576 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 12577 if ((LHS0C && LHS0C->getZExtValue() == 0) && 12578 (LHS1C && LHS1C->getZExtValue() == 1) && 12579 (RHSC && RHSC->getZExtValue() == 0)) { 12580 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 12581 LHS->getOperand(2), LHS->getOperand(3), 12582 LHS->getOperand(4)); 12583 } 12584 } 12585 12586 if (!VT.isInteger()) 12587 return SDValue(); 12588 12589 // Materialize a boolean comparison for integers so we can avoid branching. 12590 if (isNullConstant(FalseVal)) { 12591 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 12592 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 12593 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 12594 // right 5 bits will make that 32 be 1, otherwise it will be 0. 12595 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 12596 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12597 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 12598 DAG.getConstant(5, dl, MVT::i32)); 12599 } else { 12600 // CMOV 0, 1, ==, (CMPZ x, y) -> 12601 // (ADDCARRY (SUB x, y), t:0, t:1) 12602 // where t = (SUBCARRY 0, (SUB x, y), 0) 12603 // 12604 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 12605 // x != y. In other words, a carry C == 1 when x == y, C == 0 12606 // otherwise. 12607 // The final ADDCARRY computes 12608 // x - y + (0 - (x - y)) + C == C 12609 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12610 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12611 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 12612 // ISD::SUBCARRY returns a borrow but we want the carry here 12613 // actually. 12614 SDValue Carry = 12615 DAG.getNode(ISD::SUB, dl, MVT::i32, 12616 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 12617 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 12618 } 12619 } else if (CC == ARMCC::NE && LHS != RHS && 12620 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 12621 // This seems pointless but will allow us to combine it further below. 12622 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) 12623 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12624 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 12625 N->getOperand(3), Cmp); 12626 } 12627 } else if (isNullConstant(TrueVal)) { 12628 if (CC == ARMCC::EQ && LHS != RHS && 12629 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 12630 // This seems pointless but will allow us to combine it further below 12631 // Note that we change == for != as this is the dual for the case above. 12632 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) 12633 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12634 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 12635 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 12636 N->getOperand(3), Cmp); 12637 } 12638 } 12639 12640 // On Thumb1, the DAG above may be further combined if z is a power of 2 12641 // (z == 2 ^ K). 12642 // CMOV (SUB x, y), z, !=, (CMPZ x, y) -> 12643 // merge t3, t4 12644 // where t1 = (SUBCARRY (SUB x, y), z, 0) 12645 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 12646 // t3 = if K != 0 then (SHL t2:0, K) else t2:0 12647 // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ] 12648 const APInt *TrueConst; 12649 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 12650 (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) && 12651 (FalseVal.getOperand(1) == RHS) && 12652 (TrueConst = isPowerOf2Constant(TrueVal))) { 12653 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12654 unsigned ShiftAmount = TrueConst->logBase2(); 12655 if (ShiftAmount) 12656 TrueVal = DAG.getConstant(1, dl, VT); 12657 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 12658 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 12659 // Make it a carry, not a borrow. 12660 SDValue Carry = DAG.getNode( 12661 ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1)); 12662 Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry); 12663 12664 if (ShiftAmount) 12665 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 12666 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 12667 } 12668 12669 if (Res.getNode()) { 12670 KnownBits Known; 12671 DAG.computeKnownBits(SDValue(N,0), Known); 12672 // Capture demanded bits information that would be otherwise lost. 12673 if (Known.Zero == 0xfffffffe) 12674 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12675 DAG.getValueType(MVT::i1)); 12676 else if (Known.Zero == 0xffffff00) 12677 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12678 DAG.getValueType(MVT::i8)); 12679 else if (Known.Zero == 0xffff0000) 12680 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12681 DAG.getValueType(MVT::i16)); 12682 } 12683 12684 return Res; 12685 } 12686 12687 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 12688 DAGCombinerInfo &DCI) const { 12689 switch (N->getOpcode()) { 12690 default: break; 12691 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 12692 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 12693 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 12694 case ISD::SUB: return PerformSUBCombine(N, DCI); 12695 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 12696 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 12697 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 12698 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 12699 case ARMISD::ADDC: 12700 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 12701 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 12702 case ARMISD::BFI: return PerformBFICombine(N, DCI); 12703 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 12704 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 12705 case ISD::STORE: return PerformSTORECombine(N, DCI); 12706 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 12707 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 12708 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 12709 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 12710 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); 12711 case ISD::FP_TO_SINT: 12712 case ISD::FP_TO_UINT: 12713 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 12714 case ISD::FDIV: 12715 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 12716 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 12717 case ISD::SHL: 12718 case ISD::SRA: 12719 case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); 12720 case ISD::SIGN_EXTEND: 12721 case ISD::ZERO_EXTEND: 12722 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 12723 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 12724 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 12725 case ISD::LOAD: return PerformLOADCombine(N, DCI); 12726 case ARMISD::VLD1DUP: 12727 case ARMISD::VLD2DUP: 12728 case ARMISD::VLD3DUP: 12729 case ARMISD::VLD4DUP: 12730 return PerformVLDCombine(N, DCI); 12731 case ARMISD::BUILD_VECTOR: 12732 return PerformARMBUILD_VECTORCombine(N, DCI); 12733 case ARMISD::SMULWB: { 12734 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12735 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 12736 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 12737 return SDValue(); 12738 break; 12739 } 12740 case ARMISD::SMULWT: { 12741 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12742 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 12743 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 12744 return SDValue(); 12745 break; 12746 } 12747 case ARMISD::SMLALBB: { 12748 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12749 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 12750 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 12751 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 12752 return SDValue(); 12753 break; 12754 } 12755 case ARMISD::SMLALBT: { 12756 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 12757 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 12758 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 12759 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 12760 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 12761 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 12762 return SDValue(); 12763 break; 12764 } 12765 case ARMISD::SMLALTB: { 12766 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 12767 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 12768 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 12769 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 12770 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 12771 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 12772 return SDValue(); 12773 break; 12774 } 12775 case ARMISD::SMLALTT: { 12776 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12777 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 12778 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 12779 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 12780 return SDValue(); 12781 break; 12782 } 12783 case ISD::INTRINSIC_VOID: 12784 case ISD::INTRINSIC_W_CHAIN: 12785 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 12786 case Intrinsic::arm_neon_vld1: 12787 case Intrinsic::arm_neon_vld1x2: 12788 case Intrinsic::arm_neon_vld1x3: 12789 case Intrinsic::arm_neon_vld1x4: 12790 case Intrinsic::arm_neon_vld2: 12791 case Intrinsic::arm_neon_vld3: 12792 case Intrinsic::arm_neon_vld4: 12793 case Intrinsic::arm_neon_vld2lane: 12794 case Intrinsic::arm_neon_vld3lane: 12795 case Intrinsic::arm_neon_vld4lane: 12796 case Intrinsic::arm_neon_vld2dup: 12797 case Intrinsic::arm_neon_vld3dup: 12798 case Intrinsic::arm_neon_vld4dup: 12799 case Intrinsic::arm_neon_vst1: 12800 case Intrinsic::arm_neon_vst1x2: 12801 case Intrinsic::arm_neon_vst1x3: 12802 case Intrinsic::arm_neon_vst1x4: 12803 case Intrinsic::arm_neon_vst2: 12804 case Intrinsic::arm_neon_vst3: 12805 case Intrinsic::arm_neon_vst4: 12806 case Intrinsic::arm_neon_vst2lane: 12807 case Intrinsic::arm_neon_vst3lane: 12808 case Intrinsic::arm_neon_vst4lane: 12809 return PerformVLDCombine(N, DCI); 12810 default: break; 12811 } 12812 break; 12813 } 12814 return SDValue(); 12815 } 12816 12817 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 12818 EVT VT) const { 12819 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 12820 } 12821 12822 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 12823 unsigned, 12824 unsigned, 12825 bool *Fast) const { 12826 // Depends what it gets converted into if the type is weird. 12827 if (!VT.isSimple()) 12828 return false; 12829 12830 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 12831 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 12832 12833 switch (VT.getSimpleVT().SimpleTy) { 12834 default: 12835 return false; 12836 case MVT::i8: 12837 case MVT::i16: 12838 case MVT::i32: { 12839 // Unaligned access can use (for example) LRDB, LRDH, LDR 12840 if (AllowsUnaligned) { 12841 if (Fast) 12842 *Fast = Subtarget->hasV7Ops(); 12843 return true; 12844 } 12845 return false; 12846 } 12847 case MVT::f64: 12848 case MVT::v2f64: { 12849 // For any little-endian targets with neon, we can support unaligned ld/st 12850 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 12851 // A big-endian target may also explicitly support unaligned accesses 12852 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 12853 if (Fast) 12854 *Fast = true; 12855 return true; 12856 } 12857 return false; 12858 } 12859 } 12860 } 12861 12862 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 12863 unsigned AlignCheck) { 12864 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 12865 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 12866 } 12867 12868 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, 12869 unsigned DstAlign, unsigned SrcAlign, 12870 bool IsMemset, bool ZeroMemset, 12871 bool MemcpyStrSrc, 12872 MachineFunction &MF) const { 12873 const Function &F = MF.getFunction(); 12874 12875 // See if we can use NEON instructions for this... 12876 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 12877 !F.hasFnAttribute(Attribute::NoImplicitFloat)) { 12878 bool Fast; 12879 if (Size >= 16 && 12880 (memOpAlign(SrcAlign, DstAlign, 16) || 12881 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { 12882 return MVT::v2f64; 12883 } else if (Size >= 8 && 12884 (memOpAlign(SrcAlign, DstAlign, 8) || 12885 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && 12886 Fast))) { 12887 return MVT::f64; 12888 } 12889 } 12890 12891 // Let the target-independent logic figure it out. 12892 return MVT::Other; 12893 } 12894 12895 // 64-bit integers are split into their high and low parts and held in two 12896 // different registers, so the trunc is free since the low register can just 12897 // be used. 12898 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 12899 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 12900 return false; 12901 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 12902 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 12903 return (SrcBits == 64 && DestBits == 32); 12904 } 12905 12906 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 12907 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 12908 !DstVT.isInteger()) 12909 return false; 12910 unsigned SrcBits = SrcVT.getSizeInBits(); 12911 unsigned DestBits = DstVT.getSizeInBits(); 12912 return (SrcBits == 64 && DestBits == 32); 12913 } 12914 12915 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 12916 if (Val.getOpcode() != ISD::LOAD) 12917 return false; 12918 12919 EVT VT1 = Val.getValueType(); 12920 if (!VT1.isSimple() || !VT1.isInteger() || 12921 !VT2.isSimple() || !VT2.isInteger()) 12922 return false; 12923 12924 switch (VT1.getSimpleVT().SimpleTy) { 12925 default: break; 12926 case MVT::i1: 12927 case MVT::i8: 12928 case MVT::i16: 12929 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 12930 return true; 12931 } 12932 12933 return false; 12934 } 12935 12936 bool ARMTargetLowering::isFNegFree(EVT VT) const { 12937 if (!VT.isSimple()) 12938 return false; 12939 12940 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 12941 // negate values directly (fneg is free). So, we don't want to let the DAG 12942 // combiner rewrite fneg into xors and some other instructions. For f16 and 12943 // FullFP16 argument passing, some bitcast nodes may be introduced, 12944 // triggering this DAG combine rewrite, so we are avoiding that with this. 12945 switch (VT.getSimpleVT().SimpleTy) { 12946 default: break; 12947 case MVT::f16: 12948 return Subtarget->hasFullFP16(); 12949 } 12950 12951 return false; 12952 } 12953 12954 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 12955 EVT VT = ExtVal.getValueType(); 12956 12957 if (!isTypeLegal(VT)) 12958 return false; 12959 12960 // Don't create a loadext if we can fold the extension into a wide/long 12961 // instruction. 12962 // If there's more than one user instruction, the loadext is desirable no 12963 // matter what. There can be two uses by the same instruction. 12964 if (ExtVal->use_empty() || 12965 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 12966 return true; 12967 12968 SDNode *U = *ExtVal->use_begin(); 12969 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 12970 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) 12971 return false; 12972 12973 return true; 12974 } 12975 12976 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 12977 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 12978 return false; 12979 12980 if (!isTypeLegal(EVT::getEVT(Ty1))) 12981 return false; 12982 12983 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 12984 12985 // Assuming the caller doesn't have a zeroext or signext return parameter, 12986 // truncation all the way down to i1 is valid. 12987 return true; 12988 } 12989 12990 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 12991 const AddrMode &AM, Type *Ty, 12992 unsigned AS) const { 12993 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 12994 if (Subtarget->hasFPAO()) 12995 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 12996 return 0; 12997 } 12998 return -1; 12999 } 13000 13001 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 13002 if (V < 0) 13003 return false; 13004 13005 unsigned Scale = 1; 13006 switch (VT.getSimpleVT().SimpleTy) { 13007 default: return false; 13008 case MVT::i1: 13009 case MVT::i8: 13010 // Scale == 1; 13011 break; 13012 case MVT::i16: 13013 // Scale == 2; 13014 Scale = 2; 13015 break; 13016 case MVT::i32: 13017 // Scale == 4; 13018 Scale = 4; 13019 break; 13020 } 13021 13022 if ((V & (Scale - 1)) != 0) 13023 return false; 13024 V /= Scale; 13025 return V == (V & ((1LL << 5) - 1)); 13026 } 13027 13028 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 13029 const ARMSubtarget *Subtarget) { 13030 bool isNeg = false; 13031 if (V < 0) { 13032 isNeg = true; 13033 V = - V; 13034 } 13035 13036 switch (VT.getSimpleVT().SimpleTy) { 13037 default: return false; 13038 case MVT::i1: 13039 case MVT::i8: 13040 case MVT::i16: 13041 case MVT::i32: 13042 // + imm12 or - imm8 13043 if (isNeg) 13044 return V == (V & ((1LL << 8) - 1)); 13045 return V == (V & ((1LL << 12) - 1)); 13046 case MVT::f32: 13047 case MVT::f64: 13048 // Same as ARM mode. FIXME: NEON? 13049 if (!Subtarget->hasVFP2()) 13050 return false; 13051 if ((V & 3) != 0) 13052 return false; 13053 V >>= 2; 13054 return V == (V & ((1LL << 8) - 1)); 13055 } 13056 } 13057 13058 /// isLegalAddressImmediate - Return true if the integer value can be used 13059 /// as the offset of the target addressing mode for load / store of the 13060 /// given type. 13061 static bool isLegalAddressImmediate(int64_t V, EVT VT, 13062 const ARMSubtarget *Subtarget) { 13063 if (V == 0) 13064 return true; 13065 13066 if (!VT.isSimple()) 13067 return false; 13068 13069 if (Subtarget->isThumb1Only()) 13070 return isLegalT1AddressImmediate(V, VT); 13071 else if (Subtarget->isThumb2()) 13072 return isLegalT2AddressImmediate(V, VT, Subtarget); 13073 13074 // ARM mode. 13075 if (V < 0) 13076 V = - V; 13077 switch (VT.getSimpleVT().SimpleTy) { 13078 default: return false; 13079 case MVT::i1: 13080 case MVT::i8: 13081 case MVT::i32: 13082 // +- imm12 13083 return V == (V & ((1LL << 12) - 1)); 13084 case MVT::i16: 13085 // +- imm8 13086 return V == (V & ((1LL << 8) - 1)); 13087 case MVT::f32: 13088 case MVT::f64: 13089 if (!Subtarget->hasVFP2()) // FIXME: NEON? 13090 return false; 13091 if ((V & 3) != 0) 13092 return false; 13093 V >>= 2; 13094 return V == (V & ((1LL << 8) - 1)); 13095 } 13096 } 13097 13098 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 13099 EVT VT) const { 13100 int Scale = AM.Scale; 13101 if (Scale < 0) 13102 return false; 13103 13104 switch (VT.getSimpleVT().SimpleTy) { 13105 default: return false; 13106 case MVT::i1: 13107 case MVT::i8: 13108 case MVT::i16: 13109 case MVT::i32: 13110 if (Scale == 1) 13111 return true; 13112 // r + r << imm 13113 Scale = Scale & ~1; 13114 return Scale == 2 || Scale == 4 || Scale == 8; 13115 case MVT::i64: 13116 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 13117 // version in Thumb mode. 13118 // r + r 13119 if (Scale == 1) 13120 return true; 13121 // r * 2 (this can be lowered to r + r). 13122 if (!AM.HasBaseReg && Scale == 2) 13123 return true; 13124 return false; 13125 case MVT::isVoid: 13126 // Note, we allow "void" uses (basically, uses that aren't loads or 13127 // stores), because arm allows folding a scale into many arithmetic 13128 // operations. This should be made more precise and revisited later. 13129 13130 // Allow r << imm, but the imm has to be a multiple of two. 13131 if (Scale & 1) return false; 13132 return isPowerOf2_32(Scale); 13133 } 13134 } 13135 13136 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 13137 EVT VT) const { 13138 const int Scale = AM.Scale; 13139 13140 // Negative scales are not supported in Thumb1. 13141 if (Scale < 0) 13142 return false; 13143 13144 // Thumb1 addressing modes do not support register scaling excepting the 13145 // following cases: 13146 // 1. Scale == 1 means no scaling. 13147 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 13148 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 13149 } 13150 13151 /// isLegalAddressingMode - Return true if the addressing mode represented 13152 /// by AM is legal for this target, for a load/store of the specified type. 13153 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 13154 const AddrMode &AM, Type *Ty, 13155 unsigned AS, Instruction *I) const { 13156 EVT VT = getValueType(DL, Ty, true); 13157 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 13158 return false; 13159 13160 // Can never fold addr of global into load/store. 13161 if (AM.BaseGV) 13162 return false; 13163 13164 switch (AM.Scale) { 13165 case 0: // no scale reg, must be "r+i" or "r", or "i". 13166 break; 13167 default: 13168 // ARM doesn't support any R+R*scale+imm addr modes. 13169 if (AM.BaseOffs) 13170 return false; 13171 13172 if (!VT.isSimple()) 13173 return false; 13174 13175 if (Subtarget->isThumb1Only()) 13176 return isLegalT1ScaledAddressingMode(AM, VT); 13177 13178 if (Subtarget->isThumb2()) 13179 return isLegalT2ScaledAddressingMode(AM, VT); 13180 13181 int Scale = AM.Scale; 13182 switch (VT.getSimpleVT().SimpleTy) { 13183 default: return false; 13184 case MVT::i1: 13185 case MVT::i8: 13186 case MVT::i32: 13187 if (Scale < 0) Scale = -Scale; 13188 if (Scale == 1) 13189 return true; 13190 // r + r << imm 13191 return isPowerOf2_32(Scale & ~1); 13192 case MVT::i16: 13193 case MVT::i64: 13194 // r +/- r 13195 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 13196 return true; 13197 // r * 2 (this can be lowered to r + r). 13198 if (!AM.HasBaseReg && Scale == 2) 13199 return true; 13200 return false; 13201 13202 case MVT::isVoid: 13203 // Note, we allow "void" uses (basically, uses that aren't loads or 13204 // stores), because arm allows folding a scale into many arithmetic 13205 // operations. This should be made more precise and revisited later. 13206 13207 // Allow r << imm, but the imm has to be a multiple of two. 13208 if (Scale & 1) return false; 13209 return isPowerOf2_32(Scale); 13210 } 13211 } 13212 return true; 13213 } 13214 13215 /// isLegalICmpImmediate - Return true if the specified immediate is legal 13216 /// icmp immediate, that is the target has icmp instructions which can compare 13217 /// a register against the immediate without having to materialize the 13218 /// immediate into a register. 13219 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 13220 // Thumb2 and ARM modes can use cmn for negative immediates. 13221 if (!Subtarget->isThumb()) 13222 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 13223 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 13224 if (Subtarget->isThumb2()) 13225 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 13226 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 13227 // Thumb1 doesn't have cmn, and only 8-bit immediates. 13228 return Imm >= 0 && Imm <= 255; 13229 } 13230 13231 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 13232 /// *or sub* immediate, that is the target has add or sub instructions which can 13233 /// add a register with the immediate without having to materialize the 13234 /// immediate into a register. 13235 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 13236 // Same encoding for add/sub, just flip the sign. 13237 int64_t AbsImm = std::abs(Imm); 13238 if (!Subtarget->isThumb()) 13239 return ARM_AM::getSOImmVal(AbsImm) != -1; 13240 if (Subtarget->isThumb2()) 13241 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 13242 // Thumb1 only has 8-bit unsigned immediate. 13243 return AbsImm >= 0 && AbsImm <= 255; 13244 } 13245 13246 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 13247 bool isSEXTLoad, SDValue &Base, 13248 SDValue &Offset, bool &isInc, 13249 SelectionDAG &DAG) { 13250 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 13251 return false; 13252 13253 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 13254 // AddressingMode 3 13255 Base = Ptr->getOperand(0); 13256 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13257 int RHSC = (int)RHS->getZExtValue(); 13258 if (RHSC < 0 && RHSC > -256) { 13259 assert(Ptr->getOpcode() == ISD::ADD); 13260 isInc = false; 13261 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13262 return true; 13263 } 13264 } 13265 isInc = (Ptr->getOpcode() == ISD::ADD); 13266 Offset = Ptr->getOperand(1); 13267 return true; 13268 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 13269 // AddressingMode 2 13270 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13271 int RHSC = (int)RHS->getZExtValue(); 13272 if (RHSC < 0 && RHSC > -0x1000) { 13273 assert(Ptr->getOpcode() == ISD::ADD); 13274 isInc = false; 13275 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13276 Base = Ptr->getOperand(0); 13277 return true; 13278 } 13279 } 13280 13281 if (Ptr->getOpcode() == ISD::ADD) { 13282 isInc = true; 13283 ARM_AM::ShiftOpc ShOpcVal= 13284 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 13285 if (ShOpcVal != ARM_AM::no_shift) { 13286 Base = Ptr->getOperand(1); 13287 Offset = Ptr->getOperand(0); 13288 } else { 13289 Base = Ptr->getOperand(0); 13290 Offset = Ptr->getOperand(1); 13291 } 13292 return true; 13293 } 13294 13295 isInc = (Ptr->getOpcode() == ISD::ADD); 13296 Base = Ptr->getOperand(0); 13297 Offset = Ptr->getOperand(1); 13298 return true; 13299 } 13300 13301 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 13302 return false; 13303 } 13304 13305 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 13306 bool isSEXTLoad, SDValue &Base, 13307 SDValue &Offset, bool &isInc, 13308 SelectionDAG &DAG) { 13309 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 13310 return false; 13311 13312 Base = Ptr->getOperand(0); 13313 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13314 int RHSC = (int)RHS->getZExtValue(); 13315 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 13316 assert(Ptr->getOpcode() == ISD::ADD); 13317 isInc = false; 13318 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13319 return true; 13320 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 13321 isInc = Ptr->getOpcode() == ISD::ADD; 13322 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13323 return true; 13324 } 13325 } 13326 13327 return false; 13328 } 13329 13330 /// getPreIndexedAddressParts - returns true by value, base pointer and 13331 /// offset pointer and addressing mode by reference if the node's address 13332 /// can be legally represented as pre-indexed load / store address. 13333 bool 13334 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 13335 SDValue &Offset, 13336 ISD::MemIndexedMode &AM, 13337 SelectionDAG &DAG) const { 13338 if (Subtarget->isThumb1Only()) 13339 return false; 13340 13341 EVT VT; 13342 SDValue Ptr; 13343 bool isSEXTLoad = false; 13344 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 13345 Ptr = LD->getBasePtr(); 13346 VT = LD->getMemoryVT(); 13347 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 13348 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 13349 Ptr = ST->getBasePtr(); 13350 VT = ST->getMemoryVT(); 13351 } else 13352 return false; 13353 13354 bool isInc; 13355 bool isLegal = false; 13356 if (Subtarget->isThumb2()) 13357 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 13358 Offset, isInc, DAG); 13359 else 13360 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 13361 Offset, isInc, DAG); 13362 if (!isLegal) 13363 return false; 13364 13365 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 13366 return true; 13367 } 13368 13369 /// getPostIndexedAddressParts - returns true by value, base pointer and 13370 /// offset pointer and addressing mode by reference if this node can be 13371 /// combined with a load / store to form a post-indexed load / store. 13372 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 13373 SDValue &Base, 13374 SDValue &Offset, 13375 ISD::MemIndexedMode &AM, 13376 SelectionDAG &DAG) const { 13377 EVT VT; 13378 SDValue Ptr; 13379 bool isSEXTLoad = false, isNonExt; 13380 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 13381 VT = LD->getMemoryVT(); 13382 Ptr = LD->getBasePtr(); 13383 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 13384 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 13385 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 13386 VT = ST->getMemoryVT(); 13387 Ptr = ST->getBasePtr(); 13388 isNonExt = !ST->isTruncatingStore(); 13389 } else 13390 return false; 13391 13392 if (Subtarget->isThumb1Only()) { 13393 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 13394 // must be non-extending/truncating, i32, with an offset of 4. 13395 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 13396 if (Op->getOpcode() != ISD::ADD || !isNonExt) 13397 return false; 13398 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 13399 if (!RHS || RHS->getZExtValue() != 4) 13400 return false; 13401 13402 Offset = Op->getOperand(1); 13403 Base = Op->getOperand(0); 13404 AM = ISD::POST_INC; 13405 return true; 13406 } 13407 13408 bool isInc; 13409 bool isLegal = false; 13410 if (Subtarget->isThumb2()) 13411 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 13412 isInc, DAG); 13413 else 13414 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 13415 isInc, DAG); 13416 if (!isLegal) 13417 return false; 13418 13419 if (Ptr != Base) { 13420 // Swap base ptr and offset to catch more post-index load / store when 13421 // it's legal. In Thumb2 mode, offset must be an immediate. 13422 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 13423 !Subtarget->isThumb2()) 13424 std::swap(Base, Offset); 13425 13426 // Post-indexed load / store update the base pointer. 13427 if (Ptr != Base) 13428 return false; 13429 } 13430 13431 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 13432 return true; 13433 } 13434 13435 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 13436 KnownBits &Known, 13437 const APInt &DemandedElts, 13438 const SelectionDAG &DAG, 13439 unsigned Depth) const { 13440 unsigned BitWidth = Known.getBitWidth(); 13441 Known.resetAll(); 13442 switch (Op.getOpcode()) { 13443 default: break; 13444 case ARMISD::ADDC: 13445 case ARMISD::ADDE: 13446 case ARMISD::SUBC: 13447 case ARMISD::SUBE: 13448 // Special cases when we convert a carry to a boolean. 13449 if (Op.getResNo() == 0) { 13450 SDValue LHS = Op.getOperand(0); 13451 SDValue RHS = Op.getOperand(1); 13452 // (ADDE 0, 0, C) will give us a single bit. 13453 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 13454 isNullConstant(RHS)) { 13455 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 13456 return; 13457 } 13458 } 13459 break; 13460 case ARMISD::CMOV: { 13461 // Bits are known zero/one if known on the LHS and RHS. 13462 DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1); 13463 if (Known.isUnknown()) 13464 return; 13465 13466 KnownBits KnownRHS; 13467 DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1); 13468 Known.Zero &= KnownRHS.Zero; 13469 Known.One &= KnownRHS.One; 13470 return; 13471 } 13472 case ISD::INTRINSIC_W_CHAIN: { 13473 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 13474 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 13475 switch (IntID) { 13476 default: return; 13477 case Intrinsic::arm_ldaex: 13478 case Intrinsic::arm_ldrex: { 13479 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 13480 unsigned MemBits = VT.getScalarSizeInBits(); 13481 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 13482 return; 13483 } 13484 } 13485 } 13486 case ARMISD::BFI: { 13487 // Conservatively, we can recurse down the first operand 13488 // and just mask out all affected bits. 13489 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); 13490 13491 // The operand to BFI is already a mask suitable for removing the bits it 13492 // sets. 13493 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 13494 const APInt &Mask = CI->getAPIntValue(); 13495 Known.Zero &= Mask; 13496 Known.One &= Mask; 13497 return; 13498 } 13499 } 13500 } 13501 13502 bool 13503 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 13504 const APInt &DemandedAPInt, 13505 TargetLoweringOpt &TLO) const { 13506 // Delay optimization, so we don't have to deal with illegal types, or block 13507 // optimizations. 13508 if (!TLO.LegalOps) 13509 return false; 13510 13511 // Only optimize AND for now. 13512 if (Op.getOpcode() != ISD::AND) 13513 return false; 13514 13515 EVT VT = Op.getValueType(); 13516 13517 // Ignore vectors. 13518 if (VT.isVector()) 13519 return false; 13520 13521 assert(VT == MVT::i32 && "Unexpected integer type"); 13522 13523 // Make sure the RHS really is a constant. 13524 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 13525 if (!C) 13526 return false; 13527 13528 unsigned Mask = C->getZExtValue(); 13529 13530 unsigned Demanded = DemandedAPInt.getZExtValue(); 13531 unsigned ShrunkMask = Mask & Demanded; 13532 unsigned ExpandedMask = Mask | ~Demanded; 13533 13534 // If the mask is all zeros, let the target-independent code replace the 13535 // result with zero. 13536 if (ShrunkMask == 0) 13537 return false; 13538 13539 // If the mask is all ones, erase the AND. (Currently, the target-independent 13540 // code won't do this, so we have to do it explicitly to avoid an infinite 13541 // loop in obscure cases.) 13542 if (ExpandedMask == ~0U) 13543 return TLO.CombineTo(Op, Op.getOperand(0)); 13544 13545 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 13546 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 13547 }; 13548 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 13549 if (NewMask == Mask) 13550 return true; 13551 SDLoc DL(Op); 13552 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 13553 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 13554 return TLO.CombineTo(Op, NewOp); 13555 }; 13556 13557 // Prefer uxtb mask. 13558 if (IsLegalMask(0xFF)) 13559 return UseMask(0xFF); 13560 13561 // Prefer uxth mask. 13562 if (IsLegalMask(0xFFFF)) 13563 return UseMask(0xFFFF); 13564 13565 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 13566 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 13567 if (ShrunkMask < 256) 13568 return UseMask(ShrunkMask); 13569 13570 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 13571 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 13572 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 13573 return UseMask(ExpandedMask); 13574 13575 // Potential improvements: 13576 // 13577 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 13578 // We could try to prefer Thumb1 immediates which can be lowered to a 13579 // two-instruction sequence. 13580 // We could try to recognize more legal ARM/Thumb2 immediates here. 13581 13582 return false; 13583 } 13584 13585 13586 //===----------------------------------------------------------------------===// 13587 // ARM Inline Assembly Support 13588 //===----------------------------------------------------------------------===// 13589 13590 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 13591 // Looking for "rev" which is V6+. 13592 if (!Subtarget->hasV6Ops()) 13593 return false; 13594 13595 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 13596 std::string AsmStr = IA->getAsmString(); 13597 SmallVector<StringRef, 4> AsmPieces; 13598 SplitString(AsmStr, AsmPieces, ";\n"); 13599 13600 switch (AsmPieces.size()) { 13601 default: return false; 13602 case 1: 13603 AsmStr = AsmPieces[0]; 13604 AsmPieces.clear(); 13605 SplitString(AsmStr, AsmPieces, " \t,"); 13606 13607 // rev $0, $1 13608 if (AsmPieces.size() == 3 && 13609 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 13610 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 13611 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13612 if (Ty && Ty->getBitWidth() == 32) 13613 return IntrinsicLowering::LowerToByteSwap(CI); 13614 } 13615 break; 13616 } 13617 13618 return false; 13619 } 13620 13621 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 13622 // At this point, we have to lower this constraint to something else, so we 13623 // lower it to an "r" or "w". However, by doing this we will force the result 13624 // to be in register, while the X constraint is much more permissive. 13625 // 13626 // Although we are correct (we are free to emit anything, without 13627 // constraints), we might break use cases that would expect us to be more 13628 // efficient and emit something else. 13629 if (!Subtarget->hasVFP2()) 13630 return "r"; 13631 if (ConstraintVT.isFloatingPoint()) 13632 return "w"; 13633 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 13634 (ConstraintVT.getSizeInBits() == 64 || 13635 ConstraintVT.getSizeInBits() == 128)) 13636 return "w"; 13637 13638 return "r"; 13639 } 13640 13641 /// getConstraintType - Given a constraint letter, return the type of 13642 /// constraint it is for this target. 13643 ARMTargetLowering::ConstraintType 13644 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 13645 if (Constraint.size() == 1) { 13646 switch (Constraint[0]) { 13647 default: break; 13648 case 'l': return C_RegisterClass; 13649 case 'w': return C_RegisterClass; 13650 case 'h': return C_RegisterClass; 13651 case 'x': return C_RegisterClass; 13652 case 't': return C_RegisterClass; 13653 case 'j': return C_Other; // Constant for movw. 13654 // An address with a single base register. Due to the way we 13655 // currently handle addresses it is the same as an 'r' memory constraint. 13656 case 'Q': return C_Memory; 13657 } 13658 } else if (Constraint.size() == 2) { 13659 switch (Constraint[0]) { 13660 default: break; 13661 // All 'U+' constraints are addresses. 13662 case 'U': return C_Memory; 13663 } 13664 } 13665 return TargetLowering::getConstraintType(Constraint); 13666 } 13667 13668 /// Examine constraint type and operand type and determine a weight value. 13669 /// This object must already have been set up with the operand type 13670 /// and the current alternative constraint selected. 13671 TargetLowering::ConstraintWeight 13672 ARMTargetLowering::getSingleConstraintMatchWeight( 13673 AsmOperandInfo &info, const char *constraint) const { 13674 ConstraintWeight weight = CW_Invalid; 13675 Value *CallOperandVal = info.CallOperandVal; 13676 // If we don't have a value, we can't do a match, 13677 // but allow it at the lowest weight. 13678 if (!CallOperandVal) 13679 return CW_Default; 13680 Type *type = CallOperandVal->getType(); 13681 // Look at the constraint type. 13682 switch (*constraint) { 13683 default: 13684 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13685 break; 13686 case 'l': 13687 if (type->isIntegerTy()) { 13688 if (Subtarget->isThumb()) 13689 weight = CW_SpecificReg; 13690 else 13691 weight = CW_Register; 13692 } 13693 break; 13694 case 'w': 13695 if (type->isFloatingPointTy()) 13696 weight = CW_Register; 13697 break; 13698 } 13699 return weight; 13700 } 13701 13702 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 13703 13704 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 13705 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 13706 if (Constraint.size() == 1) { 13707 // GCC ARM Constraint Letters 13708 switch (Constraint[0]) { 13709 case 'l': // Low regs or general regs. 13710 if (Subtarget->isThumb()) 13711 return RCPair(0U, &ARM::tGPRRegClass); 13712 return RCPair(0U, &ARM::GPRRegClass); 13713 case 'h': // High regs or no regs. 13714 if (Subtarget->isThumb()) 13715 return RCPair(0U, &ARM::hGPRRegClass); 13716 break; 13717 case 'r': 13718 if (Subtarget->isThumb1Only()) 13719 return RCPair(0U, &ARM::tGPRRegClass); 13720 return RCPair(0U, &ARM::GPRRegClass); 13721 case 'w': 13722 if (VT == MVT::Other) 13723 break; 13724 if (VT == MVT::f32) 13725 return RCPair(0U, &ARM::SPRRegClass); 13726 if (VT.getSizeInBits() == 64) 13727 return RCPair(0U, &ARM::DPRRegClass); 13728 if (VT.getSizeInBits() == 128) 13729 return RCPair(0U, &ARM::QPRRegClass); 13730 break; 13731 case 'x': 13732 if (VT == MVT::Other) 13733 break; 13734 if (VT == MVT::f32) 13735 return RCPair(0U, &ARM::SPR_8RegClass); 13736 if (VT.getSizeInBits() == 64) 13737 return RCPair(0U, &ARM::DPR_8RegClass); 13738 if (VT.getSizeInBits() == 128) 13739 return RCPair(0U, &ARM::QPR_8RegClass); 13740 break; 13741 case 't': 13742 if (VT == MVT::Other) 13743 break; 13744 if (VT == MVT::f32 || VT == MVT::i32) 13745 return RCPair(0U, &ARM::SPRRegClass); 13746 if (VT.getSizeInBits() == 64) 13747 return RCPair(0U, &ARM::DPR_VFP2RegClass); 13748 if (VT.getSizeInBits() == 128) 13749 return RCPair(0U, &ARM::QPR_VFP2RegClass); 13750 break; 13751 } 13752 } 13753 if (StringRef("{cc}").equals_lower(Constraint)) 13754 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 13755 13756 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 13757 } 13758 13759 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13760 /// vector. If it is invalid, don't add anything to Ops. 13761 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13762 std::string &Constraint, 13763 std::vector<SDValue>&Ops, 13764 SelectionDAG &DAG) const { 13765 SDValue Result; 13766 13767 // Currently only support length 1 constraints. 13768 if (Constraint.length() != 1) return; 13769 13770 char ConstraintLetter = Constraint[0]; 13771 switch (ConstraintLetter) { 13772 default: break; 13773 case 'j': 13774 case 'I': case 'J': case 'K': case 'L': 13775 case 'M': case 'N': case 'O': 13776 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 13777 if (!C) 13778 return; 13779 13780 int64_t CVal64 = C->getSExtValue(); 13781 int CVal = (int) CVal64; 13782 // None of these constraints allow values larger than 32 bits. Check 13783 // that the value fits in an int. 13784 if (CVal != CVal64) 13785 return; 13786 13787 switch (ConstraintLetter) { 13788 case 'j': 13789 // Constant suitable for movw, must be between 0 and 13790 // 65535. 13791 if (Subtarget->hasV6T2Ops()) 13792 if (CVal >= 0 && CVal <= 65535) 13793 break; 13794 return; 13795 case 'I': 13796 if (Subtarget->isThumb1Only()) { 13797 // This must be a constant between 0 and 255, for ADD 13798 // immediates. 13799 if (CVal >= 0 && CVal <= 255) 13800 break; 13801 } else if (Subtarget->isThumb2()) { 13802 // A constant that can be used as an immediate value in a 13803 // data-processing instruction. 13804 if (ARM_AM::getT2SOImmVal(CVal) != -1) 13805 break; 13806 } else { 13807 // A constant that can be used as an immediate value in a 13808 // data-processing instruction. 13809 if (ARM_AM::getSOImmVal(CVal) != -1) 13810 break; 13811 } 13812 return; 13813 13814 case 'J': 13815 if (Subtarget->isThumb1Only()) { 13816 // This must be a constant between -255 and -1, for negated ADD 13817 // immediates. This can be used in GCC with an "n" modifier that 13818 // prints the negated value, for use with SUB instructions. It is 13819 // not useful otherwise but is implemented for compatibility. 13820 if (CVal >= -255 && CVal <= -1) 13821 break; 13822 } else { 13823 // This must be a constant between -4095 and 4095. It is not clear 13824 // what this constraint is intended for. Implemented for 13825 // compatibility with GCC. 13826 if (CVal >= -4095 && CVal <= 4095) 13827 break; 13828 } 13829 return; 13830 13831 case 'K': 13832 if (Subtarget->isThumb1Only()) { 13833 // A 32-bit value where only one byte has a nonzero value. Exclude 13834 // zero to match GCC. This constraint is used by GCC internally for 13835 // constants that can be loaded with a move/shift combination. 13836 // It is not useful otherwise but is implemented for compatibility. 13837 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 13838 break; 13839 } else if (Subtarget->isThumb2()) { 13840 // A constant whose bitwise inverse can be used as an immediate 13841 // value in a data-processing instruction. This can be used in GCC 13842 // with a "B" modifier that prints the inverted value, for use with 13843 // BIC and MVN instructions. It is not useful otherwise but is 13844 // implemented for compatibility. 13845 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 13846 break; 13847 } else { 13848 // A constant whose bitwise inverse can be used as an immediate 13849 // value in a data-processing instruction. This can be used in GCC 13850 // with a "B" modifier that prints the inverted value, for use with 13851 // BIC and MVN instructions. It is not useful otherwise but is 13852 // implemented for compatibility. 13853 if (ARM_AM::getSOImmVal(~CVal) != -1) 13854 break; 13855 } 13856 return; 13857 13858 case 'L': 13859 if (Subtarget->isThumb1Only()) { 13860 // This must be a constant between -7 and 7, 13861 // for 3-operand ADD/SUB immediate instructions. 13862 if (CVal >= -7 && CVal < 7) 13863 break; 13864 } else if (Subtarget->isThumb2()) { 13865 // A constant whose negation can be used as an immediate value in a 13866 // data-processing instruction. This can be used in GCC with an "n" 13867 // modifier that prints the negated value, for use with SUB 13868 // instructions. It is not useful otherwise but is implemented for 13869 // compatibility. 13870 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 13871 break; 13872 } else { 13873 // A constant whose negation can be used as an immediate value in a 13874 // data-processing instruction. This can be used in GCC with an "n" 13875 // modifier that prints the negated value, for use with SUB 13876 // instructions. It is not useful otherwise but is implemented for 13877 // compatibility. 13878 if (ARM_AM::getSOImmVal(-CVal) != -1) 13879 break; 13880 } 13881 return; 13882 13883 case 'M': 13884 if (Subtarget->isThumb1Only()) { 13885 // This must be a multiple of 4 between 0 and 1020, for 13886 // ADD sp + immediate. 13887 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 13888 break; 13889 } else { 13890 // A power of two or a constant between 0 and 32. This is used in 13891 // GCC for the shift amount on shifted register operands, but it is 13892 // useful in general for any shift amounts. 13893 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 13894 break; 13895 } 13896 return; 13897 13898 case 'N': 13899 if (Subtarget->isThumb()) { // FIXME thumb2 13900 // This must be a constant between 0 and 31, for shift amounts. 13901 if (CVal >= 0 && CVal <= 31) 13902 break; 13903 } 13904 return; 13905 13906 case 'O': 13907 if (Subtarget->isThumb()) { // FIXME thumb2 13908 // This must be a multiple of 4 between -508 and 508, for 13909 // ADD/SUB sp = sp + immediate. 13910 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 13911 break; 13912 } 13913 return; 13914 } 13915 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 13916 break; 13917 } 13918 13919 if (Result.getNode()) { 13920 Ops.push_back(Result); 13921 return; 13922 } 13923 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13924 } 13925 13926 static RTLIB::Libcall getDivRemLibcall( 13927 const SDNode *N, MVT::SimpleValueType SVT) { 13928 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 13929 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 13930 "Unhandled Opcode in getDivRemLibcall"); 13931 bool isSigned = N->getOpcode() == ISD::SDIVREM || 13932 N->getOpcode() == ISD::SREM; 13933 RTLIB::Libcall LC; 13934 switch (SVT) { 13935 default: llvm_unreachable("Unexpected request for libcall!"); 13936 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 13937 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 13938 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 13939 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 13940 } 13941 return LC; 13942 } 13943 13944 static TargetLowering::ArgListTy getDivRemArgList( 13945 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 13946 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 13947 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 13948 "Unhandled Opcode in getDivRemArgList"); 13949 bool isSigned = N->getOpcode() == ISD::SDIVREM || 13950 N->getOpcode() == ISD::SREM; 13951 TargetLowering::ArgListTy Args; 13952 TargetLowering::ArgListEntry Entry; 13953 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 13954 EVT ArgVT = N->getOperand(i).getValueType(); 13955 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 13956 Entry.Node = N->getOperand(i); 13957 Entry.Ty = ArgTy; 13958 Entry.IsSExt = isSigned; 13959 Entry.IsZExt = !isSigned; 13960 Args.push_back(Entry); 13961 } 13962 if (Subtarget->isTargetWindows() && Args.size() >= 2) 13963 std::swap(Args[0], Args[1]); 13964 return Args; 13965 } 13966 13967 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 13968 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 13969 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 13970 Subtarget->isTargetWindows()) && 13971 "Register-based DivRem lowering only"); 13972 unsigned Opcode = Op->getOpcode(); 13973 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 13974 "Invalid opcode for Div/Rem lowering"); 13975 bool isSigned = (Opcode == ISD::SDIVREM); 13976 EVT VT = Op->getValueType(0); 13977 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 13978 SDLoc dl(Op); 13979 13980 // If the target has hardware divide, use divide + multiply + subtract: 13981 // div = a / b 13982 // rem = a - b * div 13983 // return {div, rem} 13984 // This should be lowered into UDIV/SDIV + MLS later on. 13985 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 13986 : Subtarget->hasDivideInARMMode(); 13987 if (hasDivide && Op->getValueType(0).isSimple() && 13988 Op->getSimpleValueType(0) == MVT::i32) { 13989 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 13990 const SDValue Dividend = Op->getOperand(0); 13991 const SDValue Divisor = Op->getOperand(1); 13992 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 13993 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 13994 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 13995 13996 SDValue Values[2] = {Div, Rem}; 13997 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 13998 } 13999 14000 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 14001 VT.getSimpleVT().SimpleTy); 14002 SDValue InChain = DAG.getEntryNode(); 14003 14004 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 14005 DAG.getContext(), 14006 Subtarget); 14007 14008 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 14009 getPointerTy(DAG.getDataLayout())); 14010 14011 Type *RetTy = StructType::get(Ty, Ty); 14012 14013 if (Subtarget->isTargetWindows()) 14014 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 14015 14016 TargetLowering::CallLoweringInfo CLI(DAG); 14017 CLI.setDebugLoc(dl).setChain(InChain) 14018 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 14019 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 14020 14021 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 14022 return CallInfo.first; 14023 } 14024 14025 // Lowers REM using divmod helpers 14026 // see RTABI section 4.2/4.3 14027 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 14028 // Build return types (div and rem) 14029 std::vector<Type*> RetTyParams; 14030 Type *RetTyElement; 14031 14032 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 14033 default: llvm_unreachable("Unexpected request for libcall!"); 14034 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 14035 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 14036 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 14037 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 14038 } 14039 14040 RetTyParams.push_back(RetTyElement); 14041 RetTyParams.push_back(RetTyElement); 14042 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 14043 Type *RetTy = StructType::get(*DAG.getContext(), ret); 14044 14045 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 14046 SimpleTy); 14047 SDValue InChain = DAG.getEntryNode(); 14048 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 14049 Subtarget); 14050 bool isSigned = N->getOpcode() == ISD::SREM; 14051 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 14052 getPointerTy(DAG.getDataLayout())); 14053 14054 if (Subtarget->isTargetWindows()) 14055 InChain = WinDBZCheckDenominator(DAG, N, InChain); 14056 14057 // Lower call 14058 CallLoweringInfo CLI(DAG); 14059 CLI.setChain(InChain) 14060 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 14061 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 14062 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 14063 14064 // Return second (rem) result operand (first contains div) 14065 SDNode *ResNode = CallResult.first.getNode(); 14066 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 14067 return ResNode->getOperand(1); 14068 } 14069 14070 SDValue 14071 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 14072 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 14073 SDLoc DL(Op); 14074 14075 // Get the inputs. 14076 SDValue Chain = Op.getOperand(0); 14077 SDValue Size = Op.getOperand(1); 14078 14079 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 14080 "no-stack-arg-probe")) { 14081 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 14082 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 14083 Chain = SP.getValue(1); 14084 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 14085 if (Align) 14086 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 14087 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 14088 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 14089 SDValue Ops[2] = { SP, Chain }; 14090 return DAG.getMergeValues(Ops, DL); 14091 } 14092 14093 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 14094 DAG.getConstant(2, DL, MVT::i32)); 14095 14096 SDValue Flag; 14097 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 14098 Flag = Chain.getValue(1); 14099 14100 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 14101 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 14102 14103 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 14104 Chain = NewSP.getValue(1); 14105 14106 SDValue Ops[2] = { NewSP, Chain }; 14107 return DAG.getMergeValues(Ops, DL); 14108 } 14109 14110 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 14111 assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && 14112 "Unexpected type for custom-lowering FP_EXTEND"); 14113 14114 RTLIB::Libcall LC; 14115 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 14116 14117 SDValue SrcVal = Op.getOperand(0); 14118 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 14119 SDLoc(Op)).first; 14120 } 14121 14122 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 14123 assert(Op.getOperand(0).getValueType() == MVT::f64 && 14124 Subtarget->isFPOnlySP() && 14125 "Unexpected type for custom-lowering FP_ROUND"); 14126 14127 RTLIB::Libcall LC; 14128 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 14129 14130 SDValue SrcVal = Op.getOperand(0); 14131 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 14132 SDLoc(Op)).first; 14133 } 14134 14135 bool 14136 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 14137 // The ARM target isn't yet aware of offsets. 14138 return false; 14139 } 14140 14141 bool ARM::isBitFieldInvertedMask(unsigned v) { 14142 if (v == 0xffffffff) 14143 return false; 14144 14145 // there can be 1's on either or both "outsides", all the "inside" 14146 // bits must be 0's 14147 return isShiftedMask_32(~v); 14148 } 14149 14150 /// isFPImmLegal - Returns true if the target can instruction select the 14151 /// specified FP immediate natively. If false, the legalizer will 14152 /// materialize the FP immediate as a load from a constant pool. 14153 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 14154 if (!Subtarget->hasVFP3()) 14155 return false; 14156 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 14157 return ARM_AM::getFP16Imm(Imm) != -1; 14158 if (VT == MVT::f32) 14159 return ARM_AM::getFP32Imm(Imm) != -1; 14160 if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) 14161 return ARM_AM::getFP64Imm(Imm) != -1; 14162 return false; 14163 } 14164 14165 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 14166 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 14167 /// specified in the intrinsic calls. 14168 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 14169 const CallInst &I, 14170 MachineFunction &MF, 14171 unsigned Intrinsic) const { 14172 switch (Intrinsic) { 14173 case Intrinsic::arm_neon_vld1: 14174 case Intrinsic::arm_neon_vld2: 14175 case Intrinsic::arm_neon_vld3: 14176 case Intrinsic::arm_neon_vld4: 14177 case Intrinsic::arm_neon_vld2lane: 14178 case Intrinsic::arm_neon_vld3lane: 14179 case Intrinsic::arm_neon_vld4lane: 14180 case Intrinsic::arm_neon_vld2dup: 14181 case Intrinsic::arm_neon_vld3dup: 14182 case Intrinsic::arm_neon_vld4dup: { 14183 Info.opc = ISD::INTRINSIC_W_CHAIN; 14184 // Conservatively set memVT to the entire set of vectors loaded. 14185 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14186 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 14187 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14188 Info.ptrVal = I.getArgOperand(0); 14189 Info.offset = 0; 14190 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 14191 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 14192 // volatile loads with NEON intrinsics not supported 14193 Info.flags = MachineMemOperand::MOLoad; 14194 return true; 14195 } 14196 case Intrinsic::arm_neon_vld1x2: 14197 case Intrinsic::arm_neon_vld1x3: 14198 case Intrinsic::arm_neon_vld1x4: { 14199 Info.opc = ISD::INTRINSIC_W_CHAIN; 14200 // Conservatively set memVT to the entire set of vectors loaded. 14201 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14202 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 14203 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14204 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 14205 Info.offset = 0; 14206 Info.align = 0; 14207 // volatile loads with NEON intrinsics not supported 14208 Info.flags = MachineMemOperand::MOLoad; 14209 return true; 14210 } 14211 case Intrinsic::arm_neon_vst1: 14212 case Intrinsic::arm_neon_vst2: 14213 case Intrinsic::arm_neon_vst3: 14214 case Intrinsic::arm_neon_vst4: 14215 case Intrinsic::arm_neon_vst2lane: 14216 case Intrinsic::arm_neon_vst3lane: 14217 case Intrinsic::arm_neon_vst4lane: { 14218 Info.opc = ISD::INTRINSIC_VOID; 14219 // Conservatively set memVT to the entire set of vectors stored. 14220 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14221 unsigned NumElts = 0; 14222 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 14223 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 14224 if (!ArgTy->isVectorTy()) 14225 break; 14226 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 14227 } 14228 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14229 Info.ptrVal = I.getArgOperand(0); 14230 Info.offset = 0; 14231 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 14232 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 14233 // volatile stores with NEON intrinsics not supported 14234 Info.flags = MachineMemOperand::MOStore; 14235 return true; 14236 } 14237 case Intrinsic::arm_neon_vst1x2: 14238 case Intrinsic::arm_neon_vst1x3: 14239 case Intrinsic::arm_neon_vst1x4: { 14240 Info.opc = ISD::INTRINSIC_VOID; 14241 // Conservatively set memVT to the entire set of vectors stored. 14242 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14243 unsigned NumElts = 0; 14244 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 14245 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 14246 if (!ArgTy->isVectorTy()) 14247 break; 14248 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 14249 } 14250 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14251 Info.ptrVal = I.getArgOperand(0); 14252 Info.offset = 0; 14253 Info.align = 0; 14254 // volatile stores with NEON intrinsics not supported 14255 Info.flags = MachineMemOperand::MOStore; 14256 return true; 14257 } 14258 case Intrinsic::arm_ldaex: 14259 case Intrinsic::arm_ldrex: { 14260 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14261 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 14262 Info.opc = ISD::INTRINSIC_W_CHAIN; 14263 Info.memVT = MVT::getVT(PtrTy->getElementType()); 14264 Info.ptrVal = I.getArgOperand(0); 14265 Info.offset = 0; 14266 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 14267 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14268 return true; 14269 } 14270 case Intrinsic::arm_stlex: 14271 case Intrinsic::arm_strex: { 14272 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14273 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 14274 Info.opc = ISD::INTRINSIC_W_CHAIN; 14275 Info.memVT = MVT::getVT(PtrTy->getElementType()); 14276 Info.ptrVal = I.getArgOperand(1); 14277 Info.offset = 0; 14278 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 14279 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14280 return true; 14281 } 14282 case Intrinsic::arm_stlexd: 14283 case Intrinsic::arm_strexd: 14284 Info.opc = ISD::INTRINSIC_W_CHAIN; 14285 Info.memVT = MVT::i64; 14286 Info.ptrVal = I.getArgOperand(2); 14287 Info.offset = 0; 14288 Info.align = 8; 14289 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14290 return true; 14291 14292 case Intrinsic::arm_ldaexd: 14293 case Intrinsic::arm_ldrexd: 14294 Info.opc = ISD::INTRINSIC_W_CHAIN; 14295 Info.memVT = MVT::i64; 14296 Info.ptrVal = I.getArgOperand(0); 14297 Info.offset = 0; 14298 Info.align = 8; 14299 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14300 return true; 14301 14302 default: 14303 break; 14304 } 14305 14306 return false; 14307 } 14308 14309 /// Returns true if it is beneficial to convert a load of a constant 14310 /// to just the constant itself. 14311 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 14312 Type *Ty) const { 14313 assert(Ty->isIntegerTy()); 14314 14315 unsigned Bits = Ty->getPrimitiveSizeInBits(); 14316 if (Bits == 0 || Bits > 32) 14317 return false; 14318 return true; 14319 } 14320 14321 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 14322 unsigned Index) const { 14323 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 14324 return false; 14325 14326 return (Index == 0 || Index == ResVT.getVectorNumElements()); 14327 } 14328 14329 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 14330 ARM_MB::MemBOpt Domain) const { 14331 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14332 14333 // First, if the target has no DMB, see what fallback we can use. 14334 if (!Subtarget->hasDataBarrier()) { 14335 // Some ARMv6 cpus can support data barriers with an mcr instruction. 14336 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 14337 // here. 14338 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 14339 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 14340 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 14341 Builder.getInt32(0), Builder.getInt32(7), 14342 Builder.getInt32(10), Builder.getInt32(5)}; 14343 return Builder.CreateCall(MCR, args); 14344 } else { 14345 // Instead of using barriers, atomic accesses on these subtargets use 14346 // libcalls. 14347 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 14348 } 14349 } else { 14350 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 14351 // Only a full system barrier exists in the M-class architectures. 14352 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 14353 Constant *CDomain = Builder.getInt32(Domain); 14354 return Builder.CreateCall(DMB, CDomain); 14355 } 14356 } 14357 14358 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 14359 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 14360 Instruction *Inst, 14361 AtomicOrdering Ord) const { 14362 switch (Ord) { 14363 case AtomicOrdering::NotAtomic: 14364 case AtomicOrdering::Unordered: 14365 llvm_unreachable("Invalid fence: unordered/non-atomic"); 14366 case AtomicOrdering::Monotonic: 14367 case AtomicOrdering::Acquire: 14368 return nullptr; // Nothing to do 14369 case AtomicOrdering::SequentiallyConsistent: 14370 if (!Inst->hasAtomicStore()) 14371 return nullptr; // Nothing to do 14372 LLVM_FALLTHROUGH; 14373 case AtomicOrdering::Release: 14374 case AtomicOrdering::AcquireRelease: 14375 if (Subtarget->preferISHSTBarriers()) 14376 return makeDMB(Builder, ARM_MB::ISHST); 14377 // FIXME: add a comment with a link to documentation justifying this. 14378 else 14379 return makeDMB(Builder, ARM_MB::ISH); 14380 } 14381 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 14382 } 14383 14384 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 14385 Instruction *Inst, 14386 AtomicOrdering Ord) const { 14387 switch (Ord) { 14388 case AtomicOrdering::NotAtomic: 14389 case AtomicOrdering::Unordered: 14390 llvm_unreachable("Invalid fence: unordered/not-atomic"); 14391 case AtomicOrdering::Monotonic: 14392 case AtomicOrdering::Release: 14393 return nullptr; // Nothing to do 14394 case AtomicOrdering::Acquire: 14395 case AtomicOrdering::AcquireRelease: 14396 case AtomicOrdering::SequentiallyConsistent: 14397 return makeDMB(Builder, ARM_MB::ISH); 14398 } 14399 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 14400 } 14401 14402 // Loads and stores less than 64-bits are already atomic; ones above that 14403 // are doomed anyway, so defer to the default libcall and blame the OS when 14404 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 14405 // anything for those. 14406 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 14407 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 14408 return (Size == 64) && !Subtarget->isMClass(); 14409 } 14410 14411 // Loads and stores less than 64-bits are already atomic; ones above that 14412 // are doomed anyway, so defer to the default libcall and blame the OS when 14413 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 14414 // anything for those. 14415 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 14416 // guarantee, see DDI0406C ARM architecture reference manual, 14417 // sections A8.8.72-74 LDRD) 14418 TargetLowering::AtomicExpansionKind 14419 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 14420 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 14421 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 14422 : AtomicExpansionKind::None; 14423 } 14424 14425 // For the real atomic operations, we have ldrex/strex up to 32 bits, 14426 // and up to 64 bits on the non-M profiles 14427 TargetLowering::AtomicExpansionKind 14428 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 14429 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 14430 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 14431 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 14432 ? AtomicExpansionKind::LLSC 14433 : AtomicExpansionKind::None; 14434 } 14435 14436 TargetLowering::AtomicExpansionKind 14437 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 14438 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 14439 // implement cmpxchg without spilling. If the address being exchanged is also 14440 // on the stack and close enough to the spill slot, this can lead to a 14441 // situation where the monitor always gets cleared and the atomic operation 14442 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 14443 bool HasAtomicCmpXchg = 14444 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 14445 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 14446 return AtomicExpansionKind::LLSC; 14447 return AtomicExpansionKind::None; 14448 } 14449 14450 bool ARMTargetLowering::shouldInsertFencesForAtomic( 14451 const Instruction *I) const { 14452 return InsertFencesForAtomic; 14453 } 14454 14455 // This has so far only been implemented for MachO. 14456 bool ARMTargetLowering::useLoadStackGuardNode() const { 14457 return Subtarget->isTargetMachO(); 14458 } 14459 14460 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 14461 unsigned &Cost) const { 14462 // If we do not have NEON, vector types are not natively supported. 14463 if (!Subtarget->hasNEON()) 14464 return false; 14465 14466 // Floating point values and vector values map to the same register file. 14467 // Therefore, although we could do a store extract of a vector type, this is 14468 // better to leave at float as we have more freedom in the addressing mode for 14469 // those. 14470 if (VectorTy->isFPOrFPVectorTy()) 14471 return false; 14472 14473 // If the index is unknown at compile time, this is very expensive to lower 14474 // and it is not possible to combine the store with the extract. 14475 if (!isa<ConstantInt>(Idx)) 14476 return false; 14477 14478 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 14479 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 14480 // We can do a store + vector extract on any vector that fits perfectly in a D 14481 // or Q register. 14482 if (BitWidth == 64 || BitWidth == 128) { 14483 Cost = 0; 14484 return true; 14485 } 14486 return false; 14487 } 14488 14489 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 14490 return Subtarget->hasV6T2Ops(); 14491 } 14492 14493 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 14494 return Subtarget->hasV6T2Ops(); 14495 } 14496 14497 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 14498 AtomicOrdering Ord) const { 14499 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14500 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 14501 bool IsAcquire = isAcquireOrStronger(Ord); 14502 14503 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 14504 // intrinsic must return {i32, i32} and we have to recombine them into a 14505 // single i64 here. 14506 if (ValTy->getPrimitiveSizeInBits() == 64) { 14507 Intrinsic::ID Int = 14508 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 14509 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 14510 14511 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14512 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 14513 14514 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 14515 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 14516 if (!Subtarget->isLittle()) 14517 std::swap (Lo, Hi); 14518 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 14519 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 14520 return Builder.CreateOr( 14521 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 14522 } 14523 14524 Type *Tys[] = { Addr->getType() }; 14525 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 14526 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 14527 14528 return Builder.CreateTruncOrBitCast( 14529 Builder.CreateCall(Ldrex, Addr), 14530 cast<PointerType>(Addr->getType())->getElementType()); 14531 } 14532 14533 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 14534 IRBuilder<> &Builder) const { 14535 if (!Subtarget->hasV7Ops()) 14536 return; 14537 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14538 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 14539 } 14540 14541 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 14542 Value *Addr, 14543 AtomicOrdering Ord) const { 14544 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14545 bool IsRelease = isReleaseOrStronger(Ord); 14546 14547 // Since the intrinsics must have legal type, the i64 intrinsics take two 14548 // parameters: "i32, i32". We must marshal Val into the appropriate form 14549 // before the call. 14550 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 14551 Intrinsic::ID Int = 14552 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 14553 Function *Strex = Intrinsic::getDeclaration(M, Int); 14554 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 14555 14556 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 14557 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 14558 if (!Subtarget->isLittle()) 14559 std::swap(Lo, Hi); 14560 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14561 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 14562 } 14563 14564 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 14565 Type *Tys[] = { Addr->getType() }; 14566 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 14567 14568 return Builder.CreateCall( 14569 Strex, {Builder.CreateZExtOrBitCast( 14570 Val, Strex->getFunctionType()->getParamType(0)), 14571 Addr}); 14572 } 14573 14574 14575 bool ARMTargetLowering::alignLoopsWithOptSize() const { 14576 return Subtarget->isMClass(); 14577 } 14578 14579 /// A helper function for determining the number of interleaved accesses we 14580 /// will generate when lowering accesses of the given type. 14581 unsigned 14582 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 14583 const DataLayout &DL) const { 14584 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 14585 } 14586 14587 bool ARMTargetLowering::isLegalInterleavedAccessType( 14588 VectorType *VecTy, const DataLayout &DL) const { 14589 14590 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 14591 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 14592 14593 // Ensure the vector doesn't have f16 elements. Even though we could do an 14594 // i16 vldN, we can't hold the f16 vectors and will end up converting via 14595 // f32. 14596 if (VecTy->getElementType()->isHalfTy()) 14597 return false; 14598 14599 // Ensure the number of vector elements is greater than 1. 14600 if (VecTy->getNumElements() < 2) 14601 return false; 14602 14603 // Ensure the element type is legal. 14604 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 14605 return false; 14606 14607 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 14608 // 128 will be split into multiple interleaved accesses. 14609 return VecSize == 64 || VecSize % 128 == 0; 14610 } 14611 14612 /// Lower an interleaved load into a vldN intrinsic. 14613 /// 14614 /// E.g. Lower an interleaved load (Factor = 2): 14615 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 14616 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 14617 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 14618 /// 14619 /// Into: 14620 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 14621 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 14622 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 14623 bool ARMTargetLowering::lowerInterleavedLoad( 14624 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 14625 ArrayRef<unsigned> Indices, unsigned Factor) const { 14626 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 14627 "Invalid interleave factor"); 14628 assert(!Shuffles.empty() && "Empty shufflevector input"); 14629 assert(Shuffles.size() == Indices.size() && 14630 "Unmatched number of shufflevectors and indices"); 14631 14632 VectorType *VecTy = Shuffles[0]->getType(); 14633 Type *EltTy = VecTy->getVectorElementType(); 14634 14635 const DataLayout &DL = LI->getModule()->getDataLayout(); 14636 14637 // Skip if we do not have NEON and skip illegal vector types. We can 14638 // "legalize" wide vector types into multiple interleaved accesses as long as 14639 // the vector types are divisible by 128. 14640 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) 14641 return false; 14642 14643 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 14644 14645 // A pointer vector can not be the return type of the ldN intrinsics. Need to 14646 // load integer vectors first and then convert to pointer vectors. 14647 if (EltTy->isPointerTy()) 14648 VecTy = 14649 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 14650 14651 IRBuilder<> Builder(LI); 14652 14653 // The base address of the load. 14654 Value *BaseAddr = LI->getPointerOperand(); 14655 14656 if (NumLoads > 1) { 14657 // If we're going to generate more than one load, reset the sub-vector type 14658 // to something legal. 14659 VecTy = VectorType::get(VecTy->getVectorElementType(), 14660 VecTy->getVectorNumElements() / NumLoads); 14661 14662 // We will compute the pointer operand of each load from the original base 14663 // address using GEPs. Cast the base address to a pointer to the scalar 14664 // element type. 14665 BaseAddr = Builder.CreateBitCast( 14666 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 14667 LI->getPointerAddressSpace())); 14668 } 14669 14670 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 14671 14672 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 14673 Type *Tys[] = {VecTy, Int8Ptr}; 14674 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 14675 Intrinsic::arm_neon_vld3, 14676 Intrinsic::arm_neon_vld4}; 14677 Function *VldnFunc = 14678 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 14679 14680 // Holds sub-vectors extracted from the load intrinsic return values. The 14681 // sub-vectors are associated with the shufflevector instructions they will 14682 // replace. 14683 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 14684 14685 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 14686 // If we're generating more than one load, compute the base address of 14687 // subsequent loads as an offset from the previous. 14688 if (LoadCount > 0) 14689 BaseAddr = Builder.CreateConstGEP1_32( 14690 BaseAddr, VecTy->getVectorNumElements() * Factor); 14691 14692 SmallVector<Value *, 2> Ops; 14693 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 14694 Ops.push_back(Builder.getInt32(LI->getAlignment())); 14695 14696 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); 14697 14698 // Replace uses of each shufflevector with the corresponding vector loaded 14699 // by ldN. 14700 for (unsigned i = 0; i < Shuffles.size(); i++) { 14701 ShuffleVectorInst *SV = Shuffles[i]; 14702 unsigned Index = Indices[i]; 14703 14704 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 14705 14706 // Convert the integer vector to pointer vector if the element is pointer. 14707 if (EltTy->isPointerTy()) 14708 SubVec = Builder.CreateIntToPtr( 14709 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 14710 VecTy->getVectorNumElements())); 14711 14712 SubVecs[SV].push_back(SubVec); 14713 } 14714 } 14715 14716 // Replace uses of the shufflevector instructions with the sub-vectors 14717 // returned by the load intrinsic. If a shufflevector instruction is 14718 // associated with more than one sub-vector, those sub-vectors will be 14719 // concatenated into a single wide vector. 14720 for (ShuffleVectorInst *SVI : Shuffles) { 14721 auto &SubVec = SubVecs[SVI]; 14722 auto *WideVec = 14723 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 14724 SVI->replaceAllUsesWith(WideVec); 14725 } 14726 14727 return true; 14728 } 14729 14730 /// Lower an interleaved store into a vstN intrinsic. 14731 /// 14732 /// E.g. Lower an interleaved store (Factor = 3): 14733 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 14734 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 14735 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 14736 /// 14737 /// Into: 14738 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 14739 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 14740 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 14741 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 14742 /// 14743 /// Note that the new shufflevectors will be removed and we'll only generate one 14744 /// vst3 instruction in CodeGen. 14745 /// 14746 /// Example for a more general valid mask (Factor 3). Lower: 14747 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 14748 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 14749 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 14750 /// 14751 /// Into: 14752 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 14753 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 14754 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 14755 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 14756 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 14757 ShuffleVectorInst *SVI, 14758 unsigned Factor) const { 14759 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 14760 "Invalid interleave factor"); 14761 14762 VectorType *VecTy = SVI->getType(); 14763 assert(VecTy->getVectorNumElements() % Factor == 0 && 14764 "Invalid interleaved store"); 14765 14766 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 14767 Type *EltTy = VecTy->getVectorElementType(); 14768 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 14769 14770 const DataLayout &DL = SI->getModule()->getDataLayout(); 14771 14772 // Skip if we do not have NEON and skip illegal vector types. We can 14773 // "legalize" wide vector types into multiple interleaved accesses as long as 14774 // the vector types are divisible by 128. 14775 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) 14776 return false; 14777 14778 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 14779 14780 Value *Op0 = SVI->getOperand(0); 14781 Value *Op1 = SVI->getOperand(1); 14782 IRBuilder<> Builder(SI); 14783 14784 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 14785 // vectors to integer vectors. 14786 if (EltTy->isPointerTy()) { 14787 Type *IntTy = DL.getIntPtrType(EltTy); 14788 14789 // Convert to the corresponding integer vector. 14790 Type *IntVecTy = 14791 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 14792 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 14793 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 14794 14795 SubVecTy = VectorType::get(IntTy, LaneLen); 14796 } 14797 14798 // The base address of the store. 14799 Value *BaseAddr = SI->getPointerOperand(); 14800 14801 if (NumStores > 1) { 14802 // If we're going to generate more than one store, reset the lane length 14803 // and sub-vector type to something legal. 14804 LaneLen /= NumStores; 14805 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 14806 14807 // We will compute the pointer operand of each store from the original base 14808 // address using GEPs. Cast the base address to a pointer to the scalar 14809 // element type. 14810 BaseAddr = Builder.CreateBitCast( 14811 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 14812 SI->getPointerAddressSpace())); 14813 } 14814 14815 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 14816 14817 auto Mask = SVI->getShuffleMask(); 14818 14819 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 14820 Type *Tys[] = {Int8Ptr, SubVecTy}; 14821 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 14822 Intrinsic::arm_neon_vst3, 14823 Intrinsic::arm_neon_vst4}; 14824 14825 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 14826 // If we generating more than one store, we compute the base address of 14827 // subsequent stores as an offset from the previous. 14828 if (StoreCount > 0) 14829 BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); 14830 14831 SmallVector<Value *, 6> Ops; 14832 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 14833 14834 Function *VstNFunc = 14835 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 14836 14837 // Split the shufflevector operands into sub vectors for the new vstN call. 14838 for (unsigned i = 0; i < Factor; i++) { 14839 unsigned IdxI = StoreCount * LaneLen * Factor + i; 14840 if (Mask[IdxI] >= 0) { 14841 Ops.push_back(Builder.CreateShuffleVector( 14842 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 14843 } else { 14844 unsigned StartMask = 0; 14845 for (unsigned j = 1; j < LaneLen; j++) { 14846 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 14847 if (Mask[IdxJ * Factor + IdxI] >= 0) { 14848 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 14849 break; 14850 } 14851 } 14852 // Note: If all elements in a chunk are undefs, StartMask=0! 14853 // Note: Filling undef gaps with random elements is ok, since 14854 // those elements were being written anyway (with undefs). 14855 // In the case of all undefs we're defaulting to using elems from 0 14856 // Note: StartMask cannot be negative, it's checked in 14857 // isReInterleaveMask 14858 Ops.push_back(Builder.CreateShuffleVector( 14859 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 14860 } 14861 } 14862 14863 Ops.push_back(Builder.getInt32(SI->getAlignment())); 14864 Builder.CreateCall(VstNFunc, Ops); 14865 } 14866 return true; 14867 } 14868 14869 enum HABaseType { 14870 HA_UNKNOWN = 0, 14871 HA_FLOAT, 14872 HA_DOUBLE, 14873 HA_VECT64, 14874 HA_VECT128 14875 }; 14876 14877 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 14878 uint64_t &Members) { 14879 if (auto *ST = dyn_cast<StructType>(Ty)) { 14880 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 14881 uint64_t SubMembers = 0; 14882 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 14883 return false; 14884 Members += SubMembers; 14885 } 14886 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 14887 uint64_t SubMembers = 0; 14888 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 14889 return false; 14890 Members += SubMembers * AT->getNumElements(); 14891 } else if (Ty->isFloatTy()) { 14892 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 14893 return false; 14894 Members = 1; 14895 Base = HA_FLOAT; 14896 } else if (Ty->isDoubleTy()) { 14897 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 14898 return false; 14899 Members = 1; 14900 Base = HA_DOUBLE; 14901 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 14902 Members = 1; 14903 switch (Base) { 14904 case HA_FLOAT: 14905 case HA_DOUBLE: 14906 return false; 14907 case HA_VECT64: 14908 return VT->getBitWidth() == 64; 14909 case HA_VECT128: 14910 return VT->getBitWidth() == 128; 14911 case HA_UNKNOWN: 14912 switch (VT->getBitWidth()) { 14913 case 64: 14914 Base = HA_VECT64; 14915 return true; 14916 case 128: 14917 Base = HA_VECT128; 14918 return true; 14919 default: 14920 return false; 14921 } 14922 } 14923 } 14924 14925 return (Members > 0 && Members <= 4); 14926 } 14927 14928 /// Return the correct alignment for the current calling convention. 14929 unsigned 14930 ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 14931 DataLayout DL) const { 14932 if (!ArgTy->isVectorTy()) 14933 return DL.getABITypeAlignment(ArgTy); 14934 14935 // Avoid over-aligning vector parameters. It would require realigning the 14936 // stack and waste space for no real benefit. 14937 return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment()); 14938 } 14939 14940 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 14941 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 14942 /// passing according to AAPCS rules. 14943 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 14944 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 14945 if (getEffectiveCallingConv(CallConv, isVarArg) != 14946 CallingConv::ARM_AAPCS_VFP) 14947 return false; 14948 14949 HABaseType Base = HA_UNKNOWN; 14950 uint64_t Members = 0; 14951 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 14952 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 14953 14954 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 14955 return IsHA || IsIntArray; 14956 } 14957 14958 unsigned ARMTargetLowering::getExceptionPointerRegister( 14959 const Constant *PersonalityFn) const { 14960 // Platforms which do not use SjLj EH may return values in these registers 14961 // via the personality function. 14962 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 14963 } 14964 14965 unsigned ARMTargetLowering::getExceptionSelectorRegister( 14966 const Constant *PersonalityFn) const { 14967 // Platforms which do not use SjLj EH may return values in these registers 14968 // via the personality function. 14969 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 14970 } 14971 14972 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 14973 // Update IsSplitCSR in ARMFunctionInfo. 14974 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 14975 AFI->setIsSplitCSR(true); 14976 } 14977 14978 void ARMTargetLowering::insertCopiesSplitCSR( 14979 MachineBasicBlock *Entry, 14980 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 14981 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 14982 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 14983 if (!IStart) 14984 return; 14985 14986 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 14987 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 14988 MachineBasicBlock::iterator MBBI = Entry->begin(); 14989 for (const MCPhysReg *I = IStart; *I; ++I) { 14990 const TargetRegisterClass *RC = nullptr; 14991 if (ARM::GPRRegClass.contains(*I)) 14992 RC = &ARM::GPRRegClass; 14993 else if (ARM::DPRRegClass.contains(*I)) 14994 RC = &ARM::DPRRegClass; 14995 else 14996 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 14997 14998 unsigned NewVR = MRI->createVirtualRegister(RC); 14999 // Create copy from CSR to a virtual register. 15000 // FIXME: this currently does not emit CFI pseudo-instructions, it works 15001 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 15002 // nounwind. If we want to generalize this later, we may need to emit 15003 // CFI pseudo-instructions. 15004 assert(Entry->getParent()->getFunction().hasFnAttribute( 15005 Attribute::NoUnwind) && 15006 "Function should be nounwind in insertCopiesSplitCSR!"); 15007 Entry->addLiveIn(*I); 15008 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 15009 .addReg(*I); 15010 15011 // Insert the copy-back instructions right before the terminator. 15012 for (auto *Exit : Exits) 15013 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 15014 TII->get(TargetOpcode::COPY), *I) 15015 .addReg(NewVR); 15016 } 15017 } 15018 15019 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 15020 MF.getFrameInfo().computeMaxCallFrameSize(MF); 15021 TargetLoweringBase::finalizeLowering(MF); 15022 } 15023