1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that ARM uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "ARMISelLowering.h" 16 #include "ARMCallingConv.h" 17 #include "ARMConstantPoolValue.h" 18 #include "ARMMachineFunctionInfo.h" 19 #include "ARMPerfectShuffle.h" 20 #include "ARMSubtarget.h" 21 #include "ARMTargetMachine.h" 22 #include "ARMTargetObjectFile.h" 23 #include "MCTargetDesc/ARMAddressingModes.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/ADT/StringExtras.h" 26 #include "llvm/ADT/StringSwitch.h" 27 #include "llvm/CodeGen/CallingConvLower.h" 28 #include "llvm/CodeGen/IntrinsicLowering.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFrameInfo.h" 31 #include "llvm/CodeGen/MachineFunction.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineJumpTableInfo.h" 34 #include "llvm/CodeGen/MachineModuleInfo.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/SelectionDAG.h" 37 #include "llvm/IR/CallingConv.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/GlobalValue.h" 41 #include "llvm/IR/IRBuilder.h" 42 #include "llvm/IR/Instruction.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicInst.h" 45 #include "llvm/IR/Intrinsics.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/MC/MCSectionMachO.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/Debug.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Support/MathExtras.h" 52 #include "llvm/Support/raw_ostream.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include <utility> 55 using namespace llvm; 56 57 #define DEBUG_TYPE "arm-isel" 58 59 STATISTIC(NumTailCalls, "Number of tail calls"); 60 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 61 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 62 63 static cl::opt<bool> 64 ARMInterworking("arm-interworking", cl::Hidden, 65 cl::desc("Enable / disable ARM interworking (for debugging only)"), 66 cl::init(true)); 67 68 namespace { 69 class ARMCCState : public CCState { 70 public: 71 ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, 72 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C, 73 ParmContext PC) 74 : CCState(CC, isVarArg, MF, locs, C) { 75 assert(((PC == Call) || (PC == Prologue)) && 76 "ARMCCState users must specify whether their context is call" 77 "or prologue generation."); 78 CallOrPrologue = PC; 79 } 80 }; 81 } 82 83 // The APCS parameter registers. 84 static const MCPhysReg GPRArgRegs[] = { 85 ARM::R0, ARM::R1, ARM::R2, ARM::R3 86 }; 87 88 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 89 MVT PromotedBitwiseVT) { 90 if (VT != PromotedLdStVT) { 91 setOperationAction(ISD::LOAD, VT, Promote); 92 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 93 94 setOperationAction(ISD::STORE, VT, Promote); 95 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 96 } 97 98 MVT ElemTy = VT.getVectorElementType(); 99 if (ElemTy != MVT::i64 && ElemTy != MVT::f64) 100 setOperationAction(ISD::SETCC, VT, Custom); 101 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 102 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 103 if (ElemTy == MVT::i32) { 104 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 105 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 106 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 107 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 108 } else { 109 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 110 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 111 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 112 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 113 } 114 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 115 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 116 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 117 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 118 setOperationAction(ISD::SELECT, VT, Expand); 119 setOperationAction(ISD::SELECT_CC, VT, Expand); 120 setOperationAction(ISD::VSELECT, VT, Expand); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 122 if (VT.isInteger()) { 123 setOperationAction(ISD::SHL, VT, Custom); 124 setOperationAction(ISD::SRA, VT, Custom); 125 setOperationAction(ISD::SRL, VT, Custom); 126 } 127 128 // Promote all bit-wise operations. 129 if (VT.isInteger() && VT != PromotedBitwiseVT) { 130 setOperationAction(ISD::AND, VT, Promote); 131 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 132 setOperationAction(ISD::OR, VT, Promote); 133 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 134 setOperationAction(ISD::XOR, VT, Promote); 135 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 136 } 137 138 // Neon does not support vector divide/remainder operations. 139 setOperationAction(ISD::SDIV, VT, Expand); 140 setOperationAction(ISD::UDIV, VT, Expand); 141 setOperationAction(ISD::FDIV, VT, Expand); 142 setOperationAction(ISD::SREM, VT, Expand); 143 setOperationAction(ISD::UREM, VT, Expand); 144 setOperationAction(ISD::FREM, VT, Expand); 145 146 if (VT.isInteger()) { 147 setOperationAction(ISD::SABSDIFF, VT, Legal); 148 setOperationAction(ISD::UABSDIFF, VT, Legal); 149 } 150 if (!VT.isFloatingPoint() && 151 VT != MVT::v2i64 && VT != MVT::v1i64) 152 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 153 setOperationAction(Opcode, VT, Legal); 154 155 } 156 157 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 158 addRegisterClass(VT, &ARM::DPRRegClass); 159 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 160 } 161 162 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 163 addRegisterClass(VT, &ARM::DPairRegClass); 164 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 165 } 166 167 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 168 const ARMSubtarget &STI) 169 : TargetLowering(TM), Subtarget(&STI) { 170 RegInfo = Subtarget->getRegisterInfo(); 171 Itins = Subtarget->getInstrItineraryData(); 172 173 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 174 175 if (Subtarget->isTargetMachO()) { 176 // Uses VFP for Thumb libfuncs if available. 177 if (Subtarget->isThumb() && Subtarget->hasVFP2() && 178 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 179 static const struct { 180 const RTLIB::Libcall Op; 181 const char * const Name; 182 const ISD::CondCode Cond; 183 } LibraryCalls[] = { 184 // Single-precision floating-point arithmetic. 185 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 186 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 187 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 188 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 189 190 // Double-precision floating-point arithmetic. 191 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 192 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 193 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 194 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 195 196 // Single-precision comparisons. 197 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 198 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 199 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 200 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 201 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 202 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 203 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 204 { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, 205 206 // Double-precision comparisons. 207 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 208 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 209 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 210 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 211 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 212 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 213 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 214 { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, 215 216 // Floating-point to integer conversions. 217 // i64 conversions are done via library routines even when generating VFP 218 // instructions, so use the same ones. 219 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 220 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 221 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 222 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 223 224 // Conversions between floating types. 225 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 226 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 227 228 // Integer to floating-point conversions. 229 // i64 conversions are done via library routines even when generating VFP 230 // instructions, so use the same ones. 231 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 232 // e.g., __floatunsidf vs. __floatunssidfvfp. 233 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 234 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 235 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 236 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 237 }; 238 239 for (const auto &LC : LibraryCalls) { 240 setLibcallName(LC.Op, LC.Name); 241 if (LC.Cond != ISD::SETCC_INVALID) 242 setCmpLibcallCC(LC.Op, LC.Cond); 243 } 244 } 245 } 246 247 // These libcalls are not available in 32-bit. 248 setLibcallName(RTLIB::SHL_I128, nullptr); 249 setLibcallName(RTLIB::SRL_I128, nullptr); 250 setLibcallName(RTLIB::SRA_I128, nullptr); 251 252 if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() && 253 !Subtarget->isTargetWindows()) { 254 static const struct { 255 const RTLIB::Libcall Op; 256 const char * const Name; 257 const CallingConv::ID CC; 258 const ISD::CondCode Cond; 259 } LibraryCalls[] = { 260 // Double-precision floating-point arithmetic helper functions 261 // RTABI chapter 4.1.2, Table 2 262 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 263 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 264 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 265 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 266 267 // Double-precision floating-point comparison helper functions 268 // RTABI chapter 4.1.2, Table 3 269 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 270 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 271 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 272 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 273 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 274 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 275 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 276 { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 277 278 // Single-precision floating-point arithmetic helper functions 279 // RTABI chapter 4.1.2, Table 4 280 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 281 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 282 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 283 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 284 285 // Single-precision floating-point comparison helper functions 286 // RTABI chapter 4.1.2, Table 5 287 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 288 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 289 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 290 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 291 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 292 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 293 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 294 { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 295 296 // Floating-point to integer conversions. 297 // RTABI chapter 4.1.2, Table 6 298 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 299 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 300 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 301 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 302 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 303 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 304 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 305 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 306 307 // Conversions between floating types. 308 // RTABI chapter 4.1.2, Table 7 309 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 310 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 311 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 312 313 // Integer to floating-point conversions. 314 // RTABI chapter 4.1.2, Table 8 315 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 316 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 317 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 318 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 319 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 320 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 321 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 322 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 323 324 // Long long helper functions 325 // RTABI chapter 4.2, Table 9 326 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 327 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 328 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 329 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 330 331 // Integer division functions 332 // RTABI chapter 4.3.1 333 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 334 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 335 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 336 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 337 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 338 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 339 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 340 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 341 342 // Memory operations 343 // RTABI chapter 4.3.4 344 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 345 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 346 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 347 }; 348 349 for (const auto &LC : LibraryCalls) { 350 setLibcallName(LC.Op, LC.Name); 351 setLibcallCallingConv(LC.Op, LC.CC); 352 if (LC.Cond != ISD::SETCC_INVALID) 353 setCmpLibcallCC(LC.Op, LC.Cond); 354 } 355 } 356 357 if (Subtarget->isTargetWindows()) { 358 static const struct { 359 const RTLIB::Libcall Op; 360 const char * const Name; 361 const CallingConv::ID CC; 362 } LibraryCalls[] = { 363 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 364 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 365 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 366 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 367 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 368 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 369 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 370 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 371 }; 372 373 for (const auto &LC : LibraryCalls) { 374 setLibcallName(LC.Op, LC.Name); 375 setLibcallCallingConv(LC.Op, LC.CC); 376 } 377 } 378 379 // Use divmod compiler-rt calls for iOS 5.0 and later. 380 if (Subtarget->getTargetTriple().isiOS() && 381 !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { 382 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 383 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 384 } 385 386 // The half <-> float conversion functions are always soft-float, but are 387 // needed for some targets which use a hard-float calling convention by 388 // default. 389 if (Subtarget->isAAPCS_ABI()) { 390 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 391 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 392 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 393 } else { 394 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 395 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 396 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 397 } 398 399 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 400 // a __gnu_ prefix (which is the default). 401 if (Subtarget->isTargetAEABI()) { 402 setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); 403 setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); 404 setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); 405 } 406 407 if (Subtarget->isThumb1Only()) 408 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 409 else 410 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 411 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 412 !Subtarget->isThumb1Only()) { 413 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 414 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 415 } 416 417 for (MVT VT : MVT::vector_valuetypes()) { 418 for (MVT InnerVT : MVT::vector_valuetypes()) { 419 setTruncStoreAction(VT, InnerVT, Expand); 420 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 421 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 422 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 423 } 424 425 setOperationAction(ISD::MULHS, VT, Expand); 426 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 427 setOperationAction(ISD::MULHU, VT, Expand); 428 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 429 430 setOperationAction(ISD::BSWAP, VT, Expand); 431 } 432 433 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 434 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 435 436 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 437 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 438 439 if (Subtarget->hasNEON()) { 440 addDRTypeForNEON(MVT::v2f32); 441 addDRTypeForNEON(MVT::v8i8); 442 addDRTypeForNEON(MVT::v4i16); 443 addDRTypeForNEON(MVT::v2i32); 444 addDRTypeForNEON(MVT::v1i64); 445 446 addQRTypeForNEON(MVT::v4f32); 447 addQRTypeForNEON(MVT::v2f64); 448 addQRTypeForNEON(MVT::v16i8); 449 addQRTypeForNEON(MVT::v8i16); 450 addQRTypeForNEON(MVT::v4i32); 451 addQRTypeForNEON(MVT::v2i64); 452 453 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 454 // neither Neon nor VFP support any arithmetic operations on it. 455 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 456 // supported for v4f32. 457 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 458 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 459 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 460 // FIXME: Code duplication: FDIV and FREM are expanded always, see 461 // ARMTargetLowering::addTypeForNEON method for details. 462 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 463 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 464 // FIXME: Create unittest. 465 // In another words, find a way when "copysign" appears in DAG with vector 466 // operands. 467 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 468 // FIXME: Code duplication: SETCC has custom operation action, see 469 // ARMTargetLowering::addTypeForNEON method for details. 470 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 471 // FIXME: Create unittest for FNEG and for FABS. 472 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 473 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 474 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 475 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 476 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 477 setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); 478 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 479 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 480 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 481 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 482 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 483 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 484 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 485 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 486 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 487 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 488 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 489 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 490 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 491 492 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 493 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 494 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 495 setOperationAction(ISD::FPOWI, MVT::v4f32, Expand); 496 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 497 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 498 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 499 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 500 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 501 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 502 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 503 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 504 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 505 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 506 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 507 508 // Mark v2f32 intrinsics. 509 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 510 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 511 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 512 setOperationAction(ISD::FPOWI, MVT::v2f32, Expand); 513 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 514 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 515 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 516 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 517 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 518 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 519 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 520 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 521 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 522 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 523 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 524 525 // Neon does not support some operations on v1i64 and v2i64 types. 526 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 527 // Custom handling for some quad-vector types to detect VMULL. 528 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 529 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 530 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 531 // Custom handling for some vector types to avoid expensive expansions 532 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 533 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 534 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 535 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 536 setOperationAction(ISD::SETCC, MVT::v1i64, Expand); 537 setOperationAction(ISD::SETCC, MVT::v2i64, Expand); 538 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 539 // a destination type that is wider than the source, and nor does 540 // it have a FP_TO_[SU]INT instruction with a narrower destination than 541 // source. 542 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 543 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 544 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 545 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 546 547 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 548 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 549 550 // NEON does not have single instruction CTPOP for vectors with element 551 // types wider than 8-bits. However, custom lowering can leverage the 552 // v8i8/v16i8 vcnt instruction. 553 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 554 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 555 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 556 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 557 558 // NEON does not have single instruction CTTZ for vectors. 559 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 560 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 561 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 562 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 563 564 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 565 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 566 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 567 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 568 569 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 570 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 571 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 572 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 573 574 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 575 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 576 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 577 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 578 579 // NEON only has FMA instructions as of VFP4. 580 if (!Subtarget->hasVFP4()) { 581 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 582 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 583 } 584 585 setTargetDAGCombine(ISD::INTRINSIC_VOID); 586 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 587 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 588 setTargetDAGCombine(ISD::SHL); 589 setTargetDAGCombine(ISD::SRL); 590 setTargetDAGCombine(ISD::SRA); 591 setTargetDAGCombine(ISD::SIGN_EXTEND); 592 setTargetDAGCombine(ISD::ZERO_EXTEND); 593 setTargetDAGCombine(ISD::ANY_EXTEND); 594 setTargetDAGCombine(ISD::BUILD_VECTOR); 595 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 596 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 597 setTargetDAGCombine(ISD::STORE); 598 setTargetDAGCombine(ISD::FP_TO_SINT); 599 setTargetDAGCombine(ISD::FP_TO_UINT); 600 setTargetDAGCombine(ISD::FDIV); 601 setTargetDAGCombine(ISD::LOAD); 602 603 // It is legal to extload from v4i8 to v4i16 or v4i32. 604 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 605 MVT::v2i32}) { 606 for (MVT VT : MVT::integer_vector_valuetypes()) { 607 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 608 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 609 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 610 } 611 } 612 } 613 614 // ARM and Thumb2 support UMLAL/SMLAL. 615 if (!Subtarget->isThumb1Only()) 616 setTargetDAGCombine(ISD::ADDC); 617 618 if (Subtarget->isFPOnlySP()) { 619 // When targeting a floating-point unit with only single-precision 620 // operations, f64 is legal for the few double-precision instructions which 621 // are present However, no double-precision operations other than moves, 622 // loads and stores are provided by the hardware. 623 setOperationAction(ISD::FADD, MVT::f64, Expand); 624 setOperationAction(ISD::FSUB, MVT::f64, Expand); 625 setOperationAction(ISD::FMUL, MVT::f64, Expand); 626 setOperationAction(ISD::FMA, MVT::f64, Expand); 627 setOperationAction(ISD::FDIV, MVT::f64, Expand); 628 setOperationAction(ISD::FREM, MVT::f64, Expand); 629 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 630 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 631 setOperationAction(ISD::FNEG, MVT::f64, Expand); 632 setOperationAction(ISD::FABS, MVT::f64, Expand); 633 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 634 setOperationAction(ISD::FSIN, MVT::f64, Expand); 635 setOperationAction(ISD::FCOS, MVT::f64, Expand); 636 setOperationAction(ISD::FPOWI, MVT::f64, Expand); 637 setOperationAction(ISD::FPOW, MVT::f64, Expand); 638 setOperationAction(ISD::FLOG, MVT::f64, Expand); 639 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 640 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 641 setOperationAction(ISD::FEXP, MVT::f64, Expand); 642 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 643 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 644 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 645 setOperationAction(ISD::FRINT, MVT::f64, Expand); 646 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 647 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 648 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 649 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 650 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 651 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 652 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 653 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 654 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 655 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 656 } 657 658 computeRegisterProperties(Subtarget->getRegisterInfo()); 659 660 // ARM does not have floating-point extending loads. 661 for (MVT VT : MVT::fp_valuetypes()) { 662 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 663 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 664 } 665 666 // ... or truncating stores 667 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 668 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 669 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 670 671 // ARM does not have i1 sign extending load. 672 for (MVT VT : MVT::integer_valuetypes()) 673 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 674 675 // ARM supports all 4 flavors of integer indexed load / store. 676 if (!Subtarget->isThumb1Only()) { 677 for (unsigned im = (unsigned)ISD::PRE_INC; 678 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 679 setIndexedLoadAction(im, MVT::i1, Legal); 680 setIndexedLoadAction(im, MVT::i8, Legal); 681 setIndexedLoadAction(im, MVT::i16, Legal); 682 setIndexedLoadAction(im, MVT::i32, Legal); 683 setIndexedStoreAction(im, MVT::i1, Legal); 684 setIndexedStoreAction(im, MVT::i8, Legal); 685 setIndexedStoreAction(im, MVT::i16, Legal); 686 setIndexedStoreAction(im, MVT::i32, Legal); 687 } 688 } 689 690 setOperationAction(ISD::SADDO, MVT::i32, Custom); 691 setOperationAction(ISD::UADDO, MVT::i32, Custom); 692 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 693 setOperationAction(ISD::USUBO, MVT::i32, Custom); 694 695 // i64 operation support. 696 setOperationAction(ISD::MUL, MVT::i64, Expand); 697 setOperationAction(ISD::MULHU, MVT::i32, Expand); 698 if (Subtarget->isThumb1Only()) { 699 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 700 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 701 } 702 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 703 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 704 setOperationAction(ISD::MULHS, MVT::i32, Expand); 705 706 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 707 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 708 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 709 setOperationAction(ISD::SRL, MVT::i64, Custom); 710 setOperationAction(ISD::SRA, MVT::i64, Custom); 711 712 if (!Subtarget->isThumb1Only()) { 713 // FIXME: We should do this for Thumb1 as well. 714 setOperationAction(ISD::ADDC, MVT::i32, Custom); 715 setOperationAction(ISD::ADDE, MVT::i32, Custom); 716 setOperationAction(ISD::SUBC, MVT::i32, Custom); 717 setOperationAction(ISD::SUBE, MVT::i32, Custom); 718 } 719 720 // ARM does not have ROTL. 721 setOperationAction(ISD::ROTL, MVT::i32, Expand); 722 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 723 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 724 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) 725 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 726 727 // These just redirect to CTTZ and CTLZ on ARM. 728 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); 729 setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); 730 731 // @llvm.readcyclecounter requires the Performance Monitors extension. 732 // Default to the 0 expansion on unsupported platforms. 733 // FIXME: Technically there are older ARM CPUs that have 734 // implementation-specific ways of obtaining this information. 735 if (Subtarget->hasPerfMon()) 736 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 737 738 // Only ARMv6 has BSWAP. 739 if (!Subtarget->hasV6Ops()) 740 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 741 742 if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && 743 !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { 744 // These are expanded into libcalls if the cpu doesn't have HW divider. 745 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 746 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 747 } 748 749 if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) { 750 setOperationAction(ISD::SDIV, MVT::i32, Custom); 751 setOperationAction(ISD::UDIV, MVT::i32, Custom); 752 753 setOperationAction(ISD::SDIV, MVT::i64, Custom); 754 setOperationAction(ISD::UDIV, MVT::i64, Custom); 755 } 756 757 setOperationAction(ISD::SREM, MVT::i32, Expand); 758 setOperationAction(ISD::UREM, MVT::i32, Expand); 759 // Register based DivRem for AEABI (RTABI 4.2) 760 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) { 761 setOperationAction(ISD::SREM, MVT::i64, Custom); 762 setOperationAction(ISD::UREM, MVT::i64, Custom); 763 764 setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); 765 setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); 766 setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); 767 setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod"); 768 setLibcallName(RTLIB::UDIVREM_I8, "__aeabi_uidivmod"); 769 setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod"); 770 setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod"); 771 setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod"); 772 773 setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS); 774 setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS); 775 setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS); 776 setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS); 777 setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS); 778 setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS); 779 setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS); 780 setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS); 781 782 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 783 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 784 } else { 785 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 786 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 787 } 788 789 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 790 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 791 setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); 792 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 793 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 794 795 setOperationAction(ISD::TRAP, MVT::Other, Legal); 796 797 // Use the default implementation. 798 setOperationAction(ISD::VASTART, MVT::Other, Custom); 799 setOperationAction(ISD::VAARG, MVT::Other, Expand); 800 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 801 setOperationAction(ISD::VAEND, MVT::Other, Expand); 802 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 803 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 804 805 if (!Subtarget->isTargetMachO()) { 806 // Non-MachO platforms may return values in these registers via the 807 // personality function. 808 setExceptionPointerRegister(ARM::R0); 809 setExceptionSelectorRegister(ARM::R1); 810 } 811 812 if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) 813 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 814 else 815 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 816 817 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 818 // the default expansion. If we are targeting a single threaded system, 819 // then set them all for expand so we can lower them later into their 820 // non-atomic form. 821 if (TM.Options.ThreadModel == ThreadModel::Single) 822 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); 823 else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) { 824 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 825 // to ldrex/strex loops already. 826 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 827 828 // On v8, we have particularly efficient implementations of atomic fences 829 // if they can be combined with nearby atomic loads and stores. 830 if (!Subtarget->hasV8Ops()) { 831 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 832 setInsertFencesForAtomic(true); 833 } 834 } else { 835 // If there's anything we can use as a barrier, go through custom lowering 836 // for ATOMIC_FENCE. 837 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 838 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 839 840 // Set them all for expansion, which will force libcalls. 841 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 842 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 843 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 844 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 845 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 846 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 847 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 848 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 849 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 850 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 851 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 852 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 853 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 854 // Unordered/Monotonic case. 855 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 856 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 857 } 858 859 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 860 861 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 862 if (!Subtarget->hasV6Ops()) { 863 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 864 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 865 } 866 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 867 868 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 869 !Subtarget->isThumb1Only()) { 870 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 871 // iff target supports vfp2. 872 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 873 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 874 } 875 876 // We want to custom lower some of our intrinsics. 877 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 878 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 879 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 880 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 881 if (Subtarget->isTargetDarwin()) 882 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 883 884 setOperationAction(ISD::SETCC, MVT::i32, Expand); 885 setOperationAction(ISD::SETCC, MVT::f32, Expand); 886 setOperationAction(ISD::SETCC, MVT::f64, Expand); 887 setOperationAction(ISD::SELECT, MVT::i32, Custom); 888 setOperationAction(ISD::SELECT, MVT::f32, Custom); 889 setOperationAction(ISD::SELECT, MVT::f64, Custom); 890 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 891 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 892 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 893 894 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 895 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 896 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 897 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 898 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 899 900 // We don't support sin/cos/fmod/copysign/pow 901 setOperationAction(ISD::FSIN, MVT::f64, Expand); 902 setOperationAction(ISD::FSIN, MVT::f32, Expand); 903 setOperationAction(ISD::FCOS, MVT::f32, Expand); 904 setOperationAction(ISD::FCOS, MVT::f64, Expand); 905 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 906 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 907 setOperationAction(ISD::FREM, MVT::f64, Expand); 908 setOperationAction(ISD::FREM, MVT::f32, Expand); 909 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 910 !Subtarget->isThumb1Only()) { 911 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 912 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 913 } 914 setOperationAction(ISD::FPOW, MVT::f64, Expand); 915 setOperationAction(ISD::FPOW, MVT::f32, Expand); 916 917 if (!Subtarget->hasVFP4()) { 918 setOperationAction(ISD::FMA, MVT::f64, Expand); 919 setOperationAction(ISD::FMA, MVT::f32, Expand); 920 } 921 922 // Various VFP goodness 923 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 924 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 925 if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { 926 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 927 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 928 } 929 930 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 931 if (!Subtarget->hasFP16()) { 932 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 933 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 934 } 935 } 936 937 // Combine sin / cos into one node or libcall if possible. 938 if (Subtarget->hasSinCos()) { 939 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 940 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 941 if (Subtarget->getTargetTriple().isiOS()) { 942 // For iOS, we don't want to the normal expansion of a libcall to 943 // sincos. We want to issue a libcall to __sincos_stret. 944 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 945 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 946 } 947 } 948 949 // FP-ARMv8 implements a lot of rounding-like FP operations. 950 if (Subtarget->hasFPARMv8()) { 951 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 952 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 953 setOperationAction(ISD::FROUND, MVT::f32, Legal); 954 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 955 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 956 setOperationAction(ISD::FRINT, MVT::f32, Legal); 957 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 958 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 959 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 960 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 961 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 962 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 963 964 if (!Subtarget->isFPOnlySP()) { 965 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 966 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 967 setOperationAction(ISD::FROUND, MVT::f64, Legal); 968 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 969 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 970 setOperationAction(ISD::FRINT, MVT::f64, Legal); 971 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 972 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 973 } 974 } 975 976 if (Subtarget->hasNEON()) { 977 // vmin and vmax aren't available in a scalar form, so we use 978 // a NEON instruction with an undef lane instead. 979 setOperationAction(ISD::FMINNAN, MVT::f32, Legal); 980 setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); 981 setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); 982 setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); 983 setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); 984 setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); 985 } 986 987 // We have target-specific dag combine patterns for the following nodes: 988 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 989 setTargetDAGCombine(ISD::ADD); 990 setTargetDAGCombine(ISD::SUB); 991 setTargetDAGCombine(ISD::MUL); 992 setTargetDAGCombine(ISD::AND); 993 setTargetDAGCombine(ISD::OR); 994 setTargetDAGCombine(ISD::XOR); 995 996 if (Subtarget->hasV6Ops()) 997 setTargetDAGCombine(ISD::SRL); 998 999 setStackPointerRegisterToSaveRestore(ARM::SP); 1000 1001 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1002 !Subtarget->hasVFP2()) 1003 setSchedulingPreference(Sched::RegPressure); 1004 else 1005 setSchedulingPreference(Sched::Hybrid); 1006 1007 //// temporary - rewrite interface to use type 1008 MaxStoresPerMemset = 8; 1009 MaxStoresPerMemsetOptSize = 4; 1010 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1011 MaxStoresPerMemcpyOptSize = 2; 1012 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1013 MaxStoresPerMemmoveOptSize = 2; 1014 1015 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1016 // are at least 4 bytes aligned. 1017 setMinStackArgumentAlignment(4); 1018 1019 // Prefer likely predicted branches to selects on out-of-order cores. 1020 PredictableSelectIsExpensive = Subtarget->isLikeA9(); 1021 1022 setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); 1023 } 1024 1025 bool ARMTargetLowering::useSoftFloat() const { 1026 return Subtarget->useSoftFloat(); 1027 } 1028 1029 // FIXME: It might make sense to define the representative register class as the 1030 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1031 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1032 // SPR's representative would be DPR_VFP2. This should work well if register 1033 // pressure tracking were modified such that a register use would increment the 1034 // pressure of the register class's representative and all of it's super 1035 // classes' representatives transitively. We have not implemented this because 1036 // of the difficulty prior to coalescing of modeling operand register classes 1037 // due to the common occurrence of cross class copies and subregister insertions 1038 // and extractions. 1039 std::pair<const TargetRegisterClass *, uint8_t> 1040 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1041 MVT VT) const { 1042 const TargetRegisterClass *RRC = nullptr; 1043 uint8_t Cost = 1; 1044 switch (VT.SimpleTy) { 1045 default: 1046 return TargetLowering::findRepresentativeClass(TRI, VT); 1047 // Use DPR as representative register class for all floating point 1048 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1049 // the cost is 1 for both f32 and f64. 1050 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1051 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1052 RRC = &ARM::DPRRegClass; 1053 // When NEON is used for SP, only half of the register file is available 1054 // because operations that define both SP and DP results will be constrained 1055 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1056 // coalescing by double-counting the SP regs. See the FIXME above. 1057 if (Subtarget->useNEONForSinglePrecisionFP()) 1058 Cost = 2; 1059 break; 1060 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1061 case MVT::v4f32: case MVT::v2f64: 1062 RRC = &ARM::DPRRegClass; 1063 Cost = 2; 1064 break; 1065 case MVT::v4i64: 1066 RRC = &ARM::DPRRegClass; 1067 Cost = 4; 1068 break; 1069 case MVT::v8i64: 1070 RRC = &ARM::DPRRegClass; 1071 Cost = 8; 1072 break; 1073 } 1074 return std::make_pair(RRC, Cost); 1075 } 1076 1077 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1078 switch ((ARMISD::NodeType)Opcode) { 1079 case ARMISD::FIRST_NUMBER: break; 1080 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1081 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1082 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1083 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1084 case ARMISD::CALL: return "ARMISD::CALL"; 1085 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1086 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1087 case ARMISD::tCALL: return "ARMISD::tCALL"; 1088 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1089 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1090 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1091 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1092 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1093 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1094 case ARMISD::CMP: return "ARMISD::CMP"; 1095 case ARMISD::CMN: return "ARMISD::CMN"; 1096 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1097 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1098 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1099 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1100 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1101 1102 case ARMISD::CMOV: return "ARMISD::CMOV"; 1103 1104 case ARMISD::RBIT: return "ARMISD::RBIT"; 1105 1106 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1107 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1108 case ARMISD::RRX: return "ARMISD::RRX"; 1109 1110 case ARMISD::ADDC: return "ARMISD::ADDC"; 1111 case ARMISD::ADDE: return "ARMISD::ADDE"; 1112 case ARMISD::SUBC: return "ARMISD::SUBC"; 1113 case ARMISD::SUBE: return "ARMISD::SUBE"; 1114 1115 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1116 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1117 1118 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1119 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1120 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1121 1122 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1123 1124 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1125 1126 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1127 1128 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1129 1130 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1131 1132 case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; 1133 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1134 1135 case ARMISD::VCEQ: return "ARMISD::VCEQ"; 1136 case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; 1137 case ARMISD::VCGE: return "ARMISD::VCGE"; 1138 case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; 1139 case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; 1140 case ARMISD::VCGEU: return "ARMISD::VCGEU"; 1141 case ARMISD::VCGT: return "ARMISD::VCGT"; 1142 case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; 1143 case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; 1144 case ARMISD::VCGTU: return "ARMISD::VCGTU"; 1145 case ARMISD::VTST: return "ARMISD::VTST"; 1146 1147 case ARMISD::VSHL: return "ARMISD::VSHL"; 1148 case ARMISD::VSHRs: return "ARMISD::VSHRs"; 1149 case ARMISD::VSHRu: return "ARMISD::VSHRu"; 1150 case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; 1151 case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; 1152 case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; 1153 case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; 1154 case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; 1155 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; 1156 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; 1157 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; 1158 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; 1159 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; 1160 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; 1161 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; 1162 case ARMISD::VSLI: return "ARMISD::VSLI"; 1163 case ARMISD::VSRI: return "ARMISD::VSRI"; 1164 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1165 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1166 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1167 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1168 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1169 case ARMISD::VDUP: return "ARMISD::VDUP"; 1170 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1171 case ARMISD::VEXT: return "ARMISD::VEXT"; 1172 case ARMISD::VREV64: return "ARMISD::VREV64"; 1173 case ARMISD::VREV32: return "ARMISD::VREV32"; 1174 case ARMISD::VREV16: return "ARMISD::VREV16"; 1175 case ARMISD::VZIP: return "ARMISD::VZIP"; 1176 case ARMISD::VUZP: return "ARMISD::VUZP"; 1177 case ARMISD::VTRN: return "ARMISD::VTRN"; 1178 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1179 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1180 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1181 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1182 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1183 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1184 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1185 case ARMISD::BFI: return "ARMISD::BFI"; 1186 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1187 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1188 case ARMISD::VBSL: return "ARMISD::VBSL"; 1189 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1190 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1191 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1192 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1193 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1194 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1195 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1196 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1197 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1198 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1199 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1200 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1201 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1202 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1203 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1204 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1205 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1206 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1207 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1208 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1209 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1210 } 1211 return nullptr; 1212 } 1213 1214 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1215 EVT VT) const { 1216 if (!VT.isVector()) 1217 return getPointerTy(DL); 1218 return VT.changeVectorElementTypeToInteger(); 1219 } 1220 1221 /// getRegClassFor - Return the register class that should be used for the 1222 /// specified value type. 1223 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { 1224 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1225 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1226 // load / store 4 to 8 consecutive D registers. 1227 if (Subtarget->hasNEON()) { 1228 if (VT == MVT::v4i64) 1229 return &ARM::QQPRRegClass; 1230 if (VT == MVT::v8i64) 1231 return &ARM::QQQQPRRegClass; 1232 } 1233 return TargetLowering::getRegClassFor(VT); 1234 } 1235 1236 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1237 // source/dest is aligned and the copy size is large enough. We therefore want 1238 // to align such objects passed to memory intrinsics. 1239 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1240 unsigned &PrefAlign) const { 1241 if (!isa<MemIntrinsic>(CI)) 1242 return false; 1243 MinSize = 8; 1244 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1245 // cycle faster than 4-byte aligned LDM. 1246 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1247 return true; 1248 } 1249 1250 // Create a fast isel object. 1251 FastISel * 1252 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1253 const TargetLibraryInfo *libInfo) const { 1254 return ARM::createFastISel(funcInfo, libInfo); 1255 } 1256 1257 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1258 unsigned NumVals = N->getNumValues(); 1259 if (!NumVals) 1260 return Sched::RegPressure; 1261 1262 for (unsigned i = 0; i != NumVals; ++i) { 1263 EVT VT = N->getValueType(i); 1264 if (VT == MVT::Glue || VT == MVT::Other) 1265 continue; 1266 if (VT.isFloatingPoint() || VT.isVector()) 1267 return Sched::ILP; 1268 } 1269 1270 if (!N->isMachineOpcode()) 1271 return Sched::RegPressure; 1272 1273 // Load are scheduled for latency even if there instruction itinerary 1274 // is not available. 1275 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1276 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1277 1278 if (MCID.getNumDefs() == 0) 1279 return Sched::RegPressure; 1280 if (!Itins->isEmpty() && 1281 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1282 return Sched::ILP; 1283 1284 return Sched::RegPressure; 1285 } 1286 1287 //===----------------------------------------------------------------------===// 1288 // Lowering Code 1289 //===----------------------------------------------------------------------===// 1290 1291 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1292 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1293 switch (CC) { 1294 default: llvm_unreachable("Unknown condition code!"); 1295 case ISD::SETNE: return ARMCC::NE; 1296 case ISD::SETEQ: return ARMCC::EQ; 1297 case ISD::SETGT: return ARMCC::GT; 1298 case ISD::SETGE: return ARMCC::GE; 1299 case ISD::SETLT: return ARMCC::LT; 1300 case ISD::SETLE: return ARMCC::LE; 1301 case ISD::SETUGT: return ARMCC::HI; 1302 case ISD::SETUGE: return ARMCC::HS; 1303 case ISD::SETULT: return ARMCC::LO; 1304 case ISD::SETULE: return ARMCC::LS; 1305 } 1306 } 1307 1308 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1309 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1310 ARMCC::CondCodes &CondCode2) { 1311 CondCode2 = ARMCC::AL; 1312 switch (CC) { 1313 default: llvm_unreachable("Unknown FP condition!"); 1314 case ISD::SETEQ: 1315 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1316 case ISD::SETGT: 1317 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1318 case ISD::SETGE: 1319 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1320 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1321 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1322 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1323 case ISD::SETO: CondCode = ARMCC::VC; break; 1324 case ISD::SETUO: CondCode = ARMCC::VS; break; 1325 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1326 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1327 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1328 case ISD::SETLT: 1329 case ISD::SETULT: CondCode = ARMCC::LT; break; 1330 case ISD::SETLE: 1331 case ISD::SETULE: CondCode = ARMCC::LE; break; 1332 case ISD::SETNE: 1333 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1334 } 1335 } 1336 1337 //===----------------------------------------------------------------------===// 1338 // Calling Convention Implementation 1339 //===----------------------------------------------------------------------===// 1340 1341 #include "ARMGenCallingConv.inc" 1342 1343 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1344 /// account presence of floating point hardware and calling convention 1345 /// limitations, such as support for variadic functions. 1346 CallingConv::ID 1347 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1348 bool isVarArg) const { 1349 switch (CC) { 1350 default: 1351 llvm_unreachable("Unsupported calling convention"); 1352 case CallingConv::ARM_AAPCS: 1353 case CallingConv::ARM_APCS: 1354 case CallingConv::GHC: 1355 return CC; 1356 case CallingConv::ARM_AAPCS_VFP: 1357 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1358 case CallingConv::C: 1359 if (!Subtarget->isAAPCS_ABI()) 1360 return CallingConv::ARM_APCS; 1361 else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && 1362 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1363 !isVarArg) 1364 return CallingConv::ARM_AAPCS_VFP; 1365 else 1366 return CallingConv::ARM_AAPCS; 1367 case CallingConv::Fast: 1368 if (!Subtarget->isAAPCS_ABI()) { 1369 if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1370 return CallingConv::Fast; 1371 return CallingConv::ARM_APCS; 1372 } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1373 return CallingConv::ARM_AAPCS_VFP; 1374 else 1375 return CallingConv::ARM_AAPCS; 1376 } 1377 } 1378 1379 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1380 /// CallingConvention. 1381 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1382 bool Return, 1383 bool isVarArg) const { 1384 switch (getEffectiveCallingConv(CC, isVarArg)) { 1385 default: 1386 llvm_unreachable("Unsupported calling convention"); 1387 case CallingConv::ARM_APCS: 1388 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1389 case CallingConv::ARM_AAPCS: 1390 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1391 case CallingConv::ARM_AAPCS_VFP: 1392 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1393 case CallingConv::Fast: 1394 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1395 case CallingConv::GHC: 1396 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1397 } 1398 } 1399 1400 /// LowerCallResult - Lower the result values of a call into the 1401 /// appropriate copies out of appropriate physical registers. 1402 SDValue 1403 ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1404 CallingConv::ID CallConv, bool isVarArg, 1405 const SmallVectorImpl<ISD::InputArg> &Ins, 1406 SDLoc dl, SelectionDAG &DAG, 1407 SmallVectorImpl<SDValue> &InVals, 1408 bool isThisReturn, SDValue ThisVal) const { 1409 1410 // Assign locations to each value returned by this call. 1411 SmallVector<CCValAssign, 16> RVLocs; 1412 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1413 *DAG.getContext(), Call); 1414 CCInfo.AnalyzeCallResult(Ins, 1415 CCAssignFnForNode(CallConv, /* Return*/ true, 1416 isVarArg)); 1417 1418 // Copy all of the result registers out of their specified physreg. 1419 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1420 CCValAssign VA = RVLocs[i]; 1421 1422 // Pass 'this' value directly from the argument to return value, to avoid 1423 // reg unit interference 1424 if (i == 0 && isThisReturn) { 1425 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1426 "unexpected return calling convention register assignment"); 1427 InVals.push_back(ThisVal); 1428 continue; 1429 } 1430 1431 SDValue Val; 1432 if (VA.needsCustom()) { 1433 // Handle f64 or half of a v2f64. 1434 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1435 InFlag); 1436 Chain = Lo.getValue(1); 1437 InFlag = Lo.getValue(2); 1438 VA = RVLocs[++i]; // skip ahead to next loc 1439 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1440 InFlag); 1441 Chain = Hi.getValue(1); 1442 InFlag = Hi.getValue(2); 1443 if (!Subtarget->isLittle()) 1444 std::swap (Lo, Hi); 1445 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1446 1447 if (VA.getLocVT() == MVT::v2f64) { 1448 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1449 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1450 DAG.getConstant(0, dl, MVT::i32)); 1451 1452 VA = RVLocs[++i]; // skip ahead to next loc 1453 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1454 Chain = Lo.getValue(1); 1455 InFlag = Lo.getValue(2); 1456 VA = RVLocs[++i]; // skip ahead to next loc 1457 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1458 Chain = Hi.getValue(1); 1459 InFlag = Hi.getValue(2); 1460 if (!Subtarget->isLittle()) 1461 std::swap (Lo, Hi); 1462 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1463 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1464 DAG.getConstant(1, dl, MVT::i32)); 1465 } 1466 } else { 1467 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1468 InFlag); 1469 Chain = Val.getValue(1); 1470 InFlag = Val.getValue(2); 1471 } 1472 1473 switch (VA.getLocInfo()) { 1474 default: llvm_unreachable("Unknown loc info!"); 1475 case CCValAssign::Full: break; 1476 case CCValAssign::BCvt: 1477 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1478 break; 1479 } 1480 1481 InVals.push_back(Val); 1482 } 1483 1484 return Chain; 1485 } 1486 1487 /// LowerMemOpCallTo - Store the argument to the stack. 1488 SDValue 1489 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, 1490 SDValue StackPtr, SDValue Arg, 1491 SDLoc dl, SelectionDAG &DAG, 1492 const CCValAssign &VA, 1493 ISD::ArgFlagsTy Flags) const { 1494 unsigned LocMemOffset = VA.getLocMemOffset(); 1495 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1496 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1497 StackPtr, PtrOff); 1498 return DAG.getStore( 1499 Chain, dl, Arg, PtrOff, 1500 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), 1501 false, false, 0); 1502 } 1503 1504 void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, 1505 SDValue Chain, SDValue &Arg, 1506 RegsToPassVector &RegsToPass, 1507 CCValAssign &VA, CCValAssign &NextVA, 1508 SDValue &StackPtr, 1509 SmallVectorImpl<SDValue> &MemOpChains, 1510 ISD::ArgFlagsTy Flags) const { 1511 1512 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1513 DAG.getVTList(MVT::i32, MVT::i32), Arg); 1514 unsigned id = Subtarget->isLittle() ? 0 : 1; 1515 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 1516 1517 if (NextVA.isRegLoc()) 1518 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 1519 else { 1520 assert(NextVA.isMemLoc()); 1521 if (!StackPtr.getNode()) 1522 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 1523 getPointerTy(DAG.getDataLayout())); 1524 1525 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 1526 dl, DAG, NextVA, 1527 Flags)); 1528 } 1529 } 1530 1531 /// LowerCall - Lowering a call into a callseq_start <- 1532 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 1533 /// nodes. 1534 SDValue 1535 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1536 SmallVectorImpl<SDValue> &InVals) const { 1537 SelectionDAG &DAG = CLI.DAG; 1538 SDLoc &dl = CLI.DL; 1539 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1540 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1541 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1542 SDValue Chain = CLI.Chain; 1543 SDValue Callee = CLI.Callee; 1544 bool &isTailCall = CLI.IsTailCall; 1545 CallingConv::ID CallConv = CLI.CallConv; 1546 bool doesNotRet = CLI.DoesNotReturn; 1547 bool isVarArg = CLI.IsVarArg; 1548 1549 MachineFunction &MF = DAG.getMachineFunction(); 1550 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 1551 bool isThisReturn = false; 1552 bool isSibCall = false; 1553 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); 1554 1555 // Disable tail calls if they're not supported. 1556 if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") 1557 isTailCall = false; 1558 1559 if (isTailCall) { 1560 // Check if it's really possible to do a tail call. 1561 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1562 isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(), 1563 Outs, OutVals, Ins, DAG); 1564 if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) 1565 report_fatal_error("failed to perform tail call elimination on a call " 1566 "site marked musttail"); 1567 // We don't support GuaranteedTailCallOpt for ARM, only automatically 1568 // detected sibcalls. 1569 if (isTailCall) { 1570 ++NumTailCalls; 1571 isSibCall = true; 1572 } 1573 } 1574 1575 // Analyze operands of the call, assigning locations to each operand. 1576 SmallVector<CCValAssign, 16> ArgLocs; 1577 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1578 *DAG.getContext(), Call); 1579 CCInfo.AnalyzeCallOperands(Outs, 1580 CCAssignFnForNode(CallConv, /* Return*/ false, 1581 isVarArg)); 1582 1583 // Get a count of how many bytes are to be pushed on the stack. 1584 unsigned NumBytes = CCInfo.getNextStackOffset(); 1585 1586 // For tail calls, memory operands are available in our caller's stack. 1587 if (isSibCall) 1588 NumBytes = 0; 1589 1590 // Adjust the stack pointer for the new arguments... 1591 // These operations are automatically eliminated by the prolog/epilog pass 1592 if (!isSibCall) 1593 Chain = DAG.getCALLSEQ_START(Chain, 1594 DAG.getIntPtrConstant(NumBytes, dl, true), dl); 1595 1596 SDValue StackPtr = 1597 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 1598 1599 RegsToPassVector RegsToPass; 1600 SmallVector<SDValue, 8> MemOpChains; 1601 1602 // Walk the register/memloc assignments, inserting copies/loads. In the case 1603 // of tail call optimization, arguments are handled later. 1604 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1605 i != e; 1606 ++i, ++realArgIdx) { 1607 CCValAssign &VA = ArgLocs[i]; 1608 SDValue Arg = OutVals[realArgIdx]; 1609 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1610 bool isByVal = Flags.isByVal(); 1611 1612 // Promote the value if needed. 1613 switch (VA.getLocInfo()) { 1614 default: llvm_unreachable("Unknown loc info!"); 1615 case CCValAssign::Full: break; 1616 case CCValAssign::SExt: 1617 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1618 break; 1619 case CCValAssign::ZExt: 1620 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1621 break; 1622 case CCValAssign::AExt: 1623 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1624 break; 1625 case CCValAssign::BCvt: 1626 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1627 break; 1628 } 1629 1630 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 1631 if (VA.needsCustom()) { 1632 if (VA.getLocVT() == MVT::v2f64) { 1633 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1634 DAG.getConstant(0, dl, MVT::i32)); 1635 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1636 DAG.getConstant(1, dl, MVT::i32)); 1637 1638 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 1639 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1640 1641 VA = ArgLocs[++i]; // skip ahead to next loc 1642 if (VA.isRegLoc()) { 1643 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 1644 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1645 } else { 1646 assert(VA.isMemLoc()); 1647 1648 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 1649 dl, DAG, VA, Flags)); 1650 } 1651 } else { 1652 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 1653 StackPtr, MemOpChains, Flags); 1654 } 1655 } else if (VA.isRegLoc()) { 1656 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) { 1657 assert(VA.getLocVT() == MVT::i32 && 1658 "unexpected calling convention register assignment"); 1659 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 1660 "unexpected use of 'returned'"); 1661 isThisReturn = true; 1662 } 1663 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1664 } else if (isByVal) { 1665 assert(VA.isMemLoc()); 1666 unsigned offset = 0; 1667 1668 // True if this byval aggregate will be split between registers 1669 // and memory. 1670 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 1671 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 1672 1673 if (CurByValIdx < ByValArgsCount) { 1674 1675 unsigned RegBegin, RegEnd; 1676 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 1677 1678 EVT PtrVT = 1679 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 1680 unsigned int i, j; 1681 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 1682 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 1683 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 1684 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 1685 MachinePointerInfo(), 1686 false, false, false, 1687 DAG.InferPtrAlignment(AddArg)); 1688 MemOpChains.push_back(Load.getValue(1)); 1689 RegsToPass.push_back(std::make_pair(j, Load)); 1690 } 1691 1692 // If parameter size outsides register area, "offset" value 1693 // helps us to calculate stack slot for remained part properly. 1694 offset = RegEnd - RegBegin; 1695 1696 CCInfo.nextInRegsParam(); 1697 } 1698 1699 if (Flags.getByValSize() > 4*offset) { 1700 auto PtrVT = getPointerTy(DAG.getDataLayout()); 1701 unsigned LocMemOffset = VA.getLocMemOffset(); 1702 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1703 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 1704 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 1705 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 1706 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 1707 MVT::i32); 1708 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 1709 MVT::i32); 1710 1711 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 1712 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 1713 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 1714 Ops)); 1715 } 1716 } else if (!isSibCall) { 1717 assert(VA.isMemLoc()); 1718 1719 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1720 dl, DAG, VA, Flags)); 1721 } 1722 } 1723 1724 if (!MemOpChains.empty()) 1725 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 1726 1727 // Build a sequence of copy-to-reg nodes chained together with token chain 1728 // and flag operands which copy the outgoing args into the appropriate regs. 1729 SDValue InFlag; 1730 // Tail call byval lowering might overwrite argument registers so in case of 1731 // tail call optimization the copies to registers are lowered later. 1732 if (!isTailCall) 1733 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1734 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1735 RegsToPass[i].second, InFlag); 1736 InFlag = Chain.getValue(1); 1737 } 1738 1739 // For tail calls lower the arguments to the 'real' stack slot. 1740 if (isTailCall) { 1741 // Force all the incoming stack arguments to be loaded from the stack 1742 // before any new outgoing arguments are stored to the stack, because the 1743 // outgoing stack slots may alias the incoming argument stack slots, and 1744 // the alias isn't otherwise explicit. This is slightly more conservative 1745 // than necessary, because it means that each store effectively depends 1746 // on every argument instead of just those arguments it would clobber. 1747 1748 // Do not flag preceding copytoreg stuff together with the following stuff. 1749 InFlag = SDValue(); 1750 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1751 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1752 RegsToPass[i].second, InFlag); 1753 InFlag = Chain.getValue(1); 1754 } 1755 InFlag = SDValue(); 1756 } 1757 1758 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 1759 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 1760 // node so that legalize doesn't hack it. 1761 bool isDirect = false; 1762 bool isARMFunc = false; 1763 bool isLocalARMFunc = false; 1764 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1765 auto PtrVt = getPointerTy(DAG.getDataLayout()); 1766 1767 if (Subtarget->genLongCalls()) { 1768 assert((Subtarget->isTargetWindows() || 1769 getTargetMachine().getRelocationModel() == Reloc::Static) && 1770 "long-calls with non-static relocation model!"); 1771 // Handle a global address or an external symbol. If it's not one of 1772 // those, the target's already in a register, so we don't need to do 1773 // anything extra. 1774 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1775 const GlobalValue *GV = G->getGlobal(); 1776 // Create a constant pool entry for the callee address 1777 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1778 ARMConstantPoolValue *CPV = 1779 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 1780 1781 // Get the address of the callee into a register 1782 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1783 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1784 Callee = DAG.getLoad( 1785 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1786 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 1787 false, false, 0); 1788 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 1789 const char *Sym = S->getSymbol(); 1790 1791 // Create a constant pool entry for the callee address 1792 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1793 ARMConstantPoolValue *CPV = 1794 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1795 ARMPCLabelIndex, 0); 1796 // Get the address of the callee into a register 1797 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1798 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1799 Callee = DAG.getLoad( 1800 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1801 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 1802 false, false, 0); 1803 } 1804 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1805 const GlobalValue *GV = G->getGlobal(); 1806 isDirect = true; 1807 bool isDef = GV->isStrongDefinitionForLinker(); 1808 bool isStub = (!isDef && Subtarget->isTargetMachO()) && 1809 getTargetMachine().getRelocationModel() != Reloc::Static; 1810 isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 1811 // ARM call to a local ARM function is predicable. 1812 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 1813 // tBX takes a register source operand. 1814 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1815 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 1816 Callee = DAG.getNode( 1817 ARMISD::WrapperPIC, dl, PtrVt, 1818 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 1819 Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, 1820 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 1821 false, false, true, 0); 1822 } else if (Subtarget->isTargetCOFF()) { 1823 assert(Subtarget->isTargetWindows() && 1824 "Windows is the only supported COFF target"); 1825 unsigned TargetFlags = GV->hasDLLImportStorageClass() 1826 ? ARMII::MO_DLLIMPORT 1827 : ARMII::MO_NO_FLAG; 1828 Callee = 1829 DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); 1830 if (GV->hasDLLImportStorageClass()) 1831 Callee = 1832 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 1833 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 1834 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 1835 false, false, false, 0); 1836 } else { 1837 // On ELF targets for PIC code, direct calls should go through the PLT 1838 unsigned OpFlags = 0; 1839 if (Subtarget->isTargetELF() && 1840 getTargetMachine().getRelocationModel() == Reloc::PIC_) 1841 OpFlags = ARMII::MO_PLT; 1842 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags); 1843 } 1844 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1845 isDirect = true; 1846 bool isStub = Subtarget->isTargetMachO() && 1847 getTargetMachine().getRelocationModel() != Reloc::Static; 1848 isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 1849 // tBX takes a register source operand. 1850 const char *Sym = S->getSymbol(); 1851 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1852 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1853 ARMConstantPoolValue *CPV = 1854 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1855 ARMPCLabelIndex, 4); 1856 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1857 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1858 Callee = DAG.getLoad( 1859 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1860 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 1861 false, false, 0); 1862 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 1863 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 1864 } else { 1865 unsigned OpFlags = 0; 1866 // On ELF targets for PIC code, direct calls should go through the PLT 1867 if (Subtarget->isTargetELF() && 1868 getTargetMachine().getRelocationModel() == Reloc::PIC_) 1869 OpFlags = ARMII::MO_PLT; 1870 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags); 1871 } 1872 } 1873 1874 // FIXME: handle tail calls differently. 1875 unsigned CallOpc; 1876 if (Subtarget->isThumb()) { 1877 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 1878 CallOpc = ARMISD::CALL_NOLINK; 1879 else 1880 CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; 1881 } else { 1882 if (!isDirect && !Subtarget->hasV5TOps()) 1883 CallOpc = ARMISD::CALL_NOLINK; 1884 else if (doesNotRet && isDirect && Subtarget->hasRAS() && 1885 // Emit regular call when code size is the priority 1886 !MF.getFunction()->optForMinSize()) 1887 // "mov lr, pc; b _foo" to avoid confusing the RSP 1888 CallOpc = ARMISD::CALL_NOLINK; 1889 else 1890 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 1891 } 1892 1893 std::vector<SDValue> Ops; 1894 Ops.push_back(Chain); 1895 Ops.push_back(Callee); 1896 1897 // Add argument registers to the end of the list so that they are known live 1898 // into the call. 1899 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1900 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1901 RegsToPass[i].second.getValueType())); 1902 1903 // Add a register mask operand representing the call-preserved registers. 1904 if (!isTailCall) { 1905 const uint32_t *Mask; 1906 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 1907 if (isThisReturn) { 1908 // For 'this' returns, use the R0-preserving mask if applicable 1909 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 1910 if (!Mask) { 1911 // Set isThisReturn to false if the calling convention is not one that 1912 // allows 'returned' to be modeled in this way, so LowerCallResult does 1913 // not try to pass 'this' straight through 1914 isThisReturn = false; 1915 Mask = ARI->getCallPreservedMask(MF, CallConv); 1916 } 1917 } else 1918 Mask = ARI->getCallPreservedMask(MF, CallConv); 1919 1920 assert(Mask && "Missing call preserved mask for calling convention"); 1921 Ops.push_back(DAG.getRegisterMask(Mask)); 1922 } 1923 1924 if (InFlag.getNode()) 1925 Ops.push_back(InFlag); 1926 1927 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1928 if (isTailCall) { 1929 MF.getFrameInfo()->setHasTailCall(); 1930 return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 1931 } 1932 1933 // Returns a chain and a flag for retval copy to use. 1934 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 1935 InFlag = Chain.getValue(1); 1936 1937 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 1938 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 1939 if (!Ins.empty()) 1940 InFlag = Chain.getValue(1); 1941 1942 // Handle result values, copying them out of physregs into vregs that we 1943 // return. 1944 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 1945 InVals, isThisReturn, 1946 isThisReturn ? OutVals[0] : SDValue()); 1947 } 1948 1949 /// HandleByVal - Every parameter *after* a byval parameter is passed 1950 /// on the stack. Remember the next parameter register to allocate, 1951 /// and then confiscate the rest of the parameter registers to insure 1952 /// this. 1953 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 1954 unsigned Align) const { 1955 assert((State->getCallOrPrologue() == Prologue || 1956 State->getCallOrPrologue() == Call) && 1957 "unhandled ParmContext"); 1958 1959 // Byval (as with any stack) slots are always at least 4 byte aligned. 1960 Align = std::max(Align, 4U); 1961 1962 unsigned Reg = State->AllocateReg(GPRArgRegs); 1963 if (!Reg) 1964 return; 1965 1966 unsigned AlignInRegs = Align / 4; 1967 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 1968 for (unsigned i = 0; i < Waste; ++i) 1969 Reg = State->AllocateReg(GPRArgRegs); 1970 1971 if (!Reg) 1972 return; 1973 1974 unsigned Excess = 4 * (ARM::R4 - Reg); 1975 1976 // Special case when NSAA != SP and parameter size greater than size of 1977 // all remained GPR regs. In that case we can't split parameter, we must 1978 // send it to stack. We also must set NCRN to R4, so waste all 1979 // remained registers. 1980 const unsigned NSAAOffset = State->getNextStackOffset(); 1981 if (NSAAOffset != 0 && Size > Excess) { 1982 while (State->AllocateReg(GPRArgRegs)) 1983 ; 1984 return; 1985 } 1986 1987 // First register for byval parameter is the first register that wasn't 1988 // allocated before this method call, so it would be "reg". 1989 // If parameter is small enough to be saved in range [reg, r4), then 1990 // the end (first after last) register would be reg + param-size-in-regs, 1991 // else parameter would be splitted between registers and stack, 1992 // end register would be r4 in this case. 1993 unsigned ByValRegBegin = Reg; 1994 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 1995 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 1996 // Note, first register is allocated in the beginning of function already, 1997 // allocate remained amount of registers we need. 1998 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 1999 State->AllocateReg(GPRArgRegs); 2000 // A byval parameter that is split between registers and memory needs its 2001 // size truncated here. 2002 // In the case where the entire structure fits in registers, we set the 2003 // size in memory to zero. 2004 Size = std::max<int>(Size - Excess, 0); 2005 } 2006 2007 /// MatchingStackOffset - Return true if the given stack call argument is 2008 /// already available in the same position (relatively) of the caller's 2009 /// incoming argument stack. 2010 static 2011 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2012 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2013 const TargetInstrInfo *TII) { 2014 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2015 int FI = INT_MAX; 2016 if (Arg.getOpcode() == ISD::CopyFromReg) { 2017 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2018 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2019 return false; 2020 MachineInstr *Def = MRI->getVRegDef(VR); 2021 if (!Def) 2022 return false; 2023 if (!Flags.isByVal()) { 2024 if (!TII->isLoadFromStackSlot(Def, FI)) 2025 return false; 2026 } else { 2027 return false; 2028 } 2029 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2030 if (Flags.isByVal()) 2031 // ByVal argument is passed in as a pointer but it's now being 2032 // dereferenced. e.g. 2033 // define @foo(%struct.X* %A) { 2034 // tail call @bar(%struct.X* byval %A) 2035 // } 2036 return false; 2037 SDValue Ptr = Ld->getBasePtr(); 2038 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2039 if (!FINode) 2040 return false; 2041 FI = FINode->getIndex(); 2042 } else 2043 return false; 2044 2045 assert(FI != INT_MAX); 2046 if (!MFI->isFixedObjectIndex(FI)) 2047 return false; 2048 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2049 } 2050 2051 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2052 /// for tail call optimization. Targets which want to do tail call 2053 /// optimization should implement this function. 2054 bool 2055 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2056 CallingConv::ID CalleeCC, 2057 bool isVarArg, 2058 bool isCalleeStructRet, 2059 bool isCallerStructRet, 2060 const SmallVectorImpl<ISD::OutputArg> &Outs, 2061 const SmallVectorImpl<SDValue> &OutVals, 2062 const SmallVectorImpl<ISD::InputArg> &Ins, 2063 SelectionDAG& DAG) const { 2064 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2065 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2066 bool CCMatch = CallerCC == CalleeCC; 2067 2068 assert(Subtarget->supportsTailCall()); 2069 2070 // Look for obvious safe cases to perform tail call optimization that do not 2071 // require ABI changes. This is what gcc calls sibcall. 2072 2073 // Do not sibcall optimize vararg calls unless the call site is not passing 2074 // any arguments. 2075 if (isVarArg && !Outs.empty()) 2076 return false; 2077 2078 // Exception-handling functions need a special set of instructions to indicate 2079 // a return to the hardware. Tail-calling another function would probably 2080 // break this. 2081 if (CallerF->hasFnAttribute("interrupt")) 2082 return false; 2083 2084 // Also avoid sibcall optimization if either caller or callee uses struct 2085 // return semantics. 2086 if (isCalleeStructRet || isCallerStructRet) 2087 return false; 2088 2089 // Externally-defined functions with weak linkage should not be 2090 // tail-called on ARM when the OS does not support dynamic 2091 // pre-emption of symbols, as the AAELF spec requires normal calls 2092 // to undefined weak functions to be replaced with a NOP or jump to the 2093 // next instruction. The behaviour of branch instructions in this 2094 // situation (as used for tail calls) is implementation-defined, so we 2095 // cannot rely on the linker replacing the tail call with a return. 2096 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2097 const GlobalValue *GV = G->getGlobal(); 2098 const Triple &TT = getTargetMachine().getTargetTriple(); 2099 if (GV->hasExternalWeakLinkage() && 2100 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2101 return false; 2102 } 2103 2104 // If the calling conventions do not match, then we'd better make sure the 2105 // results are returned in the same way as what the caller expects. 2106 if (!CCMatch) { 2107 SmallVector<CCValAssign, 16> RVLocs1; 2108 ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 2109 *DAG.getContext(), Call); 2110 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); 2111 2112 SmallVector<CCValAssign, 16> RVLocs2; 2113 ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 2114 *DAG.getContext(), Call); 2115 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); 2116 2117 if (RVLocs1.size() != RVLocs2.size()) 2118 return false; 2119 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2120 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2121 return false; 2122 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2123 return false; 2124 if (RVLocs1[i].isRegLoc()) { 2125 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2126 return false; 2127 } else { 2128 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2129 return false; 2130 } 2131 } 2132 } 2133 2134 // If Caller's vararg or byval argument has been split between registers and 2135 // stack, do not perform tail call, since part of the argument is in caller's 2136 // local frame. 2137 const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). 2138 getInfo<ARMFunctionInfo>(); 2139 if (AFI_Caller->getArgRegsSaveSize()) 2140 return false; 2141 2142 // If the callee takes no arguments then go on to check the results of the 2143 // call. 2144 if (!Outs.empty()) { 2145 // Check if stack adjustment is needed. For now, do not do this if any 2146 // argument is passed on the stack. 2147 SmallVector<CCValAssign, 16> ArgLocs; 2148 ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2149 *DAG.getContext(), Call); 2150 CCInfo.AnalyzeCallOperands(Outs, 2151 CCAssignFnForNode(CalleeCC, false, isVarArg)); 2152 if (CCInfo.getNextStackOffset()) { 2153 MachineFunction &MF = DAG.getMachineFunction(); 2154 2155 // Check if the arguments are already laid out in the right way as 2156 // the caller's fixed stack objects. 2157 MachineFrameInfo *MFI = MF.getFrameInfo(); 2158 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2159 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2160 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2161 i != e; 2162 ++i, ++realArgIdx) { 2163 CCValAssign &VA = ArgLocs[i]; 2164 EVT RegVT = VA.getLocVT(); 2165 SDValue Arg = OutVals[realArgIdx]; 2166 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2167 if (VA.getLocInfo() == CCValAssign::Indirect) 2168 return false; 2169 if (VA.needsCustom()) { 2170 // f64 and vector types are split into multiple registers or 2171 // register/stack-slot combinations. The types will not match 2172 // the registers; give up on memory f64 refs until we figure 2173 // out what to do about this. 2174 if (!VA.isRegLoc()) 2175 return false; 2176 if (!ArgLocs[++i].isRegLoc()) 2177 return false; 2178 if (RegVT == MVT::v2f64) { 2179 if (!ArgLocs[++i].isRegLoc()) 2180 return false; 2181 if (!ArgLocs[++i].isRegLoc()) 2182 return false; 2183 } 2184 } else if (!VA.isRegLoc()) { 2185 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2186 MFI, MRI, TII)) 2187 return false; 2188 } 2189 } 2190 } 2191 } 2192 2193 return true; 2194 } 2195 2196 bool 2197 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2198 MachineFunction &MF, bool isVarArg, 2199 const SmallVectorImpl<ISD::OutputArg> &Outs, 2200 LLVMContext &Context) const { 2201 SmallVector<CCValAssign, 16> RVLocs; 2202 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2203 return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true, 2204 isVarArg)); 2205 } 2206 2207 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2208 SDLoc DL, SelectionDAG &DAG) { 2209 const MachineFunction &MF = DAG.getMachineFunction(); 2210 const Function *F = MF.getFunction(); 2211 2212 StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString(); 2213 2214 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2215 // version of the "preferred return address". These offsets affect the return 2216 // instruction if this is a return from PL1 without hypervisor extensions. 2217 // IRQ/FIQ: +4 "subs pc, lr, #4" 2218 // SWI: 0 "subs pc, lr, #0" 2219 // ABORT: +4 "subs pc, lr, #4" 2220 // UNDEF: +4/+2 "subs pc, lr, #0" 2221 // UNDEF varies depending on where the exception came from ARM or Thumb 2222 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2223 2224 int64_t LROffset; 2225 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2226 IntKind == "ABORT") 2227 LROffset = 4; 2228 else if (IntKind == "SWI" || IntKind == "UNDEF") 2229 LROffset = 0; 2230 else 2231 report_fatal_error("Unsupported interrupt attribute. If present, value " 2232 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2233 2234 RetOps.insert(RetOps.begin() + 1, 2235 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2236 2237 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2238 } 2239 2240 SDValue 2241 ARMTargetLowering::LowerReturn(SDValue Chain, 2242 CallingConv::ID CallConv, bool isVarArg, 2243 const SmallVectorImpl<ISD::OutputArg> &Outs, 2244 const SmallVectorImpl<SDValue> &OutVals, 2245 SDLoc dl, SelectionDAG &DAG) const { 2246 2247 // CCValAssign - represent the assignment of the return value to a location. 2248 SmallVector<CCValAssign, 16> RVLocs; 2249 2250 // CCState - Info about the registers and stack slots. 2251 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2252 *DAG.getContext(), Call); 2253 2254 // Analyze outgoing return values. 2255 CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, 2256 isVarArg)); 2257 2258 SDValue Flag; 2259 SmallVector<SDValue, 4> RetOps; 2260 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2261 bool isLittleEndian = Subtarget->isLittle(); 2262 2263 MachineFunction &MF = DAG.getMachineFunction(); 2264 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2265 AFI->setReturnRegsCount(RVLocs.size()); 2266 2267 // Copy the result values into the output registers. 2268 for (unsigned i = 0, realRVLocIdx = 0; 2269 i != RVLocs.size(); 2270 ++i, ++realRVLocIdx) { 2271 CCValAssign &VA = RVLocs[i]; 2272 assert(VA.isRegLoc() && "Can only return in registers!"); 2273 2274 SDValue Arg = OutVals[realRVLocIdx]; 2275 2276 switch (VA.getLocInfo()) { 2277 default: llvm_unreachable("Unknown loc info!"); 2278 case CCValAssign::Full: break; 2279 case CCValAssign::BCvt: 2280 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2281 break; 2282 } 2283 2284 if (VA.needsCustom()) { 2285 if (VA.getLocVT() == MVT::v2f64) { 2286 // Extract the first half and return it in two registers. 2287 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2288 DAG.getConstant(0, dl, MVT::i32)); 2289 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2290 DAG.getVTList(MVT::i32, MVT::i32), Half); 2291 2292 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2293 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2294 Flag); 2295 Flag = Chain.getValue(1); 2296 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2297 VA = RVLocs[++i]; // skip ahead to next loc 2298 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2299 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2300 Flag); 2301 Flag = Chain.getValue(1); 2302 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2303 VA = RVLocs[++i]; // skip ahead to next loc 2304 2305 // Extract the 2nd half and fall through to handle it as an f64 value. 2306 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2307 DAG.getConstant(1, dl, MVT::i32)); 2308 } 2309 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2310 // available. 2311 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2312 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2313 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2314 fmrrd.getValue(isLittleEndian ? 0 : 1), 2315 Flag); 2316 Flag = Chain.getValue(1); 2317 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2318 VA = RVLocs[++i]; // skip ahead to next loc 2319 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2320 fmrrd.getValue(isLittleEndian ? 1 : 0), 2321 Flag); 2322 } else 2323 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2324 2325 // Guarantee that all emitted copies are 2326 // stuck together, avoiding something bad. 2327 Flag = Chain.getValue(1); 2328 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2329 } 2330 2331 // Update chain and glue. 2332 RetOps[0] = Chain; 2333 if (Flag.getNode()) 2334 RetOps.push_back(Flag); 2335 2336 // CPUs which aren't M-class use a special sequence to return from 2337 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2338 // though we use "subs pc, lr, #N"). 2339 // 2340 // M-class CPUs actually use a normal return sequence with a special 2341 // (hardware-provided) value in LR, so the normal code path works. 2342 if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") && 2343 !Subtarget->isMClass()) { 2344 if (Subtarget->isThumb1Only()) 2345 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2346 return LowerInterruptReturn(RetOps, dl, DAG); 2347 } 2348 2349 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2350 } 2351 2352 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2353 if (N->getNumValues() != 1) 2354 return false; 2355 if (!N->hasNUsesOfValue(1, 0)) 2356 return false; 2357 2358 SDValue TCChain = Chain; 2359 SDNode *Copy = *N->use_begin(); 2360 if (Copy->getOpcode() == ISD::CopyToReg) { 2361 // If the copy has a glue operand, we conservatively assume it isn't safe to 2362 // perform a tail call. 2363 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2364 return false; 2365 TCChain = Copy->getOperand(0); 2366 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2367 SDNode *VMov = Copy; 2368 // f64 returned in a pair of GPRs. 2369 SmallPtrSet<SDNode*, 2> Copies; 2370 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2371 UI != UE; ++UI) { 2372 if (UI->getOpcode() != ISD::CopyToReg) 2373 return false; 2374 Copies.insert(*UI); 2375 } 2376 if (Copies.size() > 2) 2377 return false; 2378 2379 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2380 UI != UE; ++UI) { 2381 SDValue UseChain = UI->getOperand(0); 2382 if (Copies.count(UseChain.getNode())) 2383 // Second CopyToReg 2384 Copy = *UI; 2385 else { 2386 // We are at the top of this chain. 2387 // If the copy has a glue operand, we conservatively assume it 2388 // isn't safe to perform a tail call. 2389 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2390 return false; 2391 // First CopyToReg 2392 TCChain = UseChain; 2393 } 2394 } 2395 } else if (Copy->getOpcode() == ISD::BITCAST) { 2396 // f32 returned in a single GPR. 2397 if (!Copy->hasOneUse()) 2398 return false; 2399 Copy = *Copy->use_begin(); 2400 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2401 return false; 2402 // If the copy has a glue operand, we conservatively assume it isn't safe to 2403 // perform a tail call. 2404 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2405 return false; 2406 TCChain = Copy->getOperand(0); 2407 } else { 2408 return false; 2409 } 2410 2411 bool HasRet = false; 2412 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2413 UI != UE; ++UI) { 2414 if (UI->getOpcode() != ARMISD::RET_FLAG && 2415 UI->getOpcode() != ARMISD::INTRET_FLAG) 2416 return false; 2417 HasRet = true; 2418 } 2419 2420 if (!HasRet) 2421 return false; 2422 2423 Chain = TCChain; 2424 return true; 2425 } 2426 2427 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2428 if (!Subtarget->supportsTailCall()) 2429 return false; 2430 2431 auto Attr = 2432 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2433 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2434 return false; 2435 2436 return true; 2437 } 2438 2439 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2440 // and pass the lower and high parts through. 2441 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2442 SDLoc DL(Op); 2443 SDValue WriteValue = Op->getOperand(2); 2444 2445 // This function is only supposed to be called for i64 type argument. 2446 assert(WriteValue.getValueType() == MVT::i64 2447 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2448 2449 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2450 DAG.getConstant(0, DL, MVT::i32)); 2451 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2452 DAG.getConstant(1, DL, MVT::i32)); 2453 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2454 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 2455 } 2456 2457 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2458 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2459 // one of the above mentioned nodes. It has to be wrapped because otherwise 2460 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2461 // be used to form addressing mode. These wrapped nodes will be selected 2462 // into MOVi. 2463 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 2464 EVT PtrVT = Op.getValueType(); 2465 // FIXME there is no actual debug info here 2466 SDLoc dl(Op); 2467 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2468 SDValue Res; 2469 if (CP->isMachineConstantPoolEntry()) 2470 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 2471 CP->getAlignment()); 2472 else 2473 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 2474 CP->getAlignment()); 2475 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 2476 } 2477 2478 unsigned ARMTargetLowering::getJumpTableEncoding() const { 2479 return MachineJumpTableInfo::EK_Inline; 2480 } 2481 2482 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 2483 SelectionDAG &DAG) const { 2484 MachineFunction &MF = DAG.getMachineFunction(); 2485 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2486 unsigned ARMPCLabelIndex = 0; 2487 SDLoc DL(Op); 2488 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2489 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 2490 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2491 SDValue CPAddr; 2492 if (RelocM == Reloc::Static) { 2493 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 2494 } else { 2495 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2496 ARMPCLabelIndex = AFI->createPICLabelUId(); 2497 ARMConstantPoolValue *CPV = 2498 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 2499 ARMCP::CPBlockAddress, PCAdj); 2500 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2501 } 2502 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 2503 SDValue Result = 2504 DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, 2505 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 2506 false, false, false, 0); 2507 if (RelocM == Reloc::Static) 2508 return Result; 2509 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 2510 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 2511 } 2512 2513 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 2514 SDValue 2515 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 2516 SelectionDAG &DAG) const { 2517 SDLoc dl(GA); 2518 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2519 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2520 MachineFunction &MF = DAG.getMachineFunction(); 2521 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2522 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2523 ARMConstantPoolValue *CPV = 2524 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2525 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 2526 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2527 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 2528 Argument = 2529 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, 2530 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 2531 false, false, false, 0); 2532 SDValue Chain = Argument.getValue(1); 2533 2534 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2535 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 2536 2537 // call __tls_get_addr. 2538 ArgListTy Args; 2539 ArgListEntry Entry; 2540 Entry.Node = Argument; 2541 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 2542 Args.push_back(Entry); 2543 2544 // FIXME: is there useful debug info available here? 2545 TargetLowering::CallLoweringInfo CLI(DAG); 2546 CLI.setDebugLoc(dl).setChain(Chain) 2547 .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 2548 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args), 2549 0); 2550 2551 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2552 return CallResult.first; 2553 } 2554 2555 // Lower ISD::GlobalTLSAddress using the "initial exec" or 2556 // "local exec" model. 2557 SDValue 2558 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 2559 SelectionDAG &DAG, 2560 TLSModel::Model model) const { 2561 const GlobalValue *GV = GA->getGlobal(); 2562 SDLoc dl(GA); 2563 SDValue Offset; 2564 SDValue Chain = DAG.getEntryNode(); 2565 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2566 // Get the Thread Pointer 2567 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2568 2569 if (model == TLSModel::InitialExec) { 2570 MachineFunction &MF = DAG.getMachineFunction(); 2571 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2572 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2573 // Initial exec model. 2574 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2575 ARMConstantPoolValue *CPV = 2576 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2577 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 2578 true); 2579 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2580 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2581 Offset = DAG.getLoad( 2582 PtrVT, dl, Chain, Offset, 2583 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2584 false, false, 0); 2585 Chain = Offset.getValue(1); 2586 2587 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2588 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 2589 2590 Offset = DAG.getLoad( 2591 PtrVT, dl, Chain, Offset, 2592 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2593 false, false, 0); 2594 } else { 2595 // local exec model 2596 assert(model == TLSModel::LocalExec); 2597 ARMConstantPoolValue *CPV = 2598 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 2599 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2600 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2601 Offset = DAG.getLoad( 2602 PtrVT, dl, Chain, Offset, 2603 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2604 false, false, 0); 2605 } 2606 2607 // The address of the thread local variable is the add of the thread 2608 // pointer with the offset of the variable. 2609 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 2610 } 2611 2612 SDValue 2613 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 2614 // TODO: implement the "local dynamic" model 2615 assert(Subtarget->isTargetELF() && 2616 "TLS not implemented for non-ELF targets"); 2617 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2618 if (DAG.getTarget().Options.EmulatedTLS) 2619 return LowerToTLSEmulatedModel(GA, DAG); 2620 2621 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 2622 2623 switch (model) { 2624 case TLSModel::GeneralDynamic: 2625 case TLSModel::LocalDynamic: 2626 return LowerToTLSGeneralDynamicModel(GA, DAG); 2627 case TLSModel::InitialExec: 2628 case TLSModel::LocalExec: 2629 return LowerToTLSExecModels(GA, DAG, model); 2630 } 2631 llvm_unreachable("bogus TLS model"); 2632 } 2633 2634 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 2635 SelectionDAG &DAG) const { 2636 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2637 SDLoc dl(Op); 2638 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2639 if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2640 bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); 2641 ARMConstantPoolValue *CPV = 2642 ARMConstantPoolConstant::Create(GV, 2643 UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); 2644 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2645 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2646 SDValue Result = DAG.getLoad( 2647 PtrVT, dl, DAG.getEntryNode(), CPAddr, 2648 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2649 false, false, 0); 2650 SDValue Chain = Result.getValue(1); 2651 SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); 2652 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); 2653 if (!UseGOTOFF) 2654 Result = DAG.getLoad(PtrVT, dl, Chain, Result, 2655 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2656 false, false, false, 0); 2657 return Result; 2658 } 2659 2660 // If we have T2 ops, we can materialize the address directly via movt/movw 2661 // pair. This is always cheaper. 2662 if (Subtarget->useMovt(DAG.getMachineFunction())) { 2663 ++NumMovwMovt; 2664 // FIXME: Once remat is capable of dealing with instructions with register 2665 // operands, expand this into two nodes. 2666 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 2667 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 2668 } else { 2669 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 2670 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2671 return DAG.getLoad( 2672 PtrVT, dl, DAG.getEntryNode(), CPAddr, 2673 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2674 false, false, 0); 2675 } 2676 } 2677 2678 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 2679 SelectionDAG &DAG) const { 2680 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2681 SDLoc dl(Op); 2682 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2683 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2684 2685 if (Subtarget->useMovt(DAG.getMachineFunction())) 2686 ++NumMovwMovt; 2687 2688 // FIXME: Once remat is capable of dealing with instructions with register 2689 // operands, expand this into multiple nodes 2690 unsigned Wrapper = 2691 RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper; 2692 2693 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 2694 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 2695 2696 if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) 2697 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 2698 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2699 false, false, false, 0); 2700 return Result; 2701 } 2702 2703 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 2704 SelectionDAG &DAG) const { 2705 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 2706 assert(Subtarget->useMovt(DAG.getMachineFunction()) && 2707 "Windows on ARM expects to use movw/movt"); 2708 2709 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2710 const ARMII::TOF TargetFlags = 2711 (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); 2712 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2713 SDValue Result; 2714 SDLoc DL(Op); 2715 2716 ++NumMovwMovt; 2717 2718 // FIXME: Once remat is capable of dealing with instructions with register 2719 // operands, expand this into two nodes. 2720 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 2721 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, 2722 TargetFlags)); 2723 if (GV->hasDLLImportStorageClass()) 2724 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 2725 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2726 false, false, false, 0); 2727 return Result; 2728 } 2729 2730 SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, 2731 SelectionDAG &DAG) const { 2732 assert(Subtarget->isTargetELF() && 2733 "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); 2734 MachineFunction &MF = DAG.getMachineFunction(); 2735 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2736 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2737 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2738 SDLoc dl(Op); 2739 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2740 ARMConstantPoolValue *CPV = 2741 ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", 2742 ARMPCLabelIndex, PCAdj); 2743 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2744 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2745 SDValue Result = 2746 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, 2747 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), 2748 false, false, false, 0); 2749 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2750 return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 2751 } 2752 2753 SDValue 2754 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 2755 SDLoc dl(Op); 2756 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 2757 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 2758 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 2759 Op.getOperand(1), Val); 2760 } 2761 2762 SDValue 2763 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 2764 SDLoc dl(Op); 2765 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 2766 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 2767 } 2768 2769 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 2770 SelectionDAG &DAG) const { 2771 SDLoc dl(Op); 2772 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 2773 Op.getOperand(0)); 2774 } 2775 2776 SDValue 2777 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 2778 const ARMSubtarget *Subtarget) const { 2779 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2780 SDLoc dl(Op); 2781 switch (IntNo) { 2782 default: return SDValue(); // Don't custom lower most intrinsics. 2783 case Intrinsic::arm_rbit: { 2784 assert(Op.getOperand(1).getValueType() == MVT::i32 && 2785 "RBIT intrinsic must have i32 type!"); 2786 return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1)); 2787 } 2788 case Intrinsic::arm_thread_pointer: { 2789 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2790 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2791 } 2792 case Intrinsic::eh_sjlj_lsda: { 2793 MachineFunction &MF = DAG.getMachineFunction(); 2794 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2795 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2796 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2797 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2798 SDValue CPAddr; 2799 unsigned PCAdj = (RelocM != Reloc::PIC_) 2800 ? 0 : (Subtarget->isThumb() ? 4 : 8); 2801 ARMConstantPoolValue *CPV = 2802 ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, 2803 ARMCP::CPLSDA, PCAdj); 2804 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2805 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2806 SDValue Result = DAG.getLoad( 2807 PtrVT, dl, DAG.getEntryNode(), CPAddr, 2808 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, 2809 false, false, 0); 2810 2811 if (RelocM == Reloc::PIC_) { 2812 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2813 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 2814 } 2815 return Result; 2816 } 2817 case Intrinsic::arm_neon_vmulls: 2818 case Intrinsic::arm_neon_vmullu: { 2819 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 2820 ? ARMISD::VMULLs : ARMISD::VMULLu; 2821 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2822 Op.getOperand(1), Op.getOperand(2)); 2823 } 2824 case Intrinsic::arm_neon_vminnm: 2825 case Intrinsic::arm_neon_vmaxnm: { 2826 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 2827 ? ISD::FMINNUM : ISD::FMAXNUM; 2828 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2829 Op.getOperand(1), Op.getOperand(2)); 2830 } 2831 case Intrinsic::arm_neon_vminu: 2832 case Intrinsic::arm_neon_vmaxu: { 2833 if (Op.getValueType().isFloatingPoint()) 2834 return SDValue(); 2835 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 2836 ? ISD::UMIN : ISD::UMAX; 2837 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2838 Op.getOperand(1), Op.getOperand(2)); 2839 } 2840 case Intrinsic::arm_neon_vmins: 2841 case Intrinsic::arm_neon_vmaxs: { 2842 // v{min,max}s is overloaded between signed integers and floats. 2843 if (!Op.getValueType().isFloatingPoint()) { 2844 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 2845 ? ISD::SMIN : ISD::SMAX; 2846 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2847 Op.getOperand(1), Op.getOperand(2)); 2848 } 2849 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 2850 ? ISD::FMINNAN : ISD::FMAXNAN; 2851 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 2852 Op.getOperand(1), Op.getOperand(2)); 2853 } 2854 } 2855 } 2856 2857 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 2858 const ARMSubtarget *Subtarget) { 2859 // FIXME: handle "fence singlethread" more efficiently. 2860 SDLoc dl(Op); 2861 if (!Subtarget->hasDataBarrier()) { 2862 // Some ARMv6 cpus can support data barriers with an mcr instruction. 2863 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 2864 // here. 2865 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 2866 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 2867 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 2868 DAG.getConstant(0, dl, MVT::i32)); 2869 } 2870 2871 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 2872 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 2873 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 2874 if (Subtarget->isMClass()) { 2875 // Only a full system barrier exists in the M-class architectures. 2876 Domain = ARM_MB::SY; 2877 } else if (Subtarget->isSwift() && Ord == Release) { 2878 // Swift happens to implement ISHST barriers in a way that's compatible with 2879 // Release semantics but weaker than ISH so we'd be fools not to use 2880 // it. Beware: other processors probably don't! 2881 Domain = ARM_MB::ISHST; 2882 } 2883 2884 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 2885 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 2886 DAG.getConstant(Domain, dl, MVT::i32)); 2887 } 2888 2889 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 2890 const ARMSubtarget *Subtarget) { 2891 // ARM pre v5TE and Thumb1 does not have preload instructions. 2892 if (!(Subtarget->isThumb2() || 2893 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 2894 // Just preserve the chain. 2895 return Op.getOperand(0); 2896 2897 SDLoc dl(Op); 2898 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 2899 if (!isRead && 2900 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 2901 // ARMv7 with MP extension has PLDW. 2902 return Op.getOperand(0); 2903 2904 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 2905 if (Subtarget->isThumb()) { 2906 // Invert the bits. 2907 isRead = ~isRead & 1; 2908 isData = ~isData & 1; 2909 } 2910 2911 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 2912 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 2913 DAG.getConstant(isData, dl, MVT::i32)); 2914 } 2915 2916 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 2917 MachineFunction &MF = DAG.getMachineFunction(); 2918 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 2919 2920 // vastart just stores the address of the VarArgsFrameIndex slot into the 2921 // memory location argument. 2922 SDLoc dl(Op); 2923 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2924 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 2925 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2926 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 2927 MachinePointerInfo(SV), false, false, 0); 2928 } 2929 2930 SDValue 2931 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, 2932 SDValue &Root, SelectionDAG &DAG, 2933 SDLoc dl) const { 2934 MachineFunction &MF = DAG.getMachineFunction(); 2935 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2936 2937 const TargetRegisterClass *RC; 2938 if (AFI->isThumb1OnlyFunction()) 2939 RC = &ARM::tGPRRegClass; 2940 else 2941 RC = &ARM::GPRRegClass; 2942 2943 // Transform the arguments stored in physical registers into virtual ones. 2944 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2945 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 2946 2947 SDValue ArgValue2; 2948 if (NextVA.isMemLoc()) { 2949 MachineFrameInfo *MFI = MF.getFrameInfo(); 2950 int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); 2951 2952 // Create load node to retrieve arguments from the stack. 2953 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 2954 ArgValue2 = DAG.getLoad( 2955 MVT::i32, dl, Root, FIN, 2956 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, 2957 false, false, 0); 2958 } else { 2959 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 2960 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 2961 } 2962 if (!Subtarget->isLittle()) 2963 std::swap (ArgValue, ArgValue2); 2964 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 2965 } 2966 2967 // The remaining GPRs hold either the beginning of variable-argument 2968 // data, or the beginning of an aggregate passed by value (usually 2969 // byval). Either way, we allocate stack slots adjacent to the data 2970 // provided by our caller, and store the unallocated registers there. 2971 // If this is a variadic function, the va_list pointer will begin with 2972 // these values; otherwise, this reassembles a (byval) structure that 2973 // was split between registers and memory. 2974 // Return: The frame index registers were stored into. 2975 int 2976 ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 2977 SDLoc dl, SDValue &Chain, 2978 const Value *OrigArg, 2979 unsigned InRegsParamRecordIdx, 2980 int ArgOffset, 2981 unsigned ArgSize) const { 2982 // Currently, two use-cases possible: 2983 // Case #1. Non-var-args function, and we meet first byval parameter. 2984 // Setup first unallocated register as first byval register; 2985 // eat all remained registers 2986 // (these two actions are performed by HandleByVal method). 2987 // Then, here, we initialize stack frame with 2988 // "store-reg" instructions. 2989 // Case #2. Var-args function, that doesn't contain byval parameters. 2990 // The same: eat all remained unallocated registers, 2991 // initialize stack frame. 2992 2993 MachineFunction &MF = DAG.getMachineFunction(); 2994 MachineFrameInfo *MFI = MF.getFrameInfo(); 2995 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2996 unsigned RBegin, REnd; 2997 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 2998 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 2999 } else { 3000 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3001 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3002 REnd = ARM::R4; 3003 } 3004 3005 if (REnd != RBegin) 3006 ArgOffset = -4 * (ARM::R4 - RBegin); 3007 3008 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3009 int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); 3010 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3011 3012 SmallVector<SDValue, 4> MemOps; 3013 const TargetRegisterClass *RC = 3014 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3015 3016 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3017 unsigned VReg = MF.addLiveIn(Reg, RC); 3018 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3019 SDValue Store = 3020 DAG.getStore(Val.getValue(1), dl, Val, FIN, 3021 MachinePointerInfo(OrigArg, 4 * i), false, false, 0); 3022 MemOps.push_back(Store); 3023 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3024 } 3025 3026 if (!MemOps.empty()) 3027 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3028 return FrameIndex; 3029 } 3030 3031 // Setup stack frame, the va_list pointer will start from. 3032 void 3033 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3034 SDLoc dl, SDValue &Chain, 3035 unsigned ArgOffset, 3036 unsigned TotalArgRegsSaveSize, 3037 bool ForceMutable) const { 3038 MachineFunction &MF = DAG.getMachineFunction(); 3039 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3040 3041 // Try to store any remaining integer argument regs 3042 // to their spots on the stack so that they may be loaded by deferencing 3043 // the result of va_next. 3044 // If there is no regs to be stored, just point address after last 3045 // argument passed via stack. 3046 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3047 CCInfo.getInRegsParamsCount(), 3048 CCInfo.getNextStackOffset(), 4); 3049 AFI->setVarArgsFrameIndex(FrameIndex); 3050 } 3051 3052 SDValue 3053 ARMTargetLowering::LowerFormalArguments(SDValue Chain, 3054 CallingConv::ID CallConv, bool isVarArg, 3055 const SmallVectorImpl<ISD::InputArg> 3056 &Ins, 3057 SDLoc dl, SelectionDAG &DAG, 3058 SmallVectorImpl<SDValue> &InVals) 3059 const { 3060 MachineFunction &MF = DAG.getMachineFunction(); 3061 MachineFrameInfo *MFI = MF.getFrameInfo(); 3062 3063 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3064 3065 // Assign locations to all of the incoming arguments. 3066 SmallVector<CCValAssign, 16> ArgLocs; 3067 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3068 *DAG.getContext(), Prologue); 3069 CCInfo.AnalyzeFormalArguments(Ins, 3070 CCAssignFnForNode(CallConv, /* Return*/ false, 3071 isVarArg)); 3072 3073 SmallVector<SDValue, 16> ArgValues; 3074 SDValue ArgValue; 3075 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 3076 unsigned CurArgIdx = 0; 3077 3078 // Initially ArgRegsSaveSize is zero. 3079 // Then we increase this value each time we meet byval parameter. 3080 // We also increase this value in case of varargs function. 3081 AFI->setArgRegsSaveSize(0); 3082 3083 // Calculate the amount of stack space that we need to allocate to store 3084 // byval and variadic arguments that are passed in registers. 3085 // We need to know this before we allocate the first byval or variadic 3086 // argument, as they will be allocated a stack slot below the CFA (Canonical 3087 // Frame Address, the stack pointer at entry to the function). 3088 unsigned ArgRegBegin = ARM::R4; 3089 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3090 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 3091 break; 3092 3093 CCValAssign &VA = ArgLocs[i]; 3094 unsigned Index = VA.getValNo(); 3095 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 3096 if (!Flags.isByVal()) 3097 continue; 3098 3099 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 3100 unsigned RBegin, REnd; 3101 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 3102 ArgRegBegin = std::min(ArgRegBegin, RBegin); 3103 3104 CCInfo.nextInRegsParam(); 3105 } 3106 CCInfo.rewindByValRegsInfo(); 3107 3108 int lastInsIndex = -1; 3109 if (isVarArg && MFI->hasVAStart()) { 3110 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3111 if (RegIdx != array_lengthof(GPRArgRegs)) 3112 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 3113 } 3114 3115 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 3116 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 3117 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3118 3119 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3120 CCValAssign &VA = ArgLocs[i]; 3121 if (Ins[VA.getValNo()].isOrigArg()) { 3122 std::advance(CurOrigArg, 3123 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 3124 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 3125 } 3126 // Arguments stored in registers. 3127 if (VA.isRegLoc()) { 3128 EVT RegVT = VA.getLocVT(); 3129 3130 if (VA.needsCustom()) { 3131 // f64 and vector types are split up into multiple registers or 3132 // combinations of registers and stack slots. 3133 if (VA.getLocVT() == MVT::v2f64) { 3134 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 3135 Chain, DAG, dl); 3136 VA = ArgLocs[++i]; // skip ahead to next loc 3137 SDValue ArgValue2; 3138 if (VA.isMemLoc()) { 3139 int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); 3140 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3141 ArgValue2 = DAG.getLoad( 3142 MVT::f64, dl, Chain, FIN, 3143 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3144 false, false, false, 0); 3145 } else { 3146 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 3147 Chain, DAG, dl); 3148 } 3149 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 3150 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3151 ArgValue, ArgValue1, 3152 DAG.getIntPtrConstant(0, dl)); 3153 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3154 ArgValue, ArgValue2, 3155 DAG.getIntPtrConstant(1, dl)); 3156 } else 3157 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 3158 3159 } else { 3160 const TargetRegisterClass *RC; 3161 3162 if (RegVT == MVT::f32) 3163 RC = &ARM::SPRRegClass; 3164 else if (RegVT == MVT::f64) 3165 RC = &ARM::DPRRegClass; 3166 else if (RegVT == MVT::v2f64) 3167 RC = &ARM::QPRRegClass; 3168 else if (RegVT == MVT::i32) 3169 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 3170 : &ARM::GPRRegClass; 3171 else 3172 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3173 3174 // Transform the arguments in physical registers into virtual ones. 3175 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3176 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 3177 } 3178 3179 // If this is an 8 or 16-bit value, it is really passed promoted 3180 // to 32 bits. Insert an assert[sz]ext to capture this, then 3181 // truncate to the right size. 3182 switch (VA.getLocInfo()) { 3183 default: llvm_unreachable("Unknown loc info!"); 3184 case CCValAssign::Full: break; 3185 case CCValAssign::BCvt: 3186 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 3187 break; 3188 case CCValAssign::SExt: 3189 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 3190 DAG.getValueType(VA.getValVT())); 3191 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3192 break; 3193 case CCValAssign::ZExt: 3194 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 3195 DAG.getValueType(VA.getValVT())); 3196 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3197 break; 3198 } 3199 3200 InVals.push_back(ArgValue); 3201 3202 } else { // VA.isRegLoc() 3203 3204 // sanity check 3205 assert(VA.isMemLoc()); 3206 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 3207 3208 int index = VA.getValNo(); 3209 3210 // Some Ins[] entries become multiple ArgLoc[] entries. 3211 // Process them only once. 3212 if (index != lastInsIndex) 3213 { 3214 ISD::ArgFlagsTy Flags = Ins[index].Flags; 3215 // FIXME: For now, all byval parameter objects are marked mutable. 3216 // This can be changed with more analysis. 3217 // In case of tail call optimization mark all arguments mutable. 3218 // Since they could be overwritten by lowering of arguments in case of 3219 // a tail call. 3220 if (Flags.isByVal()) { 3221 assert(Ins[index].isOrigArg() && 3222 "Byval arguments cannot be implicit"); 3223 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 3224 3225 int FrameIndex = StoreByValRegs( 3226 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 3227 VA.getLocMemOffset(), Flags.getByValSize()); 3228 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 3229 CCInfo.nextInRegsParam(); 3230 } else { 3231 unsigned FIOffset = VA.getLocMemOffset(); 3232 int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 3233 FIOffset, true); 3234 3235 // Create load nodes to retrieve arguments from the stack. 3236 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3237 InVals.push_back(DAG.getLoad( 3238 VA.getValVT(), dl, Chain, FIN, 3239 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3240 false, false, false, 0)); 3241 } 3242 lastInsIndex = index; 3243 } 3244 } 3245 } 3246 3247 // varargs 3248 if (isVarArg && MFI->hasVAStart()) 3249 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 3250 CCInfo.getNextStackOffset(), 3251 TotalArgRegsSaveSize); 3252 3253 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 3254 3255 return Chain; 3256 } 3257 3258 /// isFloatingPointZero - Return true if this is +0.0. 3259 static bool isFloatingPointZero(SDValue Op) { 3260 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 3261 return CFP->getValueAPF().isPosZero(); 3262 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 3263 // Maybe this has already been legalized into the constant pool? 3264 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 3265 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 3266 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 3267 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 3268 return CFP->getValueAPF().isPosZero(); 3269 } 3270 } else if (Op->getOpcode() == ISD::BITCAST && 3271 Op->getValueType(0) == MVT::f64) { 3272 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 3273 // created by LowerConstantFP(). 3274 SDValue BitcastOp = Op->getOperand(0); 3275 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) { 3276 SDValue MoveOp = BitcastOp->getOperand(0); 3277 if (MoveOp->getOpcode() == ISD::TargetConstant && 3278 cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) { 3279 return true; 3280 } 3281 } 3282 } 3283 return false; 3284 } 3285 3286 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 3287 /// the given operands. 3288 SDValue 3289 ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3290 SDValue &ARMcc, SelectionDAG &DAG, 3291 SDLoc dl) const { 3292 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3293 unsigned C = RHSC->getZExtValue(); 3294 if (!isLegalICmpImmediate(C)) { 3295 // Constant does not fit, try adjusting it by one? 3296 switch (CC) { 3297 default: break; 3298 case ISD::SETLT: 3299 case ISD::SETGE: 3300 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 3301 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3302 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3303 } 3304 break; 3305 case ISD::SETULT: 3306 case ISD::SETUGE: 3307 if (C != 0 && isLegalICmpImmediate(C-1)) { 3308 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3309 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3310 } 3311 break; 3312 case ISD::SETLE: 3313 case ISD::SETGT: 3314 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 3315 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3316 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3317 } 3318 break; 3319 case ISD::SETULE: 3320 case ISD::SETUGT: 3321 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 3322 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3323 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3324 } 3325 break; 3326 } 3327 } 3328 } 3329 3330 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3331 ARMISD::NodeType CompareType; 3332 switch (CondCode) { 3333 default: 3334 CompareType = ARMISD::CMP; 3335 break; 3336 case ARMCC::EQ: 3337 case ARMCC::NE: 3338 // Uses only Z Flag 3339 CompareType = ARMISD::CMPZ; 3340 break; 3341 } 3342 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3343 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 3344 } 3345 3346 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 3347 SDValue 3348 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, 3349 SDLoc dl) const { 3350 assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); 3351 SDValue Cmp; 3352 if (!isFloatingPointZero(RHS)) 3353 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); 3354 else 3355 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); 3356 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 3357 } 3358 3359 /// duplicateCmp - Glue values can have only one use, so this function 3360 /// duplicates a comparison node. 3361 SDValue 3362 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 3363 unsigned Opc = Cmp.getOpcode(); 3364 SDLoc DL(Cmp); 3365 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 3366 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3367 3368 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 3369 Cmp = Cmp.getOperand(0); 3370 Opc = Cmp.getOpcode(); 3371 if (Opc == ARMISD::CMPFP) 3372 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3373 else { 3374 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 3375 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 3376 } 3377 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 3378 } 3379 3380 std::pair<SDValue, SDValue> 3381 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 3382 SDValue &ARMcc) const { 3383 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 3384 3385 SDValue Value, OverflowCmp; 3386 SDValue LHS = Op.getOperand(0); 3387 SDValue RHS = Op.getOperand(1); 3388 SDLoc dl(Op); 3389 3390 // FIXME: We are currently always generating CMPs because we don't support 3391 // generating CMN through the backend. This is not as good as the natural 3392 // CMP case because it causes a register dependency and cannot be folded 3393 // later. 3394 3395 switch (Op.getOpcode()) { 3396 default: 3397 llvm_unreachable("Unknown overflow instruction!"); 3398 case ISD::SADDO: 3399 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3400 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3401 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3402 break; 3403 case ISD::UADDO: 3404 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3405 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3406 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3407 break; 3408 case ISD::SSUBO: 3409 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3410 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3411 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3412 break; 3413 case ISD::USUBO: 3414 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3415 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3416 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3417 break; 3418 } // switch (...) 3419 3420 return std::make_pair(Value, OverflowCmp); 3421 } 3422 3423 3424 SDValue 3425 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 3426 // Let legalize expand this if it isn't a legal type yet. 3427 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 3428 return SDValue(); 3429 3430 SDValue Value, OverflowCmp; 3431 SDValue ARMcc; 3432 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 3433 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3434 SDLoc dl(Op); 3435 // We use 0 and 1 as false and true values. 3436 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3437 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3438 EVT VT = Op.getValueType(); 3439 3440 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 3441 ARMcc, CCR, OverflowCmp); 3442 3443 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 3444 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 3445 } 3446 3447 3448 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 3449 SDValue Cond = Op.getOperand(0); 3450 SDValue SelectTrue = Op.getOperand(1); 3451 SDValue SelectFalse = Op.getOperand(2); 3452 SDLoc dl(Op); 3453 unsigned Opc = Cond.getOpcode(); 3454 3455 if (Cond.getResNo() == 1 && 3456 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3457 Opc == ISD::USUBO)) { 3458 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 3459 return SDValue(); 3460 3461 SDValue Value, OverflowCmp; 3462 SDValue ARMcc; 3463 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 3464 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3465 EVT VT = Op.getValueType(); 3466 3467 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 3468 OverflowCmp, DAG); 3469 } 3470 3471 // Convert: 3472 // 3473 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 3474 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 3475 // 3476 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 3477 const ConstantSDNode *CMOVTrue = 3478 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 3479 const ConstantSDNode *CMOVFalse = 3480 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 3481 3482 if (CMOVTrue && CMOVFalse) { 3483 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 3484 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 3485 3486 SDValue True; 3487 SDValue False; 3488 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 3489 True = SelectTrue; 3490 False = SelectFalse; 3491 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 3492 True = SelectFalse; 3493 False = SelectTrue; 3494 } 3495 3496 if (True.getNode() && False.getNode()) { 3497 EVT VT = Op.getValueType(); 3498 SDValue ARMcc = Cond.getOperand(2); 3499 SDValue CCR = Cond.getOperand(3); 3500 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 3501 assert(True.getValueType() == VT); 3502 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 3503 } 3504 } 3505 } 3506 3507 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 3508 // undefined bits before doing a full-word comparison with zero. 3509 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 3510 DAG.getConstant(1, dl, Cond.getValueType())); 3511 3512 return DAG.getSelectCC(dl, Cond, 3513 DAG.getConstant(0, dl, Cond.getValueType()), 3514 SelectTrue, SelectFalse, ISD::SETNE); 3515 } 3516 3517 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 3518 bool &swpCmpOps, bool &swpVselOps) { 3519 // Start by selecting the GE condition code for opcodes that return true for 3520 // 'equality' 3521 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 3522 CC == ISD::SETULE) 3523 CondCode = ARMCC::GE; 3524 3525 // and GT for opcodes that return false for 'equality'. 3526 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 3527 CC == ISD::SETULT) 3528 CondCode = ARMCC::GT; 3529 3530 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 3531 // to swap the compare operands. 3532 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 3533 CC == ISD::SETULT) 3534 swpCmpOps = true; 3535 3536 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 3537 // If we have an unordered opcode, we need to swap the operands to the VSEL 3538 // instruction (effectively negating the condition). 3539 // 3540 // This also has the effect of swapping which one of 'less' or 'greater' 3541 // returns true, so we also swap the compare operands. It also switches 3542 // whether we return true for 'equality', so we compensate by picking the 3543 // opposite condition code to our original choice. 3544 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 3545 CC == ISD::SETUGT) { 3546 swpCmpOps = !swpCmpOps; 3547 swpVselOps = !swpVselOps; 3548 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 3549 } 3550 3551 // 'ordered' is 'anything but unordered', so use the VS condition code and 3552 // swap the VSEL operands. 3553 if (CC == ISD::SETO) { 3554 CondCode = ARMCC::VS; 3555 swpVselOps = true; 3556 } 3557 3558 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 3559 // code and swap the VSEL operands. 3560 if (CC == ISD::SETUNE) { 3561 CondCode = ARMCC::EQ; 3562 swpVselOps = true; 3563 } 3564 } 3565 3566 SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, 3567 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 3568 SDValue Cmp, SelectionDAG &DAG) const { 3569 if (Subtarget->isFPOnlySP() && VT == MVT::f64) { 3570 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 3571 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 3572 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 3573 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 3574 3575 SDValue TrueLow = TrueVal.getValue(0); 3576 SDValue TrueHigh = TrueVal.getValue(1); 3577 SDValue FalseLow = FalseVal.getValue(0); 3578 SDValue FalseHigh = FalseVal.getValue(1); 3579 3580 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 3581 ARMcc, CCR, Cmp); 3582 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 3583 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 3584 3585 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 3586 } else { 3587 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 3588 Cmp); 3589 } 3590 } 3591 3592 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 3593 EVT VT = Op.getValueType(); 3594 SDValue LHS = Op.getOperand(0); 3595 SDValue RHS = Op.getOperand(1); 3596 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 3597 SDValue TrueVal = Op.getOperand(2); 3598 SDValue FalseVal = Op.getOperand(3); 3599 SDLoc dl(Op); 3600 3601 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 3602 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 3603 dl); 3604 3605 // If softenSetCCOperands only returned one value, we should compare it to 3606 // zero. 3607 if (!RHS.getNode()) { 3608 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3609 CC = ISD::SETNE; 3610 } 3611 } 3612 3613 if (LHS.getValueType() == MVT::i32) { 3614 // Try to generate VSEL on ARMv8. 3615 // The VSEL instruction can't use all the usual ARM condition 3616 // codes: it only has two bits to select the condition code, so it's 3617 // constrained to use only GE, GT, VS and EQ. 3618 // 3619 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 3620 // swap the operands of the previous compare instruction (effectively 3621 // inverting the compare condition, swapping 'less' and 'greater') and 3622 // sometimes need to swap the operands to the VSEL (which inverts the 3623 // condition in the sense of firing whenever the previous condition didn't) 3624 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 3625 TrueVal.getValueType() == MVT::f64)) { 3626 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3627 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 3628 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 3629 CC = ISD::getSetCCInverse(CC, true); 3630 std::swap(TrueVal, FalseVal); 3631 } 3632 } 3633 3634 SDValue ARMcc; 3635 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3636 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 3637 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 3638 } 3639 3640 ARMCC::CondCodes CondCode, CondCode2; 3641 FPCCToARMCC(CC, CondCode, CondCode2); 3642 3643 // Try to generate VMAXNM/VMINNM on ARMv8. 3644 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 3645 TrueVal.getValueType() == MVT::f64)) { 3646 bool swpCmpOps = false; 3647 bool swpVselOps = false; 3648 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 3649 3650 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 3651 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 3652 if (swpCmpOps) 3653 std::swap(LHS, RHS); 3654 if (swpVselOps) 3655 std::swap(TrueVal, FalseVal); 3656 } 3657 } 3658 3659 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3660 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 3661 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3662 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 3663 if (CondCode2 != ARMCC::AL) { 3664 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 3665 // FIXME: Needs another CMP because flag can have but one use. 3666 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 3667 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 3668 } 3669 return Result; 3670 } 3671 3672 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 3673 /// to morph to an integer compare sequence. 3674 static bool canChangeToInt(SDValue Op, bool &SeenZero, 3675 const ARMSubtarget *Subtarget) { 3676 SDNode *N = Op.getNode(); 3677 if (!N->hasOneUse()) 3678 // Otherwise it requires moving the value from fp to integer registers. 3679 return false; 3680 if (!N->getNumValues()) 3681 return false; 3682 EVT VT = Op.getValueType(); 3683 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 3684 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 3685 // vmrs are very slow, e.g. cortex-a8. 3686 return false; 3687 3688 if (isFloatingPointZero(Op)) { 3689 SeenZero = true; 3690 return true; 3691 } 3692 return ISD::isNormalLoad(N); 3693 } 3694 3695 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 3696 if (isFloatingPointZero(Op)) 3697 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 3698 3699 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 3700 return DAG.getLoad(MVT::i32, SDLoc(Op), 3701 Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), 3702 Ld->isVolatile(), Ld->isNonTemporal(), 3703 Ld->isInvariant(), Ld->getAlignment()); 3704 3705 llvm_unreachable("Unknown VFP cmp argument!"); 3706 } 3707 3708 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 3709 SDValue &RetVal1, SDValue &RetVal2) { 3710 SDLoc dl(Op); 3711 3712 if (isFloatingPointZero(Op)) { 3713 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 3714 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 3715 return; 3716 } 3717 3718 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 3719 SDValue Ptr = Ld->getBasePtr(); 3720 RetVal1 = DAG.getLoad(MVT::i32, dl, 3721 Ld->getChain(), Ptr, 3722 Ld->getPointerInfo(), 3723 Ld->isVolatile(), Ld->isNonTemporal(), 3724 Ld->isInvariant(), Ld->getAlignment()); 3725 3726 EVT PtrType = Ptr.getValueType(); 3727 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 3728 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 3729 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 3730 RetVal2 = DAG.getLoad(MVT::i32, dl, 3731 Ld->getChain(), NewPtr, 3732 Ld->getPointerInfo().getWithOffset(4), 3733 Ld->isVolatile(), Ld->isNonTemporal(), 3734 Ld->isInvariant(), NewAlign); 3735 return; 3736 } 3737 3738 llvm_unreachable("Unknown VFP cmp argument!"); 3739 } 3740 3741 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 3742 /// f32 and even f64 comparisons to integer ones. 3743 SDValue 3744 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 3745 SDValue Chain = Op.getOperand(0); 3746 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3747 SDValue LHS = Op.getOperand(2); 3748 SDValue RHS = Op.getOperand(3); 3749 SDValue Dest = Op.getOperand(4); 3750 SDLoc dl(Op); 3751 3752 bool LHSSeenZero = false; 3753 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 3754 bool RHSSeenZero = false; 3755 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 3756 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 3757 // If unsafe fp math optimization is enabled and there are no other uses of 3758 // the CMP operands, and the condition code is EQ or NE, we can optimize it 3759 // to an integer comparison. 3760 if (CC == ISD::SETOEQ) 3761 CC = ISD::SETEQ; 3762 else if (CC == ISD::SETUNE) 3763 CC = ISD::SETNE; 3764 3765 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 3766 SDValue ARMcc; 3767 if (LHS.getValueType() == MVT::f32) { 3768 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 3769 bitcastf32Toi32(LHS, DAG), Mask); 3770 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 3771 bitcastf32Toi32(RHS, DAG), Mask); 3772 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 3773 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3774 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 3775 Chain, Dest, ARMcc, CCR, Cmp); 3776 } 3777 3778 SDValue LHS1, LHS2; 3779 SDValue RHS1, RHS2; 3780 expandf64Toi32(LHS, DAG, LHS1, LHS2); 3781 expandf64Toi32(RHS, DAG, RHS1, RHS2); 3782 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 3783 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 3784 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3785 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3786 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 3787 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 3788 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 3789 } 3790 3791 return SDValue(); 3792 } 3793 3794 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3795 SDValue Chain = Op.getOperand(0); 3796 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3797 SDValue LHS = Op.getOperand(2); 3798 SDValue RHS = Op.getOperand(3); 3799 SDValue Dest = Op.getOperand(4); 3800 SDLoc dl(Op); 3801 3802 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 3803 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 3804 dl); 3805 3806 // If softenSetCCOperands only returned one value, we should compare it to 3807 // zero. 3808 if (!RHS.getNode()) { 3809 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 3810 CC = ISD::SETNE; 3811 } 3812 } 3813 3814 if (LHS.getValueType() == MVT::i32) { 3815 SDValue ARMcc; 3816 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 3817 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3818 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 3819 Chain, Dest, ARMcc, CCR, Cmp); 3820 } 3821 3822 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3823 3824 if (getTargetMachine().Options.UnsafeFPMath && 3825 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 3826 CC == ISD::SETNE || CC == ISD::SETUNE)) { 3827 SDValue Result = OptimizeVFPBrcond(Op, DAG); 3828 if (Result.getNode()) 3829 return Result; 3830 } 3831 3832 ARMCC::CondCodes CondCode, CondCode2; 3833 FPCCToARMCC(CC, CondCode, CondCode2); 3834 3835 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3836 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 3837 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3838 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 3839 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 3840 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 3841 if (CondCode2 != ARMCC::AL) { 3842 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 3843 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 3844 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 3845 } 3846 return Res; 3847 } 3848 3849 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 3850 SDValue Chain = Op.getOperand(0); 3851 SDValue Table = Op.getOperand(1); 3852 SDValue Index = Op.getOperand(2); 3853 SDLoc dl(Op); 3854 3855 EVT PTy = getPointerTy(DAG.getDataLayout()); 3856 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 3857 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 3858 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 3859 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 3860 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); 3861 if (Subtarget->isThumb2()) { 3862 // Thumb2 uses a two-level jump. That is, it jumps into the jump table 3863 // which does another jump to the destination. This also makes it easier 3864 // to translate it to TBB / TBH later. 3865 // FIXME: This might not work if the function is extremely large. 3866 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 3867 Addr, Op.getOperand(2), JTI); 3868 } 3869 if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { 3870 Addr = 3871 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 3872 MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), 3873 false, false, false, 0); 3874 Chain = Addr.getValue(1); 3875 Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); 3876 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 3877 } else { 3878 Addr = 3879 DAG.getLoad(PTy, dl, Chain, Addr, 3880 MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), 3881 false, false, false, 0); 3882 Chain = Addr.getValue(1); 3883 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 3884 } 3885 } 3886 3887 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 3888 EVT VT = Op.getValueType(); 3889 SDLoc dl(Op); 3890 3891 if (Op.getValueType().getVectorElementType() == MVT::i32) { 3892 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 3893 return Op; 3894 return DAG.UnrollVectorOp(Op.getNode()); 3895 } 3896 3897 assert(Op.getOperand(0).getValueType() == MVT::v4f32 && 3898 "Invalid type for custom lowering!"); 3899 if (VT != MVT::v4i16) 3900 return DAG.UnrollVectorOp(Op.getNode()); 3901 3902 Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); 3903 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 3904 } 3905 3906 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 3907 EVT VT = Op.getValueType(); 3908 if (VT.isVector()) 3909 return LowerVectorFP_TO_INT(Op, DAG); 3910 if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { 3911 RTLIB::Libcall LC; 3912 if (Op.getOpcode() == ISD::FP_TO_SINT) 3913 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), 3914 Op.getValueType()); 3915 else 3916 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), 3917 Op.getValueType()); 3918 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 3919 /*isSigned*/ false, SDLoc(Op)).first; 3920 } 3921 3922 return Op; 3923 } 3924 3925 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 3926 EVT VT = Op.getValueType(); 3927 SDLoc dl(Op); 3928 3929 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 3930 if (VT.getVectorElementType() == MVT::f32) 3931 return Op; 3932 return DAG.UnrollVectorOp(Op.getNode()); 3933 } 3934 3935 assert(Op.getOperand(0).getValueType() == MVT::v4i16 && 3936 "Invalid type for custom lowering!"); 3937 if (VT != MVT::v4f32) 3938 return DAG.UnrollVectorOp(Op.getNode()); 3939 3940 unsigned CastOpc; 3941 unsigned Opc; 3942 switch (Op.getOpcode()) { 3943 default: llvm_unreachable("Invalid opcode!"); 3944 case ISD::SINT_TO_FP: 3945 CastOpc = ISD::SIGN_EXTEND; 3946 Opc = ISD::SINT_TO_FP; 3947 break; 3948 case ISD::UINT_TO_FP: 3949 CastOpc = ISD::ZERO_EXTEND; 3950 Opc = ISD::UINT_TO_FP; 3951 break; 3952 } 3953 3954 Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); 3955 return DAG.getNode(Opc, dl, VT, Op); 3956 } 3957 3958 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 3959 EVT VT = Op.getValueType(); 3960 if (VT.isVector()) 3961 return LowerVectorINT_TO_FP(Op, DAG); 3962 if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { 3963 RTLIB::Libcall LC; 3964 if (Op.getOpcode() == ISD::SINT_TO_FP) 3965 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 3966 Op.getValueType()); 3967 else 3968 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 3969 Op.getValueType()); 3970 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 3971 /*isSigned*/ false, SDLoc(Op)).first; 3972 } 3973 3974 return Op; 3975 } 3976 3977 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 3978 // Implement fcopysign with a fabs and a conditional fneg. 3979 SDValue Tmp0 = Op.getOperand(0); 3980 SDValue Tmp1 = Op.getOperand(1); 3981 SDLoc dl(Op); 3982 EVT VT = Op.getValueType(); 3983 EVT SrcVT = Tmp1.getValueType(); 3984 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 3985 Tmp0.getOpcode() == ARMISD::VMOVDRR; 3986 bool UseNEON = !InGPR && Subtarget->hasNEON(); 3987 3988 if (UseNEON) { 3989 // Use VBSL to copy the sign bit. 3990 unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); 3991 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 3992 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 3993 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 3994 if (VT == MVT::f64) 3995 Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, 3996 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 3997 DAG.getConstant(32, dl, MVT::i32)); 3998 else /*if (VT == MVT::f32)*/ 3999 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 4000 if (SrcVT == MVT::f32) { 4001 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 4002 if (VT == MVT::f64) 4003 Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4004 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 4005 DAG.getConstant(32, dl, MVT::i32)); 4006 } else if (VT == MVT::f32) 4007 Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, 4008 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 4009 DAG.getConstant(32, dl, MVT::i32)); 4010 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 4011 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 4012 4013 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), 4014 dl, MVT::i32); 4015 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 4016 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 4017 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 4018 4019 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 4020 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 4021 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 4022 if (VT == MVT::f32) { 4023 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 4024 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 4025 DAG.getConstant(0, dl, MVT::i32)); 4026 } else { 4027 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 4028 } 4029 4030 return Res; 4031 } 4032 4033 // Bitcast operand 1 to i32. 4034 if (SrcVT == MVT::f64) 4035 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4036 Tmp1).getValue(1); 4037 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 4038 4039 // Or in the signbit with integer operations. 4040 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 4041 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4042 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 4043 if (VT == MVT::f32) { 4044 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 4045 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 4046 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 4047 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 4048 } 4049 4050 // f64: Or the high part with signbit and then combine two parts. 4051 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4052 Tmp0); 4053 SDValue Lo = Tmp0.getValue(0); 4054 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 4055 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 4056 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 4057 } 4058 4059 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 4060 MachineFunction &MF = DAG.getMachineFunction(); 4061 MachineFrameInfo *MFI = MF.getFrameInfo(); 4062 MFI->setReturnAddressIsTaken(true); 4063 4064 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 4065 return SDValue(); 4066 4067 EVT VT = Op.getValueType(); 4068 SDLoc dl(Op); 4069 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4070 if (Depth) { 4071 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4072 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 4073 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 4074 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 4075 MachinePointerInfo(), false, false, false, 0); 4076 } 4077 4078 // Return LR, which contains the return address. Mark it an implicit live-in. 4079 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 4080 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 4081 } 4082 4083 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 4084 const ARMBaseRegisterInfo &ARI = 4085 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 4086 MachineFunction &MF = DAG.getMachineFunction(); 4087 MachineFrameInfo *MFI = MF.getFrameInfo(); 4088 MFI->setFrameAddressIsTaken(true); 4089 4090 EVT VT = Op.getValueType(); 4091 SDLoc dl(Op); // FIXME probably not meaningful 4092 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4093 unsigned FrameReg = ARI.getFrameRegister(MF); 4094 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 4095 while (Depth--) 4096 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 4097 MachinePointerInfo(), 4098 false, false, false, 0); 4099 return FrameAddr; 4100 } 4101 4102 // FIXME? Maybe this could be a TableGen attribute on some registers and 4103 // this table could be generated automatically from RegInfo. 4104 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, 4105 SelectionDAG &DAG) const { 4106 unsigned Reg = StringSwitch<unsigned>(RegName) 4107 .Case("sp", ARM::SP) 4108 .Default(0); 4109 if (Reg) 4110 return Reg; 4111 report_fatal_error(Twine("Invalid register name \"" 4112 + StringRef(RegName) + "\".")); 4113 } 4114 4115 // Result is 64 bit value so split into two 32 bit values and return as a 4116 // pair of values. 4117 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 4118 SelectionDAG &DAG) { 4119 SDLoc DL(N); 4120 4121 // This function is only supposed to be called for i64 type destination. 4122 assert(N->getValueType(0) == MVT::i64 4123 && "ExpandREAD_REGISTER called for non-i64 type result."); 4124 4125 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 4126 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 4127 N->getOperand(0), 4128 N->getOperand(1)); 4129 4130 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 4131 Read.getValue(1))); 4132 Results.push_back(Read.getOperand(0)); 4133 } 4134 4135 /// ExpandBITCAST - If the target supports VFP, this function is called to 4136 /// expand a bit convert where either the source or destination type is i64 to 4137 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 4138 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 4139 /// vectors), since the legalizer won't know what to do with that. 4140 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { 4141 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 4142 SDLoc dl(N); 4143 SDValue Op = N->getOperand(0); 4144 4145 // This function is only supposed to be called for i64 types, either as the 4146 // source or destination of the bit convert. 4147 EVT SrcVT = Op.getValueType(); 4148 EVT DstVT = N->getValueType(0); 4149 assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && 4150 "ExpandBITCAST called for non-i64 type"); 4151 4152 // Turn i64->f64 into VMOVDRR. 4153 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 4154 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 4155 DAG.getConstant(0, dl, MVT::i32)); 4156 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 4157 DAG.getConstant(1, dl, MVT::i32)); 4158 return DAG.getNode(ISD::BITCAST, dl, DstVT, 4159 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 4160 } 4161 4162 // Turn f64->i64 into VMOVRRD. 4163 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 4164 SDValue Cvt; 4165 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 4166 SrcVT.getVectorNumElements() > 1) 4167 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 4168 DAG.getVTList(MVT::i32, MVT::i32), 4169 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 4170 else 4171 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 4172 DAG.getVTList(MVT::i32, MVT::i32), Op); 4173 // Merge the pieces into a single i64 value. 4174 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 4175 } 4176 4177 return SDValue(); 4178 } 4179 4180 /// getZeroVector - Returns a vector of specified type with all zero elements. 4181 /// Zero vectors are used to represent vector negation and in those cases 4182 /// will be implemented with the NEON VNEG instruction. However, VNEG does 4183 /// not support i64 elements, so sometimes the zero vectors will need to be 4184 /// explicitly constructed. Regardless, use a canonical VMOV to create the 4185 /// zero vector. 4186 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) { 4187 assert(VT.isVector() && "Expected a vector type"); 4188 // The canonical modified immediate encoding of a zero vector is....0! 4189 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 4190 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 4191 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 4192 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 4193 } 4194 4195 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4196 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 4197 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 4198 SelectionDAG &DAG) const { 4199 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4200 EVT VT = Op.getValueType(); 4201 unsigned VTBits = VT.getSizeInBits(); 4202 SDLoc dl(Op); 4203 SDValue ShOpLo = Op.getOperand(0); 4204 SDValue ShOpHi = Op.getOperand(1); 4205 SDValue ShAmt = Op.getOperand(2); 4206 SDValue ARMcc; 4207 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4208 4209 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4210 4211 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 4212 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 4213 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4214 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 4215 DAG.getConstant(VTBits, dl, MVT::i32)); 4216 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4217 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4218 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4219 4220 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4221 SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4222 ISD::SETGE, ARMcc, DAG, dl); 4223 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4224 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, 4225 CCR, Cmp); 4226 4227 SDValue Ops[2] = { Lo, Hi }; 4228 return DAG.getMergeValues(Ops, dl); 4229 } 4230 4231 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4232 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 4233 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 4234 SelectionDAG &DAG) const { 4235 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4236 EVT VT = Op.getValueType(); 4237 unsigned VTBits = VT.getSizeInBits(); 4238 SDLoc dl(Op); 4239 SDValue ShOpLo = Op.getOperand(0); 4240 SDValue ShOpHi = Op.getOperand(1); 4241 SDValue ShAmt = Op.getOperand(2); 4242 SDValue ARMcc; 4243 4244 assert(Op.getOpcode() == ISD::SHL_PARTS); 4245 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 4246 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 4247 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4248 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 4249 DAG.getConstant(VTBits, dl, MVT::i32)); 4250 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4251 SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4252 4253 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4254 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4255 SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4256 ISD::SETGE, ARMcc, DAG, dl); 4257 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4258 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, 4259 CCR, Cmp); 4260 4261 SDValue Ops[2] = { Lo, Hi }; 4262 return DAG.getMergeValues(Ops, dl); 4263 } 4264 4265 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 4266 SelectionDAG &DAG) const { 4267 // The rounding mode is in bits 23:22 of the FPSCR. 4268 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 4269 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 4270 // so that the shift + and get folded into a bitfield extract. 4271 SDLoc dl(Op); 4272 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, 4273 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, 4274 MVT::i32)); 4275 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 4276 DAG.getConstant(1U << 22, dl, MVT::i32)); 4277 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 4278 DAG.getConstant(22, dl, MVT::i32)); 4279 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 4280 DAG.getConstant(3, dl, MVT::i32)); 4281 } 4282 4283 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 4284 const ARMSubtarget *ST) { 4285 SDLoc dl(N); 4286 EVT VT = N->getValueType(0); 4287 if (VT.isVector()) { 4288 assert(ST->hasNEON()); 4289 4290 // Compute the least significant set bit: LSB = X & -X 4291 SDValue X = N->getOperand(0); 4292 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 4293 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 4294 4295 EVT ElemTy = VT.getVectorElementType(); 4296 4297 if (ElemTy == MVT::i8) { 4298 // Compute with: cttz(x) = ctpop(lsb - 1) 4299 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4300 DAG.getTargetConstant(1, dl, ElemTy)); 4301 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 4302 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 4303 } 4304 4305 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 4306 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 4307 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 4308 unsigned NumBits = ElemTy.getSizeInBits(); 4309 SDValue WidthMinus1 = 4310 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4311 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 4312 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 4313 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 4314 } 4315 4316 // Compute with: cttz(x) = ctpop(lsb - 1) 4317 4318 // Since we can only compute the number of bits in a byte with vcnt.8, we 4319 // have to gather the result with pairwise addition (vpaddl) for i16, i32, 4320 // and i64. 4321 4322 // Compute LSB - 1. 4323 SDValue Bits; 4324 if (ElemTy == MVT::i64) { 4325 // Load constant 0xffff'ffff'ffff'ffff to register. 4326 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4327 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 4328 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 4329 } else { 4330 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4331 DAG.getTargetConstant(1, dl, ElemTy)); 4332 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 4333 } 4334 4335 // Count #bits with vcnt.8. 4336 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 4337 SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); 4338 SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); 4339 4340 // Gather the #bits with vpaddl (pairwise add.) 4341 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 4342 SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, 4343 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4344 Cnt8); 4345 if (ElemTy == MVT::i16) 4346 return Cnt16; 4347 4348 EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; 4349 SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, 4350 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4351 Cnt16); 4352 if (ElemTy == MVT::i32) 4353 return Cnt32; 4354 4355 assert(ElemTy == MVT::i64); 4356 SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 4357 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4358 Cnt32); 4359 return Cnt64; 4360 } 4361 4362 if (!ST->hasV6T2Ops()) 4363 return SDValue(); 4364 4365 SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0)); 4366 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 4367 } 4368 4369 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count 4370 /// for each 16-bit element from operand, repeated. The basic idea is to 4371 /// leverage vcnt to get the 8-bit counts, gather and add the results. 4372 /// 4373 /// Trace for v4i16: 4374 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 4375 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) 4376 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) 4377 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 4378 /// [b0 b1 b2 b3 b4 b5 b6 b7] 4379 /// +[b1 b0 b3 b2 b5 b4 b7 b6] 4380 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, 4381 /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) 4382 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { 4383 EVT VT = N->getValueType(0); 4384 SDLoc DL(N); 4385 4386 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 4387 SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); 4388 SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); 4389 SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); 4390 SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); 4391 return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); 4392 } 4393 4394 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the 4395 /// bit-count for each 16-bit element from the operand. We need slightly 4396 /// different sequencing for v4i16 and v8i16 to stay within NEON's available 4397 /// 64/128-bit registers. 4398 /// 4399 /// Trace for v4i16: 4400 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 4401 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) 4402 /// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] 4403 /// v4i16:Extracted = [k0 k1 k2 k3 ] 4404 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { 4405 EVT VT = N->getValueType(0); 4406 SDLoc DL(N); 4407 4408 SDValue BitCounts = getCTPOP16BitCounts(N, DAG); 4409 if (VT.is64BitVector()) { 4410 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); 4411 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, 4412 DAG.getIntPtrConstant(0, DL)); 4413 } else { 4414 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, 4415 BitCounts, DAG.getIntPtrConstant(0, DL)); 4416 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); 4417 } 4418 } 4419 4420 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the 4421 /// bit-count for each 32-bit element from the operand. The idea here is 4422 /// to split the vector into 16-bit elements, leverage the 16-bit count 4423 /// routine, and then combine the results. 4424 /// 4425 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): 4426 /// input = [v0 v1 ] (vi: 32-bit elements) 4427 /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) 4428 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) 4429 /// vrev: N0 = [k1 k0 k3 k2 ] 4430 /// [k0 k1 k2 k3 ] 4431 /// N1 =+[k1 k0 k3 k2 ] 4432 /// [k0 k2 k1 k3 ] 4433 /// N2 =+[k1 k3 k0 k2 ] 4434 /// [k0 k2 k1 k3 ] 4435 /// Extended =+[k1 k3 k0 k2 ] 4436 /// [k0 k2 ] 4437 /// Extracted=+[k1 k3 ] 4438 /// 4439 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { 4440 EVT VT = N->getValueType(0); 4441 SDLoc DL(N); 4442 4443 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 4444 4445 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); 4446 SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); 4447 SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); 4448 SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); 4449 SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); 4450 4451 if (VT.is64BitVector()) { 4452 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); 4453 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, 4454 DAG.getIntPtrConstant(0, DL)); 4455 } else { 4456 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, 4457 DAG.getIntPtrConstant(0, DL)); 4458 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); 4459 } 4460 } 4461 4462 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 4463 const ARMSubtarget *ST) { 4464 EVT VT = N->getValueType(0); 4465 4466 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 4467 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || 4468 VT == MVT::v4i16 || VT == MVT::v8i16) && 4469 "Unexpected type for custom ctpop lowering"); 4470 4471 if (VT.getVectorElementType() == MVT::i32) 4472 return lowerCTPOP32BitElements(N, DAG); 4473 else 4474 return lowerCTPOP16BitElements(N, DAG); 4475 } 4476 4477 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 4478 const ARMSubtarget *ST) { 4479 EVT VT = N->getValueType(0); 4480 SDLoc dl(N); 4481 4482 if (!VT.isVector()) 4483 return SDValue(); 4484 4485 // Lower vector shifts on NEON to use VSHL. 4486 assert(ST->hasNEON() && "unexpected vector shift"); 4487 4488 // Left shifts translate directly to the vshiftu intrinsic. 4489 if (N->getOpcode() == ISD::SHL) 4490 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 4491 DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, 4492 MVT::i32), 4493 N->getOperand(0), N->getOperand(1)); 4494 4495 assert((N->getOpcode() == ISD::SRA || 4496 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); 4497 4498 // NEON uses the same intrinsics for both left and right shifts. For 4499 // right shifts, the shift amounts are negative, so negate the vector of 4500 // shift amounts. 4501 EVT ShiftVT = N->getOperand(1).getValueType(); 4502 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, 4503 getZeroVector(ShiftVT, DAG, dl), 4504 N->getOperand(1)); 4505 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? 4506 Intrinsic::arm_neon_vshifts : 4507 Intrinsic::arm_neon_vshiftu); 4508 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 4509 DAG.getConstant(vshiftInt, dl, MVT::i32), 4510 N->getOperand(0), NegatedCount); 4511 } 4512 4513 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 4514 const ARMSubtarget *ST) { 4515 EVT VT = N->getValueType(0); 4516 SDLoc dl(N); 4517 4518 // We can get here for a node like i32 = ISD::SHL i32, i64 4519 if (VT != MVT::i64) 4520 return SDValue(); 4521 4522 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 4523 "Unknown shift to lower!"); 4524 4525 // We only lower SRA, SRL of 1 here, all others use generic lowering. 4526 if (!isa<ConstantSDNode>(N->getOperand(1)) || 4527 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1) 4528 return SDValue(); 4529 4530 // If we are in thumb mode, we don't have RRX. 4531 if (ST->isThumb1Only()) return SDValue(); 4532 4533 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 4534 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 4535 DAG.getConstant(0, dl, MVT::i32)); 4536 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 4537 DAG.getConstant(1, dl, MVT::i32)); 4538 4539 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 4540 // captures the result into a carry flag. 4541 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 4542 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 4543 4544 // The low part is an ARMISD::RRX operand, which shifts the carry in. 4545 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 4546 4547 // Merge the pieces into a single i64 value. 4548 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 4549 } 4550 4551 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 4552 SDValue TmpOp0, TmpOp1; 4553 bool Invert = false; 4554 bool Swap = false; 4555 unsigned Opc = 0; 4556 4557 SDValue Op0 = Op.getOperand(0); 4558 SDValue Op1 = Op.getOperand(1); 4559 SDValue CC = Op.getOperand(2); 4560 EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 4561 EVT VT = Op.getValueType(); 4562 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 4563 SDLoc dl(Op); 4564 4565 if (CmpVT.getVectorElementType() == MVT::i64) 4566 // 64-bit comparisons are not legal. We've marked SETCC as non-Custom, 4567 // but it's possible that our operands are 64-bit but our result is 32-bit. 4568 // Bail in this case. 4569 return SDValue(); 4570 4571 if (Op1.getValueType().isFloatingPoint()) { 4572 switch (SetCCOpcode) { 4573 default: llvm_unreachable("Illegal FP comparison"); 4574 case ISD::SETUNE: 4575 case ISD::SETNE: Invert = true; // Fallthrough 4576 case ISD::SETOEQ: 4577 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 4578 case ISD::SETOLT: 4579 case ISD::SETLT: Swap = true; // Fallthrough 4580 case ISD::SETOGT: 4581 case ISD::SETGT: Opc = ARMISD::VCGT; break; 4582 case ISD::SETOLE: 4583 case ISD::SETLE: Swap = true; // Fallthrough 4584 case ISD::SETOGE: 4585 case ISD::SETGE: Opc = ARMISD::VCGE; break; 4586 case ISD::SETUGE: Swap = true; // Fallthrough 4587 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; 4588 case ISD::SETUGT: Swap = true; // Fallthrough 4589 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; 4590 case ISD::SETUEQ: Invert = true; // Fallthrough 4591 case ISD::SETONE: 4592 // Expand this to (OLT | OGT). 4593 TmpOp0 = Op0; 4594 TmpOp1 = Op1; 4595 Opc = ISD::OR; 4596 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 4597 Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); 4598 break; 4599 case ISD::SETUO: Invert = true; // Fallthrough 4600 case ISD::SETO: 4601 // Expand this to (OLT | OGE). 4602 TmpOp0 = Op0; 4603 TmpOp1 = Op1; 4604 Opc = ISD::OR; 4605 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 4606 Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); 4607 break; 4608 } 4609 } else { 4610 // Integer comparisons. 4611 switch (SetCCOpcode) { 4612 default: llvm_unreachable("Illegal integer comparison"); 4613 case ISD::SETNE: Invert = true; 4614 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 4615 case ISD::SETLT: Swap = true; 4616 case ISD::SETGT: Opc = ARMISD::VCGT; break; 4617 case ISD::SETLE: Swap = true; 4618 case ISD::SETGE: Opc = ARMISD::VCGE; break; 4619 case ISD::SETULT: Swap = true; 4620 case ISD::SETUGT: Opc = ARMISD::VCGTU; break; 4621 case ISD::SETULE: Swap = true; 4622 case ISD::SETUGE: Opc = ARMISD::VCGEU; break; 4623 } 4624 4625 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 4626 if (Opc == ARMISD::VCEQ) { 4627 4628 SDValue AndOp; 4629 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 4630 AndOp = Op0; 4631 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 4632 AndOp = Op1; 4633 4634 // Ignore bitconvert. 4635 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 4636 AndOp = AndOp.getOperand(0); 4637 4638 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 4639 Opc = ARMISD::VTST; 4640 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 4641 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 4642 Invert = !Invert; 4643 } 4644 } 4645 } 4646 4647 if (Swap) 4648 std::swap(Op0, Op1); 4649 4650 // If one of the operands is a constant vector zero, attempt to fold the 4651 // comparison to a specialized compare-against-zero form. 4652 SDValue SingleOp; 4653 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 4654 SingleOp = Op0; 4655 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 4656 if (Opc == ARMISD::VCGE) 4657 Opc = ARMISD::VCLEZ; 4658 else if (Opc == ARMISD::VCGT) 4659 Opc = ARMISD::VCLTZ; 4660 SingleOp = Op1; 4661 } 4662 4663 SDValue Result; 4664 if (SingleOp.getNode()) { 4665 switch (Opc) { 4666 case ARMISD::VCEQ: 4667 Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; 4668 case ARMISD::VCGE: 4669 Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; 4670 case ARMISD::VCLEZ: 4671 Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; 4672 case ARMISD::VCGT: 4673 Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; 4674 case ARMISD::VCLTZ: 4675 Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; 4676 default: 4677 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 4678 } 4679 } else { 4680 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 4681 } 4682 4683 Result = DAG.getSExtOrTrunc(Result, dl, VT); 4684 4685 if (Invert) 4686 Result = DAG.getNOT(dl, Result, VT); 4687 4688 return Result; 4689 } 4690 4691 /// isNEONModifiedImm - Check if the specified splat value corresponds to a 4692 /// valid vector constant for a NEON instruction with a "modified immediate" 4693 /// operand (e.g., VMOV). If so, return the encoded value. 4694 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 4695 unsigned SplatBitSize, SelectionDAG &DAG, 4696 SDLoc dl, EVT &VT, bool is128Bits, 4697 NEONModImmType type) { 4698 unsigned OpCmode, Imm; 4699 4700 // SplatBitSize is set to the smallest size that splats the vector, so a 4701 // zero vector will always have SplatBitSize == 8. However, NEON modified 4702 // immediate instructions others than VMOV do not support the 8-bit encoding 4703 // of a zero vector, and the default encoding of zero is supposed to be the 4704 // 32-bit version. 4705 if (SplatBits == 0) 4706 SplatBitSize = 32; 4707 4708 switch (SplatBitSize) { 4709 case 8: 4710 if (type != VMOVModImm) 4711 return SDValue(); 4712 // Any 1-byte value is OK. Op=0, Cmode=1110. 4713 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 4714 OpCmode = 0xe; 4715 Imm = SplatBits; 4716 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 4717 break; 4718 4719 case 16: 4720 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 4721 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 4722 if ((SplatBits & ~0xff) == 0) { 4723 // Value = 0x00nn: Op=x, Cmode=100x. 4724 OpCmode = 0x8; 4725 Imm = SplatBits; 4726 break; 4727 } 4728 if ((SplatBits & ~0xff00) == 0) { 4729 // Value = 0xnn00: Op=x, Cmode=101x. 4730 OpCmode = 0xa; 4731 Imm = SplatBits >> 8; 4732 break; 4733 } 4734 return SDValue(); 4735 4736 case 32: 4737 // NEON's 32-bit VMOV supports splat values where: 4738 // * only one byte is nonzero, or 4739 // * the least significant byte is 0xff and the second byte is nonzero, or 4740 // * the least significant 2 bytes are 0xff and the third is nonzero. 4741 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 4742 if ((SplatBits & ~0xff) == 0) { 4743 // Value = 0x000000nn: Op=x, Cmode=000x. 4744 OpCmode = 0; 4745 Imm = SplatBits; 4746 break; 4747 } 4748 if ((SplatBits & ~0xff00) == 0) { 4749 // Value = 0x0000nn00: Op=x, Cmode=001x. 4750 OpCmode = 0x2; 4751 Imm = SplatBits >> 8; 4752 break; 4753 } 4754 if ((SplatBits & ~0xff0000) == 0) { 4755 // Value = 0x00nn0000: Op=x, Cmode=010x. 4756 OpCmode = 0x4; 4757 Imm = SplatBits >> 16; 4758 break; 4759 } 4760 if ((SplatBits & ~0xff000000) == 0) { 4761 // Value = 0xnn000000: Op=x, Cmode=011x. 4762 OpCmode = 0x6; 4763 Imm = SplatBits >> 24; 4764 break; 4765 } 4766 4767 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 4768 if (type == OtherModImm) return SDValue(); 4769 4770 if ((SplatBits & ~0xffff) == 0 && 4771 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 4772 // Value = 0x0000nnff: Op=x, Cmode=1100. 4773 OpCmode = 0xc; 4774 Imm = SplatBits >> 8; 4775 break; 4776 } 4777 4778 if ((SplatBits & ~0xffffff) == 0 && 4779 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 4780 // Value = 0x00nnffff: Op=x, Cmode=1101. 4781 OpCmode = 0xd; 4782 Imm = SplatBits >> 16; 4783 break; 4784 } 4785 4786 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 4787 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 4788 // VMOV.I32. A (very) minor optimization would be to replicate the value 4789 // and fall through here to test for a valid 64-bit splat. But, then the 4790 // caller would also need to check and handle the change in size. 4791 return SDValue(); 4792 4793 case 64: { 4794 if (type != VMOVModImm) 4795 return SDValue(); 4796 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 4797 uint64_t BitMask = 0xff; 4798 uint64_t Val = 0; 4799 unsigned ImmMask = 1; 4800 Imm = 0; 4801 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 4802 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 4803 Val |= BitMask; 4804 Imm |= ImmMask; 4805 } else if ((SplatBits & BitMask) != 0) { 4806 return SDValue(); 4807 } 4808 BitMask <<= 8; 4809 ImmMask <<= 1; 4810 } 4811 4812 if (DAG.getDataLayout().isBigEndian()) 4813 // swap higher and lower 32 bit word 4814 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 4815 4816 // Op=1, Cmode=1110. 4817 OpCmode = 0x1e; 4818 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 4819 break; 4820 } 4821 4822 default: 4823 llvm_unreachable("unexpected size for isNEONModifiedImm"); 4824 } 4825 4826 unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); 4827 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 4828 } 4829 4830 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 4831 const ARMSubtarget *ST) const { 4832 if (!ST->hasVFP3()) 4833 return SDValue(); 4834 4835 bool IsDouble = Op.getValueType() == MVT::f64; 4836 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 4837 4838 // Use the default (constant pool) lowering for double constants when we have 4839 // an SP-only FPU 4840 if (IsDouble && Subtarget->isFPOnlySP()) 4841 return SDValue(); 4842 4843 // Try splatting with a VMOV.f32... 4844 APFloat FPVal = CFP->getValueAPF(); 4845 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 4846 4847 if (ImmVal != -1) { 4848 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 4849 // We have code in place to select a valid ConstantFP already, no need to 4850 // do any mangling. 4851 return Op; 4852 } 4853 4854 // It's a float and we are trying to use NEON operations where 4855 // possible. Lower it to a splat followed by an extract. 4856 SDLoc DL(Op); 4857 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 4858 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 4859 NewVal); 4860 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 4861 DAG.getConstant(0, DL, MVT::i32)); 4862 } 4863 4864 // The rest of our options are NEON only, make sure that's allowed before 4865 // proceeding.. 4866 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 4867 return SDValue(); 4868 4869 EVT VMovVT; 4870 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 4871 4872 // It wouldn't really be worth bothering for doubles except for one very 4873 // important value, which does happen to match: 0.0. So make sure we don't do 4874 // anything stupid. 4875 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 4876 return SDValue(); 4877 4878 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 4879 SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 4880 VMovVT, false, VMOVModImm); 4881 if (NewVal != SDValue()) { 4882 SDLoc DL(Op); 4883 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 4884 NewVal); 4885 if (IsDouble) 4886 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 4887 4888 // It's a float: cast and extract a vector element. 4889 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 4890 VecConstant); 4891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 4892 DAG.getConstant(0, DL, MVT::i32)); 4893 } 4894 4895 // Finally, try a VMVN.i32 4896 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 4897 false, VMVNModImm); 4898 if (NewVal != SDValue()) { 4899 SDLoc DL(Op); 4900 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 4901 4902 if (IsDouble) 4903 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 4904 4905 // It's a float: cast and extract a vector element. 4906 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 4907 VecConstant); 4908 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 4909 DAG.getConstant(0, DL, MVT::i32)); 4910 } 4911 4912 return SDValue(); 4913 } 4914 4915 // check if an VEXT instruction can handle the shuffle mask when the 4916 // vector sources of the shuffle are the same. 4917 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 4918 unsigned NumElts = VT.getVectorNumElements(); 4919 4920 // Assume that the first shuffle index is not UNDEF. Fail if it is. 4921 if (M[0] < 0) 4922 return false; 4923 4924 Imm = M[0]; 4925 4926 // If this is a VEXT shuffle, the immediate value is the index of the first 4927 // element. The other shuffle indices must be the successive elements after 4928 // the first one. 4929 unsigned ExpectedElt = Imm; 4930 for (unsigned i = 1; i < NumElts; ++i) { 4931 // Increment the expected index. If it wraps around, just follow it 4932 // back to index zero and keep going. 4933 ++ExpectedElt; 4934 if (ExpectedElt == NumElts) 4935 ExpectedElt = 0; 4936 4937 if (M[i] < 0) continue; // ignore UNDEF indices 4938 if (ExpectedElt != static_cast<unsigned>(M[i])) 4939 return false; 4940 } 4941 4942 return true; 4943 } 4944 4945 4946 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 4947 bool &ReverseVEXT, unsigned &Imm) { 4948 unsigned NumElts = VT.getVectorNumElements(); 4949 ReverseVEXT = false; 4950 4951 // Assume that the first shuffle index is not UNDEF. Fail if it is. 4952 if (M[0] < 0) 4953 return false; 4954 4955 Imm = M[0]; 4956 4957 // If this is a VEXT shuffle, the immediate value is the index of the first 4958 // element. The other shuffle indices must be the successive elements after 4959 // the first one. 4960 unsigned ExpectedElt = Imm; 4961 for (unsigned i = 1; i < NumElts; ++i) { 4962 // Increment the expected index. If it wraps around, it may still be 4963 // a VEXT but the source vectors must be swapped. 4964 ExpectedElt += 1; 4965 if (ExpectedElt == NumElts * 2) { 4966 ExpectedElt = 0; 4967 ReverseVEXT = true; 4968 } 4969 4970 if (M[i] < 0) continue; // ignore UNDEF indices 4971 if (ExpectedElt != static_cast<unsigned>(M[i])) 4972 return false; 4973 } 4974 4975 // Adjust the index value if the source operands will be swapped. 4976 if (ReverseVEXT) 4977 Imm -= NumElts; 4978 4979 return true; 4980 } 4981 4982 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 4983 /// instruction with the specified blocksize. (The order of the elements 4984 /// within each block of the vector is reversed.) 4985 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 4986 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 4987 "Only possible block sizes for VREV are: 16, 32, 64"); 4988 4989 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4990 if (EltSz == 64) 4991 return false; 4992 4993 unsigned NumElts = VT.getVectorNumElements(); 4994 unsigned BlockElts = M[0] + 1; 4995 // If the first shuffle index is UNDEF, be optimistic. 4996 if (M[0] < 0) 4997 BlockElts = BlockSize / EltSz; 4998 4999 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 5000 return false; 5001 5002 for (unsigned i = 0; i < NumElts; ++i) { 5003 if (M[i] < 0) continue; // ignore UNDEF indices 5004 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 5005 return false; 5006 } 5007 5008 return true; 5009 } 5010 5011 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 5012 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 5013 // range, then 0 is placed into the resulting vector. So pretty much any mask 5014 // of 8 elements can work here. 5015 return VT == MVT::v8i8 && M.size() == 8; 5016 } 5017 5018 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 5019 // checking that pairs of elements in the shuffle mask represent the same index 5020 // in each vector, incrementing the expected index by 2 at each step. 5021 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 5022 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 5023 // v2={e,f,g,h} 5024 // WhichResult gives the offset for each element in the mask based on which 5025 // of the two results it belongs to. 5026 // 5027 // The transpose can be represented either as: 5028 // result1 = shufflevector v1, v2, result1_shuffle_mask 5029 // result2 = shufflevector v1, v2, result2_shuffle_mask 5030 // where v1/v2 and the shuffle masks have the same number of elements 5031 // (here WhichResult (see below) indicates which result is being checked) 5032 // 5033 // or as: 5034 // results = shufflevector v1, v2, shuffle_mask 5035 // where both results are returned in one vector and the shuffle mask has twice 5036 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 5037 // want to check the low half and high half of the shuffle mask as if it were 5038 // the other case 5039 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5040 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5041 if (EltSz == 64) 5042 return false; 5043 5044 unsigned NumElts = VT.getVectorNumElements(); 5045 if (M.size() != NumElts && M.size() != NumElts*2) 5046 return false; 5047 5048 // If the mask is twice as long as the input vector then we need to check the 5049 // upper and lower parts of the mask with a matching value for WhichResult 5050 // FIXME: A mask with only even values will be rejected in case the first 5051 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 5052 // M[0] is used to determine WhichResult 5053 for (unsigned i = 0; i < M.size(); i += NumElts) { 5054 if (M.size() == NumElts * 2) 5055 WhichResult = i / NumElts; 5056 else 5057 WhichResult = M[i] == 0 ? 0 : 1; 5058 for (unsigned j = 0; j < NumElts; j += 2) { 5059 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 5060 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 5061 return false; 5062 } 5063 } 5064 5065 if (M.size() == NumElts*2) 5066 WhichResult = 0; 5067 5068 return true; 5069 } 5070 5071 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 5072 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5073 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 5074 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5075 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5076 if (EltSz == 64) 5077 return false; 5078 5079 unsigned NumElts = VT.getVectorNumElements(); 5080 if (M.size() != NumElts && M.size() != NumElts*2) 5081 return false; 5082 5083 for (unsigned i = 0; i < M.size(); i += NumElts) { 5084 if (M.size() == NumElts * 2) 5085 WhichResult = i / NumElts; 5086 else 5087 WhichResult = M[i] == 0 ? 0 : 1; 5088 for (unsigned j = 0; j < NumElts; j += 2) { 5089 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 5090 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 5091 return false; 5092 } 5093 } 5094 5095 if (M.size() == NumElts*2) 5096 WhichResult = 0; 5097 5098 return true; 5099 } 5100 5101 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 5102 // that the mask elements are either all even and in steps of size 2 or all odd 5103 // and in steps of size 2. 5104 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 5105 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 5106 // v2={e,f,g,h} 5107 // Requires similar checks to that of isVTRNMask with 5108 // respect the how results are returned. 5109 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5110 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5111 if (EltSz == 64) 5112 return false; 5113 5114 unsigned NumElts = VT.getVectorNumElements(); 5115 if (M.size() != NumElts && M.size() != NumElts*2) 5116 return false; 5117 5118 for (unsigned i = 0; i < M.size(); i += NumElts) { 5119 WhichResult = M[i] == 0 ? 0 : 1; 5120 for (unsigned j = 0; j < NumElts; ++j) { 5121 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 5122 return false; 5123 } 5124 } 5125 5126 if (M.size() == NumElts*2) 5127 WhichResult = 0; 5128 5129 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5130 if (VT.is64BitVector() && EltSz == 32) 5131 return false; 5132 5133 return true; 5134 } 5135 5136 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 5137 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5138 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 5139 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5140 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5141 if (EltSz == 64) 5142 return false; 5143 5144 unsigned NumElts = VT.getVectorNumElements(); 5145 if (M.size() != NumElts && M.size() != NumElts*2) 5146 return false; 5147 5148 unsigned Half = NumElts / 2; 5149 for (unsigned i = 0; i < M.size(); i += NumElts) { 5150 WhichResult = M[i] == 0 ? 0 : 1; 5151 for (unsigned j = 0; j < NumElts; j += Half) { 5152 unsigned Idx = WhichResult; 5153 for (unsigned k = 0; k < Half; ++k) { 5154 int MIdx = M[i + j + k]; 5155 if (MIdx >= 0 && (unsigned) MIdx != Idx) 5156 return false; 5157 Idx += 2; 5158 } 5159 } 5160 } 5161 5162 if (M.size() == NumElts*2) 5163 WhichResult = 0; 5164 5165 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5166 if (VT.is64BitVector() && EltSz == 32) 5167 return false; 5168 5169 return true; 5170 } 5171 5172 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 5173 // that pairs of elements of the shufflemask represent the same index in each 5174 // vector incrementing sequentially through the vectors. 5175 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 5176 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 5177 // v2={e,f,g,h} 5178 // Requires similar checks to that of isVTRNMask with respect the how results 5179 // are returned. 5180 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5181 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5182 if (EltSz == 64) 5183 return false; 5184 5185 unsigned NumElts = VT.getVectorNumElements(); 5186 if (M.size() != NumElts && M.size() != NumElts*2) 5187 return false; 5188 5189 for (unsigned i = 0; i < M.size(); i += NumElts) { 5190 WhichResult = M[i] == 0 ? 0 : 1; 5191 unsigned Idx = WhichResult * NumElts / 2; 5192 for (unsigned j = 0; j < NumElts; j += 2) { 5193 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 5194 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 5195 return false; 5196 Idx += 1; 5197 } 5198 } 5199 5200 if (M.size() == NumElts*2) 5201 WhichResult = 0; 5202 5203 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5204 if (VT.is64BitVector() && EltSz == 32) 5205 return false; 5206 5207 return true; 5208 } 5209 5210 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 5211 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5212 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 5213 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5214 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 5215 if (EltSz == 64) 5216 return false; 5217 5218 unsigned NumElts = VT.getVectorNumElements(); 5219 if (M.size() != NumElts && M.size() != NumElts*2) 5220 return false; 5221 5222 for (unsigned i = 0; i < M.size(); i += NumElts) { 5223 WhichResult = M[i] == 0 ? 0 : 1; 5224 unsigned Idx = WhichResult * NumElts / 2; 5225 for (unsigned j = 0; j < NumElts; j += 2) { 5226 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 5227 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 5228 return false; 5229 Idx += 1; 5230 } 5231 } 5232 5233 if (M.size() == NumElts*2) 5234 WhichResult = 0; 5235 5236 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5237 if (VT.is64BitVector() && EltSz == 32) 5238 return false; 5239 5240 return true; 5241 } 5242 5243 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 5244 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 5245 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 5246 unsigned &WhichResult, 5247 bool &isV_UNDEF) { 5248 isV_UNDEF = false; 5249 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 5250 return ARMISD::VTRN; 5251 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 5252 return ARMISD::VUZP; 5253 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 5254 return ARMISD::VZIP; 5255 5256 isV_UNDEF = true; 5257 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5258 return ARMISD::VTRN; 5259 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5260 return ARMISD::VUZP; 5261 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5262 return ARMISD::VZIP; 5263 5264 return 0; 5265 } 5266 5267 /// \return true if this is a reverse operation on an vector. 5268 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 5269 unsigned NumElts = VT.getVectorNumElements(); 5270 // Make sure the mask has the right size. 5271 if (NumElts != M.size()) 5272 return false; 5273 5274 // Look for <15, ..., 3, -1, 1, 0>. 5275 for (unsigned i = 0; i != NumElts; ++i) 5276 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 5277 return false; 5278 5279 return true; 5280 } 5281 5282 // If N is an integer constant that can be moved into a register in one 5283 // instruction, return an SDValue of such a constant (will become a MOV 5284 // instruction). Otherwise return null. 5285 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 5286 const ARMSubtarget *ST, SDLoc dl) { 5287 uint64_t Val; 5288 if (!isa<ConstantSDNode>(N)) 5289 return SDValue(); 5290 Val = cast<ConstantSDNode>(N)->getZExtValue(); 5291 5292 if (ST->isThumb1Only()) { 5293 if (Val <= 255 || ~Val <= 255) 5294 return DAG.getConstant(Val, dl, MVT::i32); 5295 } else { 5296 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 5297 return DAG.getConstant(Val, dl, MVT::i32); 5298 } 5299 return SDValue(); 5300 } 5301 5302 // If this is a case we can't handle, return null and let the default 5303 // expansion code take care of it. 5304 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 5305 const ARMSubtarget *ST) const { 5306 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 5307 SDLoc dl(Op); 5308 EVT VT = Op.getValueType(); 5309 5310 APInt SplatBits, SplatUndef; 5311 unsigned SplatBitSize; 5312 bool HasAnyUndefs; 5313 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5314 if (SplatBitSize <= 64) { 5315 // Check if an immediate VMOV works. 5316 EVT VmovVT; 5317 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 5318 SplatUndef.getZExtValue(), SplatBitSize, 5319 DAG, dl, VmovVT, VT.is128BitVector(), 5320 VMOVModImm); 5321 if (Val.getNode()) { 5322 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 5323 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5324 } 5325 5326 // Try an immediate VMVN. 5327 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 5328 Val = isNEONModifiedImm(NegatedImm, 5329 SplatUndef.getZExtValue(), SplatBitSize, 5330 DAG, dl, VmovVT, VT.is128BitVector(), 5331 VMVNModImm); 5332 if (Val.getNode()) { 5333 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 5334 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5335 } 5336 5337 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 5338 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 5339 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 5340 if (ImmVal != -1) { 5341 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 5342 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 5343 } 5344 } 5345 } 5346 } 5347 5348 // Scan through the operands to see if only one value is used. 5349 // 5350 // As an optimisation, even if more than one value is used it may be more 5351 // profitable to splat with one value then change some lanes. 5352 // 5353 // Heuristically we decide to do this if the vector has a "dominant" value, 5354 // defined as splatted to more than half of the lanes. 5355 unsigned NumElts = VT.getVectorNumElements(); 5356 bool isOnlyLowElement = true; 5357 bool usesOnlyOneValue = true; 5358 bool hasDominantValue = false; 5359 bool isConstant = true; 5360 5361 // Map of the number of times a particular SDValue appears in the 5362 // element list. 5363 DenseMap<SDValue, unsigned> ValueCounts; 5364 SDValue Value; 5365 for (unsigned i = 0; i < NumElts; ++i) { 5366 SDValue V = Op.getOperand(i); 5367 if (V.getOpcode() == ISD::UNDEF) 5368 continue; 5369 if (i > 0) 5370 isOnlyLowElement = false; 5371 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 5372 isConstant = false; 5373 5374 ValueCounts.insert(std::make_pair(V, 0)); 5375 unsigned &Count = ValueCounts[V]; 5376 5377 // Is this value dominant? (takes up more than half of the lanes) 5378 if (++Count > (NumElts / 2)) { 5379 hasDominantValue = true; 5380 Value = V; 5381 } 5382 } 5383 if (ValueCounts.size() != 1) 5384 usesOnlyOneValue = false; 5385 if (!Value.getNode() && ValueCounts.size() > 0) 5386 Value = ValueCounts.begin()->first; 5387 5388 if (ValueCounts.size() == 0) 5389 return DAG.getUNDEF(VT); 5390 5391 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 5392 // Keep going if we are hitting this case. 5393 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 5394 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 5395 5396 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 5397 5398 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 5399 // i32 and try again. 5400 if (hasDominantValue && EltSize <= 32) { 5401 if (!isConstant) { 5402 SDValue N; 5403 5404 // If we are VDUPing a value that comes directly from a vector, that will 5405 // cause an unnecessary move to and from a GPR, where instead we could 5406 // just use VDUPLANE. We can only do this if the lane being extracted 5407 // is at a constant index, as the VDUP from lane instructions only have 5408 // constant-index forms. 5409 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 5410 isa<ConstantSDNode>(Value->getOperand(1))) { 5411 // We need to create a new undef vector to use for the VDUPLANE if the 5412 // size of the vector from which we get the value is different than the 5413 // size of the vector that we need to create. We will insert the element 5414 // such that the register coalescer will remove unnecessary copies. 5415 if (VT != Value->getOperand(0).getValueType()) { 5416 ConstantSDNode *constIndex; 5417 constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)); 5418 assert(constIndex && "The index is not a constant!"); 5419 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 5420 VT.getVectorNumElements(); 5421 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 5422 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 5423 Value, DAG.getConstant(index, dl, MVT::i32)), 5424 DAG.getConstant(index, dl, MVT::i32)); 5425 } else 5426 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 5427 Value->getOperand(0), Value->getOperand(1)); 5428 } else 5429 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 5430 5431 if (!usesOnlyOneValue) { 5432 // The dominant value was splatted as 'N', but we now have to insert 5433 // all differing elements. 5434 for (unsigned I = 0; I < NumElts; ++I) { 5435 if (Op.getOperand(I) == Value) 5436 continue; 5437 SmallVector<SDValue, 3> Ops; 5438 Ops.push_back(N); 5439 Ops.push_back(Op.getOperand(I)); 5440 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 5441 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 5442 } 5443 } 5444 return N; 5445 } 5446 if (VT.getVectorElementType().isFloatingPoint()) { 5447 SmallVector<SDValue, 8> Ops; 5448 for (unsigned i = 0; i < NumElts; ++i) 5449 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, 5450 Op.getOperand(i))); 5451 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 5452 SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); 5453 Val = LowerBUILD_VECTOR(Val, DAG, ST); 5454 if (Val.getNode()) 5455 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 5456 } 5457 if (usesOnlyOneValue) { 5458 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 5459 if (isConstant && Val.getNode()) 5460 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 5461 } 5462 } 5463 5464 // If all elements are constants and the case above didn't get hit, fall back 5465 // to the default expansion, which will generate a load from the constant 5466 // pool. 5467 if (isConstant) 5468 return SDValue(); 5469 5470 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 5471 if (NumElts >= 4) { 5472 SDValue shuffle = ReconstructShuffle(Op, DAG); 5473 if (shuffle != SDValue()) 5474 return shuffle; 5475 } 5476 5477 // Vectors with 32- or 64-bit elements can be built by directly assigning 5478 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 5479 // will be legalized. 5480 if (EltSize >= 32) { 5481 // Do the expansion with floating-point types, since that is what the VFP 5482 // registers are defined to use, and since i64 is not legal. 5483 EVT EltVT = EVT::getFloatingPointVT(EltSize); 5484 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 5485 SmallVector<SDValue, 8> Ops; 5486 for (unsigned i = 0; i < NumElts; ++i) 5487 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 5488 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 5489 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 5490 } 5491 5492 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 5493 // know the default expansion would otherwise fall back on something even 5494 // worse. For a vector with one or two non-undef values, that's 5495 // scalar_to_vector for the elements followed by a shuffle (provided the 5496 // shuffle is valid for the target) and materialization element by element 5497 // on the stack followed by a load for everything else. 5498 if (!isConstant && !usesOnlyOneValue) { 5499 SDValue Vec = DAG.getUNDEF(VT); 5500 for (unsigned i = 0 ; i < NumElts; ++i) { 5501 SDValue V = Op.getOperand(i); 5502 if (V.getOpcode() == ISD::UNDEF) 5503 continue; 5504 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 5505 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 5506 } 5507 return Vec; 5508 } 5509 5510 return SDValue(); 5511 } 5512 5513 // Gather data to see if the operation can be modelled as a 5514 // shuffle in combination with VEXTs. 5515 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 5516 SelectionDAG &DAG) const { 5517 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 5518 SDLoc dl(Op); 5519 EVT VT = Op.getValueType(); 5520 unsigned NumElts = VT.getVectorNumElements(); 5521 5522 struct ShuffleSourceInfo { 5523 SDValue Vec; 5524 unsigned MinElt; 5525 unsigned MaxElt; 5526 5527 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 5528 // be compatible with the shuffle we intend to construct. As a result 5529 // ShuffleVec will be some sliding window into the original Vec. 5530 SDValue ShuffleVec; 5531 5532 // Code should guarantee that element i in Vec starts at element "WindowBase 5533 // + i * WindowScale in ShuffleVec". 5534 int WindowBase; 5535 int WindowScale; 5536 5537 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 5538 ShuffleSourceInfo(SDValue Vec) 5539 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 5540 WindowScale(1) {} 5541 }; 5542 5543 // First gather all vectors used as an immediate source for this BUILD_VECTOR 5544 // node. 5545 SmallVector<ShuffleSourceInfo, 2> Sources; 5546 for (unsigned i = 0; i < NumElts; ++i) { 5547 SDValue V = Op.getOperand(i); 5548 if (V.getOpcode() == ISD::UNDEF) 5549 continue; 5550 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 5551 // A shuffle can only come from building a vector from various 5552 // elements of other vectors. 5553 return SDValue(); 5554 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 5555 // Furthermore, shuffles require a constant mask, whereas extractelts 5556 // accept variable indices. 5557 return SDValue(); 5558 } 5559 5560 // Add this element source to the list if it's not already there. 5561 SDValue SourceVec = V.getOperand(0); 5562 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 5563 if (Source == Sources.end()) 5564 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 5565 5566 // Update the minimum and maximum lane number seen. 5567 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 5568 Source->MinElt = std::min(Source->MinElt, EltNo); 5569 Source->MaxElt = std::max(Source->MaxElt, EltNo); 5570 } 5571 5572 // Currently only do something sane when at most two source vectors 5573 // are involved. 5574 if (Sources.size() > 2) 5575 return SDValue(); 5576 5577 // Find out the smallest element size among result and two sources, and use 5578 // it as element size to build the shuffle_vector. 5579 EVT SmallestEltTy = VT.getVectorElementType(); 5580 for (auto &Source : Sources) { 5581 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 5582 if (SrcEltTy.bitsLT(SmallestEltTy)) 5583 SmallestEltTy = SrcEltTy; 5584 } 5585 unsigned ResMultiplier = 5586 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 5587 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 5588 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 5589 5590 // If the source vector is too wide or too narrow, we may nevertheless be able 5591 // to construct a compatible shuffle either by concatenating it with UNDEF or 5592 // extracting a suitable range of elements. 5593 for (auto &Src : Sources) { 5594 EVT SrcVT = Src.ShuffleVec.getValueType(); 5595 5596 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 5597 continue; 5598 5599 // This stage of the search produces a source with the same element type as 5600 // the original, but with a total width matching the BUILD_VECTOR output. 5601 EVT EltVT = SrcVT.getVectorElementType(); 5602 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 5603 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 5604 5605 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 5606 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 5607 return SDValue(); 5608 // We can pad out the smaller vector for free, so if it's part of a 5609 // shuffle... 5610 Src.ShuffleVec = 5611 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 5612 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 5613 continue; 5614 } 5615 5616 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 5617 return SDValue(); 5618 5619 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 5620 // Span too large for a VEXT to cope 5621 return SDValue(); 5622 } 5623 5624 if (Src.MinElt >= NumSrcElts) { 5625 // The extraction can just take the second half 5626 Src.ShuffleVec = 5627 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5628 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 5629 Src.WindowBase = -NumSrcElts; 5630 } else if (Src.MaxElt < NumSrcElts) { 5631 // The extraction can just take the first half 5632 Src.ShuffleVec = 5633 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5634 DAG.getConstant(0, dl, MVT::i32)); 5635 } else { 5636 // An actual VEXT is needed 5637 SDValue VEXTSrc1 = 5638 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5639 DAG.getConstant(0, dl, MVT::i32)); 5640 SDValue VEXTSrc2 = 5641 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 5642 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 5643 5644 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 5645 VEXTSrc2, 5646 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 5647 Src.WindowBase = -Src.MinElt; 5648 } 5649 } 5650 5651 // Another possible incompatibility occurs from the vector element types. We 5652 // can fix this by bitcasting the source vectors to the same type we intend 5653 // for the shuffle. 5654 for (auto &Src : Sources) { 5655 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 5656 if (SrcEltTy == SmallestEltTy) 5657 continue; 5658 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 5659 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 5660 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 5661 Src.WindowBase *= Src.WindowScale; 5662 } 5663 5664 // Final sanity check before we try to actually produce a shuffle. 5665 DEBUG( 5666 for (auto Src : Sources) 5667 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 5668 ); 5669 5670 // The stars all align, our next step is to produce the mask for the shuffle. 5671 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 5672 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 5673 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 5674 SDValue Entry = Op.getOperand(i); 5675 if (Entry.getOpcode() == ISD::UNDEF) 5676 continue; 5677 5678 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 5679 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 5680 5681 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 5682 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 5683 // segment. 5684 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 5685 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 5686 VT.getVectorElementType().getSizeInBits()); 5687 int LanesDefined = BitsDefined / BitsPerShuffleLane; 5688 5689 // This source is expected to fill ResMultiplier lanes of the final shuffle, 5690 // starting at the appropriate offset. 5691 int *LaneMask = &Mask[i * ResMultiplier]; 5692 5693 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 5694 ExtractBase += NumElts * (Src - Sources.begin()); 5695 for (int j = 0; j < LanesDefined; ++j) 5696 LaneMask[j] = ExtractBase + j; 5697 } 5698 5699 // Final check before we try to produce nonsense... 5700 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 5701 return SDValue(); 5702 5703 // We can't handle more than two sources. This should have already 5704 // been checked before this point. 5705 assert(Sources.size() <= 2 && "Too many sources!"); 5706 5707 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 5708 for (unsigned i = 0; i < Sources.size(); ++i) 5709 ShuffleOps[i] = Sources[i].ShuffleVec; 5710 5711 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 5712 ShuffleOps[1], &Mask[0]); 5713 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 5714 } 5715 5716 /// isShuffleMaskLegal - Targets can use this to indicate that they only 5717 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 5718 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 5719 /// are assumed to be legal. 5720 bool 5721 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 5722 EVT VT) const { 5723 if (VT.getVectorNumElements() == 4 && 5724 (VT.is128BitVector() || VT.is64BitVector())) { 5725 unsigned PFIndexes[4]; 5726 for (unsigned i = 0; i != 4; ++i) { 5727 if (M[i] < 0) 5728 PFIndexes[i] = 8; 5729 else 5730 PFIndexes[i] = M[i]; 5731 } 5732 5733 // Compute the index in the perfect shuffle table. 5734 unsigned PFTableIndex = 5735 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 5736 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 5737 unsigned Cost = (PFEntry >> 30); 5738 5739 if (Cost <= 4) 5740 return true; 5741 } 5742 5743 bool ReverseVEXT, isV_UNDEF; 5744 unsigned Imm, WhichResult; 5745 5746 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 5747 return (EltSize >= 32 || 5748 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 5749 isVREVMask(M, VT, 64) || 5750 isVREVMask(M, VT, 32) || 5751 isVREVMask(M, VT, 16) || 5752 isVEXTMask(M, VT, ReverseVEXT, Imm) || 5753 isVTBLMask(M, VT) || 5754 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || 5755 ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); 5756 } 5757 5758 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 5759 /// the specified operations to build the shuffle. 5760 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 5761 SDValue RHS, SelectionDAG &DAG, 5762 SDLoc dl) { 5763 unsigned OpNum = (PFEntry >> 26) & 0x0F; 5764 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 5765 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 5766 5767 enum { 5768 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 5769 OP_VREV, 5770 OP_VDUP0, 5771 OP_VDUP1, 5772 OP_VDUP2, 5773 OP_VDUP3, 5774 OP_VEXT1, 5775 OP_VEXT2, 5776 OP_VEXT3, 5777 OP_VUZPL, // VUZP, left result 5778 OP_VUZPR, // VUZP, right result 5779 OP_VZIPL, // VZIP, left result 5780 OP_VZIPR, // VZIP, right result 5781 OP_VTRNL, // VTRN, left result 5782 OP_VTRNR // VTRN, right result 5783 }; 5784 5785 if (OpNum == OP_COPY) { 5786 if (LHSID == (1*9+2)*9+3) return LHS; 5787 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 5788 return RHS; 5789 } 5790 5791 SDValue OpLHS, OpRHS; 5792 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 5793 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 5794 EVT VT = OpLHS.getValueType(); 5795 5796 switch (OpNum) { 5797 default: llvm_unreachable("Unknown shuffle opcode!"); 5798 case OP_VREV: 5799 // VREV divides the vector in half and swaps within the half. 5800 if (VT.getVectorElementType() == MVT::i32 || 5801 VT.getVectorElementType() == MVT::f32) 5802 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 5803 // vrev <4 x i16> -> VREV32 5804 if (VT.getVectorElementType() == MVT::i16) 5805 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 5806 // vrev <4 x i8> -> VREV16 5807 assert(VT.getVectorElementType() == MVT::i8); 5808 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 5809 case OP_VDUP0: 5810 case OP_VDUP1: 5811 case OP_VDUP2: 5812 case OP_VDUP3: 5813 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 5814 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 5815 case OP_VEXT1: 5816 case OP_VEXT2: 5817 case OP_VEXT3: 5818 return DAG.getNode(ARMISD::VEXT, dl, VT, 5819 OpLHS, OpRHS, 5820 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 5821 case OP_VUZPL: 5822 case OP_VUZPR: 5823 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 5824 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 5825 case OP_VZIPL: 5826 case OP_VZIPR: 5827 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 5828 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 5829 case OP_VTRNL: 5830 case OP_VTRNR: 5831 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 5832 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 5833 } 5834 } 5835 5836 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 5837 ArrayRef<int> ShuffleMask, 5838 SelectionDAG &DAG) { 5839 // Check to see if we can use the VTBL instruction. 5840 SDValue V1 = Op.getOperand(0); 5841 SDValue V2 = Op.getOperand(1); 5842 SDLoc DL(Op); 5843 5844 SmallVector<SDValue, 8> VTBLMask; 5845 for (ArrayRef<int>::iterator 5846 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 5847 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 5848 5849 if (V2.getNode()->getOpcode() == ISD::UNDEF) 5850 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 5851 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); 5852 5853 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 5854 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask)); 5855 } 5856 5857 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 5858 SelectionDAG &DAG) { 5859 SDLoc DL(Op); 5860 SDValue OpLHS = Op.getOperand(0); 5861 EVT VT = OpLHS.getValueType(); 5862 5863 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 5864 "Expect an v8i16/v16i8 type"); 5865 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 5866 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 5867 // extract the first 8 bytes into the top double word and the last 8 bytes 5868 // into the bottom double word. The v8i16 case is similar. 5869 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 5870 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 5871 DAG.getConstant(ExtractNum, DL, MVT::i32)); 5872 } 5873 5874 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 5875 SDValue V1 = Op.getOperand(0); 5876 SDValue V2 = Op.getOperand(1); 5877 SDLoc dl(Op); 5878 EVT VT = Op.getValueType(); 5879 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 5880 5881 // Convert shuffles that are directly supported on NEON to target-specific 5882 // DAG nodes, instead of keeping them as shuffles and matching them again 5883 // during code selection. This is more efficient and avoids the possibility 5884 // of inconsistencies between legalization and selection. 5885 // FIXME: floating-point vectors should be canonicalized to integer vectors 5886 // of the same time so that they get CSEd properly. 5887 ArrayRef<int> ShuffleMask = SVN->getMask(); 5888 5889 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 5890 if (EltSize <= 32) { 5891 if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { 5892 int Lane = SVN->getSplatIndex(); 5893 // If this is undef splat, generate it via "just" vdup, if possible. 5894 if (Lane == -1) Lane = 0; 5895 5896 // Test if V1 is a SCALAR_TO_VECTOR. 5897 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 5898 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 5899 } 5900 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 5901 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 5902 // reaches it). 5903 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 5904 !isa<ConstantSDNode>(V1.getOperand(0))) { 5905 bool IsScalarToVector = true; 5906 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 5907 if (V1.getOperand(i).getOpcode() != ISD::UNDEF) { 5908 IsScalarToVector = false; 5909 break; 5910 } 5911 if (IsScalarToVector) 5912 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 5913 } 5914 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 5915 DAG.getConstant(Lane, dl, MVT::i32)); 5916 } 5917 5918 bool ReverseVEXT; 5919 unsigned Imm; 5920 if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 5921 if (ReverseVEXT) 5922 std::swap(V1, V2); 5923 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 5924 DAG.getConstant(Imm, dl, MVT::i32)); 5925 } 5926 5927 if (isVREVMask(ShuffleMask, VT, 64)) 5928 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 5929 if (isVREVMask(ShuffleMask, VT, 32)) 5930 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 5931 if (isVREVMask(ShuffleMask, VT, 16)) 5932 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 5933 5934 if (V2->getOpcode() == ISD::UNDEF && 5935 isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 5936 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 5937 DAG.getConstant(Imm, dl, MVT::i32)); 5938 } 5939 5940 // Check for Neon shuffles that modify both input vectors in place. 5941 // If both results are used, i.e., if there are two shuffles with the same 5942 // source operands and with masks corresponding to both results of one of 5943 // these operations, DAG memoization will ensure that a single node is 5944 // used for both shuffles. 5945 unsigned WhichResult; 5946 bool isV_UNDEF; 5947 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 5948 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 5949 if (isV_UNDEF) 5950 V2 = V1; 5951 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 5952 .getValue(WhichResult); 5953 } 5954 5955 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 5956 // shuffles that produce a result larger than their operands with: 5957 // shuffle(concat(v1, undef), concat(v2, undef)) 5958 // -> 5959 // shuffle(concat(v1, v2), undef) 5960 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 5961 // 5962 // This is useful in the general case, but there are special cases where 5963 // native shuffles produce larger results: the two-result ops. 5964 // 5965 // Look through the concat when lowering them: 5966 // shuffle(concat(v1, v2), undef) 5967 // -> 5968 // concat(VZIP(v1, v2):0, :1) 5969 // 5970 if (V1->getOpcode() == ISD::CONCAT_VECTORS && 5971 V2->getOpcode() == ISD::UNDEF) { 5972 SDValue SubV1 = V1->getOperand(0); 5973 SDValue SubV2 = V1->getOperand(1); 5974 EVT SubVT = SubV1.getValueType(); 5975 5976 // We expect these to have been canonicalized to -1. 5977 assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { 5978 return i < (int)VT.getVectorNumElements(); 5979 }) && "Unexpected shuffle index into UNDEF operand!"); 5980 5981 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 5982 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 5983 if (isV_UNDEF) 5984 SubV2 = SubV1; 5985 assert((WhichResult == 0) && 5986 "In-place shuffle of concat can only have one result!"); 5987 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 5988 SubV1, SubV2); 5989 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 5990 Res.getValue(1)); 5991 } 5992 } 5993 } 5994 5995 // If the shuffle is not directly supported and it has 4 elements, use 5996 // the PerfectShuffle-generated table to synthesize it from other shuffles. 5997 unsigned NumElts = VT.getVectorNumElements(); 5998 if (NumElts == 4) { 5999 unsigned PFIndexes[4]; 6000 for (unsigned i = 0; i != 4; ++i) { 6001 if (ShuffleMask[i] < 0) 6002 PFIndexes[i] = 8; 6003 else 6004 PFIndexes[i] = ShuffleMask[i]; 6005 } 6006 6007 // Compute the index in the perfect shuffle table. 6008 unsigned PFTableIndex = 6009 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6010 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6011 unsigned Cost = (PFEntry >> 30); 6012 6013 if (Cost <= 4) 6014 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 6015 } 6016 6017 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 6018 if (EltSize >= 32) { 6019 // Do the expansion with floating-point types, since that is what the VFP 6020 // registers are defined to use, and since i64 is not legal. 6021 EVT EltVT = EVT::getFloatingPointVT(EltSize); 6022 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 6023 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 6024 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 6025 SmallVector<SDValue, 8> Ops; 6026 for (unsigned i = 0; i < NumElts; ++i) { 6027 if (ShuffleMask[i] < 0) 6028 Ops.push_back(DAG.getUNDEF(EltVT)); 6029 else 6030 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6031 ShuffleMask[i] < (int)NumElts ? V1 : V2, 6032 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 6033 dl, MVT::i32))); 6034 } 6035 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 6036 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6037 } 6038 6039 if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 6040 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 6041 6042 if (VT == MVT::v8i8) { 6043 SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); 6044 if (NewOp.getNode()) 6045 return NewOp; 6046 } 6047 6048 return SDValue(); 6049 } 6050 6051 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 6052 // INSERT_VECTOR_ELT is legal only for immediate indexes. 6053 SDValue Lane = Op.getOperand(2); 6054 if (!isa<ConstantSDNode>(Lane)) 6055 return SDValue(); 6056 6057 return Op; 6058 } 6059 6060 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 6061 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 6062 SDValue Lane = Op.getOperand(1); 6063 if (!isa<ConstantSDNode>(Lane)) 6064 return SDValue(); 6065 6066 SDValue Vec = Op.getOperand(0); 6067 if (Op.getValueType() == MVT::i32 && 6068 Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { 6069 SDLoc dl(Op); 6070 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 6071 } 6072 6073 return Op; 6074 } 6075 6076 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6077 // The only time a CONCAT_VECTORS operation can have legal types is when 6078 // two 64-bit vectors are concatenated to a 128-bit vector. 6079 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 6080 "unexpected CONCAT_VECTORS"); 6081 SDLoc dl(Op); 6082 SDValue Val = DAG.getUNDEF(MVT::v2f64); 6083 SDValue Op0 = Op.getOperand(0); 6084 SDValue Op1 = Op.getOperand(1); 6085 if (Op0.getOpcode() != ISD::UNDEF) 6086 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 6087 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 6088 DAG.getIntPtrConstant(0, dl)); 6089 if (Op1.getOpcode() != ISD::UNDEF) 6090 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 6091 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 6092 DAG.getIntPtrConstant(1, dl)); 6093 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 6094 } 6095 6096 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 6097 /// element has been zero/sign-extended, depending on the isSigned parameter, 6098 /// from an integer type half its size. 6099 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 6100 bool isSigned) { 6101 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 6102 EVT VT = N->getValueType(0); 6103 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 6104 SDNode *BVN = N->getOperand(0).getNode(); 6105 if (BVN->getValueType(0) != MVT::v4i32 || 6106 BVN->getOpcode() != ISD::BUILD_VECTOR) 6107 return false; 6108 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 6109 unsigned HiElt = 1 - LoElt; 6110 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 6111 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 6112 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 6113 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 6114 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 6115 return false; 6116 if (isSigned) { 6117 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 6118 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 6119 return true; 6120 } else { 6121 if (Hi0->isNullValue() && Hi1->isNullValue()) 6122 return true; 6123 } 6124 return false; 6125 } 6126 6127 if (N->getOpcode() != ISD::BUILD_VECTOR) 6128 return false; 6129 6130 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 6131 SDNode *Elt = N->getOperand(i).getNode(); 6132 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 6133 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6134 unsigned HalfSize = EltSize / 2; 6135 if (isSigned) { 6136 if (!isIntN(HalfSize, C->getSExtValue())) 6137 return false; 6138 } else { 6139 if (!isUIntN(HalfSize, C->getZExtValue())) 6140 return false; 6141 } 6142 continue; 6143 } 6144 return false; 6145 } 6146 6147 return true; 6148 } 6149 6150 /// isSignExtended - Check if a node is a vector value that is sign-extended 6151 /// or a constant BUILD_VECTOR with sign-extended elements. 6152 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 6153 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 6154 return true; 6155 if (isExtendedBUILD_VECTOR(N, DAG, true)) 6156 return true; 6157 return false; 6158 } 6159 6160 /// isZeroExtended - Check if a node is a vector value that is zero-extended 6161 /// or a constant BUILD_VECTOR with zero-extended elements. 6162 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 6163 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 6164 return true; 6165 if (isExtendedBUILD_VECTOR(N, DAG, false)) 6166 return true; 6167 return false; 6168 } 6169 6170 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 6171 if (OrigVT.getSizeInBits() >= 64) 6172 return OrigVT; 6173 6174 assert(OrigVT.isSimple() && "Expecting a simple value type"); 6175 6176 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 6177 switch (OrigSimpleTy) { 6178 default: llvm_unreachable("Unexpected Vector Type"); 6179 case MVT::v2i8: 6180 case MVT::v2i16: 6181 return MVT::v2i32; 6182 case MVT::v4i8: 6183 return MVT::v4i16; 6184 } 6185 } 6186 6187 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 6188 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 6189 /// We insert the required extension here to get the vector to fill a D register. 6190 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 6191 const EVT &OrigTy, 6192 const EVT &ExtTy, 6193 unsigned ExtOpcode) { 6194 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 6195 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 6196 // 64-bits we need to insert a new extension so that it will be 64-bits. 6197 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 6198 if (OrigTy.getSizeInBits() >= 64) 6199 return N; 6200 6201 // Must extend size to at least 64 bits to be used as an operand for VMULL. 6202 EVT NewVT = getExtensionTo64Bits(OrigTy); 6203 6204 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 6205 } 6206 6207 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 6208 /// does not do any sign/zero extension. If the original vector is less 6209 /// than 64 bits, an appropriate extension will be added after the load to 6210 /// reach a total size of 64 bits. We have to add the extension separately 6211 /// because ARM does not have a sign/zero extending load for vectors. 6212 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 6213 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 6214 6215 // The load already has the right type. 6216 if (ExtendedTy == LD->getMemoryVT()) 6217 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 6218 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), 6219 LD->isNonTemporal(), LD->isInvariant(), 6220 LD->getAlignment()); 6221 6222 // We need to create a zextload/sextload. We cannot just create a load 6223 // followed by a zext/zext node because LowerMUL is also run during normal 6224 // operation legalization where we can't create illegal types. 6225 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 6226 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 6227 LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(), 6228 LD->isNonTemporal(), LD->getAlignment()); 6229 } 6230 6231 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 6232 /// extending load, or BUILD_VECTOR with extended elements, return the 6233 /// unextended value. The unextended vector should be 64 bits so that it can 6234 /// be used as an operand to a VMULL instruction. If the original vector size 6235 /// before extension is less than 64 bits we add a an extension to resize 6236 /// the vector to 64 bits. 6237 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 6238 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 6239 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 6240 N->getOperand(0)->getValueType(0), 6241 N->getValueType(0), 6242 N->getOpcode()); 6243 6244 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) 6245 return SkipLoadExtensionForVMULL(LD, DAG); 6246 6247 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 6248 // have been legalized as a BITCAST from v4i32. 6249 if (N->getOpcode() == ISD::BITCAST) { 6250 SDNode *BVN = N->getOperand(0).getNode(); 6251 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 6252 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 6253 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 6254 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, 6255 BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); 6256 } 6257 // Construct a new BUILD_VECTOR with elements truncated to half the size. 6258 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 6259 EVT VT = N->getValueType(0); 6260 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 6261 unsigned NumElts = VT.getVectorNumElements(); 6262 MVT TruncVT = MVT::getIntegerVT(EltSize); 6263 SmallVector<SDValue, 8> Ops; 6264 SDLoc dl(N); 6265 for (unsigned i = 0; i != NumElts; ++i) { 6266 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 6267 const APInt &CInt = C->getAPIntValue(); 6268 // Element types smaller than 32 bits are not legal, so use i32 elements. 6269 // The values are implicitly truncated so sext vs. zext doesn't matter. 6270 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 6271 } 6272 return DAG.getNode(ISD::BUILD_VECTOR, dl, 6273 MVT::getVectorVT(TruncVT, NumElts), Ops); 6274 } 6275 6276 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 6277 unsigned Opcode = N->getOpcode(); 6278 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 6279 SDNode *N0 = N->getOperand(0).getNode(); 6280 SDNode *N1 = N->getOperand(1).getNode(); 6281 return N0->hasOneUse() && N1->hasOneUse() && 6282 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 6283 } 6284 return false; 6285 } 6286 6287 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 6288 unsigned Opcode = N->getOpcode(); 6289 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 6290 SDNode *N0 = N->getOperand(0).getNode(); 6291 SDNode *N1 = N->getOperand(1).getNode(); 6292 return N0->hasOneUse() && N1->hasOneUse() && 6293 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 6294 } 6295 return false; 6296 } 6297 6298 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 6299 // Multiplications are only custom-lowered for 128-bit vectors so that 6300 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 6301 EVT VT = Op.getValueType(); 6302 assert(VT.is128BitVector() && VT.isInteger() && 6303 "unexpected type for custom-lowering ISD::MUL"); 6304 SDNode *N0 = Op.getOperand(0).getNode(); 6305 SDNode *N1 = Op.getOperand(1).getNode(); 6306 unsigned NewOpc = 0; 6307 bool isMLA = false; 6308 bool isN0SExt = isSignExtended(N0, DAG); 6309 bool isN1SExt = isSignExtended(N1, DAG); 6310 if (isN0SExt && isN1SExt) 6311 NewOpc = ARMISD::VMULLs; 6312 else { 6313 bool isN0ZExt = isZeroExtended(N0, DAG); 6314 bool isN1ZExt = isZeroExtended(N1, DAG); 6315 if (isN0ZExt && isN1ZExt) 6316 NewOpc = ARMISD::VMULLu; 6317 else if (isN1SExt || isN1ZExt) { 6318 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 6319 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 6320 if (isN1SExt && isAddSubSExt(N0, DAG)) { 6321 NewOpc = ARMISD::VMULLs; 6322 isMLA = true; 6323 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 6324 NewOpc = ARMISD::VMULLu; 6325 isMLA = true; 6326 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 6327 std::swap(N0, N1); 6328 NewOpc = ARMISD::VMULLu; 6329 isMLA = true; 6330 } 6331 } 6332 6333 if (!NewOpc) { 6334 if (VT == MVT::v2i64) 6335 // Fall through to expand this. It is not legal. 6336 return SDValue(); 6337 else 6338 // Other vector multiplications are legal. 6339 return Op; 6340 } 6341 } 6342 6343 // Legalize to a VMULL instruction. 6344 SDLoc DL(Op); 6345 SDValue Op0; 6346 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 6347 if (!isMLA) { 6348 Op0 = SkipExtensionForVMULL(N0, DAG); 6349 assert(Op0.getValueType().is64BitVector() && 6350 Op1.getValueType().is64BitVector() && 6351 "unexpected types for extended operands to VMULL"); 6352 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 6353 } 6354 6355 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 6356 // isel lowering to take advantage of no-stall back to back vmul + vmla. 6357 // vmull q0, d4, d6 6358 // vmlal q0, d5, d6 6359 // is faster than 6360 // vaddl q0, d4, d5 6361 // vmovl q1, d6 6362 // vmul q0, q0, q1 6363 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 6364 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 6365 EVT Op1VT = Op1.getValueType(); 6366 return DAG.getNode(N0->getOpcode(), DL, VT, 6367 DAG.getNode(NewOpc, DL, VT, 6368 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 6369 DAG.getNode(NewOpc, DL, VT, 6370 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 6371 } 6372 6373 static SDValue 6374 LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { 6375 // TODO: Should this propagate fast-math-flags? 6376 6377 // Convert to float 6378 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 6379 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 6380 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 6381 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 6382 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 6383 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 6384 // Get reciprocal estimate. 6385 // float4 recip = vrecpeq_f32(yf); 6386 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6387 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 6388 Y); 6389 // Because char has a smaller range than uchar, we can actually get away 6390 // without any newton steps. This requires that we use a weird bias 6391 // of 0xb000, however (again, this has been exhaustively tested). 6392 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 6393 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 6394 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 6395 Y = DAG.getConstant(0xb000, dl, MVT::i32); 6396 Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); 6397 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 6398 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 6399 // Convert back to short. 6400 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 6401 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 6402 return X; 6403 } 6404 6405 static SDValue 6406 LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { 6407 // TODO: Should this propagate fast-math-flags? 6408 6409 SDValue N2; 6410 // Convert to float. 6411 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 6412 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 6413 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 6414 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 6415 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 6416 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 6417 6418 // Use reciprocal estimate and one refinement step. 6419 // float4 recip = vrecpeq_f32(yf); 6420 // recip *= vrecpsq_f32(yf, recip); 6421 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6422 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 6423 N1); 6424 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6425 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 6426 N1, N2); 6427 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 6428 // Because short has a smaller range than ushort, we can actually get away 6429 // with only a single newton step. This requires that we use a weird bias 6430 // of 89, however (again, this has been exhaustively tested). 6431 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 6432 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 6433 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 6434 N1 = DAG.getConstant(0x89, dl, MVT::i32); 6435 N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); 6436 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 6437 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 6438 // Convert back to integer and return. 6439 // return vmovn_s32(vcvt_s32_f32(result)); 6440 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 6441 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 6442 return N0; 6443 } 6444 6445 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 6446 EVT VT = Op.getValueType(); 6447 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 6448 "unexpected type for custom-lowering ISD::SDIV"); 6449 6450 SDLoc dl(Op); 6451 SDValue N0 = Op.getOperand(0); 6452 SDValue N1 = Op.getOperand(1); 6453 SDValue N2, N3; 6454 6455 if (VT == MVT::v8i8) { 6456 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 6457 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 6458 6459 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6460 DAG.getIntPtrConstant(4, dl)); 6461 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6462 DAG.getIntPtrConstant(4, dl)); 6463 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6464 DAG.getIntPtrConstant(0, dl)); 6465 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6466 DAG.getIntPtrConstant(0, dl)); 6467 6468 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 6469 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 6470 6471 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 6472 N0 = LowerCONCAT_VECTORS(N0, DAG); 6473 6474 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 6475 return N0; 6476 } 6477 return LowerSDIV_v4i16(N0, N1, dl, DAG); 6478 } 6479 6480 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { 6481 // TODO: Should this propagate fast-math-flags? 6482 EVT VT = Op.getValueType(); 6483 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 6484 "unexpected type for custom-lowering ISD::UDIV"); 6485 6486 SDLoc dl(Op); 6487 SDValue N0 = Op.getOperand(0); 6488 SDValue N1 = Op.getOperand(1); 6489 SDValue N2, N3; 6490 6491 if (VT == MVT::v8i8) { 6492 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 6493 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 6494 6495 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6496 DAG.getIntPtrConstant(4, dl)); 6497 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6498 DAG.getIntPtrConstant(4, dl)); 6499 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 6500 DAG.getIntPtrConstant(0, dl)); 6501 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 6502 DAG.getIntPtrConstant(0, dl)); 6503 6504 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 6505 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 6506 6507 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 6508 N0 = LowerCONCAT_VECTORS(N0, DAG); 6509 6510 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 6511 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 6512 MVT::i32), 6513 N0); 6514 return N0; 6515 } 6516 6517 // v4i16 sdiv ... Convert to float. 6518 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 6519 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 6520 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 6521 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 6522 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 6523 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 6524 6525 // Use reciprocal estimate and two refinement steps. 6526 // float4 recip = vrecpeq_f32(yf); 6527 // recip *= vrecpsq_f32(yf, recip); 6528 // recip *= vrecpsq_f32(yf, recip); 6529 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6530 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 6531 BN1); 6532 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6533 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 6534 BN1, N2); 6535 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 6536 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 6537 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 6538 BN1, N2); 6539 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 6540 // Simply multiplying by the reciprocal estimate can leave us a few ulps 6541 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 6542 // and that it will never cause us to return an answer too large). 6543 // float4 result = as_float4(as_int4(xf*recip) + 2); 6544 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 6545 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 6546 N1 = DAG.getConstant(2, dl, MVT::i32); 6547 N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); 6548 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 6549 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 6550 // Convert back to integer and return. 6551 // return vmovn_u32(vcvt_s32_f32(result)); 6552 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 6553 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 6554 return N0; 6555 } 6556 6557 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 6558 EVT VT = Op.getNode()->getValueType(0); 6559 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 6560 6561 unsigned Opc; 6562 bool ExtraOp = false; 6563 switch (Op.getOpcode()) { 6564 default: llvm_unreachable("Invalid code"); 6565 case ISD::ADDC: Opc = ARMISD::ADDC; break; 6566 case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; 6567 case ISD::SUBC: Opc = ARMISD::SUBC; break; 6568 case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; 6569 } 6570 6571 if (!ExtraOp) 6572 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 6573 Op.getOperand(1)); 6574 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 6575 Op.getOperand(1), Op.getOperand(2)); 6576 } 6577 6578 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 6579 assert(Subtarget->isTargetDarwin()); 6580 6581 // For iOS, we want to call an alternative entry point: __sincos_stret, 6582 // return values are passed via sret. 6583 SDLoc dl(Op); 6584 SDValue Arg = Op.getOperand(0); 6585 EVT ArgVT = Arg.getValueType(); 6586 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 6587 auto PtrVT = getPointerTy(DAG.getDataLayout()); 6588 6589 MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); 6590 6591 // Pair of floats / doubles used to pass the result. 6592 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 6593 6594 // Create stack object for sret. 6595 auto &DL = DAG.getDataLayout(); 6596 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 6597 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 6598 int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); 6599 SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); 6600 6601 ArgListTy Args; 6602 ArgListEntry Entry; 6603 6604 Entry.Node = SRet; 6605 Entry.Ty = RetTy->getPointerTo(); 6606 Entry.isSExt = false; 6607 Entry.isZExt = false; 6608 Entry.isSRet = true; 6609 Args.push_back(Entry); 6610 6611 Entry.Node = Arg; 6612 Entry.Ty = ArgTy; 6613 Entry.isSExt = false; 6614 Entry.isZExt = false; 6615 Args.push_back(Entry); 6616 6617 const char *LibcallName = 6618 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 6619 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 6620 6621 TargetLowering::CallLoweringInfo CLI(DAG); 6622 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 6623 .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee, 6624 std::move(Args), 0) 6625 .setDiscardResult(); 6626 6627 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 6628 6629 SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, 6630 MachinePointerInfo(), false, false, false, 0); 6631 6632 // Address of cos field. 6633 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 6634 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 6635 SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, 6636 MachinePointerInfo(), false, false, false, 0); 6637 6638 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 6639 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 6640 LoadSin.getValue(0), LoadCos.getValue(0)); 6641 } 6642 6643 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 6644 bool Signed, 6645 SDValue &Chain) const { 6646 EVT VT = Op.getValueType(); 6647 assert((VT == MVT::i32 || VT == MVT::i64) && 6648 "unexpected type for custom lowering DIV"); 6649 SDLoc dl(Op); 6650 6651 const auto &DL = DAG.getDataLayout(); 6652 const auto &TLI = DAG.getTargetLoweringInfo(); 6653 6654 const char *Name = nullptr; 6655 if (Signed) 6656 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 6657 else 6658 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 6659 6660 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 6661 6662 ARMTargetLowering::ArgListTy Args; 6663 6664 for (auto AI : {1, 0}) { 6665 ArgListEntry Arg; 6666 Arg.Node = Op.getOperand(AI); 6667 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 6668 Args.push_back(Arg); 6669 } 6670 6671 CallLoweringInfo CLI(DAG); 6672 CLI.setDebugLoc(dl) 6673 .setChain(Chain) 6674 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 6675 ES, std::move(Args), 0); 6676 6677 return LowerCallTo(CLI).first; 6678 } 6679 6680 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 6681 bool Signed) const { 6682 assert(Op.getValueType() == MVT::i32 && 6683 "unexpected type for custom lowering DIV"); 6684 SDLoc dl(Op); 6685 6686 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 6687 DAG.getEntryNode(), Op.getOperand(1)); 6688 6689 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 6690 } 6691 6692 void ARMTargetLowering::ExpandDIV_Windows( 6693 SDValue Op, SelectionDAG &DAG, bool Signed, 6694 SmallVectorImpl<SDValue> &Results) const { 6695 const auto &DL = DAG.getDataLayout(); 6696 const auto &TLI = DAG.getTargetLoweringInfo(); 6697 6698 assert(Op.getValueType() == MVT::i64 && 6699 "unexpected type for custom lowering DIV"); 6700 SDLoc dl(Op); 6701 6702 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), 6703 DAG.getConstant(0, dl, MVT::i32)); 6704 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), 6705 DAG.getConstant(1, dl, MVT::i32)); 6706 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi); 6707 6708 SDValue DBZCHK = 6709 DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or); 6710 6711 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 6712 6713 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 6714 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 6715 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 6716 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 6717 6718 Results.push_back(Lower); 6719 Results.push_back(Upper); 6720 } 6721 6722 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 6723 // Monotonic load/store is legal for all targets 6724 if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic) 6725 return Op; 6726 6727 // Acquire/Release load/store is not legal for targets without a 6728 // dmb or equivalent available. 6729 return SDValue(); 6730 } 6731 6732 static void ReplaceREADCYCLECOUNTER(SDNode *N, 6733 SmallVectorImpl<SDValue> &Results, 6734 SelectionDAG &DAG, 6735 const ARMSubtarget *Subtarget) { 6736 SDLoc DL(N); 6737 // Under Power Management extensions, the cycle-count is: 6738 // mrc p15, #0, <Rt>, c9, c13, #0 6739 SDValue Ops[] = { N->getOperand(0), // Chain 6740 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 6741 DAG.getConstant(15, DL, MVT::i32), 6742 DAG.getConstant(0, DL, MVT::i32), 6743 DAG.getConstant(9, DL, MVT::i32), 6744 DAG.getConstant(13, DL, MVT::i32), 6745 DAG.getConstant(0, DL, MVT::i32) 6746 }; 6747 6748 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 6749 DAG.getVTList(MVT::i32, MVT::Other), Ops); 6750 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 6751 DAG.getConstant(0, DL, MVT::i32))); 6752 Results.push_back(Cycles32.getValue(1)); 6753 } 6754 6755 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 6756 switch (Op.getOpcode()) { 6757 default: llvm_unreachable("Don't know how to custom lower this!"); 6758 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 6759 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 6760 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 6761 case ISD::GlobalAddress: 6762 switch (Subtarget->getTargetTriple().getObjectFormat()) { 6763 default: llvm_unreachable("unknown object format"); 6764 case Triple::COFF: 6765 return LowerGlobalAddressWindows(Op, DAG); 6766 case Triple::ELF: 6767 return LowerGlobalAddressELF(Op, DAG); 6768 case Triple::MachO: 6769 return LowerGlobalAddressDarwin(Op, DAG); 6770 } 6771 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 6772 case ISD::SELECT: return LowerSELECT(Op, DAG); 6773 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 6774 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 6775 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 6776 case ISD::VASTART: return LowerVASTART(Op, DAG); 6777 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 6778 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 6779 case ISD::SINT_TO_FP: 6780 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 6781 case ISD::FP_TO_SINT: 6782 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 6783 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 6784 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 6785 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 6786 case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); 6787 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 6788 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 6789 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 6790 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 6791 Subtarget); 6792 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); 6793 case ISD::SHL: 6794 case ISD::SRL: 6795 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 6796 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 6797 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 6798 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 6799 case ISD::SRL_PARTS: 6800 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 6801 case ISD::CTTZ: 6802 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 6803 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 6804 case ISD::SETCC: return LowerVSETCC(Op, DAG); 6805 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 6806 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 6807 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 6808 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 6809 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6810 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 6811 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 6812 case ISD::MUL: return LowerMUL(Op, DAG); 6813 case ISD::SDIV: 6814 if (Subtarget->isTargetWindows()) 6815 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 6816 return LowerSDIV(Op, DAG); 6817 case ISD::UDIV: 6818 if (Subtarget->isTargetWindows()) 6819 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 6820 return LowerUDIV(Op, DAG); 6821 case ISD::ADDC: 6822 case ISD::ADDE: 6823 case ISD::SUBC: 6824 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 6825 case ISD::SADDO: 6826 case ISD::UADDO: 6827 case ISD::SSUBO: 6828 case ISD::USUBO: 6829 return LowerXALUO(Op, DAG); 6830 case ISD::ATOMIC_LOAD: 6831 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 6832 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 6833 case ISD::SDIVREM: 6834 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 6835 case ISD::DYNAMIC_STACKALLOC: 6836 if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) 6837 return LowerDYNAMIC_STACKALLOC(Op, DAG); 6838 llvm_unreachable("Don't know how to custom lower this!"); 6839 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 6840 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 6841 case ARMISD::WIN__DBZCHK: return SDValue(); 6842 } 6843 } 6844 6845 /// ReplaceNodeResults - Replace the results of node with an illegal result 6846 /// type with new values built out of custom code. 6847 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 6848 SmallVectorImpl<SDValue> &Results, 6849 SelectionDAG &DAG) const { 6850 SDValue Res; 6851 switch (N->getOpcode()) { 6852 default: 6853 llvm_unreachable("Don't know how to custom expand this!"); 6854 case ISD::READ_REGISTER: 6855 ExpandREAD_REGISTER(N, Results, DAG); 6856 break; 6857 case ISD::BITCAST: 6858 Res = ExpandBITCAST(N, DAG); 6859 break; 6860 case ISD::SRL: 6861 case ISD::SRA: 6862 Res = Expand64BitShift(N, DAG, Subtarget); 6863 break; 6864 case ISD::SREM: 6865 case ISD::UREM: 6866 Res = LowerREM(N, DAG); 6867 break; 6868 case ISD::READCYCLECOUNTER: 6869 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 6870 return; 6871 case ISD::UDIV: 6872 case ISD::SDIV: 6873 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 6874 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 6875 Results); 6876 } 6877 if (Res.getNode()) 6878 Results.push_back(Res); 6879 } 6880 6881 //===----------------------------------------------------------------------===// 6882 // ARM Scheduler Hooks 6883 //===----------------------------------------------------------------------===// 6884 6885 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 6886 /// registers the function context. 6887 void ARMTargetLowering:: 6888 SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, 6889 MachineBasicBlock *DispatchBB, int FI) const { 6890 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 6891 DebugLoc dl = MI->getDebugLoc(); 6892 MachineFunction *MF = MBB->getParent(); 6893 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6894 MachineConstantPool *MCP = MF->getConstantPool(); 6895 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 6896 const Function *F = MF->getFunction(); 6897 6898 bool isThumb = Subtarget->isThumb(); 6899 bool isThumb2 = Subtarget->isThumb2(); 6900 6901 unsigned PCLabelId = AFI->createPICLabelUId(); 6902 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 6903 ARMConstantPoolValue *CPV = 6904 ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); 6905 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 6906 6907 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 6908 : &ARM::GPRRegClass; 6909 6910 // Grab constant pool and fixed stack memory operands. 6911 MachineMemOperand *CPMMO = 6912 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 6913 MachineMemOperand::MOLoad, 4, 4); 6914 6915 MachineMemOperand *FIMMOSt = 6916 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 6917 MachineMemOperand::MOStore, 4, 4); 6918 6919 // Load the address of the dispatch MBB into the jump buffer. 6920 if (isThumb2) { 6921 // Incoming value: jbuf 6922 // ldr.n r5, LCPI1_1 6923 // orr r5, r5, #1 6924 // add r5, pc 6925 // str r5, [$jbuf, #+4] ; &jbuf[1] 6926 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6927 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 6928 .addConstantPoolIndex(CPI) 6929 .addMemOperand(CPMMO)); 6930 // Set the low bit because of thumb mode. 6931 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 6932 AddDefaultCC( 6933 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 6934 .addReg(NewVReg1, RegState::Kill) 6935 .addImm(0x01))); 6936 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 6937 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 6938 .addReg(NewVReg2, RegState::Kill) 6939 .addImm(PCLabelId); 6940 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 6941 .addReg(NewVReg3, RegState::Kill) 6942 .addFrameIndex(FI) 6943 .addImm(36) // &jbuf[1] :: pc 6944 .addMemOperand(FIMMOSt)); 6945 } else if (isThumb) { 6946 // Incoming value: jbuf 6947 // ldr.n r1, LCPI1_4 6948 // add r1, pc 6949 // mov r2, #1 6950 // orrs r1, r2 6951 // add r2, $jbuf, #+4 ; &jbuf[1] 6952 // str r1, [r2] 6953 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6954 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 6955 .addConstantPoolIndex(CPI) 6956 .addMemOperand(CPMMO)); 6957 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 6958 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 6959 .addReg(NewVReg1, RegState::Kill) 6960 .addImm(PCLabelId); 6961 // Set the low bit because of thumb mode. 6962 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 6963 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 6964 .addReg(ARM::CPSR, RegState::Define) 6965 .addImm(1)); 6966 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 6967 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 6968 .addReg(ARM::CPSR, RegState::Define) 6969 .addReg(NewVReg2, RegState::Kill) 6970 .addReg(NewVReg3, RegState::Kill)); 6971 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 6972 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 6973 .addFrameIndex(FI) 6974 .addImm(36); // &jbuf[1] :: pc 6975 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 6976 .addReg(NewVReg4, RegState::Kill) 6977 .addReg(NewVReg5, RegState::Kill) 6978 .addImm(0) 6979 .addMemOperand(FIMMOSt)); 6980 } else { 6981 // Incoming value: jbuf 6982 // ldr r1, LCPI1_1 6983 // add r1, pc, r1 6984 // str r1, [$jbuf, #+4] ; &jbuf[1] 6985 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6986 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 6987 .addConstantPoolIndex(CPI) 6988 .addImm(0) 6989 .addMemOperand(CPMMO)); 6990 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 6991 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 6992 .addReg(NewVReg1, RegState::Kill) 6993 .addImm(PCLabelId)); 6994 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 6995 .addReg(NewVReg2, RegState::Kill) 6996 .addFrameIndex(FI) 6997 .addImm(36) // &jbuf[1] :: pc 6998 .addMemOperand(FIMMOSt)); 6999 } 7000 } 7001 7002 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, 7003 MachineBasicBlock *MBB) const { 7004 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7005 DebugLoc dl = MI->getDebugLoc(); 7006 MachineFunction *MF = MBB->getParent(); 7007 MachineRegisterInfo *MRI = &MF->getRegInfo(); 7008 MachineFrameInfo *MFI = MF->getFrameInfo(); 7009 int FI = MFI->getFunctionContextIndex(); 7010 7011 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 7012 : &ARM::GPRnopcRegClass; 7013 7014 // Get a mapping of the call site numbers to all of the landing pads they're 7015 // associated with. 7016 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad; 7017 unsigned MaxCSNum = 0; 7018 MachineModuleInfo &MMI = MF->getMMI(); 7019 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 7020 ++BB) { 7021 if (!BB->isEHPad()) continue; 7022 7023 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 7024 // pad. 7025 for (MachineBasicBlock::iterator 7026 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 7027 if (!II->isEHLabel()) continue; 7028 7029 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 7030 if (!MMI.hasCallSiteLandingPad(Sym)) continue; 7031 7032 SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym); 7033 for (SmallVectorImpl<unsigned>::iterator 7034 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 7035 CSI != CSE; ++CSI) { 7036 CallSiteNumToLPad[*CSI].push_back(&*BB); 7037 MaxCSNum = std::max(MaxCSNum, *CSI); 7038 } 7039 break; 7040 } 7041 } 7042 7043 // Get an ordered list of the machine basic blocks for the jump table. 7044 std::vector<MachineBasicBlock*> LPadList; 7045 SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs; 7046 LPadList.reserve(CallSiteNumToLPad.size()); 7047 for (unsigned I = 1; I <= MaxCSNum; ++I) { 7048 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 7049 for (SmallVectorImpl<MachineBasicBlock*>::iterator 7050 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 7051 LPadList.push_back(*II); 7052 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 7053 } 7054 } 7055 7056 assert(!LPadList.empty() && 7057 "No landing pad destinations for the dispatch jump table!"); 7058 7059 // Create the jump table and associated information. 7060 MachineJumpTableInfo *JTI = 7061 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 7062 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 7063 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 7064 7065 // Create the MBBs for the dispatch code. 7066 7067 // Shove the dispatch's address into the return slot in the function context. 7068 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 7069 DispatchBB->setIsEHPad(); 7070 7071 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 7072 unsigned trap_opcode; 7073 if (Subtarget->isThumb()) 7074 trap_opcode = ARM::tTRAP; 7075 else 7076 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 7077 7078 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 7079 DispatchBB->addSuccessor(TrapBB); 7080 7081 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 7082 DispatchBB->addSuccessor(DispContBB); 7083 7084 // Insert and MBBs. 7085 MF->insert(MF->end(), DispatchBB); 7086 MF->insert(MF->end(), DispContBB); 7087 MF->insert(MF->end(), TrapBB); 7088 7089 // Insert code into the entry block that creates and registers the function 7090 // context. 7091 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 7092 7093 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 7094 MachinePointerInfo::getFixedStack(*MF, FI), 7095 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 7096 7097 MachineInstrBuilder MIB; 7098 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 7099 7100 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 7101 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 7102 7103 // Add a register mask with no preserved registers. This results in all 7104 // registers being marked as clobbered. 7105 MIB.addRegMask(RI.getNoPreservedMask()); 7106 7107 unsigned NumLPads = LPadList.size(); 7108 if (Subtarget->isThumb2()) { 7109 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7110 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 7111 .addFrameIndex(FI) 7112 .addImm(4) 7113 .addMemOperand(FIMMOLd)); 7114 7115 if (NumLPads < 256) { 7116 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 7117 .addReg(NewVReg1) 7118 .addImm(LPadList.size())); 7119 } else { 7120 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7121 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 7122 .addImm(NumLPads & 0xFFFF)); 7123 7124 unsigned VReg2 = VReg1; 7125 if ((NumLPads & 0xFFFF0000) != 0) { 7126 VReg2 = MRI->createVirtualRegister(TRC); 7127 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 7128 .addReg(VReg1) 7129 .addImm(NumLPads >> 16)); 7130 } 7131 7132 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 7133 .addReg(NewVReg1) 7134 .addReg(VReg2)); 7135 } 7136 7137 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 7138 .addMBB(TrapBB) 7139 .addImm(ARMCC::HI) 7140 .addReg(ARM::CPSR); 7141 7142 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7143 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3) 7144 .addJumpTableIndex(MJTI)); 7145 7146 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7147 AddDefaultCC( 7148 AddDefaultPred( 7149 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 7150 .addReg(NewVReg3, RegState::Kill) 7151 .addReg(NewVReg1) 7152 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); 7153 7154 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 7155 .addReg(NewVReg4, RegState::Kill) 7156 .addReg(NewVReg1) 7157 .addJumpTableIndex(MJTI); 7158 } else if (Subtarget->isThumb()) { 7159 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7160 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 7161 .addFrameIndex(FI) 7162 .addImm(1) 7163 .addMemOperand(FIMMOLd)); 7164 7165 if (NumLPads < 256) { 7166 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 7167 .addReg(NewVReg1) 7168 .addImm(NumLPads)); 7169 } else { 7170 MachineConstantPool *ConstantPool = MF->getConstantPool(); 7171 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 7172 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 7173 7174 // MachineConstantPool wants an explicit alignment. 7175 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 7176 if (Align == 0) 7177 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 7178 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 7179 7180 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7181 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 7182 .addReg(VReg1, RegState::Define) 7183 .addConstantPoolIndex(Idx)); 7184 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 7185 .addReg(NewVReg1) 7186 .addReg(VReg1)); 7187 } 7188 7189 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 7190 .addMBB(TrapBB) 7191 .addImm(ARMCC::HI) 7192 .addReg(ARM::CPSR); 7193 7194 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7195 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 7196 .addReg(ARM::CPSR, RegState::Define) 7197 .addReg(NewVReg1) 7198 .addImm(2)); 7199 7200 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7201 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 7202 .addJumpTableIndex(MJTI)); 7203 7204 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7205 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 7206 .addReg(ARM::CPSR, RegState::Define) 7207 .addReg(NewVReg2, RegState::Kill) 7208 .addReg(NewVReg3)); 7209 7210 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 7211 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 7212 7213 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 7214 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 7215 .addReg(NewVReg4, RegState::Kill) 7216 .addImm(0) 7217 .addMemOperand(JTMMOLd)); 7218 7219 unsigned NewVReg6 = NewVReg5; 7220 if (RelocM == Reloc::PIC_) { 7221 NewVReg6 = MRI->createVirtualRegister(TRC); 7222 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 7223 .addReg(ARM::CPSR, RegState::Define) 7224 .addReg(NewVReg5, RegState::Kill) 7225 .addReg(NewVReg3)); 7226 } 7227 7228 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 7229 .addReg(NewVReg6, RegState::Kill) 7230 .addJumpTableIndex(MJTI); 7231 } else { 7232 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7233 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 7234 .addFrameIndex(FI) 7235 .addImm(4) 7236 .addMemOperand(FIMMOLd)); 7237 7238 if (NumLPads < 256) { 7239 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 7240 .addReg(NewVReg1) 7241 .addImm(NumLPads)); 7242 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 7243 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7244 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 7245 .addImm(NumLPads & 0xFFFF)); 7246 7247 unsigned VReg2 = VReg1; 7248 if ((NumLPads & 0xFFFF0000) != 0) { 7249 VReg2 = MRI->createVirtualRegister(TRC); 7250 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 7251 .addReg(VReg1) 7252 .addImm(NumLPads >> 16)); 7253 } 7254 7255 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 7256 .addReg(NewVReg1) 7257 .addReg(VReg2)); 7258 } else { 7259 MachineConstantPool *ConstantPool = MF->getConstantPool(); 7260 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 7261 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 7262 7263 // MachineConstantPool wants an explicit alignment. 7264 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 7265 if (Align == 0) 7266 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 7267 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 7268 7269 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7270 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 7271 .addReg(VReg1, RegState::Define) 7272 .addConstantPoolIndex(Idx) 7273 .addImm(0)); 7274 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 7275 .addReg(NewVReg1) 7276 .addReg(VReg1, RegState::Kill)); 7277 } 7278 7279 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 7280 .addMBB(TrapBB) 7281 .addImm(ARMCC::HI) 7282 .addReg(ARM::CPSR); 7283 7284 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7285 AddDefaultCC( 7286 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 7287 .addReg(NewVReg1) 7288 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); 7289 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7290 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 7291 .addJumpTableIndex(MJTI)); 7292 7293 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 7294 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 7295 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 7296 AddDefaultPred( 7297 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 7298 .addReg(NewVReg3, RegState::Kill) 7299 .addReg(NewVReg4) 7300 .addImm(0) 7301 .addMemOperand(JTMMOLd)); 7302 7303 if (RelocM == Reloc::PIC_) { 7304 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 7305 .addReg(NewVReg5, RegState::Kill) 7306 .addReg(NewVReg4) 7307 .addJumpTableIndex(MJTI); 7308 } else { 7309 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 7310 .addReg(NewVReg5, RegState::Kill) 7311 .addJumpTableIndex(MJTI); 7312 } 7313 } 7314 7315 // Add the jump table entries as successors to the MBB. 7316 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 7317 for (std::vector<MachineBasicBlock*>::iterator 7318 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 7319 MachineBasicBlock *CurMBB = *I; 7320 if (SeenMBBs.insert(CurMBB).second) 7321 DispContBB->addSuccessor(CurMBB); 7322 } 7323 7324 // N.B. the order the invoke BBs are processed in doesn't matter here. 7325 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 7326 SmallVector<MachineBasicBlock*, 64> MBBLPads; 7327 for (MachineBasicBlock *BB : InvokeBBs) { 7328 7329 // Remove the landing pad successor from the invoke block and replace it 7330 // with the new dispatch block. 7331 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 7332 BB->succ_end()); 7333 while (!Successors.empty()) { 7334 MachineBasicBlock *SMBB = Successors.pop_back_val(); 7335 if (SMBB->isEHPad()) { 7336 BB->removeSuccessor(SMBB); 7337 MBBLPads.push_back(SMBB); 7338 } 7339 } 7340 7341 BB->addSuccessor(DispatchBB); 7342 7343 // Find the invoke call and mark all of the callee-saved registers as 7344 // 'implicit defined' so that they're spilled. This prevents code from 7345 // moving instructions to before the EH block, where they will never be 7346 // executed. 7347 for (MachineBasicBlock::reverse_iterator 7348 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 7349 if (!II->isCall()) continue; 7350 7351 DenseMap<unsigned, bool> DefRegs; 7352 for (MachineInstr::mop_iterator 7353 OI = II->operands_begin(), OE = II->operands_end(); 7354 OI != OE; ++OI) { 7355 if (!OI->isReg()) continue; 7356 DefRegs[OI->getReg()] = true; 7357 } 7358 7359 MachineInstrBuilder MIB(*MF, &*II); 7360 7361 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 7362 unsigned Reg = SavedRegs[i]; 7363 if (Subtarget->isThumb2() && 7364 !ARM::tGPRRegClass.contains(Reg) && 7365 !ARM::hGPRRegClass.contains(Reg)) 7366 continue; 7367 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 7368 continue; 7369 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 7370 continue; 7371 if (!DefRegs[Reg]) 7372 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 7373 } 7374 7375 break; 7376 } 7377 } 7378 7379 // Mark all former landing pads as non-landing pads. The dispatch is the only 7380 // landing pad now. 7381 for (SmallVectorImpl<MachineBasicBlock*>::iterator 7382 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 7383 (*I)->setIsEHPad(false); 7384 7385 // The instruction is gone now. 7386 MI->eraseFromParent(); 7387 } 7388 7389 static 7390 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 7391 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 7392 E = MBB->succ_end(); I != E; ++I) 7393 if (*I != Succ) 7394 return *I; 7395 llvm_unreachable("Expecting a BB with two successors!"); 7396 } 7397 7398 /// Return the load opcode for a given load size. If load size >= 8, 7399 /// neon opcode will be returned. 7400 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 7401 if (LdSize >= 8) 7402 return LdSize == 16 ? ARM::VLD1q32wb_fixed 7403 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 7404 if (IsThumb1) 7405 return LdSize == 4 ? ARM::tLDRi 7406 : LdSize == 2 ? ARM::tLDRHi 7407 : LdSize == 1 ? ARM::tLDRBi : 0; 7408 if (IsThumb2) 7409 return LdSize == 4 ? ARM::t2LDR_POST 7410 : LdSize == 2 ? ARM::t2LDRH_POST 7411 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 7412 return LdSize == 4 ? ARM::LDR_POST_IMM 7413 : LdSize == 2 ? ARM::LDRH_POST 7414 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 7415 } 7416 7417 /// Return the store opcode for a given store size. If store size >= 8, 7418 /// neon opcode will be returned. 7419 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 7420 if (StSize >= 8) 7421 return StSize == 16 ? ARM::VST1q32wb_fixed 7422 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 7423 if (IsThumb1) 7424 return StSize == 4 ? ARM::tSTRi 7425 : StSize == 2 ? ARM::tSTRHi 7426 : StSize == 1 ? ARM::tSTRBi : 0; 7427 if (IsThumb2) 7428 return StSize == 4 ? ARM::t2STR_POST 7429 : StSize == 2 ? ARM::t2STRH_POST 7430 : StSize == 1 ? ARM::t2STRB_POST : 0; 7431 return StSize == 4 ? ARM::STR_POST_IMM 7432 : StSize == 2 ? ARM::STRH_POST 7433 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 7434 } 7435 7436 /// Emit a post-increment load operation with given size. The instructions 7437 /// will be added to BB at Pos. 7438 static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos, 7439 const TargetInstrInfo *TII, DebugLoc dl, 7440 unsigned LdSize, unsigned Data, unsigned AddrIn, 7441 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 7442 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 7443 assert(LdOpc != 0 && "Should have a load opcode"); 7444 if (LdSize >= 8) { 7445 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7446 .addReg(AddrOut, RegState::Define).addReg(AddrIn) 7447 .addImm(0)); 7448 } else if (IsThumb1) { 7449 // load + update AddrIn 7450 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7451 .addReg(AddrIn).addImm(0)); 7452 MachineInstrBuilder MIB = 7453 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); 7454 MIB = AddDefaultT1CC(MIB); 7455 MIB.addReg(AddrIn).addImm(LdSize); 7456 AddDefaultPred(MIB); 7457 } else if (IsThumb2) { 7458 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7459 .addReg(AddrOut, RegState::Define).addReg(AddrIn) 7460 .addImm(LdSize)); 7461 } else { // arm 7462 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 7463 .addReg(AddrOut, RegState::Define).addReg(AddrIn) 7464 .addReg(0).addImm(LdSize)); 7465 } 7466 } 7467 7468 /// Emit a post-increment store operation with given size. The instructions 7469 /// will be added to BB at Pos. 7470 static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos, 7471 const TargetInstrInfo *TII, DebugLoc dl, 7472 unsigned StSize, unsigned Data, unsigned AddrIn, 7473 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 7474 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 7475 assert(StOpc != 0 && "Should have a store opcode"); 7476 if (StSize >= 8) { 7477 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 7478 .addReg(AddrIn).addImm(0).addReg(Data)); 7479 } else if (IsThumb1) { 7480 // store + update AddrIn 7481 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data) 7482 .addReg(AddrIn).addImm(0)); 7483 MachineInstrBuilder MIB = 7484 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut); 7485 MIB = AddDefaultT1CC(MIB); 7486 MIB.addReg(AddrIn).addImm(StSize); 7487 AddDefaultPred(MIB); 7488 } else if (IsThumb2) { 7489 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 7490 .addReg(Data).addReg(AddrIn).addImm(StSize)); 7491 } else { // arm 7492 AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 7493 .addReg(Data).addReg(AddrIn).addReg(0) 7494 .addImm(StSize)); 7495 } 7496 } 7497 7498 MachineBasicBlock * 7499 ARMTargetLowering::EmitStructByval(MachineInstr *MI, 7500 MachineBasicBlock *BB) const { 7501 // This pseudo instruction has 3 operands: dst, src, size 7502 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 7503 // Otherwise, we will generate unrolled scalar copies. 7504 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7505 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7506 MachineFunction::iterator It = ++BB->getIterator(); 7507 7508 unsigned dest = MI->getOperand(0).getReg(); 7509 unsigned src = MI->getOperand(1).getReg(); 7510 unsigned SizeVal = MI->getOperand(2).getImm(); 7511 unsigned Align = MI->getOperand(3).getImm(); 7512 DebugLoc dl = MI->getDebugLoc(); 7513 7514 MachineFunction *MF = BB->getParent(); 7515 MachineRegisterInfo &MRI = MF->getRegInfo(); 7516 unsigned UnitSize = 0; 7517 const TargetRegisterClass *TRC = nullptr; 7518 const TargetRegisterClass *VecTRC = nullptr; 7519 7520 bool IsThumb1 = Subtarget->isThumb1Only(); 7521 bool IsThumb2 = Subtarget->isThumb2(); 7522 7523 if (Align & 1) { 7524 UnitSize = 1; 7525 } else if (Align & 2) { 7526 UnitSize = 2; 7527 } else { 7528 // Check whether we can use NEON instructions. 7529 if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && 7530 Subtarget->hasNEON()) { 7531 if ((Align % 16 == 0) && SizeVal >= 16) 7532 UnitSize = 16; 7533 else if ((Align % 8 == 0) && SizeVal >= 8) 7534 UnitSize = 8; 7535 } 7536 // Can't use NEON instructions. 7537 if (UnitSize == 0) 7538 UnitSize = 4; 7539 } 7540 7541 // Select the correct opcode and register class for unit size load/store 7542 bool IsNeon = UnitSize >= 8; 7543 TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 7544 if (IsNeon) 7545 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 7546 : UnitSize == 8 ? &ARM::DPRRegClass 7547 : nullptr; 7548 7549 unsigned BytesLeft = SizeVal % UnitSize; 7550 unsigned LoopSize = SizeVal - BytesLeft; 7551 7552 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 7553 // Use LDR and STR to copy. 7554 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 7555 // [destOut] = STR_POST(scratch, destIn, UnitSize) 7556 unsigned srcIn = src; 7557 unsigned destIn = dest; 7558 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 7559 unsigned srcOut = MRI.createVirtualRegister(TRC); 7560 unsigned destOut = MRI.createVirtualRegister(TRC); 7561 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 7562 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 7563 IsThumb1, IsThumb2); 7564 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 7565 IsThumb1, IsThumb2); 7566 srcIn = srcOut; 7567 destIn = destOut; 7568 } 7569 7570 // Handle the leftover bytes with LDRB and STRB. 7571 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 7572 // [destOut] = STRB_POST(scratch, destIn, 1) 7573 for (unsigned i = 0; i < BytesLeft; i++) { 7574 unsigned srcOut = MRI.createVirtualRegister(TRC); 7575 unsigned destOut = MRI.createVirtualRegister(TRC); 7576 unsigned scratch = MRI.createVirtualRegister(TRC); 7577 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 7578 IsThumb1, IsThumb2); 7579 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 7580 IsThumb1, IsThumb2); 7581 srcIn = srcOut; 7582 destIn = destOut; 7583 } 7584 MI->eraseFromParent(); // The instruction is gone now. 7585 return BB; 7586 } 7587 7588 // Expand the pseudo op to a loop. 7589 // thisMBB: 7590 // ... 7591 // movw varEnd, # --> with thumb2 7592 // movt varEnd, # 7593 // ldrcp varEnd, idx --> without thumb2 7594 // fallthrough --> loopMBB 7595 // loopMBB: 7596 // PHI varPhi, varEnd, varLoop 7597 // PHI srcPhi, src, srcLoop 7598 // PHI destPhi, dst, destLoop 7599 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 7600 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 7601 // subs varLoop, varPhi, #UnitSize 7602 // bne loopMBB 7603 // fallthrough --> exitMBB 7604 // exitMBB: 7605 // epilogue to handle left-over bytes 7606 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 7607 // [destOut] = STRB_POST(scratch, destLoop, 1) 7608 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 7609 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 7610 MF->insert(It, loopMBB); 7611 MF->insert(It, exitMBB); 7612 7613 // Transfer the remainder of BB and its successor edges to exitMBB. 7614 exitMBB->splice(exitMBB->begin(), BB, 7615 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 7616 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 7617 7618 // Load an immediate to varEnd. 7619 unsigned varEnd = MRI.createVirtualRegister(TRC); 7620 if (Subtarget->useMovt(*MF)) { 7621 unsigned Vtmp = varEnd; 7622 if ((LoopSize & 0xFFFF0000) != 0) 7623 Vtmp = MRI.createVirtualRegister(TRC); 7624 AddDefaultPred(BuildMI(BB, dl, 7625 TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16), 7626 Vtmp).addImm(LoopSize & 0xFFFF)); 7627 7628 if ((LoopSize & 0xFFFF0000) != 0) 7629 AddDefaultPred(BuildMI(BB, dl, 7630 TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16), 7631 varEnd) 7632 .addReg(Vtmp) 7633 .addImm(LoopSize >> 16)); 7634 } else { 7635 MachineConstantPool *ConstantPool = MF->getConstantPool(); 7636 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 7637 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 7638 7639 // MachineConstantPool wants an explicit alignment. 7640 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 7641 if (Align == 0) 7642 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 7643 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 7644 7645 if (IsThumb1) 7646 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg( 7647 varEnd, RegState::Define).addConstantPoolIndex(Idx)); 7648 else 7649 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg( 7650 varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0)); 7651 } 7652 BB->addSuccessor(loopMBB); 7653 7654 // Generate the loop body: 7655 // varPhi = PHI(varLoop, varEnd) 7656 // srcPhi = PHI(srcLoop, src) 7657 // destPhi = PHI(destLoop, dst) 7658 MachineBasicBlock *entryBB = BB; 7659 BB = loopMBB; 7660 unsigned varLoop = MRI.createVirtualRegister(TRC); 7661 unsigned varPhi = MRI.createVirtualRegister(TRC); 7662 unsigned srcLoop = MRI.createVirtualRegister(TRC); 7663 unsigned srcPhi = MRI.createVirtualRegister(TRC); 7664 unsigned destLoop = MRI.createVirtualRegister(TRC); 7665 unsigned destPhi = MRI.createVirtualRegister(TRC); 7666 7667 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 7668 .addReg(varLoop).addMBB(loopMBB) 7669 .addReg(varEnd).addMBB(entryBB); 7670 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 7671 .addReg(srcLoop).addMBB(loopMBB) 7672 .addReg(src).addMBB(entryBB); 7673 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 7674 .addReg(destLoop).addMBB(loopMBB) 7675 .addReg(dest).addMBB(entryBB); 7676 7677 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 7678 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 7679 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 7680 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 7681 IsThumb1, IsThumb2); 7682 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 7683 IsThumb1, IsThumb2); 7684 7685 // Decrement loop variable by UnitSize. 7686 if (IsThumb1) { 7687 MachineInstrBuilder MIB = 7688 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop); 7689 MIB = AddDefaultT1CC(MIB); 7690 MIB.addReg(varPhi).addImm(UnitSize); 7691 AddDefaultPred(MIB); 7692 } else { 7693 MachineInstrBuilder MIB = 7694 BuildMI(*BB, BB->end(), dl, 7695 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 7696 AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); 7697 MIB->getOperand(5).setReg(ARM::CPSR); 7698 MIB->getOperand(5).setIsDef(true); 7699 } 7700 BuildMI(*BB, BB->end(), dl, 7701 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 7702 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 7703 7704 // loopMBB can loop back to loopMBB or fall through to exitMBB. 7705 BB->addSuccessor(loopMBB); 7706 BB->addSuccessor(exitMBB); 7707 7708 // Add epilogue to handle BytesLeft. 7709 BB = exitMBB; 7710 MachineInstr *StartOfExit = exitMBB->begin(); 7711 7712 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 7713 // [destOut] = STRB_POST(scratch, destLoop, 1) 7714 unsigned srcIn = srcLoop; 7715 unsigned destIn = destLoop; 7716 for (unsigned i = 0; i < BytesLeft; i++) { 7717 unsigned srcOut = MRI.createVirtualRegister(TRC); 7718 unsigned destOut = MRI.createVirtualRegister(TRC); 7719 unsigned scratch = MRI.createVirtualRegister(TRC); 7720 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 7721 IsThumb1, IsThumb2); 7722 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 7723 IsThumb1, IsThumb2); 7724 srcIn = srcOut; 7725 destIn = destOut; 7726 } 7727 7728 MI->eraseFromParent(); // The instruction is gone now. 7729 return BB; 7730 } 7731 7732 MachineBasicBlock * 7733 ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, 7734 MachineBasicBlock *MBB) const { 7735 const TargetMachine &TM = getTargetMachine(); 7736 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 7737 DebugLoc DL = MI->getDebugLoc(); 7738 7739 assert(Subtarget->isTargetWindows() && 7740 "__chkstk is only supported on Windows"); 7741 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 7742 7743 // __chkstk takes the number of words to allocate on the stack in R4, and 7744 // returns the stack adjustment in number of bytes in R4. This will not 7745 // clober any other registers (other than the obvious lr). 7746 // 7747 // Although, technically, IP should be considered a register which may be 7748 // clobbered, the call itself will not touch it. Windows on ARM is a pure 7749 // thumb-2 environment, so there is no interworking required. As a result, we 7750 // do not expect a veneer to be emitted by the linker, clobbering IP. 7751 // 7752 // Each module receives its own copy of __chkstk, so no import thunk is 7753 // required, again, ensuring that IP is not clobbered. 7754 // 7755 // Finally, although some linkers may theoretically provide a trampoline for 7756 // out of range calls (which is quite common due to a 32M range limitation of 7757 // branches for Thumb), we can generate the long-call version via 7758 // -mcmodel=large, alleviating the need for the trampoline which may clobber 7759 // IP. 7760 7761 switch (TM.getCodeModel()) { 7762 case CodeModel::Small: 7763 case CodeModel::Medium: 7764 case CodeModel::Default: 7765 case CodeModel::Kernel: 7766 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 7767 .addImm((unsigned)ARMCC::AL).addReg(0) 7768 .addExternalSymbol("__chkstk") 7769 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 7770 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 7771 .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); 7772 break; 7773 case CodeModel::Large: 7774 case CodeModel::JITDefault: { 7775 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 7776 unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 7777 7778 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 7779 .addExternalSymbol("__chkstk"); 7780 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 7781 .addImm((unsigned)ARMCC::AL).addReg(0) 7782 .addReg(Reg, RegState::Kill) 7783 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 7784 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 7785 .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead); 7786 break; 7787 } 7788 } 7789 7790 AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), 7791 ARM::SP) 7792 .addReg(ARM::SP).addReg(ARM::R4))); 7793 7794 MI->eraseFromParent(); 7795 return MBB; 7796 } 7797 7798 MachineBasicBlock * 7799 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, 7800 MachineBasicBlock *MBB) const { 7801 DebugLoc DL = MI->getDebugLoc(); 7802 MachineFunction *MF = MBB->getParent(); 7803 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7804 7805 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 7806 MF->push_back(ContBB); 7807 ContBB->splice(ContBB->begin(), MBB, 7808 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 7809 MBB->addSuccessor(ContBB); 7810 7811 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 7812 MF->push_back(TrapBB); 7813 BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249); 7814 MBB->addSuccessor(TrapBB); 7815 7816 BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) 7817 .addReg(MI->getOperand(0).getReg()) 7818 .addMBB(TrapBB); 7819 7820 MI->eraseFromParent(); 7821 return ContBB; 7822 } 7823 7824 MachineBasicBlock * 7825 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7826 MachineBasicBlock *BB) const { 7827 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7828 DebugLoc dl = MI->getDebugLoc(); 7829 bool isThumb2 = Subtarget->isThumb2(); 7830 switch (MI->getOpcode()) { 7831 default: { 7832 MI->dump(); 7833 llvm_unreachable("Unexpected instr type to insert"); 7834 } 7835 // The Thumb2 pre-indexed stores have the same MI operands, they just 7836 // define them differently in the .td files from the isel patterns, so 7837 // they need pseudos. 7838 case ARM::t2STR_preidx: 7839 MI->setDesc(TII->get(ARM::t2STR_PRE)); 7840 return BB; 7841 case ARM::t2STRB_preidx: 7842 MI->setDesc(TII->get(ARM::t2STRB_PRE)); 7843 return BB; 7844 case ARM::t2STRH_preidx: 7845 MI->setDesc(TII->get(ARM::t2STRH_PRE)); 7846 return BB; 7847 7848 case ARM::STRi_preidx: 7849 case ARM::STRBi_preidx: { 7850 unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ? 7851 ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; 7852 // Decode the offset. 7853 unsigned Offset = MI->getOperand(4).getImm(); 7854 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 7855 Offset = ARM_AM::getAM2Offset(Offset); 7856 if (isSub) 7857 Offset = -Offset; 7858 7859 MachineMemOperand *MMO = *MI->memoperands_begin(); 7860 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 7861 .addOperand(MI->getOperand(0)) // Rn_wb 7862 .addOperand(MI->getOperand(1)) // Rt 7863 .addOperand(MI->getOperand(2)) // Rn 7864 .addImm(Offset) // offset (skip GPR==zero_reg) 7865 .addOperand(MI->getOperand(5)) // pred 7866 .addOperand(MI->getOperand(6)) 7867 .addMemOperand(MMO); 7868 MI->eraseFromParent(); 7869 return BB; 7870 } 7871 case ARM::STRr_preidx: 7872 case ARM::STRBr_preidx: 7873 case ARM::STRH_preidx: { 7874 unsigned NewOpc; 7875 switch (MI->getOpcode()) { 7876 default: llvm_unreachable("unexpected opcode!"); 7877 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 7878 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 7879 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 7880 } 7881 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 7882 for (unsigned i = 0; i < MI->getNumOperands(); ++i) 7883 MIB.addOperand(MI->getOperand(i)); 7884 MI->eraseFromParent(); 7885 return BB; 7886 } 7887 7888 case ARM::tMOVCCr_pseudo: { 7889 // To "insert" a SELECT_CC instruction, we actually have to insert the 7890 // diamond control-flow pattern. The incoming instruction knows the 7891 // destination vreg to set, the condition code register to branch on, the 7892 // true/false values to select between, and a branch opcode to use. 7893 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7894 MachineFunction::iterator It = ++BB->getIterator(); 7895 7896 // thisMBB: 7897 // ... 7898 // TrueVal = ... 7899 // cmpTY ccX, r1, r2 7900 // bCC copy1MBB 7901 // fallthrough --> copy0MBB 7902 MachineBasicBlock *thisMBB = BB; 7903 MachineFunction *F = BB->getParent(); 7904 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7905 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7906 F->insert(It, copy0MBB); 7907 F->insert(It, sinkMBB); 7908 7909 // Transfer the remainder of BB and its successor edges to sinkMBB. 7910 sinkMBB->splice(sinkMBB->begin(), BB, 7911 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 7912 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 7913 7914 BB->addSuccessor(copy0MBB); 7915 BB->addSuccessor(sinkMBB); 7916 7917 BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) 7918 .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); 7919 7920 // copy0MBB: 7921 // %FalseValue = ... 7922 // # fallthrough to sinkMBB 7923 BB = copy0MBB; 7924 7925 // Update machine-CFG edges 7926 BB->addSuccessor(sinkMBB); 7927 7928 // sinkMBB: 7929 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7930 // ... 7931 BB = sinkMBB; 7932 BuildMI(*BB, BB->begin(), dl, 7933 TII->get(ARM::PHI), MI->getOperand(0).getReg()) 7934 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7935 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7936 7937 MI->eraseFromParent(); // The pseudo instruction is gone now. 7938 return BB; 7939 } 7940 7941 case ARM::BCCi64: 7942 case ARM::BCCZi64: { 7943 // If there is an unconditional branch to the other successor, remove it. 7944 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 7945 7946 // Compare both parts that make up the double comparison separately for 7947 // equality. 7948 bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; 7949 7950 unsigned LHS1 = MI->getOperand(1).getReg(); 7951 unsigned LHS2 = MI->getOperand(2).getReg(); 7952 if (RHSisZero) { 7953 AddDefaultPred(BuildMI(BB, dl, 7954 TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 7955 .addReg(LHS1).addImm(0)); 7956 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 7957 .addReg(LHS2).addImm(0) 7958 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 7959 } else { 7960 unsigned RHS1 = MI->getOperand(3).getReg(); 7961 unsigned RHS2 = MI->getOperand(4).getReg(); 7962 AddDefaultPred(BuildMI(BB, dl, 7963 TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 7964 .addReg(LHS1).addReg(RHS1)); 7965 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 7966 .addReg(LHS2).addReg(RHS2) 7967 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 7968 } 7969 7970 MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); 7971 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 7972 if (MI->getOperand(0).getImm() == ARMCC::NE) 7973 std::swap(destMBB, exitMBB); 7974 7975 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 7976 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 7977 if (isThumb2) 7978 AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB)); 7979 else 7980 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 7981 7982 MI->eraseFromParent(); // The pseudo instruction is gone now. 7983 return BB; 7984 } 7985 7986 case ARM::Int_eh_sjlj_setjmp: 7987 case ARM::Int_eh_sjlj_setjmp_nofp: 7988 case ARM::tInt_eh_sjlj_setjmp: 7989 case ARM::t2Int_eh_sjlj_setjmp: 7990 case ARM::t2Int_eh_sjlj_setjmp_nofp: 7991 return BB; 7992 7993 case ARM::Int_eh_sjlj_setup_dispatch: 7994 EmitSjLjDispatchBlock(MI, BB); 7995 return BB; 7996 7997 case ARM::ABS: 7998 case ARM::t2ABS: { 7999 // To insert an ABS instruction, we have to insert the 8000 // diamond control-flow pattern. The incoming instruction knows the 8001 // source vreg to test against 0, the destination vreg to set, 8002 // the condition code register to branch on, the 8003 // true/false values to select between, and a branch opcode to use. 8004 // It transforms 8005 // V1 = ABS V0 8006 // into 8007 // V2 = MOVS V0 8008 // BCC (branch to SinkBB if V0 >= 0) 8009 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 8010 // SinkBB: V1 = PHI(V2, V3) 8011 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8012 MachineFunction::iterator BBI = ++BB->getIterator(); 8013 MachineFunction *Fn = BB->getParent(); 8014 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 8015 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 8016 Fn->insert(BBI, RSBBB); 8017 Fn->insert(BBI, SinkBB); 8018 8019 unsigned int ABSSrcReg = MI->getOperand(1).getReg(); 8020 unsigned int ABSDstReg = MI->getOperand(0).getReg(); 8021 bool ABSSrcKIll = MI->getOperand(1).isKill(); 8022 bool isThumb2 = Subtarget->isThumb2(); 8023 MachineRegisterInfo &MRI = Fn->getRegInfo(); 8024 // In Thumb mode S must not be specified if source register is the SP or 8025 // PC and if destination register is the SP, so restrict register class 8026 unsigned NewRsbDstReg = 8027 MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 8028 8029 // Transfer the remainder of BB and its successor edges to sinkMBB. 8030 SinkBB->splice(SinkBB->begin(), BB, 8031 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8032 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 8033 8034 BB->addSuccessor(RSBBB); 8035 BB->addSuccessor(SinkBB); 8036 8037 // fall through to SinkMBB 8038 RSBBB->addSuccessor(SinkBB); 8039 8040 // insert a cmp at the end of BB 8041 AddDefaultPred(BuildMI(BB, dl, 8042 TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 8043 .addReg(ABSSrcReg).addImm(0)); 8044 8045 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 8046 BuildMI(BB, dl, 8047 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 8048 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 8049 8050 // insert rsbri in RSBBB 8051 // Note: BCC and rsbri will be converted into predicated rsbmi 8052 // by if-conversion pass 8053 BuildMI(*RSBBB, RSBBB->begin(), dl, 8054 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 8055 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 8056 .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); 8057 8058 // insert PHI in SinkBB, 8059 // reuse ABSDstReg to not change uses of ABS instruction 8060 BuildMI(*SinkBB, SinkBB->begin(), dl, 8061 TII->get(ARM::PHI), ABSDstReg) 8062 .addReg(NewRsbDstReg).addMBB(RSBBB) 8063 .addReg(ABSSrcReg).addMBB(BB); 8064 8065 // remove ABS instruction 8066 MI->eraseFromParent(); 8067 8068 // return last added BB 8069 return SinkBB; 8070 } 8071 case ARM::COPY_STRUCT_BYVAL_I32: 8072 ++NumLoopByVals; 8073 return EmitStructByval(MI, BB); 8074 case ARM::WIN__CHKSTK: 8075 return EmitLowered__chkstk(MI, BB); 8076 case ARM::WIN__DBZCHK: 8077 return EmitLowered__dbzchk(MI, BB); 8078 } 8079 } 8080 8081 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers 8082 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 8083 /// instead of as a custom inserter because we need the use list from the SDNode. 8084 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 8085 MachineInstr *MI, const SDNode *Node) { 8086 bool isThumb1 = Subtarget->isThumb1Only(); 8087 8088 DebugLoc DL = MI->getDebugLoc(); 8089 MachineFunction *MF = MI->getParent()->getParent(); 8090 MachineRegisterInfo &MRI = MF->getRegInfo(); 8091 MachineInstrBuilder MIB(*MF, MI); 8092 8093 // If the new dst/src is unused mark it as dead. 8094 if (!Node->hasAnyUseOfValue(0)) { 8095 MI->getOperand(0).setIsDead(true); 8096 } 8097 if (!Node->hasAnyUseOfValue(1)) { 8098 MI->getOperand(1).setIsDead(true); 8099 } 8100 8101 // The MEMCPY both defines and kills the scratch registers. 8102 for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { 8103 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 8104 : &ARM::GPRRegClass); 8105 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 8106 } 8107 } 8108 8109 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 8110 SDNode *Node) const { 8111 if (MI->getOpcode() == ARM::MEMCPY) { 8112 attachMEMCPYScratchRegs(Subtarget, MI, Node); 8113 return; 8114 } 8115 8116 const MCInstrDesc *MCID = &MI->getDesc(); 8117 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 8118 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 8119 // operand is still set to noreg. If needed, set the optional operand's 8120 // register to CPSR, and remove the redundant implicit def. 8121 // 8122 // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>). 8123 8124 // Rename pseudo opcodes. 8125 unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); 8126 if (NewOpc) { 8127 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 8128 MCID = &TII->get(NewOpc); 8129 8130 assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 && 8131 "converted opcode should be the same except for cc_out"); 8132 8133 MI->setDesc(*MCID); 8134 8135 // Add the optional cc_out operand 8136 MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 8137 } 8138 unsigned ccOutIdx = MCID->getNumOperands() - 1; 8139 8140 // Any ARM instruction that sets the 's' bit should specify an optional 8141 // "cc_out" operand in the last operand position. 8142 if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 8143 assert(!NewOpc && "Optional cc_out operand required"); 8144 return; 8145 } 8146 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 8147 // since we already have an optional CPSR def. 8148 bool definesCPSR = false; 8149 bool deadCPSR = false; 8150 for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands(); 8151 i != e; ++i) { 8152 const MachineOperand &MO = MI->getOperand(i); 8153 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 8154 definesCPSR = true; 8155 if (MO.isDead()) 8156 deadCPSR = true; 8157 MI->RemoveOperand(i); 8158 break; 8159 } 8160 } 8161 if (!definesCPSR) { 8162 assert(!NewOpc && "Optional cc_out operand required"); 8163 return; 8164 } 8165 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 8166 if (deadCPSR) { 8167 assert(!MI->getOperand(ccOutIdx).getReg() && 8168 "expect uninitialized optional cc_out operand"); 8169 return; 8170 } 8171 8172 // If this instruction was defined with an optional CPSR def and its dag node 8173 // had a live implicit CPSR def, then activate the optional CPSR def. 8174 MachineOperand &MO = MI->getOperand(ccOutIdx); 8175 MO.setReg(ARM::CPSR); 8176 MO.setIsDef(true); 8177 } 8178 8179 //===----------------------------------------------------------------------===// 8180 // ARM Optimization Hooks 8181 //===----------------------------------------------------------------------===// 8182 8183 // Helper function that checks if N is a null or all ones constant. 8184 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 8185 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); 8186 if (!C) 8187 return false; 8188 return AllOnes ? C->isAllOnesValue() : C->isNullValue(); 8189 } 8190 8191 // Return true if N is conditionally 0 or all ones. 8192 // Detects these expressions where cc is an i1 value: 8193 // 8194 // (select cc 0, y) [AllOnes=0] 8195 // (select cc y, 0) [AllOnes=0] 8196 // (zext cc) [AllOnes=0] 8197 // (sext cc) [AllOnes=0/1] 8198 // (select cc -1, y) [AllOnes=1] 8199 // (select cc y, -1) [AllOnes=1] 8200 // 8201 // Invert is set when N is the null/all ones constant when CC is false. 8202 // OtherOp is set to the alternative value of N. 8203 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 8204 SDValue &CC, bool &Invert, 8205 SDValue &OtherOp, 8206 SelectionDAG &DAG) { 8207 switch (N->getOpcode()) { 8208 default: return false; 8209 case ISD::SELECT: { 8210 CC = N->getOperand(0); 8211 SDValue N1 = N->getOperand(1); 8212 SDValue N2 = N->getOperand(2); 8213 if (isZeroOrAllOnes(N1, AllOnes)) { 8214 Invert = false; 8215 OtherOp = N2; 8216 return true; 8217 } 8218 if (isZeroOrAllOnes(N2, AllOnes)) { 8219 Invert = true; 8220 OtherOp = N1; 8221 return true; 8222 } 8223 return false; 8224 } 8225 case ISD::ZERO_EXTEND: 8226 // (zext cc) can never be the all ones value. 8227 if (AllOnes) 8228 return false; 8229 // Fall through. 8230 case ISD::SIGN_EXTEND: { 8231 SDLoc dl(N); 8232 EVT VT = N->getValueType(0); 8233 CC = N->getOperand(0); 8234 if (CC.getValueType() != MVT::i1) 8235 return false; 8236 Invert = !AllOnes; 8237 if (AllOnes) 8238 // When looking for an AllOnes constant, N is an sext, and the 'other' 8239 // value is 0. 8240 OtherOp = DAG.getConstant(0, dl, VT); 8241 else if (N->getOpcode() == ISD::ZERO_EXTEND) 8242 // When looking for a 0 constant, N can be zext or sext. 8243 OtherOp = DAG.getConstant(1, dl, VT); 8244 else 8245 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 8246 VT); 8247 return true; 8248 } 8249 } 8250 } 8251 8252 // Combine a constant select operand into its use: 8253 // 8254 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 8255 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 8256 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 8257 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 8258 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 8259 // 8260 // The transform is rejected if the select doesn't have a constant operand that 8261 // is null, or all ones when AllOnes is set. 8262 // 8263 // Also recognize sext/zext from i1: 8264 // 8265 // (add (zext cc), x) -> (select cc (add x, 1), x) 8266 // (add (sext cc), x) -> (select cc (add x, -1), x) 8267 // 8268 // These transformations eventually create predicated instructions. 8269 // 8270 // @param N The node to transform. 8271 // @param Slct The N operand that is a select. 8272 // @param OtherOp The other N operand (x above). 8273 // @param DCI Context. 8274 // @param AllOnes Require the select constant to be all ones instead of null. 8275 // @returns The new node, or SDValue() on failure. 8276 static 8277 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 8278 TargetLowering::DAGCombinerInfo &DCI, 8279 bool AllOnes = false) { 8280 SelectionDAG &DAG = DCI.DAG; 8281 EVT VT = N->getValueType(0); 8282 SDValue NonConstantVal; 8283 SDValue CCOp; 8284 bool SwapSelectOps; 8285 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 8286 NonConstantVal, DAG)) 8287 return SDValue(); 8288 8289 // Slct is now know to be the desired identity constant when CC is true. 8290 SDValue TrueVal = OtherOp; 8291 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 8292 OtherOp, NonConstantVal); 8293 // Unless SwapSelectOps says CC should be false. 8294 if (SwapSelectOps) 8295 std::swap(TrueVal, FalseVal); 8296 8297 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 8298 CCOp, TrueVal, FalseVal); 8299 } 8300 8301 // Attempt combineSelectAndUse on each operand of a commutative operator N. 8302 static 8303 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 8304 TargetLowering::DAGCombinerInfo &DCI) { 8305 SDValue N0 = N->getOperand(0); 8306 SDValue N1 = N->getOperand(1); 8307 if (N0.getNode()->hasOneUse()) { 8308 SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes); 8309 if (Result.getNode()) 8310 return Result; 8311 } 8312 if (N1.getNode()->hasOneUse()) { 8313 SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes); 8314 if (Result.getNode()) 8315 return Result; 8316 } 8317 return SDValue(); 8318 } 8319 8320 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction 8321 // (only after legalization). 8322 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, 8323 TargetLowering::DAGCombinerInfo &DCI, 8324 const ARMSubtarget *Subtarget) { 8325 8326 // Only perform optimization if after legalize, and if NEON is available. We 8327 // also expected both operands to be BUILD_VECTORs. 8328 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 8329 || N0.getOpcode() != ISD::BUILD_VECTOR 8330 || N1.getOpcode() != ISD::BUILD_VECTOR) 8331 return SDValue(); 8332 8333 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 8334 EVT VT = N->getValueType(0); 8335 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 8336 return SDValue(); 8337 8338 // Check that the vector operands are of the right form. 8339 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 8340 // operands, where N is the size of the formed vector. 8341 // Each EXTRACT_VECTOR should have the same input vector and odd or even 8342 // index such that we have a pair wise add pattern. 8343 8344 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 8345 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8346 return SDValue(); 8347 SDValue Vec = N0->getOperand(0)->getOperand(0); 8348 SDNode *V = Vec.getNode(); 8349 unsigned nextIndex = 0; 8350 8351 // For each operands to the ADD which are BUILD_VECTORs, 8352 // check to see if each of their operands are an EXTRACT_VECTOR with 8353 // the same vector and appropriate index. 8354 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 8355 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 8356 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 8357 8358 SDValue ExtVec0 = N0->getOperand(i); 8359 SDValue ExtVec1 = N1->getOperand(i); 8360 8361 // First operand is the vector, verify its the same. 8362 if (V != ExtVec0->getOperand(0).getNode() || 8363 V != ExtVec1->getOperand(0).getNode()) 8364 return SDValue(); 8365 8366 // Second is the constant, verify its correct. 8367 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 8368 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 8369 8370 // For the constant, we want to see all the even or all the odd. 8371 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 8372 || C1->getZExtValue() != nextIndex+1) 8373 return SDValue(); 8374 8375 // Increment index. 8376 nextIndex+=2; 8377 } else 8378 return SDValue(); 8379 } 8380 8381 // Create VPADDL node. 8382 SelectionDAG &DAG = DCI.DAG; 8383 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8384 8385 SDLoc dl(N); 8386 8387 // Build operand list. 8388 SmallVector<SDValue, 8> Ops; 8389 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 8390 TLI.getPointerTy(DAG.getDataLayout()))); 8391 8392 // Input is the vector. 8393 Ops.push_back(Vec); 8394 8395 // Get widened type and narrowed type. 8396 MVT widenType; 8397 unsigned numElem = VT.getVectorNumElements(); 8398 8399 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 8400 switch (inputLaneType.getSimpleVT().SimpleTy) { 8401 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 8402 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 8403 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 8404 default: 8405 llvm_unreachable("Invalid vector element type for padd optimization."); 8406 } 8407 8408 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 8409 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 8410 return DAG.getNode(ExtOp, dl, VT, tmp); 8411 } 8412 8413 static SDValue findMUL_LOHI(SDValue V) { 8414 if (V->getOpcode() == ISD::UMUL_LOHI || 8415 V->getOpcode() == ISD::SMUL_LOHI) 8416 return V; 8417 return SDValue(); 8418 } 8419 8420 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, 8421 TargetLowering::DAGCombinerInfo &DCI, 8422 const ARMSubtarget *Subtarget) { 8423 8424 if (Subtarget->isThumb1Only()) return SDValue(); 8425 8426 // Only perform the checks after legalize when the pattern is available. 8427 if (DCI.isBeforeLegalize()) return SDValue(); 8428 8429 // Look for multiply add opportunities. 8430 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 8431 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 8432 // a glue link from the first add to the second add. 8433 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 8434 // a S/UMLAL instruction. 8435 // UMUL_LOHI 8436 // / :lo \ :hi 8437 // / \ [no multiline comment] 8438 // loAdd -> ADDE | 8439 // \ :glue / 8440 // \ / 8441 // ADDC <- hiAdd 8442 // 8443 assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); 8444 SDValue AddcOp0 = AddcNode->getOperand(0); 8445 SDValue AddcOp1 = AddcNode->getOperand(1); 8446 8447 // Check if the two operands are from the same mul_lohi node. 8448 if (AddcOp0.getNode() == AddcOp1.getNode()) 8449 return SDValue(); 8450 8451 assert(AddcNode->getNumValues() == 2 && 8452 AddcNode->getValueType(0) == MVT::i32 && 8453 "Expect ADDC with two result values. First: i32"); 8454 8455 // Check that we have a glued ADDC node. 8456 if (AddcNode->getValueType(1) != MVT::Glue) 8457 return SDValue(); 8458 8459 // Check that the ADDC adds the low result of the S/UMUL_LOHI. 8460 if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && 8461 AddcOp0->getOpcode() != ISD::SMUL_LOHI && 8462 AddcOp1->getOpcode() != ISD::UMUL_LOHI && 8463 AddcOp1->getOpcode() != ISD::SMUL_LOHI) 8464 return SDValue(); 8465 8466 // Look for the glued ADDE. 8467 SDNode* AddeNode = AddcNode->getGluedUser(); 8468 if (!AddeNode) 8469 return SDValue(); 8470 8471 // Make sure it is really an ADDE. 8472 if (AddeNode->getOpcode() != ISD::ADDE) 8473 return SDValue(); 8474 8475 assert(AddeNode->getNumOperands() == 3 && 8476 AddeNode->getOperand(2).getValueType() == MVT::Glue && 8477 "ADDE node has the wrong inputs"); 8478 8479 // Check for the triangle shape. 8480 SDValue AddeOp0 = AddeNode->getOperand(0); 8481 SDValue AddeOp1 = AddeNode->getOperand(1); 8482 8483 // Make sure that the ADDE operands are not coming from the same node. 8484 if (AddeOp0.getNode() == AddeOp1.getNode()) 8485 return SDValue(); 8486 8487 // Find the MUL_LOHI node walking up ADDE's operands. 8488 bool IsLeftOperandMUL = false; 8489 SDValue MULOp = findMUL_LOHI(AddeOp0); 8490 if (MULOp == SDValue()) 8491 MULOp = findMUL_LOHI(AddeOp1); 8492 else 8493 IsLeftOperandMUL = true; 8494 if (MULOp == SDValue()) 8495 return SDValue(); 8496 8497 // Figure out the right opcode. 8498 unsigned Opc = MULOp->getOpcode(); 8499 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 8500 8501 // Figure out the high and low input values to the MLAL node. 8502 SDValue* HiAdd = nullptr; 8503 SDValue* LoMul = nullptr; 8504 SDValue* LowAdd = nullptr; 8505 8506 // Ensure that ADDE is from high result of ISD::SMUL_LOHI. 8507 if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) 8508 return SDValue(); 8509 8510 if (IsLeftOperandMUL) 8511 HiAdd = &AddeOp1; 8512 else 8513 HiAdd = &AddeOp0; 8514 8515 8516 // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node 8517 // whose low result is fed to the ADDC we are checking. 8518 8519 if (AddcOp0 == MULOp.getValue(0)) { 8520 LoMul = &AddcOp0; 8521 LowAdd = &AddcOp1; 8522 } 8523 if (AddcOp1 == MULOp.getValue(0)) { 8524 LoMul = &AddcOp1; 8525 LowAdd = &AddcOp0; 8526 } 8527 8528 if (!LoMul) 8529 return SDValue(); 8530 8531 // Create the merged node. 8532 SelectionDAG &DAG = DCI.DAG; 8533 8534 // Build operand list. 8535 SmallVector<SDValue, 8> Ops; 8536 Ops.push_back(LoMul->getOperand(0)); 8537 Ops.push_back(LoMul->getOperand(1)); 8538 Ops.push_back(*LowAdd); 8539 Ops.push_back(*HiAdd); 8540 8541 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), 8542 DAG.getVTList(MVT::i32, MVT::i32), Ops); 8543 8544 // Replace the ADDs' nodes uses by the MLA node's values. 8545 SDValue HiMLALResult(MLALNode.getNode(), 1); 8546 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 8547 8548 SDValue LoMLALResult(MLALNode.getNode(), 0); 8549 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 8550 8551 // Return original node to notify the driver to stop replacing. 8552 SDValue resNode(AddcNode, 0); 8553 return resNode; 8554 } 8555 8556 /// PerformADDCCombine - Target-specific dag combine transform from 8557 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. 8558 static SDValue PerformADDCCombine(SDNode *N, 8559 TargetLowering::DAGCombinerInfo &DCI, 8560 const ARMSubtarget *Subtarget) { 8561 8562 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 8563 8564 } 8565 8566 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 8567 /// operands N0 and N1. This is a helper for PerformADDCombine that is 8568 /// called with the default operands, and if that fails, with commuted 8569 /// operands. 8570 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 8571 TargetLowering::DAGCombinerInfo &DCI, 8572 const ARMSubtarget *Subtarget){ 8573 8574 // Attempt to create vpaddl for this add. 8575 SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget); 8576 if (Result.getNode()) 8577 return Result; 8578 8579 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 8580 if (N0.getNode()->hasOneUse()) { 8581 SDValue Result = combineSelectAndUse(N, N0, N1, DCI); 8582 if (Result.getNode()) return Result; 8583 } 8584 return SDValue(); 8585 } 8586 8587 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 8588 /// 8589 static SDValue PerformADDCombine(SDNode *N, 8590 TargetLowering::DAGCombinerInfo &DCI, 8591 const ARMSubtarget *Subtarget) { 8592 SDValue N0 = N->getOperand(0); 8593 SDValue N1 = N->getOperand(1); 8594 8595 // First try with the default operand order. 8596 SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget); 8597 if (Result.getNode()) 8598 return Result; 8599 8600 // If that didn't work, try again with the operands commuted. 8601 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 8602 } 8603 8604 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 8605 /// 8606 static SDValue PerformSUBCombine(SDNode *N, 8607 TargetLowering::DAGCombinerInfo &DCI) { 8608 SDValue N0 = N->getOperand(0); 8609 SDValue N1 = N->getOperand(1); 8610 8611 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 8612 if (N1.getNode()->hasOneUse()) { 8613 SDValue Result = combineSelectAndUse(N, N1, N0, DCI); 8614 if (Result.getNode()) return Result; 8615 } 8616 8617 return SDValue(); 8618 } 8619 8620 /// PerformVMULCombine 8621 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 8622 /// special multiplier accumulator forwarding. 8623 /// vmul d3, d0, d2 8624 /// vmla d3, d1, d2 8625 /// is faster than 8626 /// vadd d3, d0, d1 8627 /// vmul d3, d3, d2 8628 // However, for (A + B) * (A + B), 8629 // vadd d2, d0, d1 8630 // vmul d3, d0, d2 8631 // vmla d3, d1, d2 8632 // is slower than 8633 // vadd d2, d0, d1 8634 // vmul d3, d2, d2 8635 static SDValue PerformVMULCombine(SDNode *N, 8636 TargetLowering::DAGCombinerInfo &DCI, 8637 const ARMSubtarget *Subtarget) { 8638 if (!Subtarget->hasVMLxForwarding()) 8639 return SDValue(); 8640 8641 SelectionDAG &DAG = DCI.DAG; 8642 SDValue N0 = N->getOperand(0); 8643 SDValue N1 = N->getOperand(1); 8644 unsigned Opcode = N0.getOpcode(); 8645 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 8646 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 8647 Opcode = N1.getOpcode(); 8648 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 8649 Opcode != ISD::FADD && Opcode != ISD::FSUB) 8650 return SDValue(); 8651 std::swap(N0, N1); 8652 } 8653 8654 if (N0 == N1) 8655 return SDValue(); 8656 8657 EVT VT = N->getValueType(0); 8658 SDLoc DL(N); 8659 SDValue N00 = N0->getOperand(0); 8660 SDValue N01 = N0->getOperand(1); 8661 return DAG.getNode(Opcode, DL, VT, 8662 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 8663 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 8664 } 8665 8666 static SDValue PerformMULCombine(SDNode *N, 8667 TargetLowering::DAGCombinerInfo &DCI, 8668 const ARMSubtarget *Subtarget) { 8669 SelectionDAG &DAG = DCI.DAG; 8670 8671 if (Subtarget->isThumb1Only()) 8672 return SDValue(); 8673 8674 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 8675 return SDValue(); 8676 8677 EVT VT = N->getValueType(0); 8678 if (VT.is64BitVector() || VT.is128BitVector()) 8679 return PerformVMULCombine(N, DCI, Subtarget); 8680 if (VT != MVT::i32) 8681 return SDValue(); 8682 8683 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 8684 if (!C) 8685 return SDValue(); 8686 8687 int64_t MulAmt = C->getSExtValue(); 8688 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 8689 8690 ShiftAmt = ShiftAmt & (32 - 1); 8691 SDValue V = N->getOperand(0); 8692 SDLoc DL(N); 8693 8694 SDValue Res; 8695 MulAmt >>= ShiftAmt; 8696 8697 if (MulAmt >= 0) { 8698 if (isPowerOf2_32(MulAmt - 1)) { 8699 // (mul x, 2^N + 1) => (add (shl x, N), x) 8700 Res = DAG.getNode(ISD::ADD, DL, VT, 8701 V, 8702 DAG.getNode(ISD::SHL, DL, VT, 8703 V, 8704 DAG.getConstant(Log2_32(MulAmt - 1), DL, 8705 MVT::i32))); 8706 } else if (isPowerOf2_32(MulAmt + 1)) { 8707 // (mul x, 2^N - 1) => (sub (shl x, N), x) 8708 Res = DAG.getNode(ISD::SUB, DL, VT, 8709 DAG.getNode(ISD::SHL, DL, VT, 8710 V, 8711 DAG.getConstant(Log2_32(MulAmt + 1), DL, 8712 MVT::i32)), 8713 V); 8714 } else 8715 return SDValue(); 8716 } else { 8717 uint64_t MulAmtAbs = -MulAmt; 8718 if (isPowerOf2_32(MulAmtAbs + 1)) { 8719 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 8720 Res = DAG.getNode(ISD::SUB, DL, VT, 8721 V, 8722 DAG.getNode(ISD::SHL, DL, VT, 8723 V, 8724 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 8725 MVT::i32))); 8726 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 8727 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 8728 Res = DAG.getNode(ISD::ADD, DL, VT, 8729 V, 8730 DAG.getNode(ISD::SHL, DL, VT, 8731 V, 8732 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 8733 MVT::i32))); 8734 Res = DAG.getNode(ISD::SUB, DL, VT, 8735 DAG.getConstant(0, DL, MVT::i32), Res); 8736 8737 } else 8738 return SDValue(); 8739 } 8740 8741 if (ShiftAmt != 0) 8742 Res = DAG.getNode(ISD::SHL, DL, VT, 8743 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 8744 8745 // Do not add new nodes to DAG combiner worklist. 8746 DCI.CombineTo(N, Res, false); 8747 return SDValue(); 8748 } 8749 8750 static SDValue PerformANDCombine(SDNode *N, 8751 TargetLowering::DAGCombinerInfo &DCI, 8752 const ARMSubtarget *Subtarget) { 8753 8754 // Attempt to use immediate-form VBIC 8755 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 8756 SDLoc dl(N); 8757 EVT VT = N->getValueType(0); 8758 SelectionDAG &DAG = DCI.DAG; 8759 8760 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 8761 return SDValue(); 8762 8763 APInt SplatBits, SplatUndef; 8764 unsigned SplatBitSize; 8765 bool HasAnyUndefs; 8766 if (BVN && 8767 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 8768 if (SplatBitSize <= 64) { 8769 EVT VbicVT; 8770 SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), 8771 SplatUndef.getZExtValue(), SplatBitSize, 8772 DAG, dl, VbicVT, VT.is128BitVector(), 8773 OtherModImm); 8774 if (Val.getNode()) { 8775 SDValue Input = 8776 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 8777 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 8778 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 8779 } 8780 } 8781 } 8782 8783 if (!Subtarget->isThumb1Only()) { 8784 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 8785 SDValue Result = combineSelectAndUseCommutative(N, true, DCI); 8786 if (Result.getNode()) 8787 return Result; 8788 } 8789 8790 return SDValue(); 8791 } 8792 8793 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 8794 static SDValue PerformORCombine(SDNode *N, 8795 TargetLowering::DAGCombinerInfo &DCI, 8796 const ARMSubtarget *Subtarget) { 8797 // Attempt to use immediate-form VORR 8798 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 8799 SDLoc dl(N); 8800 EVT VT = N->getValueType(0); 8801 SelectionDAG &DAG = DCI.DAG; 8802 8803 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 8804 return SDValue(); 8805 8806 APInt SplatBits, SplatUndef; 8807 unsigned SplatBitSize; 8808 bool HasAnyUndefs; 8809 if (BVN && Subtarget->hasNEON() && 8810 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 8811 if (SplatBitSize <= 64) { 8812 EVT VorrVT; 8813 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 8814 SplatUndef.getZExtValue(), SplatBitSize, 8815 DAG, dl, VorrVT, VT.is128BitVector(), 8816 OtherModImm); 8817 if (Val.getNode()) { 8818 SDValue Input = 8819 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 8820 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 8821 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 8822 } 8823 } 8824 } 8825 8826 if (!Subtarget->isThumb1Only()) { 8827 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 8828 SDValue Result = combineSelectAndUseCommutative(N, false, DCI); 8829 if (Result.getNode()) 8830 return Result; 8831 } 8832 8833 // The code below optimizes (or (and X, Y), Z). 8834 // The AND operand needs to have a single user to make these optimizations 8835 // profitable. 8836 SDValue N0 = N->getOperand(0); 8837 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 8838 return SDValue(); 8839 SDValue N1 = N->getOperand(1); 8840 8841 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 8842 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 8843 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 8844 APInt SplatUndef; 8845 unsigned SplatBitSize; 8846 bool HasAnyUndefs; 8847 8848 APInt SplatBits0, SplatBits1; 8849 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 8850 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 8851 // Ensure that the second operand of both ands are constants 8852 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 8853 HasAnyUndefs) && !HasAnyUndefs) { 8854 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 8855 HasAnyUndefs) && !HasAnyUndefs) { 8856 // Ensure that the bit width of the constants are the same and that 8857 // the splat arguments are logical inverses as per the pattern we 8858 // are trying to simplify. 8859 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 8860 SplatBits0 == ~SplatBits1) { 8861 // Canonicalize the vector type to make instruction selection 8862 // simpler. 8863 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 8864 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 8865 N0->getOperand(1), 8866 N0->getOperand(0), 8867 N1->getOperand(0)); 8868 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 8869 } 8870 } 8871 } 8872 } 8873 8874 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 8875 // reasonable. 8876 8877 // BFI is only available on V6T2+ 8878 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 8879 return SDValue(); 8880 8881 SDLoc DL(N); 8882 // 1) or (and A, mask), val => ARMbfi A, val, mask 8883 // iff (val & mask) == val 8884 // 8885 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 8886 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 8887 // && mask == ~mask2 8888 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 8889 // && ~mask == mask2 8890 // (i.e., copy a bitfield value into another bitfield of the same width) 8891 8892 if (VT != MVT::i32) 8893 return SDValue(); 8894 8895 SDValue N00 = N0.getOperand(0); 8896 8897 // The value and the mask need to be constants so we can verify this is 8898 // actually a bitfield set. If the mask is 0xffff, we can do better 8899 // via a movt instruction, so don't use BFI in that case. 8900 SDValue MaskOp = N0.getOperand(1); 8901 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 8902 if (!MaskC) 8903 return SDValue(); 8904 unsigned Mask = MaskC->getZExtValue(); 8905 if (Mask == 0xffff) 8906 return SDValue(); 8907 SDValue Res; 8908 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 8909 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 8910 if (N1C) { 8911 unsigned Val = N1C->getZExtValue(); 8912 if ((Val & ~Mask) != Val) 8913 return SDValue(); 8914 8915 if (ARM::isBitFieldInvertedMask(Mask)) { 8916 Val >>= countTrailingZeros(~Mask); 8917 8918 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 8919 DAG.getConstant(Val, DL, MVT::i32), 8920 DAG.getConstant(Mask, DL, MVT::i32)); 8921 8922 // Do not add new nodes to DAG combiner worklist. 8923 DCI.CombineTo(N, Res, false); 8924 return SDValue(); 8925 } 8926 } else if (N1.getOpcode() == ISD::AND) { 8927 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 8928 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 8929 if (!N11C) 8930 return SDValue(); 8931 unsigned Mask2 = N11C->getZExtValue(); 8932 8933 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 8934 // as is to match. 8935 if (ARM::isBitFieldInvertedMask(Mask) && 8936 (Mask == ~Mask2)) { 8937 // The pack halfword instruction works better for masks that fit it, 8938 // so use that when it's available. 8939 if (Subtarget->hasT2ExtractPack() && 8940 (Mask == 0xffff || Mask == 0xffff0000)) 8941 return SDValue(); 8942 // 2a 8943 unsigned amt = countTrailingZeros(Mask2); 8944 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 8945 DAG.getConstant(amt, DL, MVT::i32)); 8946 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 8947 DAG.getConstant(Mask, DL, MVT::i32)); 8948 // Do not add new nodes to DAG combiner worklist. 8949 DCI.CombineTo(N, Res, false); 8950 return SDValue(); 8951 } else if (ARM::isBitFieldInvertedMask(~Mask) && 8952 (~Mask == Mask2)) { 8953 // The pack halfword instruction works better for masks that fit it, 8954 // so use that when it's available. 8955 if (Subtarget->hasT2ExtractPack() && 8956 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 8957 return SDValue(); 8958 // 2b 8959 unsigned lsb = countTrailingZeros(Mask); 8960 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 8961 DAG.getConstant(lsb, DL, MVT::i32)); 8962 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 8963 DAG.getConstant(Mask2, DL, MVT::i32)); 8964 // Do not add new nodes to DAG combiner worklist. 8965 DCI.CombineTo(N, Res, false); 8966 return SDValue(); 8967 } 8968 } 8969 8970 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 8971 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 8972 ARM::isBitFieldInvertedMask(~Mask)) { 8973 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 8974 // where lsb(mask) == #shamt and masked bits of B are known zero. 8975 SDValue ShAmt = N00.getOperand(1); 8976 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 8977 unsigned LSB = countTrailingZeros(Mask); 8978 if (ShAmtC != LSB) 8979 return SDValue(); 8980 8981 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 8982 DAG.getConstant(~Mask, DL, MVT::i32)); 8983 8984 // Do not add new nodes to DAG combiner worklist. 8985 DCI.CombineTo(N, Res, false); 8986 } 8987 8988 return SDValue(); 8989 } 8990 8991 static SDValue PerformXORCombine(SDNode *N, 8992 TargetLowering::DAGCombinerInfo &DCI, 8993 const ARMSubtarget *Subtarget) { 8994 EVT VT = N->getValueType(0); 8995 SelectionDAG &DAG = DCI.DAG; 8996 8997 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 8998 return SDValue(); 8999 9000 if (!Subtarget->isThumb1Only()) { 9001 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 9002 SDValue Result = combineSelectAndUseCommutative(N, false, DCI); 9003 if (Result.getNode()) 9004 return Result; 9005 } 9006 9007 return SDValue(); 9008 } 9009 9010 /// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 9011 /// the bits being cleared by the AND are not demanded by the BFI. 9012 static SDValue PerformBFICombine(SDNode *N, 9013 TargetLowering::DAGCombinerInfo &DCI) { 9014 SDValue N1 = N->getOperand(1); 9015 if (N1.getOpcode() == ISD::AND) { 9016 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 9017 if (!N11C) 9018 return SDValue(); 9019 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 9020 unsigned LSB = countTrailingZeros(~InvMask); 9021 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 9022 assert(Width < 9023 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 9024 "undefined behavior"); 9025 unsigned Mask = (1u << Width) - 1; 9026 unsigned Mask2 = N11C->getZExtValue(); 9027 if ((Mask & (~Mask2)) == 0) 9028 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 9029 N->getOperand(0), N1.getOperand(0), 9030 N->getOperand(2)); 9031 } 9032 return SDValue(); 9033 } 9034 9035 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 9036 /// ARMISD::VMOVRRD. 9037 static SDValue PerformVMOVRRDCombine(SDNode *N, 9038 TargetLowering::DAGCombinerInfo &DCI, 9039 const ARMSubtarget *Subtarget) { 9040 // vmovrrd(vmovdrr x, y) -> x,y 9041 SDValue InDouble = N->getOperand(0); 9042 if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) 9043 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 9044 9045 // vmovrrd(load f64) -> (load i32), (load i32) 9046 SDNode *InNode = InDouble.getNode(); 9047 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 9048 InNode->getValueType(0) == MVT::f64 && 9049 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 9050 !cast<LoadSDNode>(InNode)->isVolatile()) { 9051 // TODO: Should this be done for non-FrameIndex operands? 9052 LoadSDNode *LD = cast<LoadSDNode>(InNode); 9053 9054 SelectionDAG &DAG = DCI.DAG; 9055 SDLoc DL(LD); 9056 SDValue BasePtr = LD->getBasePtr(); 9057 SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, 9058 LD->getPointerInfo(), LD->isVolatile(), 9059 LD->isNonTemporal(), LD->isInvariant(), 9060 LD->getAlignment()); 9061 9062 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 9063 DAG.getConstant(4, DL, MVT::i32)); 9064 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, 9065 LD->getPointerInfo(), LD->isVolatile(), 9066 LD->isNonTemporal(), LD->isInvariant(), 9067 std::min(4U, LD->getAlignment() / 2)); 9068 9069 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 9070 if (DCI.DAG.getDataLayout().isBigEndian()) 9071 std::swap (NewLD1, NewLD2); 9072 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 9073 return Result; 9074 } 9075 9076 return SDValue(); 9077 } 9078 9079 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 9080 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 9081 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 9082 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 9083 SDValue Op0 = N->getOperand(0); 9084 SDValue Op1 = N->getOperand(1); 9085 if (Op0.getOpcode() == ISD::BITCAST) 9086 Op0 = Op0.getOperand(0); 9087 if (Op1.getOpcode() == ISD::BITCAST) 9088 Op1 = Op1.getOperand(0); 9089 if (Op0.getOpcode() == ARMISD::VMOVRRD && 9090 Op0.getNode() == Op1.getNode() && 9091 Op0.getResNo() == 0 && Op1.getResNo() == 1) 9092 return DAG.getNode(ISD::BITCAST, SDLoc(N), 9093 N->getValueType(0), Op0.getOperand(0)); 9094 return SDValue(); 9095 } 9096 9097 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 9098 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 9099 /// i64 vector to have f64 elements, since the value can then be loaded 9100 /// directly into a VFP register. 9101 static bool hasNormalLoadOperand(SDNode *N) { 9102 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 9103 for (unsigned i = 0; i < NumElts; ++i) { 9104 SDNode *Elt = N->getOperand(i).getNode(); 9105 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 9106 return true; 9107 } 9108 return false; 9109 } 9110 9111 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 9112 /// ISD::BUILD_VECTOR. 9113 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 9114 TargetLowering::DAGCombinerInfo &DCI, 9115 const ARMSubtarget *Subtarget) { 9116 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 9117 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 9118 // into a pair of GPRs, which is fine when the value is used as a scalar, 9119 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 9120 SelectionDAG &DAG = DCI.DAG; 9121 if (N->getNumOperands() == 2) { 9122 SDValue RV = PerformVMOVDRRCombine(N, DAG); 9123 if (RV.getNode()) 9124 return RV; 9125 } 9126 9127 // Load i64 elements as f64 values so that type legalization does not split 9128 // them up into i32 values. 9129 EVT VT = N->getValueType(0); 9130 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 9131 return SDValue(); 9132 SDLoc dl(N); 9133 SmallVector<SDValue, 8> Ops; 9134 unsigned NumElts = VT.getVectorNumElements(); 9135 for (unsigned i = 0; i < NumElts; ++i) { 9136 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 9137 Ops.push_back(V); 9138 // Make the DAGCombiner fold the bitcast. 9139 DCI.AddToWorklist(V.getNode()); 9140 } 9141 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 9142 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops); 9143 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 9144 } 9145 9146 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 9147 static SDValue 9148 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 9149 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 9150 // At that time, we may have inserted bitcasts from integer to float. 9151 // If these bitcasts have survived DAGCombine, change the lowering of this 9152 // BUILD_VECTOR in something more vector friendly, i.e., that does not 9153 // force to use floating point types. 9154 9155 // Make sure we can change the type of the vector. 9156 // This is possible iff: 9157 // 1. The vector is only used in a bitcast to a integer type. I.e., 9158 // 1.1. Vector is used only once. 9159 // 1.2. Use is a bit convert to an integer type. 9160 // 2. The size of its operands are 32-bits (64-bits are not legal). 9161 EVT VT = N->getValueType(0); 9162 EVT EltVT = VT.getVectorElementType(); 9163 9164 // Check 1.1. and 2. 9165 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 9166 return SDValue(); 9167 9168 // By construction, the input type must be float. 9169 assert(EltVT == MVT::f32 && "Unexpected type!"); 9170 9171 // Check 1.2. 9172 SDNode *Use = *N->use_begin(); 9173 if (Use->getOpcode() != ISD::BITCAST || 9174 Use->getValueType(0).isFloatingPoint()) 9175 return SDValue(); 9176 9177 // Check profitability. 9178 // Model is, if more than half of the relevant operands are bitcast from 9179 // i32, turn the build_vector into a sequence of insert_vector_elt. 9180 // Relevant operands are everything that is not statically 9181 // (i.e., at compile time) bitcasted. 9182 unsigned NumOfBitCastedElts = 0; 9183 unsigned NumElts = VT.getVectorNumElements(); 9184 unsigned NumOfRelevantElts = NumElts; 9185 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 9186 SDValue Elt = N->getOperand(Idx); 9187 if (Elt->getOpcode() == ISD::BITCAST) { 9188 // Assume only bit cast to i32 will go away. 9189 if (Elt->getOperand(0).getValueType() == MVT::i32) 9190 ++NumOfBitCastedElts; 9191 } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt)) 9192 // Constants are statically casted, thus do not count them as 9193 // relevant operands. 9194 --NumOfRelevantElts; 9195 } 9196 9197 // Check if more than half of the elements require a non-free bitcast. 9198 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 9199 return SDValue(); 9200 9201 SelectionDAG &DAG = DCI.DAG; 9202 // Create the new vector type. 9203 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 9204 // Check if the type is legal. 9205 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9206 if (!TLI.isTypeLegal(VecVT)) 9207 return SDValue(); 9208 9209 // Combine: 9210 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 9211 // => BITCAST INSERT_VECTOR_ELT 9212 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 9213 // (BITCAST EN), N. 9214 SDValue Vec = DAG.getUNDEF(VecVT); 9215 SDLoc dl(N); 9216 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 9217 SDValue V = N->getOperand(Idx); 9218 if (V.getOpcode() == ISD::UNDEF) 9219 continue; 9220 if (V.getOpcode() == ISD::BITCAST && 9221 V->getOperand(0).getValueType() == MVT::i32) 9222 // Fold obvious case. 9223 V = V.getOperand(0); 9224 else { 9225 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 9226 // Make the DAGCombiner fold the bitcasts. 9227 DCI.AddToWorklist(V.getNode()); 9228 } 9229 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 9230 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 9231 } 9232 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 9233 // Make the DAGCombiner fold the bitcasts. 9234 DCI.AddToWorklist(Vec.getNode()); 9235 return Vec; 9236 } 9237 9238 /// PerformInsertEltCombine - Target-specific dag combine xforms for 9239 /// ISD::INSERT_VECTOR_ELT. 9240 static SDValue PerformInsertEltCombine(SDNode *N, 9241 TargetLowering::DAGCombinerInfo &DCI) { 9242 // Bitcast an i64 load inserted into a vector to f64. 9243 // Otherwise, the i64 value will be legalized to a pair of i32 values. 9244 EVT VT = N->getValueType(0); 9245 SDNode *Elt = N->getOperand(1).getNode(); 9246 if (VT.getVectorElementType() != MVT::i64 || 9247 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 9248 return SDValue(); 9249 9250 SelectionDAG &DAG = DCI.DAG; 9251 SDLoc dl(N); 9252 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 9253 VT.getVectorNumElements()); 9254 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 9255 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 9256 // Make the DAGCombiner fold the bitcasts. 9257 DCI.AddToWorklist(Vec.getNode()); 9258 DCI.AddToWorklist(V.getNode()); 9259 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 9260 Vec, V, N->getOperand(2)); 9261 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 9262 } 9263 9264 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 9265 /// ISD::VECTOR_SHUFFLE. 9266 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 9267 // The LLVM shufflevector instruction does not require the shuffle mask 9268 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 9269 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 9270 // operands do not match the mask length, they are extended by concatenating 9271 // them with undef vectors. That is probably the right thing for other 9272 // targets, but for NEON it is better to concatenate two double-register 9273 // size vector operands into a single quad-register size vector. Do that 9274 // transformation here: 9275 // shuffle(concat(v1, undef), concat(v2, undef)) -> 9276 // shuffle(concat(v1, v2), undef) 9277 SDValue Op0 = N->getOperand(0); 9278 SDValue Op1 = N->getOperand(1); 9279 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 9280 Op1.getOpcode() != ISD::CONCAT_VECTORS || 9281 Op0.getNumOperands() != 2 || 9282 Op1.getNumOperands() != 2) 9283 return SDValue(); 9284 SDValue Concat0Op1 = Op0.getOperand(1); 9285 SDValue Concat1Op1 = Op1.getOperand(1); 9286 if (Concat0Op1.getOpcode() != ISD::UNDEF || 9287 Concat1Op1.getOpcode() != ISD::UNDEF) 9288 return SDValue(); 9289 // Skip the transformation if any of the types are illegal. 9290 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9291 EVT VT = N->getValueType(0); 9292 if (!TLI.isTypeLegal(VT) || 9293 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 9294 !TLI.isTypeLegal(Concat1Op1.getValueType())) 9295 return SDValue(); 9296 9297 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 9298 Op0.getOperand(0), Op1.getOperand(0)); 9299 // Translate the shuffle mask. 9300 SmallVector<int, 16> NewMask; 9301 unsigned NumElts = VT.getVectorNumElements(); 9302 unsigned HalfElts = NumElts/2; 9303 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9304 for (unsigned n = 0; n < NumElts; ++n) { 9305 int MaskElt = SVN->getMaskElt(n); 9306 int NewElt = -1; 9307 if (MaskElt < (int)HalfElts) 9308 NewElt = MaskElt; 9309 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 9310 NewElt = HalfElts + MaskElt - NumElts; 9311 NewMask.push_back(NewElt); 9312 } 9313 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 9314 DAG.getUNDEF(VT), NewMask.data()); 9315 } 9316 9317 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 9318 /// NEON load/store intrinsics, and generic vector load/stores, to merge 9319 /// base address updates. 9320 /// For generic load/stores, the memory type is assumed to be a vector. 9321 /// The caller is assumed to have checked legality. 9322 static SDValue CombineBaseUpdate(SDNode *N, 9323 TargetLowering::DAGCombinerInfo &DCI) { 9324 SelectionDAG &DAG = DCI.DAG; 9325 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 9326 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 9327 const bool isStore = N->getOpcode() == ISD::STORE; 9328 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 9329 SDValue Addr = N->getOperand(AddrOpIdx); 9330 MemSDNode *MemN = cast<MemSDNode>(N); 9331 SDLoc dl(N); 9332 9333 // Search for a use of the address operand that is an increment. 9334 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 9335 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 9336 SDNode *User = *UI; 9337 if (User->getOpcode() != ISD::ADD || 9338 UI.getUse().getResNo() != Addr.getResNo()) 9339 continue; 9340 9341 // Check that the add is independent of the load/store. Otherwise, folding 9342 // it would create a cycle. 9343 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 9344 continue; 9345 9346 // Find the new opcode for the updating load/store. 9347 bool isLoadOp = true; 9348 bool isLaneOp = false; 9349 unsigned NewOpc = 0; 9350 unsigned NumVecs = 0; 9351 if (isIntrinsic) { 9352 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 9353 switch (IntNo) { 9354 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 9355 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 9356 NumVecs = 1; break; 9357 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 9358 NumVecs = 2; break; 9359 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 9360 NumVecs = 3; break; 9361 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 9362 NumVecs = 4; break; 9363 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 9364 NumVecs = 2; isLaneOp = true; break; 9365 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 9366 NumVecs = 3; isLaneOp = true; break; 9367 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 9368 NumVecs = 4; isLaneOp = true; break; 9369 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 9370 NumVecs = 1; isLoadOp = false; break; 9371 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 9372 NumVecs = 2; isLoadOp = false; break; 9373 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 9374 NumVecs = 3; isLoadOp = false; break; 9375 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 9376 NumVecs = 4; isLoadOp = false; break; 9377 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 9378 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 9379 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 9380 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 9381 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 9382 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 9383 } 9384 } else { 9385 isLaneOp = true; 9386 switch (N->getOpcode()) { 9387 default: llvm_unreachable("unexpected opcode for Neon base update"); 9388 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 9389 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 9390 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 9391 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 9392 NumVecs = 1; isLaneOp = false; break; 9393 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 9394 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 9395 } 9396 } 9397 9398 // Find the size of memory referenced by the load/store. 9399 EVT VecTy; 9400 if (isLoadOp) { 9401 VecTy = N->getValueType(0); 9402 } else if (isIntrinsic) { 9403 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 9404 } else { 9405 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 9406 VecTy = N->getOperand(1).getValueType(); 9407 } 9408 9409 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 9410 if (isLaneOp) 9411 NumBytes /= VecTy.getVectorNumElements(); 9412 9413 // If the increment is a constant, it must match the memory ref size. 9414 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 9415 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 9416 uint64_t IncVal = CInc->getZExtValue(); 9417 if (IncVal != NumBytes) 9418 continue; 9419 } else if (NumBytes >= 3 * 16) { 9420 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 9421 // separate instructions that make it harder to use a non-constant update. 9422 continue; 9423 } 9424 9425 // OK, we found an ADD we can fold into the base update. 9426 // Now, create a _UPD node, taking care of not breaking alignment. 9427 9428 EVT AlignedVecTy = VecTy; 9429 unsigned Alignment = MemN->getAlignment(); 9430 9431 // If this is a less-than-standard-aligned load/store, change the type to 9432 // match the standard alignment. 9433 // The alignment is overlooked when selecting _UPD variants; and it's 9434 // easier to introduce bitcasts here than fix that. 9435 // There are 3 ways to get to this base-update combine: 9436 // - intrinsics: they are assumed to be properly aligned (to the standard 9437 // alignment of the memory type), so we don't need to do anything. 9438 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 9439 // intrinsics, so, likewise, there's nothing to do. 9440 // - generic load/store instructions: the alignment is specified as an 9441 // explicit operand, rather than implicitly as the standard alignment 9442 // of the memory type (like the intrisics). We need to change the 9443 // memory type to match the explicit alignment. That way, we don't 9444 // generate non-standard-aligned ARMISD::VLDx nodes. 9445 if (isa<LSBaseSDNode>(N)) { 9446 if (Alignment == 0) 9447 Alignment = 1; 9448 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 9449 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 9450 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 9451 assert(!isLaneOp && "Unexpected generic load/store lane."); 9452 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 9453 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 9454 } 9455 // Don't set an explicit alignment on regular load/stores that we want 9456 // to transform to VLD/VST 1_UPD nodes. 9457 // This matches the behavior of regular load/stores, which only get an 9458 // explicit alignment if the MMO alignment is larger than the standard 9459 // alignment of the memory type. 9460 // Intrinsics, however, always get an explicit alignment, set to the 9461 // alignment of the MMO. 9462 Alignment = 1; 9463 } 9464 9465 // Create the new updating load/store node. 9466 // First, create an SDVTList for the new updating node's results. 9467 EVT Tys[6]; 9468 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 9469 unsigned n; 9470 for (n = 0; n < NumResultVecs; ++n) 9471 Tys[n] = AlignedVecTy; 9472 Tys[n++] = MVT::i32; 9473 Tys[n] = MVT::Other; 9474 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 9475 9476 // Then, gather the new node's operands. 9477 SmallVector<SDValue, 8> Ops; 9478 Ops.push_back(N->getOperand(0)); // incoming chain 9479 Ops.push_back(N->getOperand(AddrOpIdx)); 9480 Ops.push_back(Inc); 9481 9482 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 9483 // Try to match the intrinsic's signature 9484 Ops.push_back(StN->getValue()); 9485 } else { 9486 // Loads (and of course intrinsics) match the intrinsics' signature, 9487 // so just add all but the alignment operand. 9488 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 9489 Ops.push_back(N->getOperand(i)); 9490 } 9491 9492 // For all node types, the alignment operand is always the last one. 9493 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 9494 9495 // If this is a non-standard-aligned STORE, the penultimate operand is the 9496 // stored value. Bitcast it to the aligned type. 9497 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 9498 SDValue &StVal = Ops[Ops.size()-2]; 9499 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 9500 } 9501 9502 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, 9503 Ops, AlignedVecTy, 9504 MemN->getMemOperand()); 9505 9506 // Update the uses. 9507 SmallVector<SDValue, 5> NewResults; 9508 for (unsigned i = 0; i < NumResultVecs; ++i) 9509 NewResults.push_back(SDValue(UpdN.getNode(), i)); 9510 9511 // If this is an non-standard-aligned LOAD, the first result is the loaded 9512 // value. Bitcast it to the expected result type. 9513 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 9514 SDValue &LdVal = NewResults[0]; 9515 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 9516 } 9517 9518 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 9519 DCI.CombineTo(N, NewResults); 9520 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 9521 9522 break; 9523 } 9524 return SDValue(); 9525 } 9526 9527 static SDValue PerformVLDCombine(SDNode *N, 9528 TargetLowering::DAGCombinerInfo &DCI) { 9529 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9530 return SDValue(); 9531 9532 return CombineBaseUpdate(N, DCI); 9533 } 9534 9535 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 9536 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 9537 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 9538 /// return true. 9539 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 9540 SelectionDAG &DAG = DCI.DAG; 9541 EVT VT = N->getValueType(0); 9542 // vldN-dup instructions only support 64-bit vectors for N > 1. 9543 if (!VT.is64BitVector()) 9544 return false; 9545 9546 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 9547 SDNode *VLD = N->getOperand(0).getNode(); 9548 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 9549 return false; 9550 unsigned NumVecs = 0; 9551 unsigned NewOpc = 0; 9552 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 9553 if (IntNo == Intrinsic::arm_neon_vld2lane) { 9554 NumVecs = 2; 9555 NewOpc = ARMISD::VLD2DUP; 9556 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 9557 NumVecs = 3; 9558 NewOpc = ARMISD::VLD3DUP; 9559 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 9560 NumVecs = 4; 9561 NewOpc = ARMISD::VLD4DUP; 9562 } else { 9563 return false; 9564 } 9565 9566 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 9567 // numbers match the load. 9568 unsigned VLDLaneNo = 9569 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 9570 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 9571 UI != UE; ++UI) { 9572 // Ignore uses of the chain result. 9573 if (UI.getUse().getResNo() == NumVecs) 9574 continue; 9575 SDNode *User = *UI; 9576 if (User->getOpcode() != ARMISD::VDUPLANE || 9577 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 9578 return false; 9579 } 9580 9581 // Create the vldN-dup node. 9582 EVT Tys[5]; 9583 unsigned n; 9584 for (n = 0; n < NumVecs; ++n) 9585 Tys[n] = VT; 9586 Tys[n] = MVT::Other; 9587 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 9588 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 9589 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 9590 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 9591 Ops, VLDMemInt->getMemoryVT(), 9592 VLDMemInt->getMemOperand()); 9593 9594 // Update the uses. 9595 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 9596 UI != UE; ++UI) { 9597 unsigned ResNo = UI.getUse().getResNo(); 9598 // Ignore uses of the chain result. 9599 if (ResNo == NumVecs) 9600 continue; 9601 SDNode *User = *UI; 9602 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 9603 } 9604 9605 // Now the vldN-lane intrinsic is dead except for its chain result. 9606 // Update uses of the chain. 9607 std::vector<SDValue> VLDDupResults; 9608 for (unsigned n = 0; n < NumVecs; ++n) 9609 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 9610 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 9611 DCI.CombineTo(VLD, VLDDupResults); 9612 9613 return true; 9614 } 9615 9616 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 9617 /// ARMISD::VDUPLANE. 9618 static SDValue PerformVDUPLANECombine(SDNode *N, 9619 TargetLowering::DAGCombinerInfo &DCI) { 9620 SDValue Op = N->getOperand(0); 9621 9622 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 9623 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 9624 if (CombineVLDDUP(N, DCI)) 9625 return SDValue(N, 0); 9626 9627 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 9628 // redundant. Ignore bit_converts for now; element sizes are checked below. 9629 while (Op.getOpcode() == ISD::BITCAST) 9630 Op = Op.getOperand(0); 9631 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 9632 return SDValue(); 9633 9634 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 9635 unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); 9636 // The canonical VMOV for a zero vector uses a 32-bit element size. 9637 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9638 unsigned EltBits; 9639 if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) 9640 EltSize = 8; 9641 EVT VT = N->getValueType(0); 9642 if (EltSize > VT.getVectorElementType().getSizeInBits()) 9643 return SDValue(); 9644 9645 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 9646 } 9647 9648 static SDValue PerformLOADCombine(SDNode *N, 9649 TargetLowering::DAGCombinerInfo &DCI) { 9650 EVT VT = N->getValueType(0); 9651 9652 // If this is a legal vector load, try to combine it into a VLD1_UPD. 9653 if (ISD::isNormalLoad(N) && VT.isVector() && 9654 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9655 return CombineBaseUpdate(N, DCI); 9656 9657 return SDValue(); 9658 } 9659 9660 /// PerformSTORECombine - Target-specific dag combine xforms for 9661 /// ISD::STORE. 9662 static SDValue PerformSTORECombine(SDNode *N, 9663 TargetLowering::DAGCombinerInfo &DCI) { 9664 StoreSDNode *St = cast<StoreSDNode>(N); 9665 if (St->isVolatile()) 9666 return SDValue(); 9667 9668 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 9669 // pack all of the elements in one place. Next, store to memory in fewer 9670 // chunks. 9671 SDValue StVal = St->getValue(); 9672 EVT VT = StVal.getValueType(); 9673 if (St->isTruncatingStore() && VT.isVector()) { 9674 SelectionDAG &DAG = DCI.DAG; 9675 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9676 EVT StVT = St->getMemoryVT(); 9677 unsigned NumElems = VT.getVectorNumElements(); 9678 assert(StVT != VT && "Cannot truncate to the same type"); 9679 unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); 9680 unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); 9681 9682 // From, To sizes and ElemCount must be pow of two 9683 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); 9684 9685 // We are going to use the original vector elt for storing. 9686 // Accumulated smaller vector elements must be a multiple of the store size. 9687 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); 9688 9689 unsigned SizeRatio = FromEltSz / ToEltSz; 9690 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 9691 9692 // Create a type on which we perform the shuffle. 9693 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 9694 NumElems*SizeRatio); 9695 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 9696 9697 SDLoc DL(St); 9698 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 9699 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 9700 for (unsigned i = 0; i < NumElems; ++i) 9701 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() 9702 ? (i + 1) * SizeRatio - 1 9703 : i * SizeRatio; 9704 9705 // Can't shuffle using an illegal type. 9706 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 9707 9708 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, 9709 DAG.getUNDEF(WideVec.getValueType()), 9710 ShuffleVec.data()); 9711 // At this point all of the data is stored at the bottom of the 9712 // register. We now need to save it to mem. 9713 9714 // Find the largest store unit 9715 MVT StoreType = MVT::i8; 9716 for (MVT Tp : MVT::integer_valuetypes()) { 9717 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 9718 StoreType = Tp; 9719 } 9720 // Didn't find a legal store type. 9721 if (!TLI.isTypeLegal(StoreType)) 9722 return SDValue(); 9723 9724 // Bitcast the original vector into a vector of store-size units 9725 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 9726 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 9727 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 9728 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 9729 SmallVector<SDValue, 8> Chains; 9730 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 9731 TLI.getPointerTy(DAG.getDataLayout())); 9732 SDValue BasePtr = St->getBasePtr(); 9733 9734 // Perform one or more big stores into memory. 9735 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); 9736 for (unsigned I = 0; I < E; I++) { 9737 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 9738 StoreType, ShuffWide, 9739 DAG.getIntPtrConstant(I, DL)); 9740 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, 9741 St->getPointerInfo(), St->isVolatile(), 9742 St->isNonTemporal(), St->getAlignment()); 9743 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, 9744 Increment); 9745 Chains.push_back(Ch); 9746 } 9747 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 9748 } 9749 9750 if (!ISD::isNormalStore(St)) 9751 return SDValue(); 9752 9753 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 9754 // ARM stores of arguments in the same cache line. 9755 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 9756 StVal.getNode()->hasOneUse()) { 9757 SelectionDAG &DAG = DCI.DAG; 9758 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9759 SDLoc DL(St); 9760 SDValue BasePtr = St->getBasePtr(); 9761 SDValue NewST1 = DAG.getStore(St->getChain(), DL, 9762 StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), 9763 BasePtr, St->getPointerInfo(), St->isVolatile(), 9764 St->isNonTemporal(), St->getAlignment()); 9765 9766 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 9767 DAG.getConstant(4, DL, MVT::i32)); 9768 return DAG.getStore(NewST1.getValue(0), DL, 9769 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 9770 OffsetPtr, St->getPointerInfo(), St->isVolatile(), 9771 St->isNonTemporal(), 9772 std::min(4U, St->getAlignment() / 2)); 9773 } 9774 9775 if (StVal.getValueType() == MVT::i64 && 9776 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 9777 9778 // Bitcast an i64 store extracted from a vector to f64. 9779 // Otherwise, the i64 value will be legalized to a pair of i32 values. 9780 SelectionDAG &DAG = DCI.DAG; 9781 SDLoc dl(StVal); 9782 SDValue IntVec = StVal.getOperand(0); 9783 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 9784 IntVec.getValueType().getVectorNumElements()); 9785 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 9786 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 9787 Vec, StVal.getOperand(1)); 9788 dl = SDLoc(N); 9789 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 9790 // Make the DAGCombiner fold the bitcasts. 9791 DCI.AddToWorklist(Vec.getNode()); 9792 DCI.AddToWorklist(ExtElt.getNode()); 9793 DCI.AddToWorklist(V.getNode()); 9794 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 9795 St->getPointerInfo(), St->isVolatile(), 9796 St->isNonTemporal(), St->getAlignment(), 9797 St->getAAInfo()); 9798 } 9799 9800 // If this is a legal vector store, try to combine it into a VST1_UPD. 9801 if (ISD::isNormalStore(N) && VT.isVector() && 9802 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9803 return CombineBaseUpdate(N, DCI); 9804 9805 return SDValue(); 9806 } 9807 9808 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 9809 /// can replace combinations of VMUL and VCVT (floating-point to integer) 9810 /// when the VMUL has a constant operand that is a power of 2. 9811 /// 9812 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 9813 /// vmul.f32 d16, d17, d16 9814 /// vcvt.s32.f32 d16, d16 9815 /// becomes: 9816 /// vcvt.s32.f32 d16, d16, #3 9817 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 9818 const ARMSubtarget *Subtarget) { 9819 if (!Subtarget->hasNEON()) 9820 return SDValue(); 9821 9822 SDValue Op = N->getOperand(0); 9823 if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) 9824 return SDValue(); 9825 9826 SDValue ConstVec = Op->getOperand(1); 9827 if (!isa<BuildVectorSDNode>(ConstVec)) 9828 return SDValue(); 9829 9830 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 9831 uint32_t FloatBits = FloatTy.getSizeInBits(); 9832 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 9833 uint32_t IntBits = IntTy.getSizeInBits(); 9834 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 9835 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 9836 // These instructions only exist converting from f32 to i32. We can handle 9837 // smaller integers by generating an extra truncate, but larger ones would 9838 // be lossy. We also can't handle more then 4 lanes, since these intructions 9839 // only support v2i32/v4i32 types. 9840 return SDValue(); 9841 } 9842 9843 BitVector UndefElements; 9844 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 9845 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 9846 if (C == -1 || C == 0 || C > 32) 9847 return SDValue(); 9848 9849 SDLoc dl(N); 9850 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 9851 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 9852 Intrinsic::arm_neon_vcvtfp2fxu; 9853 SDValue FixConv = DAG.getNode( 9854 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 9855 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 9856 DAG.getConstant(C, dl, MVT::i32)); 9857 9858 if (IntBits < FloatBits) 9859 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 9860 9861 return FixConv; 9862 } 9863 9864 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 9865 /// can replace combinations of VCVT (integer to floating-point) and VDIV 9866 /// when the VDIV has a constant operand that is a power of 2. 9867 /// 9868 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 9869 /// vcvt.f32.s32 d16, d16 9870 /// vdiv.f32 d16, d17, d16 9871 /// becomes: 9872 /// vcvt.f32.s32 d16, d16, #3 9873 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 9874 const ARMSubtarget *Subtarget) { 9875 if (!Subtarget->hasNEON()) 9876 return SDValue(); 9877 9878 SDValue Op = N->getOperand(0); 9879 unsigned OpOpcode = Op.getNode()->getOpcode(); 9880 if (!N->getValueType(0).isVector() || 9881 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 9882 return SDValue(); 9883 9884 SDValue ConstVec = N->getOperand(1); 9885 if (!isa<BuildVectorSDNode>(ConstVec)) 9886 return SDValue(); 9887 9888 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 9889 uint32_t FloatBits = FloatTy.getSizeInBits(); 9890 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 9891 uint32_t IntBits = IntTy.getSizeInBits(); 9892 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 9893 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 9894 // These instructions only exist converting from i32 to f32. We can handle 9895 // smaller integers by generating an extra extend, but larger ones would 9896 // be lossy. We also can't handle more then 4 lanes, since these intructions 9897 // only support v2i32/v4i32 types. 9898 return SDValue(); 9899 } 9900 9901 BitVector UndefElements; 9902 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 9903 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 9904 if (C == -1 || C == 0 || C > 32) 9905 return SDValue(); 9906 9907 SDLoc dl(N); 9908 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 9909 SDValue ConvInput = Op.getOperand(0); 9910 if (IntBits < FloatBits) 9911 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 9912 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 9913 ConvInput); 9914 9915 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 9916 Intrinsic::arm_neon_vcvtfxu2fp; 9917 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 9918 Op.getValueType(), 9919 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 9920 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 9921 } 9922 9923 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 9924 /// operand of a vector shift operation, where all the elements of the 9925 /// build_vector must have the same constant integer value. 9926 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 9927 // Ignore bit_converts. 9928 while (Op.getOpcode() == ISD::BITCAST) 9929 Op = Op.getOperand(0); 9930 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 9931 APInt SplatBits, SplatUndef; 9932 unsigned SplatBitSize; 9933 bool HasAnyUndefs; 9934 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 9935 HasAnyUndefs, ElementBits) || 9936 SplatBitSize > ElementBits) 9937 return false; 9938 Cnt = SplatBits.getSExtValue(); 9939 return true; 9940 } 9941 9942 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 9943 /// operand of a vector shift left operation. That value must be in the range: 9944 /// 0 <= Value < ElementBits for a left shift; or 9945 /// 0 <= Value <= ElementBits for a long left shift. 9946 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 9947 assert(VT.isVector() && "vector shift count is not a vector type"); 9948 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 9949 if (! getVShiftImm(Op, ElementBits, Cnt)) 9950 return false; 9951 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); 9952 } 9953 9954 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 9955 /// operand of a vector shift right operation. For a shift opcode, the value 9956 /// is positive, but for an intrinsic the value count must be negative. The 9957 /// absolute value must be in the range: 9958 /// 1 <= |Value| <= ElementBits for a right shift; or 9959 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 9960 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 9961 int64_t &Cnt) { 9962 assert(VT.isVector() && "vector shift count is not a vector type"); 9963 int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); 9964 if (! getVShiftImm(Op, ElementBits, Cnt)) 9965 return false; 9966 if (!isIntrinsic) 9967 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); 9968 if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { 9969 Cnt = -Cnt; 9970 return true; 9971 } 9972 return false; 9973 } 9974 9975 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 9976 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 9977 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9978 switch (IntNo) { 9979 default: 9980 // Don't do anything for most intrinsics. 9981 break; 9982 9983 case Intrinsic::arm_neon_vabds: 9984 if (!N->getValueType(0).isInteger()) 9985 return SDValue(); 9986 return DAG.getNode(ISD::SABSDIFF, SDLoc(N), N->getValueType(0), 9987 N->getOperand(1), N->getOperand(2)); 9988 case Intrinsic::arm_neon_vabdu: 9989 return DAG.getNode(ISD::UABSDIFF, SDLoc(N), N->getValueType(0), 9990 N->getOperand(1), N->getOperand(2)); 9991 9992 // Vector shifts: check for immediate versions and lower them. 9993 // Note: This is done during DAG combining instead of DAG legalizing because 9994 // the build_vectors for 64-bit vector element shift counts are generally 9995 // not legal, and it is hard to see their values after they get legalized to 9996 // loads from a constant pool. 9997 case Intrinsic::arm_neon_vshifts: 9998 case Intrinsic::arm_neon_vshiftu: 9999 case Intrinsic::arm_neon_vrshifts: 10000 case Intrinsic::arm_neon_vrshiftu: 10001 case Intrinsic::arm_neon_vrshiftn: 10002 case Intrinsic::arm_neon_vqshifts: 10003 case Intrinsic::arm_neon_vqshiftu: 10004 case Intrinsic::arm_neon_vqshiftsu: 10005 case Intrinsic::arm_neon_vqshiftns: 10006 case Intrinsic::arm_neon_vqshiftnu: 10007 case Intrinsic::arm_neon_vqshiftnsu: 10008 case Intrinsic::arm_neon_vqrshiftns: 10009 case Intrinsic::arm_neon_vqrshiftnu: 10010 case Intrinsic::arm_neon_vqrshiftnsu: { 10011 EVT VT = N->getOperand(1).getValueType(); 10012 int64_t Cnt; 10013 unsigned VShiftOpc = 0; 10014 10015 switch (IntNo) { 10016 case Intrinsic::arm_neon_vshifts: 10017 case Intrinsic::arm_neon_vshiftu: 10018 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 10019 VShiftOpc = ARMISD::VSHL; 10020 break; 10021 } 10022 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 10023 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? 10024 ARMISD::VSHRs : ARMISD::VSHRu); 10025 break; 10026 } 10027 return SDValue(); 10028 10029 case Intrinsic::arm_neon_vrshifts: 10030 case Intrinsic::arm_neon_vrshiftu: 10031 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 10032 break; 10033 return SDValue(); 10034 10035 case Intrinsic::arm_neon_vqshifts: 10036 case Intrinsic::arm_neon_vqshiftu: 10037 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 10038 break; 10039 return SDValue(); 10040 10041 case Intrinsic::arm_neon_vqshiftsu: 10042 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 10043 break; 10044 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 10045 10046 case Intrinsic::arm_neon_vrshiftn: 10047 case Intrinsic::arm_neon_vqshiftns: 10048 case Intrinsic::arm_neon_vqshiftnu: 10049 case Intrinsic::arm_neon_vqshiftnsu: 10050 case Intrinsic::arm_neon_vqrshiftns: 10051 case Intrinsic::arm_neon_vqrshiftnu: 10052 case Intrinsic::arm_neon_vqrshiftnsu: 10053 // Narrowing shifts require an immediate right shift. 10054 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 10055 break; 10056 llvm_unreachable("invalid shift count for narrowing vector shift " 10057 "intrinsic"); 10058 10059 default: 10060 llvm_unreachable("unhandled vector shift"); 10061 } 10062 10063 switch (IntNo) { 10064 case Intrinsic::arm_neon_vshifts: 10065 case Intrinsic::arm_neon_vshiftu: 10066 // Opcode already set above. 10067 break; 10068 case Intrinsic::arm_neon_vrshifts: 10069 VShiftOpc = ARMISD::VRSHRs; break; 10070 case Intrinsic::arm_neon_vrshiftu: 10071 VShiftOpc = ARMISD::VRSHRu; break; 10072 case Intrinsic::arm_neon_vrshiftn: 10073 VShiftOpc = ARMISD::VRSHRN; break; 10074 case Intrinsic::arm_neon_vqshifts: 10075 VShiftOpc = ARMISD::VQSHLs; break; 10076 case Intrinsic::arm_neon_vqshiftu: 10077 VShiftOpc = ARMISD::VQSHLu; break; 10078 case Intrinsic::arm_neon_vqshiftsu: 10079 VShiftOpc = ARMISD::VQSHLsu; break; 10080 case Intrinsic::arm_neon_vqshiftns: 10081 VShiftOpc = ARMISD::VQSHRNs; break; 10082 case Intrinsic::arm_neon_vqshiftnu: 10083 VShiftOpc = ARMISD::VQSHRNu; break; 10084 case Intrinsic::arm_neon_vqshiftnsu: 10085 VShiftOpc = ARMISD::VQSHRNsu; break; 10086 case Intrinsic::arm_neon_vqrshiftns: 10087 VShiftOpc = ARMISD::VQRSHRNs; break; 10088 case Intrinsic::arm_neon_vqrshiftnu: 10089 VShiftOpc = ARMISD::VQRSHRNu; break; 10090 case Intrinsic::arm_neon_vqrshiftnsu: 10091 VShiftOpc = ARMISD::VQRSHRNsu; break; 10092 } 10093 10094 SDLoc dl(N); 10095 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 10096 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 10097 } 10098 10099 case Intrinsic::arm_neon_vshiftins: { 10100 EVT VT = N->getOperand(1).getValueType(); 10101 int64_t Cnt; 10102 unsigned VShiftOpc = 0; 10103 10104 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 10105 VShiftOpc = ARMISD::VSLI; 10106 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 10107 VShiftOpc = ARMISD::VSRI; 10108 else { 10109 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 10110 } 10111 10112 SDLoc dl(N); 10113 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 10114 N->getOperand(1), N->getOperand(2), 10115 DAG.getConstant(Cnt, dl, MVT::i32)); 10116 } 10117 10118 case Intrinsic::arm_neon_vqrshifts: 10119 case Intrinsic::arm_neon_vqrshiftu: 10120 // No immediate versions of these to check for. 10121 break; 10122 } 10123 10124 return SDValue(); 10125 } 10126 10127 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 10128 /// lowers them. As with the vector shift intrinsics, this is done during DAG 10129 /// combining instead of DAG legalizing because the build_vectors for 64-bit 10130 /// vector element shift counts are generally not legal, and it is hard to see 10131 /// their values after they get legalized to loads from a constant pool. 10132 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, 10133 const ARMSubtarget *ST) { 10134 EVT VT = N->getValueType(0); 10135 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 10136 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 10137 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 10138 SDValue N1 = N->getOperand(1); 10139 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 10140 SDValue N0 = N->getOperand(0); 10141 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 10142 DAG.MaskedValueIsZero(N0.getOperand(0), 10143 APInt::getHighBitsSet(32, 16))) 10144 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 10145 } 10146 } 10147 10148 // Nothing to be done for scalar shifts. 10149 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10150 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 10151 return SDValue(); 10152 10153 assert(ST->hasNEON() && "unexpected vector shift"); 10154 int64_t Cnt; 10155 10156 switch (N->getOpcode()) { 10157 default: llvm_unreachable("unexpected shift opcode"); 10158 10159 case ISD::SHL: 10160 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 10161 SDLoc dl(N); 10162 return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), 10163 DAG.getConstant(Cnt, dl, MVT::i32)); 10164 } 10165 break; 10166 10167 case ISD::SRA: 10168 case ISD::SRL: 10169 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 10170 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? 10171 ARMISD::VSHRs : ARMISD::VSHRu); 10172 SDLoc dl(N); 10173 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 10174 DAG.getConstant(Cnt, dl, MVT::i32)); 10175 } 10176 } 10177 return SDValue(); 10178 } 10179 10180 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 10181 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 10182 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 10183 const ARMSubtarget *ST) { 10184 SDValue N0 = N->getOperand(0); 10185 10186 // Check for sign- and zero-extensions of vector extract operations of 8- 10187 // and 16-bit vector elements. NEON supports these directly. They are 10188 // handled during DAG combining because type legalization will promote them 10189 // to 32-bit types and it is messy to recognize the operations after that. 10190 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 10191 SDValue Vec = N0.getOperand(0); 10192 SDValue Lane = N0.getOperand(1); 10193 EVT VT = N->getValueType(0); 10194 EVT EltVT = N0.getValueType(); 10195 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10196 10197 if (VT == MVT::i32 && 10198 (EltVT == MVT::i8 || EltVT == MVT::i16) && 10199 TLI.isTypeLegal(Vec.getValueType()) && 10200 isa<ConstantSDNode>(Lane)) { 10201 10202 unsigned Opc = 0; 10203 switch (N->getOpcode()) { 10204 default: llvm_unreachable("unexpected opcode"); 10205 case ISD::SIGN_EXTEND: 10206 Opc = ARMISD::VGETLANEs; 10207 break; 10208 case ISD::ZERO_EXTEND: 10209 case ISD::ANY_EXTEND: 10210 Opc = ARMISD::VGETLANEu; 10211 break; 10212 } 10213 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 10214 } 10215 } 10216 10217 return SDValue(); 10218 } 10219 10220 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 10221 SDValue 10222 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 10223 SDValue Cmp = N->getOperand(4); 10224 if (Cmp.getOpcode() != ARMISD::CMPZ) 10225 // Only looking at EQ and NE cases. 10226 return SDValue(); 10227 10228 EVT VT = N->getValueType(0); 10229 SDLoc dl(N); 10230 SDValue LHS = Cmp.getOperand(0); 10231 SDValue RHS = Cmp.getOperand(1); 10232 SDValue FalseVal = N->getOperand(0); 10233 SDValue TrueVal = N->getOperand(1); 10234 SDValue ARMcc = N->getOperand(2); 10235 ARMCC::CondCodes CC = 10236 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 10237 10238 // Simplify 10239 // mov r1, r0 10240 // cmp r1, x 10241 // mov r0, y 10242 // moveq r0, x 10243 // to 10244 // cmp r0, x 10245 // movne r0, y 10246 // 10247 // mov r1, r0 10248 // cmp r1, x 10249 // mov r0, x 10250 // movne r0, y 10251 // to 10252 // cmp r0, x 10253 // movne r0, y 10254 /// FIXME: Turn this into a target neutral optimization? 10255 SDValue Res; 10256 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 10257 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 10258 N->getOperand(3), Cmp); 10259 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 10260 SDValue ARMcc; 10261 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 10262 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 10263 N->getOperand(3), NewCmp); 10264 } 10265 10266 if (Res.getNode()) { 10267 APInt KnownZero, KnownOne; 10268 DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne); 10269 // Capture demanded bits information that would be otherwise lost. 10270 if (KnownZero == 0xfffffffe) 10271 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 10272 DAG.getValueType(MVT::i1)); 10273 else if (KnownZero == 0xffffff00) 10274 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 10275 DAG.getValueType(MVT::i8)); 10276 else if (KnownZero == 0xffff0000) 10277 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 10278 DAG.getValueType(MVT::i16)); 10279 } 10280 10281 return Res; 10282 } 10283 10284 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 10285 DAGCombinerInfo &DCI) const { 10286 switch (N->getOpcode()) { 10287 default: break; 10288 case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); 10289 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 10290 case ISD::SUB: return PerformSUBCombine(N, DCI); 10291 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 10292 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 10293 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 10294 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 10295 case ARMISD::BFI: return PerformBFICombine(N, DCI); 10296 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 10297 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 10298 case ISD::STORE: return PerformSTORECombine(N, DCI); 10299 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 10300 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 10301 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 10302 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 10303 case ISD::FP_TO_SINT: 10304 case ISD::FP_TO_UINT: 10305 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 10306 case ISD::FDIV: 10307 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 10308 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 10309 case ISD::SHL: 10310 case ISD::SRA: 10311 case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); 10312 case ISD::SIGN_EXTEND: 10313 case ISD::ZERO_EXTEND: 10314 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 10315 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 10316 case ISD::LOAD: return PerformLOADCombine(N, DCI); 10317 case ARMISD::VLD2DUP: 10318 case ARMISD::VLD3DUP: 10319 case ARMISD::VLD4DUP: 10320 return PerformVLDCombine(N, DCI); 10321 case ARMISD::BUILD_VECTOR: 10322 return PerformARMBUILD_VECTORCombine(N, DCI); 10323 case ISD::INTRINSIC_VOID: 10324 case ISD::INTRINSIC_W_CHAIN: 10325 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 10326 case Intrinsic::arm_neon_vld1: 10327 case Intrinsic::arm_neon_vld2: 10328 case Intrinsic::arm_neon_vld3: 10329 case Intrinsic::arm_neon_vld4: 10330 case Intrinsic::arm_neon_vld2lane: 10331 case Intrinsic::arm_neon_vld3lane: 10332 case Intrinsic::arm_neon_vld4lane: 10333 case Intrinsic::arm_neon_vst1: 10334 case Intrinsic::arm_neon_vst2: 10335 case Intrinsic::arm_neon_vst3: 10336 case Intrinsic::arm_neon_vst4: 10337 case Intrinsic::arm_neon_vst2lane: 10338 case Intrinsic::arm_neon_vst3lane: 10339 case Intrinsic::arm_neon_vst4lane: 10340 return PerformVLDCombine(N, DCI); 10341 default: break; 10342 } 10343 break; 10344 } 10345 return SDValue(); 10346 } 10347 10348 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 10349 EVT VT) const { 10350 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 10351 } 10352 10353 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 10354 unsigned, 10355 unsigned, 10356 bool *Fast) const { 10357 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 10358 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 10359 10360 switch (VT.getSimpleVT().SimpleTy) { 10361 default: 10362 return false; 10363 case MVT::i8: 10364 case MVT::i16: 10365 case MVT::i32: { 10366 // Unaligned access can use (for example) LRDB, LRDH, LDR 10367 if (AllowsUnaligned) { 10368 if (Fast) 10369 *Fast = Subtarget->hasV7Ops(); 10370 return true; 10371 } 10372 return false; 10373 } 10374 case MVT::f64: 10375 case MVT::v2f64: { 10376 // For any little-endian targets with neon, we can support unaligned ld/st 10377 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 10378 // A big-endian target may also explicitly support unaligned accesses 10379 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 10380 if (Fast) 10381 *Fast = true; 10382 return true; 10383 } 10384 return false; 10385 } 10386 } 10387 } 10388 10389 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 10390 unsigned AlignCheck) { 10391 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 10392 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 10393 } 10394 10395 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, 10396 unsigned DstAlign, unsigned SrcAlign, 10397 bool IsMemset, bool ZeroMemset, 10398 bool MemcpyStrSrc, 10399 MachineFunction &MF) const { 10400 const Function *F = MF.getFunction(); 10401 10402 // See if we can use NEON instructions for this... 10403 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 10404 !F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10405 bool Fast; 10406 if (Size >= 16 && 10407 (memOpAlign(SrcAlign, DstAlign, 16) || 10408 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { 10409 return MVT::v2f64; 10410 } else if (Size >= 8 && 10411 (memOpAlign(SrcAlign, DstAlign, 8) || 10412 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && 10413 Fast))) { 10414 return MVT::f64; 10415 } 10416 } 10417 10418 // Lowering to i32/i16 if the size permits. 10419 if (Size >= 4) 10420 return MVT::i32; 10421 else if (Size >= 2) 10422 return MVT::i16; 10423 10424 // Let the target-independent logic figure it out. 10425 return MVT::Other; 10426 } 10427 10428 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 10429 if (Val.getOpcode() != ISD::LOAD) 10430 return false; 10431 10432 EVT VT1 = Val.getValueType(); 10433 if (!VT1.isSimple() || !VT1.isInteger() || 10434 !VT2.isSimple() || !VT2.isInteger()) 10435 return false; 10436 10437 switch (VT1.getSimpleVT().SimpleTy) { 10438 default: break; 10439 case MVT::i1: 10440 case MVT::i8: 10441 case MVT::i16: 10442 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 10443 return true; 10444 } 10445 10446 return false; 10447 } 10448 10449 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 10450 EVT VT = ExtVal.getValueType(); 10451 10452 if (!isTypeLegal(VT)) 10453 return false; 10454 10455 // Don't create a loadext if we can fold the extension into a wide/long 10456 // instruction. 10457 // If there's more than one user instruction, the loadext is desirable no 10458 // matter what. There can be two uses by the same instruction. 10459 if (ExtVal->use_empty() || 10460 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 10461 return true; 10462 10463 SDNode *U = *ExtVal->use_begin(); 10464 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 10465 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) 10466 return false; 10467 10468 return true; 10469 } 10470 10471 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 10472 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 10473 return false; 10474 10475 if (!isTypeLegal(EVT::getEVT(Ty1))) 10476 return false; 10477 10478 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 10479 10480 // Assuming the caller doesn't have a zeroext or signext return parameter, 10481 // truncation all the way down to i1 is valid. 10482 return true; 10483 } 10484 10485 10486 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 10487 if (V < 0) 10488 return false; 10489 10490 unsigned Scale = 1; 10491 switch (VT.getSimpleVT().SimpleTy) { 10492 default: return false; 10493 case MVT::i1: 10494 case MVT::i8: 10495 // Scale == 1; 10496 break; 10497 case MVT::i16: 10498 // Scale == 2; 10499 Scale = 2; 10500 break; 10501 case MVT::i32: 10502 // Scale == 4; 10503 Scale = 4; 10504 break; 10505 } 10506 10507 if ((V & (Scale - 1)) != 0) 10508 return false; 10509 V /= Scale; 10510 return V == (V & ((1LL << 5) - 1)); 10511 } 10512 10513 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 10514 const ARMSubtarget *Subtarget) { 10515 bool isNeg = false; 10516 if (V < 0) { 10517 isNeg = true; 10518 V = - V; 10519 } 10520 10521 switch (VT.getSimpleVT().SimpleTy) { 10522 default: return false; 10523 case MVT::i1: 10524 case MVT::i8: 10525 case MVT::i16: 10526 case MVT::i32: 10527 // + imm12 or - imm8 10528 if (isNeg) 10529 return V == (V & ((1LL << 8) - 1)); 10530 return V == (V & ((1LL << 12) - 1)); 10531 case MVT::f32: 10532 case MVT::f64: 10533 // Same as ARM mode. FIXME: NEON? 10534 if (!Subtarget->hasVFP2()) 10535 return false; 10536 if ((V & 3) != 0) 10537 return false; 10538 V >>= 2; 10539 return V == (V & ((1LL << 8) - 1)); 10540 } 10541 } 10542 10543 /// isLegalAddressImmediate - Return true if the integer value can be used 10544 /// as the offset of the target addressing mode for load / store of the 10545 /// given type. 10546 static bool isLegalAddressImmediate(int64_t V, EVT VT, 10547 const ARMSubtarget *Subtarget) { 10548 if (V == 0) 10549 return true; 10550 10551 if (!VT.isSimple()) 10552 return false; 10553 10554 if (Subtarget->isThumb1Only()) 10555 return isLegalT1AddressImmediate(V, VT); 10556 else if (Subtarget->isThumb2()) 10557 return isLegalT2AddressImmediate(V, VT, Subtarget); 10558 10559 // ARM mode. 10560 if (V < 0) 10561 V = - V; 10562 switch (VT.getSimpleVT().SimpleTy) { 10563 default: return false; 10564 case MVT::i1: 10565 case MVT::i8: 10566 case MVT::i32: 10567 // +- imm12 10568 return V == (V & ((1LL << 12) - 1)); 10569 case MVT::i16: 10570 // +- imm8 10571 return V == (V & ((1LL << 8) - 1)); 10572 case MVT::f32: 10573 case MVT::f64: 10574 if (!Subtarget->hasVFP2()) // FIXME: NEON? 10575 return false; 10576 if ((V & 3) != 0) 10577 return false; 10578 V >>= 2; 10579 return V == (V & ((1LL << 8) - 1)); 10580 } 10581 } 10582 10583 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 10584 EVT VT) const { 10585 int Scale = AM.Scale; 10586 if (Scale < 0) 10587 return false; 10588 10589 switch (VT.getSimpleVT().SimpleTy) { 10590 default: return false; 10591 case MVT::i1: 10592 case MVT::i8: 10593 case MVT::i16: 10594 case MVT::i32: 10595 if (Scale == 1) 10596 return true; 10597 // r + r << imm 10598 Scale = Scale & ~1; 10599 return Scale == 2 || Scale == 4 || Scale == 8; 10600 case MVT::i64: 10601 // r + r 10602 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 10603 return true; 10604 return false; 10605 case MVT::isVoid: 10606 // Note, we allow "void" uses (basically, uses that aren't loads or 10607 // stores), because arm allows folding a scale into many arithmetic 10608 // operations. This should be made more precise and revisited later. 10609 10610 // Allow r << imm, but the imm has to be a multiple of two. 10611 if (Scale & 1) return false; 10612 return isPowerOf2_32(Scale); 10613 } 10614 } 10615 10616 /// isLegalAddressingMode - Return true if the addressing mode represented 10617 /// by AM is legal for this target, for a load/store of the specified type. 10618 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 10619 const AddrMode &AM, Type *Ty, 10620 unsigned AS) const { 10621 EVT VT = getValueType(DL, Ty, true); 10622 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 10623 return false; 10624 10625 // Can never fold addr of global into load/store. 10626 if (AM.BaseGV) 10627 return false; 10628 10629 switch (AM.Scale) { 10630 case 0: // no scale reg, must be "r+i" or "r", or "i". 10631 break; 10632 case 1: 10633 if (Subtarget->isThumb1Only()) 10634 return false; 10635 // FALL THROUGH. 10636 default: 10637 // ARM doesn't support any R+R*scale+imm addr modes. 10638 if (AM.BaseOffs) 10639 return false; 10640 10641 if (!VT.isSimple()) 10642 return false; 10643 10644 if (Subtarget->isThumb2()) 10645 return isLegalT2ScaledAddressingMode(AM, VT); 10646 10647 int Scale = AM.Scale; 10648 switch (VT.getSimpleVT().SimpleTy) { 10649 default: return false; 10650 case MVT::i1: 10651 case MVT::i8: 10652 case MVT::i32: 10653 if (Scale < 0) Scale = -Scale; 10654 if (Scale == 1) 10655 return true; 10656 // r + r << imm 10657 return isPowerOf2_32(Scale & ~1); 10658 case MVT::i16: 10659 case MVT::i64: 10660 // r + r 10661 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 10662 return true; 10663 return false; 10664 10665 case MVT::isVoid: 10666 // Note, we allow "void" uses (basically, uses that aren't loads or 10667 // stores), because arm allows folding a scale into many arithmetic 10668 // operations. This should be made more precise and revisited later. 10669 10670 // Allow r << imm, but the imm has to be a multiple of two. 10671 if (Scale & 1) return false; 10672 return isPowerOf2_32(Scale); 10673 } 10674 } 10675 return true; 10676 } 10677 10678 /// isLegalICmpImmediate - Return true if the specified immediate is legal 10679 /// icmp immediate, that is the target has icmp instructions which can compare 10680 /// a register against the immediate without having to materialize the 10681 /// immediate into a register. 10682 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 10683 // Thumb2 and ARM modes can use cmn for negative immediates. 10684 if (!Subtarget->isThumb()) 10685 return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; 10686 if (Subtarget->isThumb2()) 10687 return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; 10688 // Thumb1 doesn't have cmn, and only 8-bit immediates. 10689 return Imm >= 0 && Imm <= 255; 10690 } 10691 10692 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 10693 /// *or sub* immediate, that is the target has add or sub instructions which can 10694 /// add a register with the immediate without having to materialize the 10695 /// immediate into a register. 10696 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 10697 // Same encoding for add/sub, just flip the sign. 10698 int64_t AbsImm = std::abs(Imm); 10699 if (!Subtarget->isThumb()) 10700 return ARM_AM::getSOImmVal(AbsImm) != -1; 10701 if (Subtarget->isThumb2()) 10702 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 10703 // Thumb1 only has 8-bit unsigned immediate. 10704 return AbsImm >= 0 && AbsImm <= 255; 10705 } 10706 10707 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 10708 bool isSEXTLoad, SDValue &Base, 10709 SDValue &Offset, bool &isInc, 10710 SelectionDAG &DAG) { 10711 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 10712 return false; 10713 10714 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 10715 // AddressingMode 3 10716 Base = Ptr->getOperand(0); 10717 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 10718 int RHSC = (int)RHS->getZExtValue(); 10719 if (RHSC < 0 && RHSC > -256) { 10720 assert(Ptr->getOpcode() == ISD::ADD); 10721 isInc = false; 10722 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 10723 return true; 10724 } 10725 } 10726 isInc = (Ptr->getOpcode() == ISD::ADD); 10727 Offset = Ptr->getOperand(1); 10728 return true; 10729 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 10730 // AddressingMode 2 10731 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 10732 int RHSC = (int)RHS->getZExtValue(); 10733 if (RHSC < 0 && RHSC > -0x1000) { 10734 assert(Ptr->getOpcode() == ISD::ADD); 10735 isInc = false; 10736 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 10737 Base = Ptr->getOperand(0); 10738 return true; 10739 } 10740 } 10741 10742 if (Ptr->getOpcode() == ISD::ADD) { 10743 isInc = true; 10744 ARM_AM::ShiftOpc ShOpcVal= 10745 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 10746 if (ShOpcVal != ARM_AM::no_shift) { 10747 Base = Ptr->getOperand(1); 10748 Offset = Ptr->getOperand(0); 10749 } else { 10750 Base = Ptr->getOperand(0); 10751 Offset = Ptr->getOperand(1); 10752 } 10753 return true; 10754 } 10755 10756 isInc = (Ptr->getOpcode() == ISD::ADD); 10757 Base = Ptr->getOperand(0); 10758 Offset = Ptr->getOperand(1); 10759 return true; 10760 } 10761 10762 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 10763 return false; 10764 } 10765 10766 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 10767 bool isSEXTLoad, SDValue &Base, 10768 SDValue &Offset, bool &isInc, 10769 SelectionDAG &DAG) { 10770 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 10771 return false; 10772 10773 Base = Ptr->getOperand(0); 10774 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 10775 int RHSC = (int)RHS->getZExtValue(); 10776 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 10777 assert(Ptr->getOpcode() == ISD::ADD); 10778 isInc = false; 10779 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 10780 return true; 10781 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 10782 isInc = Ptr->getOpcode() == ISD::ADD; 10783 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 10784 return true; 10785 } 10786 } 10787 10788 return false; 10789 } 10790 10791 /// getPreIndexedAddressParts - returns true by value, base pointer and 10792 /// offset pointer and addressing mode by reference if the node's address 10793 /// can be legally represented as pre-indexed load / store address. 10794 bool 10795 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 10796 SDValue &Offset, 10797 ISD::MemIndexedMode &AM, 10798 SelectionDAG &DAG) const { 10799 if (Subtarget->isThumb1Only()) 10800 return false; 10801 10802 EVT VT; 10803 SDValue Ptr; 10804 bool isSEXTLoad = false; 10805 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 10806 Ptr = LD->getBasePtr(); 10807 VT = LD->getMemoryVT(); 10808 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 10809 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 10810 Ptr = ST->getBasePtr(); 10811 VT = ST->getMemoryVT(); 10812 } else 10813 return false; 10814 10815 bool isInc; 10816 bool isLegal = false; 10817 if (Subtarget->isThumb2()) 10818 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 10819 Offset, isInc, DAG); 10820 else 10821 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 10822 Offset, isInc, DAG); 10823 if (!isLegal) 10824 return false; 10825 10826 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 10827 return true; 10828 } 10829 10830 /// getPostIndexedAddressParts - returns true by value, base pointer and 10831 /// offset pointer and addressing mode by reference if this node can be 10832 /// combined with a load / store to form a post-indexed load / store. 10833 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 10834 SDValue &Base, 10835 SDValue &Offset, 10836 ISD::MemIndexedMode &AM, 10837 SelectionDAG &DAG) const { 10838 if (Subtarget->isThumb1Only()) 10839 return false; 10840 10841 EVT VT; 10842 SDValue Ptr; 10843 bool isSEXTLoad = false; 10844 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 10845 VT = LD->getMemoryVT(); 10846 Ptr = LD->getBasePtr(); 10847 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 10848 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 10849 VT = ST->getMemoryVT(); 10850 Ptr = ST->getBasePtr(); 10851 } else 10852 return false; 10853 10854 bool isInc; 10855 bool isLegal = false; 10856 if (Subtarget->isThumb2()) 10857 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 10858 isInc, DAG); 10859 else 10860 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 10861 isInc, DAG); 10862 if (!isLegal) 10863 return false; 10864 10865 if (Ptr != Base) { 10866 // Swap base ptr and offset to catch more post-index load / store when 10867 // it's legal. In Thumb2 mode, offset must be an immediate. 10868 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 10869 !Subtarget->isThumb2()) 10870 std::swap(Base, Offset); 10871 10872 // Post-indexed load / store update the base pointer. 10873 if (Ptr != Base) 10874 return false; 10875 } 10876 10877 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 10878 return true; 10879 } 10880 10881 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 10882 APInt &KnownZero, 10883 APInt &KnownOne, 10884 const SelectionDAG &DAG, 10885 unsigned Depth) const { 10886 unsigned BitWidth = KnownOne.getBitWidth(); 10887 KnownZero = KnownOne = APInt(BitWidth, 0); 10888 switch (Op.getOpcode()) { 10889 default: break; 10890 case ARMISD::ADDC: 10891 case ARMISD::ADDE: 10892 case ARMISD::SUBC: 10893 case ARMISD::SUBE: 10894 // These nodes' second result is a boolean 10895 if (Op.getResNo() == 0) 10896 break; 10897 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 10898 break; 10899 case ARMISD::CMOV: { 10900 // Bits are known zero/one if known on the LHS and RHS. 10901 DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); 10902 if (KnownZero == 0 && KnownOne == 0) return; 10903 10904 APInt KnownZeroRHS, KnownOneRHS; 10905 DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); 10906 KnownZero &= KnownZeroRHS; 10907 KnownOne &= KnownOneRHS; 10908 return; 10909 } 10910 case ISD::INTRINSIC_W_CHAIN: { 10911 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 10912 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 10913 switch (IntID) { 10914 default: return; 10915 case Intrinsic::arm_ldaex: 10916 case Intrinsic::arm_ldrex: { 10917 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 10918 unsigned MemBits = VT.getScalarType().getSizeInBits(); 10919 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 10920 return; 10921 } 10922 } 10923 } 10924 } 10925 } 10926 10927 //===----------------------------------------------------------------------===// 10928 // ARM Inline Assembly Support 10929 //===----------------------------------------------------------------------===// 10930 10931 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 10932 // Looking for "rev" which is V6+. 10933 if (!Subtarget->hasV6Ops()) 10934 return false; 10935 10936 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10937 std::string AsmStr = IA->getAsmString(); 10938 SmallVector<StringRef, 4> AsmPieces; 10939 SplitString(AsmStr, AsmPieces, ";\n"); 10940 10941 switch (AsmPieces.size()) { 10942 default: return false; 10943 case 1: 10944 AsmStr = AsmPieces[0]; 10945 AsmPieces.clear(); 10946 SplitString(AsmStr, AsmPieces, " \t,"); 10947 10948 // rev $0, $1 10949 if (AsmPieces.size() == 3 && 10950 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 10951 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 10952 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10953 if (Ty && Ty->getBitWidth() == 32) 10954 return IntrinsicLowering::LowerToByteSwap(CI); 10955 } 10956 break; 10957 } 10958 10959 return false; 10960 } 10961 10962 /// getConstraintType - Given a constraint letter, return the type of 10963 /// constraint it is for this target. 10964 ARMTargetLowering::ConstraintType 10965 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 10966 if (Constraint.size() == 1) { 10967 switch (Constraint[0]) { 10968 default: break; 10969 case 'l': return C_RegisterClass; 10970 case 'w': return C_RegisterClass; 10971 case 'h': return C_RegisterClass; 10972 case 'x': return C_RegisterClass; 10973 case 't': return C_RegisterClass; 10974 case 'j': return C_Other; // Constant for movw. 10975 // An address with a single base register. Due to the way we 10976 // currently handle addresses it is the same as an 'r' memory constraint. 10977 case 'Q': return C_Memory; 10978 } 10979 } else if (Constraint.size() == 2) { 10980 switch (Constraint[0]) { 10981 default: break; 10982 // All 'U+' constraints are addresses. 10983 case 'U': return C_Memory; 10984 } 10985 } 10986 return TargetLowering::getConstraintType(Constraint); 10987 } 10988 10989 /// Examine constraint type and operand type and determine a weight value. 10990 /// This object must already have been set up with the operand type 10991 /// and the current alternative constraint selected. 10992 TargetLowering::ConstraintWeight 10993 ARMTargetLowering::getSingleConstraintMatchWeight( 10994 AsmOperandInfo &info, const char *constraint) const { 10995 ConstraintWeight weight = CW_Invalid; 10996 Value *CallOperandVal = info.CallOperandVal; 10997 // If we don't have a value, we can't do a match, 10998 // but allow it at the lowest weight. 10999 if (!CallOperandVal) 11000 return CW_Default; 11001 Type *type = CallOperandVal->getType(); 11002 // Look at the constraint type. 11003 switch (*constraint) { 11004 default: 11005 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 11006 break; 11007 case 'l': 11008 if (type->isIntegerTy()) { 11009 if (Subtarget->isThumb()) 11010 weight = CW_SpecificReg; 11011 else 11012 weight = CW_Register; 11013 } 11014 break; 11015 case 'w': 11016 if (type->isFloatingPointTy()) 11017 weight = CW_Register; 11018 break; 11019 } 11020 return weight; 11021 } 11022 11023 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; 11024 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 11025 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 11026 if (Constraint.size() == 1) { 11027 // GCC ARM Constraint Letters 11028 switch (Constraint[0]) { 11029 case 'l': // Low regs or general regs. 11030 if (Subtarget->isThumb()) 11031 return RCPair(0U, &ARM::tGPRRegClass); 11032 return RCPair(0U, &ARM::GPRRegClass); 11033 case 'h': // High regs or no regs. 11034 if (Subtarget->isThumb()) 11035 return RCPair(0U, &ARM::hGPRRegClass); 11036 break; 11037 case 'r': 11038 if (Subtarget->isThumb1Only()) 11039 return RCPair(0U, &ARM::tGPRRegClass); 11040 return RCPair(0U, &ARM::GPRRegClass); 11041 case 'w': 11042 if (VT == MVT::Other) 11043 break; 11044 if (VT == MVT::f32) 11045 return RCPair(0U, &ARM::SPRRegClass); 11046 if (VT.getSizeInBits() == 64) 11047 return RCPair(0U, &ARM::DPRRegClass); 11048 if (VT.getSizeInBits() == 128) 11049 return RCPair(0U, &ARM::QPRRegClass); 11050 break; 11051 case 'x': 11052 if (VT == MVT::Other) 11053 break; 11054 if (VT == MVT::f32) 11055 return RCPair(0U, &ARM::SPR_8RegClass); 11056 if (VT.getSizeInBits() == 64) 11057 return RCPair(0U, &ARM::DPR_8RegClass); 11058 if (VT.getSizeInBits() == 128) 11059 return RCPair(0U, &ARM::QPR_8RegClass); 11060 break; 11061 case 't': 11062 if (VT == MVT::f32) 11063 return RCPair(0U, &ARM::SPRRegClass); 11064 break; 11065 } 11066 } 11067 if (StringRef("{cc}").equals_lower(Constraint)) 11068 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 11069 11070 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 11071 } 11072 11073 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 11074 /// vector. If it is invalid, don't add anything to Ops. 11075 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 11076 std::string &Constraint, 11077 std::vector<SDValue>&Ops, 11078 SelectionDAG &DAG) const { 11079 SDValue Result; 11080 11081 // Currently only support length 1 constraints. 11082 if (Constraint.length() != 1) return; 11083 11084 char ConstraintLetter = Constraint[0]; 11085 switch (ConstraintLetter) { 11086 default: break; 11087 case 'j': 11088 case 'I': case 'J': case 'K': case 'L': 11089 case 'M': case 'N': case 'O': 11090 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 11091 if (!C) 11092 return; 11093 11094 int64_t CVal64 = C->getSExtValue(); 11095 int CVal = (int) CVal64; 11096 // None of these constraints allow values larger than 32 bits. Check 11097 // that the value fits in an int. 11098 if (CVal != CVal64) 11099 return; 11100 11101 switch (ConstraintLetter) { 11102 case 'j': 11103 // Constant suitable for movw, must be between 0 and 11104 // 65535. 11105 if (Subtarget->hasV6T2Ops()) 11106 if (CVal >= 0 && CVal <= 65535) 11107 break; 11108 return; 11109 case 'I': 11110 if (Subtarget->isThumb1Only()) { 11111 // This must be a constant between 0 and 255, for ADD 11112 // immediates. 11113 if (CVal >= 0 && CVal <= 255) 11114 break; 11115 } else if (Subtarget->isThumb2()) { 11116 // A constant that can be used as an immediate value in a 11117 // data-processing instruction. 11118 if (ARM_AM::getT2SOImmVal(CVal) != -1) 11119 break; 11120 } else { 11121 // A constant that can be used as an immediate value in a 11122 // data-processing instruction. 11123 if (ARM_AM::getSOImmVal(CVal) != -1) 11124 break; 11125 } 11126 return; 11127 11128 case 'J': 11129 if (Subtarget->isThumb()) { // FIXME thumb2 11130 // This must be a constant between -255 and -1, for negated ADD 11131 // immediates. This can be used in GCC with an "n" modifier that 11132 // prints the negated value, for use with SUB instructions. It is 11133 // not useful otherwise but is implemented for compatibility. 11134 if (CVal >= -255 && CVal <= -1) 11135 break; 11136 } else { 11137 // This must be a constant between -4095 and 4095. It is not clear 11138 // what this constraint is intended for. Implemented for 11139 // compatibility with GCC. 11140 if (CVal >= -4095 && CVal <= 4095) 11141 break; 11142 } 11143 return; 11144 11145 case 'K': 11146 if (Subtarget->isThumb1Only()) { 11147 // A 32-bit value where only one byte has a nonzero value. Exclude 11148 // zero to match GCC. This constraint is used by GCC internally for 11149 // constants that can be loaded with a move/shift combination. 11150 // It is not useful otherwise but is implemented for compatibility. 11151 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 11152 break; 11153 } else if (Subtarget->isThumb2()) { 11154 // A constant whose bitwise inverse can be used as an immediate 11155 // value in a data-processing instruction. This can be used in GCC 11156 // with a "B" modifier that prints the inverted value, for use with 11157 // BIC and MVN instructions. It is not useful otherwise but is 11158 // implemented for compatibility. 11159 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 11160 break; 11161 } else { 11162 // A constant whose bitwise inverse can be used as an immediate 11163 // value in a data-processing instruction. This can be used in GCC 11164 // with a "B" modifier that prints the inverted value, for use with 11165 // BIC and MVN instructions. It is not useful otherwise but is 11166 // implemented for compatibility. 11167 if (ARM_AM::getSOImmVal(~CVal) != -1) 11168 break; 11169 } 11170 return; 11171 11172 case 'L': 11173 if (Subtarget->isThumb1Only()) { 11174 // This must be a constant between -7 and 7, 11175 // for 3-operand ADD/SUB immediate instructions. 11176 if (CVal >= -7 && CVal < 7) 11177 break; 11178 } else if (Subtarget->isThumb2()) { 11179 // A constant whose negation can be used as an immediate value in a 11180 // data-processing instruction. This can be used in GCC with an "n" 11181 // modifier that prints the negated value, for use with SUB 11182 // instructions. It is not useful otherwise but is implemented for 11183 // compatibility. 11184 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 11185 break; 11186 } else { 11187 // A constant whose negation can be used as an immediate value in a 11188 // data-processing instruction. This can be used in GCC with an "n" 11189 // modifier that prints the negated value, for use with SUB 11190 // instructions. It is not useful otherwise but is implemented for 11191 // compatibility. 11192 if (ARM_AM::getSOImmVal(-CVal) != -1) 11193 break; 11194 } 11195 return; 11196 11197 case 'M': 11198 if (Subtarget->isThumb()) { // FIXME thumb2 11199 // This must be a multiple of 4 between 0 and 1020, for 11200 // ADD sp + immediate. 11201 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 11202 break; 11203 } else { 11204 // A power of two or a constant between 0 and 32. This is used in 11205 // GCC for the shift amount on shifted register operands, but it is 11206 // useful in general for any shift amounts. 11207 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 11208 break; 11209 } 11210 return; 11211 11212 case 'N': 11213 if (Subtarget->isThumb()) { // FIXME thumb2 11214 // This must be a constant between 0 and 31, for shift amounts. 11215 if (CVal >= 0 && CVal <= 31) 11216 break; 11217 } 11218 return; 11219 11220 case 'O': 11221 if (Subtarget->isThumb()) { // FIXME thumb2 11222 // This must be a multiple of 4 between -508 and 508, for 11223 // ADD/SUB sp = sp + immediate. 11224 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 11225 break; 11226 } 11227 return; 11228 } 11229 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 11230 break; 11231 } 11232 11233 if (Result.getNode()) { 11234 Ops.push_back(Result); 11235 return; 11236 } 11237 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 11238 } 11239 11240 static RTLIB::Libcall getDivRemLibcall( 11241 const SDNode *N, MVT::SimpleValueType SVT) { 11242 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 11243 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 11244 "Unhandled Opcode in getDivRemLibcall"); 11245 bool isSigned = N->getOpcode() == ISD::SDIVREM || 11246 N->getOpcode() == ISD::SREM; 11247 RTLIB::Libcall LC; 11248 switch (SVT) { 11249 default: llvm_unreachable("Unexpected request for libcall!"); 11250 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 11251 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 11252 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 11253 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 11254 } 11255 return LC; 11256 } 11257 11258 static TargetLowering::ArgListTy getDivRemArgList( 11259 const SDNode *N, LLVMContext *Context) { 11260 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 11261 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 11262 "Unhandled Opcode in getDivRemArgList"); 11263 bool isSigned = N->getOpcode() == ISD::SDIVREM || 11264 N->getOpcode() == ISD::SREM; 11265 TargetLowering::ArgListTy Args; 11266 TargetLowering::ArgListEntry Entry; 11267 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 11268 EVT ArgVT = N->getOperand(i).getValueType(); 11269 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 11270 Entry.Node = N->getOperand(i); 11271 Entry.Ty = ArgTy; 11272 Entry.isSExt = isSigned; 11273 Entry.isZExt = !isSigned; 11274 Args.push_back(Entry); 11275 } 11276 return Args; 11277 } 11278 11279 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 11280 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) && 11281 "Register-based DivRem lowering only"); 11282 unsigned Opcode = Op->getOpcode(); 11283 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 11284 "Invalid opcode for Div/Rem lowering"); 11285 bool isSigned = (Opcode == ISD::SDIVREM); 11286 EVT VT = Op->getValueType(0); 11287 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 11288 11289 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 11290 VT.getSimpleVT().SimpleTy); 11291 SDValue InChain = DAG.getEntryNode(); 11292 11293 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 11294 DAG.getContext()); 11295 11296 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 11297 getPointerTy(DAG.getDataLayout())); 11298 11299 Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); 11300 11301 SDLoc dl(Op); 11302 TargetLowering::CallLoweringInfo CLI(DAG); 11303 CLI.setDebugLoc(dl).setChain(InChain) 11304 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0) 11305 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 11306 11307 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 11308 return CallInfo.first; 11309 } 11310 11311 // Lowers REM using divmod helpers 11312 // see RTABI section 4.2/4.3 11313 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 11314 // Build return types (div and rem) 11315 std::vector<Type*> RetTyParams; 11316 Type *RetTyElement; 11317 11318 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 11319 default: llvm_unreachable("Unexpected request for libcall!"); 11320 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 11321 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 11322 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 11323 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 11324 } 11325 11326 RetTyParams.push_back(RetTyElement); 11327 RetTyParams.push_back(RetTyElement); 11328 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 11329 Type *RetTy = StructType::get(*DAG.getContext(), ret); 11330 11331 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 11332 SimpleTy); 11333 SDValue InChain = DAG.getEntryNode(); 11334 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext()); 11335 bool isSigned = N->getOpcode() == ISD::SREM; 11336 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 11337 getPointerTy(DAG.getDataLayout())); 11338 11339 // Lower call 11340 CallLoweringInfo CLI(DAG); 11341 CLI.setChain(InChain) 11342 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0) 11343 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 11344 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 11345 11346 // Return second (rem) result operand (first contains div) 11347 SDNode *ResNode = CallResult.first.getNode(); 11348 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 11349 return ResNode->getOperand(1); 11350 } 11351 11352 SDValue 11353 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 11354 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 11355 SDLoc DL(Op); 11356 11357 // Get the inputs. 11358 SDValue Chain = Op.getOperand(0); 11359 SDValue Size = Op.getOperand(1); 11360 11361 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 11362 DAG.getConstant(2, DL, MVT::i32)); 11363 11364 SDValue Flag; 11365 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 11366 Flag = Chain.getValue(1); 11367 11368 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 11369 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 11370 11371 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 11372 Chain = NewSP.getValue(1); 11373 11374 SDValue Ops[2] = { NewSP, Chain }; 11375 return DAG.getMergeValues(Ops, DL); 11376 } 11377 11378 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 11379 assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && 11380 "Unexpected type for custom-lowering FP_EXTEND"); 11381 11382 RTLIB::Libcall LC; 11383 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 11384 11385 SDValue SrcVal = Op.getOperand(0); 11386 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 11387 SDLoc(Op)).first; 11388 } 11389 11390 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 11391 assert(Op.getOperand(0).getValueType() == MVT::f64 && 11392 Subtarget->isFPOnlySP() && 11393 "Unexpected type for custom-lowering FP_ROUND"); 11394 11395 RTLIB::Libcall LC; 11396 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 11397 11398 SDValue SrcVal = Op.getOperand(0); 11399 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 11400 SDLoc(Op)).first; 11401 } 11402 11403 bool 11404 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 11405 // The ARM target isn't yet aware of offsets. 11406 return false; 11407 } 11408 11409 bool ARM::isBitFieldInvertedMask(unsigned v) { 11410 if (v == 0xffffffff) 11411 return false; 11412 11413 // there can be 1's on either or both "outsides", all the "inside" 11414 // bits must be 0's 11415 return isShiftedMask_32(~v); 11416 } 11417 11418 /// isFPImmLegal - Returns true if the target can instruction select the 11419 /// specified FP immediate natively. If false, the legalizer will 11420 /// materialize the FP immediate as a load from a constant pool. 11421 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 11422 if (!Subtarget->hasVFP3()) 11423 return false; 11424 if (VT == MVT::f32) 11425 return ARM_AM::getFP32Imm(Imm) != -1; 11426 if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) 11427 return ARM_AM::getFP64Imm(Imm) != -1; 11428 return false; 11429 } 11430 11431 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 11432 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 11433 /// specified in the intrinsic calls. 11434 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 11435 const CallInst &I, 11436 unsigned Intrinsic) const { 11437 switch (Intrinsic) { 11438 case Intrinsic::arm_neon_vld1: 11439 case Intrinsic::arm_neon_vld2: 11440 case Intrinsic::arm_neon_vld3: 11441 case Intrinsic::arm_neon_vld4: 11442 case Intrinsic::arm_neon_vld2lane: 11443 case Intrinsic::arm_neon_vld3lane: 11444 case Intrinsic::arm_neon_vld4lane: { 11445 Info.opc = ISD::INTRINSIC_W_CHAIN; 11446 // Conservatively set memVT to the entire set of vectors loaded. 11447 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 11448 uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; 11449 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 11450 Info.ptrVal = I.getArgOperand(0); 11451 Info.offset = 0; 11452 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 11453 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 11454 Info.vol = false; // volatile loads with NEON intrinsics not supported 11455 Info.readMem = true; 11456 Info.writeMem = false; 11457 return true; 11458 } 11459 case Intrinsic::arm_neon_vst1: 11460 case Intrinsic::arm_neon_vst2: 11461 case Intrinsic::arm_neon_vst3: 11462 case Intrinsic::arm_neon_vst4: 11463 case Intrinsic::arm_neon_vst2lane: 11464 case Intrinsic::arm_neon_vst3lane: 11465 case Intrinsic::arm_neon_vst4lane: { 11466 Info.opc = ISD::INTRINSIC_VOID; 11467 // Conservatively set memVT to the entire set of vectors stored. 11468 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 11469 unsigned NumElts = 0; 11470 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 11471 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 11472 if (!ArgTy->isVectorTy()) 11473 break; 11474 NumElts += DL.getTypeAllocSize(ArgTy) / 8; 11475 } 11476 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 11477 Info.ptrVal = I.getArgOperand(0); 11478 Info.offset = 0; 11479 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 11480 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 11481 Info.vol = false; // volatile stores with NEON intrinsics not supported 11482 Info.readMem = false; 11483 Info.writeMem = true; 11484 return true; 11485 } 11486 case Intrinsic::arm_ldaex: 11487 case Intrinsic::arm_ldrex: { 11488 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 11489 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 11490 Info.opc = ISD::INTRINSIC_W_CHAIN; 11491 Info.memVT = MVT::getVT(PtrTy->getElementType()); 11492 Info.ptrVal = I.getArgOperand(0); 11493 Info.offset = 0; 11494 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 11495 Info.vol = true; 11496 Info.readMem = true; 11497 Info.writeMem = false; 11498 return true; 11499 } 11500 case Intrinsic::arm_stlex: 11501 case Intrinsic::arm_strex: { 11502 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 11503 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 11504 Info.opc = ISD::INTRINSIC_W_CHAIN; 11505 Info.memVT = MVT::getVT(PtrTy->getElementType()); 11506 Info.ptrVal = I.getArgOperand(1); 11507 Info.offset = 0; 11508 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 11509 Info.vol = true; 11510 Info.readMem = false; 11511 Info.writeMem = true; 11512 return true; 11513 } 11514 case Intrinsic::arm_stlexd: 11515 case Intrinsic::arm_strexd: { 11516 Info.opc = ISD::INTRINSIC_W_CHAIN; 11517 Info.memVT = MVT::i64; 11518 Info.ptrVal = I.getArgOperand(2); 11519 Info.offset = 0; 11520 Info.align = 8; 11521 Info.vol = true; 11522 Info.readMem = false; 11523 Info.writeMem = true; 11524 return true; 11525 } 11526 case Intrinsic::arm_ldaexd: 11527 case Intrinsic::arm_ldrexd: { 11528 Info.opc = ISD::INTRINSIC_W_CHAIN; 11529 Info.memVT = MVT::i64; 11530 Info.ptrVal = I.getArgOperand(0); 11531 Info.offset = 0; 11532 Info.align = 8; 11533 Info.vol = true; 11534 Info.readMem = true; 11535 Info.writeMem = false; 11536 return true; 11537 } 11538 default: 11539 break; 11540 } 11541 11542 return false; 11543 } 11544 11545 /// \brief Returns true if it is beneficial to convert a load of a constant 11546 /// to just the constant itself. 11547 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 11548 Type *Ty) const { 11549 assert(Ty->isIntegerTy()); 11550 11551 unsigned Bits = Ty->getPrimitiveSizeInBits(); 11552 if (Bits == 0 || Bits > 32) 11553 return false; 11554 return true; 11555 } 11556 11557 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 11558 ARM_MB::MemBOpt Domain) const { 11559 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 11560 11561 // First, if the target has no DMB, see what fallback we can use. 11562 if (!Subtarget->hasDataBarrier()) { 11563 // Some ARMv6 cpus can support data barriers with an mcr instruction. 11564 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 11565 // here. 11566 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 11567 Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 11568 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 11569 Builder.getInt32(0), Builder.getInt32(7), 11570 Builder.getInt32(10), Builder.getInt32(5)}; 11571 return Builder.CreateCall(MCR, args); 11572 } else { 11573 // Instead of using barriers, atomic accesses on these subtargets use 11574 // libcalls. 11575 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 11576 } 11577 } else { 11578 Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 11579 // Only a full system barrier exists in the M-class architectures. 11580 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 11581 Constant *CDomain = Builder.getInt32(Domain); 11582 return Builder.CreateCall(DMB, CDomain); 11583 } 11584 } 11585 11586 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 11587 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 11588 AtomicOrdering Ord, bool IsStore, 11589 bool IsLoad) const { 11590 if (!getInsertFencesForAtomic()) 11591 return nullptr; 11592 11593 switch (Ord) { 11594 case NotAtomic: 11595 case Unordered: 11596 llvm_unreachable("Invalid fence: unordered/non-atomic"); 11597 case Monotonic: 11598 case Acquire: 11599 return nullptr; // Nothing to do 11600 case SequentiallyConsistent: 11601 if (!IsStore) 11602 return nullptr; // Nothing to do 11603 /*FALLTHROUGH*/ 11604 case Release: 11605 case AcquireRelease: 11606 if (Subtarget->isSwift()) 11607 return makeDMB(Builder, ARM_MB::ISHST); 11608 // FIXME: add a comment with a link to documentation justifying this. 11609 else 11610 return makeDMB(Builder, ARM_MB::ISH); 11611 } 11612 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 11613 } 11614 11615 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 11616 AtomicOrdering Ord, bool IsStore, 11617 bool IsLoad) const { 11618 if (!getInsertFencesForAtomic()) 11619 return nullptr; 11620 11621 switch (Ord) { 11622 case NotAtomic: 11623 case Unordered: 11624 llvm_unreachable("Invalid fence: unordered/not-atomic"); 11625 case Monotonic: 11626 case Release: 11627 return nullptr; // Nothing to do 11628 case Acquire: 11629 case AcquireRelease: 11630 case SequentiallyConsistent: 11631 return makeDMB(Builder, ARM_MB::ISH); 11632 } 11633 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 11634 } 11635 11636 // Loads and stores less than 64-bits are already atomic; ones above that 11637 // are doomed anyway, so defer to the default libcall and blame the OS when 11638 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 11639 // anything for those. 11640 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 11641 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 11642 return (Size == 64) && !Subtarget->isMClass(); 11643 } 11644 11645 // Loads and stores less than 64-bits are already atomic; ones above that 11646 // are doomed anyway, so defer to the default libcall and blame the OS when 11647 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 11648 // anything for those. 11649 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 11650 // guarantee, see DDI0406C ARM architecture reference manual, 11651 // sections A8.8.72-74 LDRD) 11652 TargetLowering::AtomicExpansionKind 11653 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 11654 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 11655 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLSC 11656 : AtomicExpansionKind::None; 11657 } 11658 11659 // For the real atomic operations, we have ldrex/strex up to 32 bits, 11660 // and up to 64 bits on the non-M profiles 11661 TargetLowering::AtomicExpansionKind 11662 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 11663 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 11664 return (Size <= (Subtarget->isMClass() ? 32U : 64U)) 11665 ? AtomicExpansionKind::LLSC 11666 : AtomicExpansionKind::None; 11667 } 11668 11669 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( 11670 AtomicCmpXchgInst *AI) const { 11671 return true; 11672 } 11673 11674 // This has so far only been implemented for MachO. 11675 bool ARMTargetLowering::useLoadStackGuardNode() const { 11676 return Subtarget->isTargetMachO(); 11677 } 11678 11679 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 11680 unsigned &Cost) const { 11681 // If we do not have NEON, vector types are not natively supported. 11682 if (!Subtarget->hasNEON()) 11683 return false; 11684 11685 // Floating point values and vector values map to the same register file. 11686 // Therefore, although we could do a store extract of a vector type, this is 11687 // better to leave at float as we have more freedom in the addressing mode for 11688 // those. 11689 if (VectorTy->isFPOrFPVectorTy()) 11690 return false; 11691 11692 // If the index is unknown at compile time, this is very expensive to lower 11693 // and it is not possible to combine the store with the extract. 11694 if (!isa<ConstantInt>(Idx)) 11695 return false; 11696 11697 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 11698 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 11699 // We can do a store + vector extract on any vector that fits perfectly in a D 11700 // or Q register. 11701 if (BitWidth == 64 || BitWidth == 128) { 11702 Cost = 0; 11703 return true; 11704 } 11705 return false; 11706 } 11707 11708 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 11709 AtomicOrdering Ord) const { 11710 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 11711 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 11712 bool IsAcquire = isAtLeastAcquire(Ord); 11713 11714 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 11715 // intrinsic must return {i32, i32} and we have to recombine them into a 11716 // single i64 here. 11717 if (ValTy->getPrimitiveSizeInBits() == 64) { 11718 Intrinsic::ID Int = 11719 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 11720 Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int); 11721 11722 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 11723 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 11724 11725 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 11726 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 11727 if (!Subtarget->isLittle()) 11728 std::swap (Lo, Hi); 11729 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 11730 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 11731 return Builder.CreateOr( 11732 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 11733 } 11734 11735 Type *Tys[] = { Addr->getType() }; 11736 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 11737 Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys); 11738 11739 return Builder.CreateTruncOrBitCast( 11740 Builder.CreateCall(Ldrex, Addr), 11741 cast<PointerType>(Addr->getType())->getElementType()); 11742 } 11743 11744 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 11745 IRBuilder<> &Builder) const { 11746 if (!Subtarget->hasV7Ops()) 11747 return; 11748 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 11749 Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 11750 } 11751 11752 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 11753 Value *Addr, 11754 AtomicOrdering Ord) const { 11755 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 11756 bool IsRelease = isAtLeastRelease(Ord); 11757 11758 // Since the intrinsics must have legal type, the i64 intrinsics take two 11759 // parameters: "i32, i32". We must marshal Val into the appropriate form 11760 // before the call. 11761 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 11762 Intrinsic::ID Int = 11763 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 11764 Function *Strex = Intrinsic::getDeclaration(M, Int); 11765 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 11766 11767 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 11768 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 11769 if (!Subtarget->isLittle()) 11770 std::swap (Lo, Hi); 11771 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 11772 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 11773 } 11774 11775 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 11776 Type *Tys[] = { Addr->getType() }; 11777 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 11778 11779 return Builder.CreateCall( 11780 Strex, {Builder.CreateZExtOrBitCast( 11781 Val, Strex->getFunctionType()->getParamType(0)), 11782 Addr}); 11783 } 11784 11785 /// \brief Lower an interleaved load into a vldN intrinsic. 11786 /// 11787 /// E.g. Lower an interleaved load (Factor = 2): 11788 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 11789 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 11790 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 11791 /// 11792 /// Into: 11793 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 11794 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 11795 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 11796 bool ARMTargetLowering::lowerInterleavedLoad( 11797 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 11798 ArrayRef<unsigned> Indices, unsigned Factor) const { 11799 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 11800 "Invalid interleave factor"); 11801 assert(!Shuffles.empty() && "Empty shufflevector input"); 11802 assert(Shuffles.size() == Indices.size() && 11803 "Unmatched number of shufflevectors and indices"); 11804 11805 VectorType *VecTy = Shuffles[0]->getType(); 11806 Type *EltTy = VecTy->getVectorElementType(); 11807 11808 const DataLayout &DL = LI->getModule()->getDataLayout(); 11809 unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); 11810 bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; 11811 11812 // Skip if we do not have NEON and skip illegal vector types and vector types 11813 // with i64/f64 elements (vldN doesn't support i64/f64 elements). 11814 if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) 11815 return false; 11816 11817 // A pointer vector can not be the return type of the ldN intrinsics. Need to 11818 // load integer vectors first and then convert to pointer vectors. 11819 if (EltTy->isPointerTy()) 11820 VecTy = 11821 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 11822 11823 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 11824 Intrinsic::arm_neon_vld3, 11825 Intrinsic::arm_neon_vld4}; 11826 11827 IRBuilder<> Builder(LI); 11828 SmallVector<Value *, 2> Ops; 11829 11830 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 11831 Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); 11832 Ops.push_back(Builder.getInt32(LI->getAlignment())); 11833 11834 Type *Tys[] = { VecTy, Int8Ptr }; 11835 Function *VldnFunc = 11836 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 11837 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); 11838 11839 // Replace uses of each shufflevector with the corresponding vector loaded 11840 // by ldN. 11841 for (unsigned i = 0; i < Shuffles.size(); i++) { 11842 ShuffleVectorInst *SV = Shuffles[i]; 11843 unsigned Index = Indices[i]; 11844 11845 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 11846 11847 // Convert the integer vector to pointer vector if the element is pointer. 11848 if (EltTy->isPointerTy()) 11849 SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); 11850 11851 SV->replaceAllUsesWith(SubVec); 11852 } 11853 11854 return true; 11855 } 11856 11857 /// \brief Get a mask consisting of sequential integers starting from \p Start. 11858 /// 11859 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1> 11860 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, 11861 unsigned NumElts) { 11862 SmallVector<Constant *, 16> Mask; 11863 for (unsigned i = 0; i < NumElts; i++) 11864 Mask.push_back(Builder.getInt32(Start + i)); 11865 11866 return ConstantVector::get(Mask); 11867 } 11868 11869 /// \brief Lower an interleaved store into a vstN intrinsic. 11870 /// 11871 /// E.g. Lower an interleaved store (Factor = 3): 11872 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 11873 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 11874 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 11875 /// 11876 /// Into: 11877 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 11878 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 11879 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 11880 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 11881 /// 11882 /// Note that the new shufflevectors will be removed and we'll only generate one 11883 /// vst3 instruction in CodeGen. 11884 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 11885 ShuffleVectorInst *SVI, 11886 unsigned Factor) const { 11887 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 11888 "Invalid interleave factor"); 11889 11890 VectorType *VecTy = SVI->getType(); 11891 assert(VecTy->getVectorNumElements() % Factor == 0 && 11892 "Invalid interleaved store"); 11893 11894 unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; 11895 Type *EltTy = VecTy->getVectorElementType(); 11896 VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); 11897 11898 const DataLayout &DL = SI->getModule()->getDataLayout(); 11899 unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); 11900 bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; 11901 11902 // Skip if we do not have NEON and skip illegal vector types and vector types 11903 // with i64/f64 elements (vstN doesn't support i64/f64 elements). 11904 if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || 11905 EltIs64Bits) 11906 return false; 11907 11908 Value *Op0 = SVI->getOperand(0); 11909 Value *Op1 = SVI->getOperand(1); 11910 IRBuilder<> Builder(SI); 11911 11912 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 11913 // vectors to integer vectors. 11914 if (EltTy->isPointerTy()) { 11915 Type *IntTy = DL.getIntPtrType(EltTy); 11916 11917 // Convert to the corresponding integer vector. 11918 Type *IntVecTy = 11919 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 11920 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 11921 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 11922 11923 SubVecTy = VectorType::get(IntTy, NumSubElts); 11924 } 11925 11926 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 11927 Intrinsic::arm_neon_vst3, 11928 Intrinsic::arm_neon_vst4}; 11929 SmallVector<Value *, 6> Ops; 11930 11931 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 11932 Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); 11933 11934 Type *Tys[] = { Int8Ptr, SubVecTy }; 11935 Function *VstNFunc = Intrinsic::getDeclaration( 11936 SI->getModule(), StoreInts[Factor - 2], Tys); 11937 11938 // Split the shufflevector operands into sub vectors for the new vstN call. 11939 for (unsigned i = 0; i < Factor; i++) 11940 Ops.push_back(Builder.CreateShuffleVector( 11941 Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); 11942 11943 Ops.push_back(Builder.getInt32(SI->getAlignment())); 11944 Builder.CreateCall(VstNFunc, Ops); 11945 return true; 11946 } 11947 11948 enum HABaseType { 11949 HA_UNKNOWN = 0, 11950 HA_FLOAT, 11951 HA_DOUBLE, 11952 HA_VECT64, 11953 HA_VECT128 11954 }; 11955 11956 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 11957 uint64_t &Members) { 11958 if (auto *ST = dyn_cast<StructType>(Ty)) { 11959 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 11960 uint64_t SubMembers = 0; 11961 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 11962 return false; 11963 Members += SubMembers; 11964 } 11965 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 11966 uint64_t SubMembers = 0; 11967 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 11968 return false; 11969 Members += SubMembers * AT->getNumElements(); 11970 } else if (Ty->isFloatTy()) { 11971 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 11972 return false; 11973 Members = 1; 11974 Base = HA_FLOAT; 11975 } else if (Ty->isDoubleTy()) { 11976 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 11977 return false; 11978 Members = 1; 11979 Base = HA_DOUBLE; 11980 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 11981 Members = 1; 11982 switch (Base) { 11983 case HA_FLOAT: 11984 case HA_DOUBLE: 11985 return false; 11986 case HA_VECT64: 11987 return VT->getBitWidth() == 64; 11988 case HA_VECT128: 11989 return VT->getBitWidth() == 128; 11990 case HA_UNKNOWN: 11991 switch (VT->getBitWidth()) { 11992 case 64: 11993 Base = HA_VECT64; 11994 return true; 11995 case 128: 11996 Base = HA_VECT128; 11997 return true; 11998 default: 11999 return false; 12000 } 12001 } 12002 } 12003 12004 return (Members > 0 && Members <= 4); 12005 } 12006 12007 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 12008 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 12009 /// passing according to AAPCS rules. 12010 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 12011 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 12012 if (getEffectiveCallingConv(CallConv, isVarArg) != 12013 CallingConv::ARM_AAPCS_VFP) 12014 return false; 12015 12016 HABaseType Base = HA_UNKNOWN; 12017 uint64_t Members = 0; 12018 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 12019 DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 12020 12021 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 12022 return IsHA || IsIntArray; 12023 } 12024