1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that ARM uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "ARMISelLowering.h" 16 #include "ARMCallingConv.h" 17 #include "ARMConstantPoolValue.h" 18 #include "ARMMachineFunctionInfo.h" 19 #include "ARMPerfectShuffle.h" 20 #include "ARMSubtarget.h" 21 #include "ARMTargetMachine.h" 22 #include "ARMTargetObjectFile.h" 23 #include "MCTargetDesc/ARMAddressingModes.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/ADT/StringExtras.h" 26 #include "llvm/ADT/StringSwitch.h" 27 #include "llvm/CodeGen/CallingConvLower.h" 28 #include "llvm/CodeGen/IntrinsicLowering.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFrameInfo.h" 31 #include "llvm/CodeGen/MachineFunction.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineJumpTableInfo.h" 34 #include "llvm/CodeGen/MachineModuleInfo.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/SelectionDAG.h" 37 #include "llvm/IR/CallingConv.h" 38 #include "llvm/IR/Constants.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/DebugInfoMetadata.h" 41 #include "llvm/IR/GlobalValue.h" 42 #include "llvm/IR/IRBuilder.h" 43 #include "llvm/IR/Instruction.h" 44 #include "llvm/IR/Instructions.h" 45 #include "llvm/IR/IntrinsicInst.h" 46 #include "llvm/IR/Intrinsics.h" 47 #include "llvm/IR/Type.h" 48 #include "llvm/MC/MCSectionMachO.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/Debug.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Support/MathExtras.h" 53 #include "llvm/Support/raw_ostream.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include <utility> 56 using namespace llvm; 57 58 #define DEBUG_TYPE "arm-isel" 59 60 STATISTIC(NumTailCalls, "Number of tail calls"); 61 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 62 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 63 STATISTIC(NumConstpoolPromoted, 64 "Number of constants with their storage promoted into constant pools"); 65 66 static cl::opt<bool> 67 ARMInterworking("arm-interworking", cl::Hidden, 68 cl::desc("Enable / disable ARM interworking (for debugging only)"), 69 cl::init(true)); 70 71 static cl::opt<bool> EnableConstpoolPromotion( 72 "arm-promote-constant", cl::Hidden, 73 cl::desc("Enable / disable promotion of unnamed_addr constants into " 74 "constant pools"), 75 cl::init(true)); 76 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 77 "arm-promote-constant-max-size", cl::Hidden, 78 cl::desc("Maximum size of constant to promote into a constant pool"), 79 cl::init(64)); 80 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 81 "arm-promote-constant-max-total", cl::Hidden, 82 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 83 cl::init(128)); 84 85 namespace { 86 class ARMCCState : public CCState { 87 public: 88 ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, 89 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C, 90 ParmContext PC) 91 : CCState(CC, isVarArg, MF, locs, C) { 92 assert(((PC == Call) || (PC == Prologue)) && 93 "ARMCCState users must specify whether their context is call" 94 "or prologue generation."); 95 CallOrPrologue = PC; 96 } 97 }; 98 } 99 100 // The APCS parameter registers. 101 static const MCPhysReg GPRArgRegs[] = { 102 ARM::R0, ARM::R1, ARM::R2, ARM::R3 103 }; 104 105 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 106 MVT PromotedBitwiseVT) { 107 if (VT != PromotedLdStVT) { 108 setOperationAction(ISD::LOAD, VT, Promote); 109 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 110 111 setOperationAction(ISD::STORE, VT, Promote); 112 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 113 } 114 115 MVT ElemTy = VT.getVectorElementType(); 116 if (ElemTy != MVT::f64) 117 setOperationAction(ISD::SETCC, VT, Custom); 118 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 119 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 120 if (ElemTy == MVT::i32) { 121 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 122 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 123 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 124 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 125 } else { 126 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 127 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 128 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 129 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 130 } 131 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 132 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 133 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 134 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 135 setOperationAction(ISD::SELECT, VT, Expand); 136 setOperationAction(ISD::SELECT_CC, VT, Expand); 137 setOperationAction(ISD::VSELECT, VT, Expand); 138 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 139 if (VT.isInteger()) { 140 setOperationAction(ISD::SHL, VT, Custom); 141 setOperationAction(ISD::SRA, VT, Custom); 142 setOperationAction(ISD::SRL, VT, Custom); 143 } 144 145 // Promote all bit-wise operations. 146 if (VT.isInteger() && VT != PromotedBitwiseVT) { 147 setOperationAction(ISD::AND, VT, Promote); 148 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 149 setOperationAction(ISD::OR, VT, Promote); 150 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 151 setOperationAction(ISD::XOR, VT, Promote); 152 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 153 } 154 155 // Neon does not support vector divide/remainder operations. 156 setOperationAction(ISD::SDIV, VT, Expand); 157 setOperationAction(ISD::UDIV, VT, Expand); 158 setOperationAction(ISD::FDIV, VT, Expand); 159 setOperationAction(ISD::SREM, VT, Expand); 160 setOperationAction(ISD::UREM, VT, Expand); 161 setOperationAction(ISD::FREM, VT, Expand); 162 163 if (!VT.isFloatingPoint() && 164 VT != MVT::v2i64 && VT != MVT::v1i64) 165 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 166 setOperationAction(Opcode, VT, Legal); 167 } 168 169 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 170 addRegisterClass(VT, &ARM::DPRRegClass); 171 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 172 } 173 174 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 175 addRegisterClass(VT, &ARM::DPairRegClass); 176 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 177 } 178 179 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 180 const ARMSubtarget &STI) 181 : TargetLowering(TM), Subtarget(&STI) { 182 RegInfo = Subtarget->getRegisterInfo(); 183 Itins = Subtarget->getInstrItineraryData(); 184 185 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 186 187 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 188 !Subtarget->isTargetWatchOS()) { 189 const auto &E = Subtarget->getTargetTriple().getEnvironment(); 190 191 bool IsHFTarget = E == Triple::EABIHF || E == Triple::GNUEABIHF || 192 E == Triple::MuslEABIHF; 193 // Windows is a special case. Technically, we will replace all of the "GNU" 194 // calls with calls to MSVCRT if appropriate and adjust the calling 195 // convention then. 196 IsHFTarget = IsHFTarget || Subtarget->isTargetWindows(); 197 198 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 199 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 200 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 201 : CallingConv::ARM_AAPCS); 202 } 203 204 if (Subtarget->isTargetMachO()) { 205 // Uses VFP for Thumb libfuncs if available. 206 if (Subtarget->isThumb() && Subtarget->hasVFP2() && 207 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 208 static const struct { 209 const RTLIB::Libcall Op; 210 const char * const Name; 211 const ISD::CondCode Cond; 212 } LibraryCalls[] = { 213 // Single-precision floating-point arithmetic. 214 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 215 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 216 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 217 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 218 219 // Double-precision floating-point arithmetic. 220 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 221 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 222 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 223 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 224 225 // Single-precision comparisons. 226 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 227 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 228 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 229 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 230 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 231 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 232 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 233 { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, 234 235 // Double-precision comparisons. 236 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 237 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 238 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 239 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 240 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 241 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 242 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 243 { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, 244 245 // Floating-point to integer conversions. 246 // i64 conversions are done via library routines even when generating VFP 247 // instructions, so use the same ones. 248 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 249 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 250 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 251 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 252 253 // Conversions between floating types. 254 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 255 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 256 257 // Integer to floating-point conversions. 258 // i64 conversions are done via library routines even when generating VFP 259 // instructions, so use the same ones. 260 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 261 // e.g., __floatunsidf vs. __floatunssidfvfp. 262 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 263 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 264 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 265 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 266 }; 267 268 for (const auto &LC : LibraryCalls) { 269 setLibcallName(LC.Op, LC.Name); 270 if (LC.Cond != ISD::SETCC_INVALID) 271 setCmpLibcallCC(LC.Op, LC.Cond); 272 } 273 } 274 275 // Set the correct calling convention for ARMv7k WatchOS. It's just 276 // AAPCS_VFP for functions as simple as libcalls. 277 if (Subtarget->isTargetWatchABI()) { 278 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) 279 setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); 280 } 281 } 282 283 // These libcalls are not available in 32-bit. 284 setLibcallName(RTLIB::SHL_I128, nullptr); 285 setLibcallName(RTLIB::SRL_I128, nullptr); 286 setLibcallName(RTLIB::SRA_I128, nullptr); 287 288 // RTLIB 289 if (Subtarget->isAAPCS_ABI() && 290 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 291 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 292 static const struct { 293 const RTLIB::Libcall Op; 294 const char * const Name; 295 const CallingConv::ID CC; 296 const ISD::CondCode Cond; 297 } LibraryCalls[] = { 298 // Double-precision floating-point arithmetic helper functions 299 // RTABI chapter 4.1.2, Table 2 300 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 301 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 302 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 303 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 304 305 // Double-precision floating-point comparison helper functions 306 // RTABI chapter 4.1.2, Table 3 307 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 308 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 309 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 310 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 311 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 312 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 313 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 314 { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 315 316 // Single-precision floating-point arithmetic helper functions 317 // RTABI chapter 4.1.2, Table 4 318 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 319 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 320 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 321 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 322 323 // Single-precision floating-point comparison helper functions 324 // RTABI chapter 4.1.2, Table 5 325 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 326 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 327 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 328 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 329 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 330 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 331 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 332 { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 333 334 // Floating-point to integer conversions. 335 // RTABI chapter 4.1.2, Table 6 336 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 337 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 338 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 339 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 340 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 341 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 342 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 343 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 344 345 // Conversions between floating types. 346 // RTABI chapter 4.1.2, Table 7 347 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 348 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 349 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 350 351 // Integer to floating-point conversions. 352 // RTABI chapter 4.1.2, Table 8 353 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 354 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 355 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 356 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 357 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 358 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 359 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 360 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 361 362 // Long long helper functions 363 // RTABI chapter 4.2, Table 9 364 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 365 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 366 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 367 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 368 369 // Integer division functions 370 // RTABI chapter 4.3.1 371 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 372 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 373 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 374 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 375 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 376 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 377 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 378 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 379 }; 380 381 for (const auto &LC : LibraryCalls) { 382 setLibcallName(LC.Op, LC.Name); 383 setLibcallCallingConv(LC.Op, LC.CC); 384 if (LC.Cond != ISD::SETCC_INVALID) 385 setCmpLibcallCC(LC.Op, LC.Cond); 386 } 387 388 // EABI dependent RTLIB 389 if (TM.Options.EABIVersion == EABI::EABI4 || 390 TM.Options.EABIVersion == EABI::EABI5) { 391 static const struct { 392 const RTLIB::Libcall Op; 393 const char *const Name; 394 const CallingConv::ID CC; 395 const ISD::CondCode Cond; 396 } MemOpsLibraryCalls[] = { 397 // Memory operations 398 // RTABI chapter 4.3.4 399 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 400 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 401 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 402 }; 403 404 for (const auto &LC : MemOpsLibraryCalls) { 405 setLibcallName(LC.Op, LC.Name); 406 setLibcallCallingConv(LC.Op, LC.CC); 407 if (LC.Cond != ISD::SETCC_INVALID) 408 setCmpLibcallCC(LC.Op, LC.Cond); 409 } 410 } 411 } 412 413 if (Subtarget->isTargetWindows()) { 414 static const struct { 415 const RTLIB::Libcall Op; 416 const char * const Name; 417 const CallingConv::ID CC; 418 } LibraryCalls[] = { 419 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 420 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 421 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 422 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 423 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 424 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 425 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 426 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 427 }; 428 429 for (const auto &LC : LibraryCalls) { 430 setLibcallName(LC.Op, LC.Name); 431 setLibcallCallingConv(LC.Op, LC.CC); 432 } 433 } 434 435 // Use divmod compiler-rt calls for iOS 5.0 and later. 436 if (Subtarget->isTargetWatchOS() || 437 (Subtarget->isTargetIOS() && 438 !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 439 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 440 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 441 } 442 443 // The half <-> float conversion functions are always soft-float on 444 // non-watchos platforms, but are needed for some targets which use a 445 // hard-float calling convention by default. 446 if (!Subtarget->isTargetWatchABI()) { 447 if (Subtarget->isAAPCS_ABI()) { 448 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 449 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 450 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 451 } else { 452 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 453 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 454 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 455 } 456 } 457 458 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 459 // a __gnu_ prefix (which is the default). 460 if (Subtarget->isTargetAEABI()) { 461 static const struct { 462 const RTLIB::Libcall Op; 463 const char * const Name; 464 const CallingConv::ID CC; 465 } LibraryCalls[] = { 466 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 467 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 468 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 469 }; 470 471 for (const auto &LC : LibraryCalls) { 472 setLibcallName(LC.Op, LC.Name); 473 setLibcallCallingConv(LC.Op, LC.CC); 474 } 475 } 476 477 if (Subtarget->isThumb1Only()) 478 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 479 else 480 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 481 482 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 483 !Subtarget->isThumb1Only()) { 484 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 485 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 486 } 487 488 for (MVT VT : MVT::vector_valuetypes()) { 489 for (MVT InnerVT : MVT::vector_valuetypes()) { 490 setTruncStoreAction(VT, InnerVT, Expand); 491 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 492 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 493 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 494 } 495 496 setOperationAction(ISD::MULHS, VT, Expand); 497 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 498 setOperationAction(ISD::MULHU, VT, Expand); 499 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 500 501 setOperationAction(ISD::BSWAP, VT, Expand); 502 } 503 504 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 505 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 506 507 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 508 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 509 510 if (Subtarget->hasNEON()) { 511 addDRTypeForNEON(MVT::v2f32); 512 addDRTypeForNEON(MVT::v8i8); 513 addDRTypeForNEON(MVT::v4i16); 514 addDRTypeForNEON(MVT::v2i32); 515 addDRTypeForNEON(MVT::v1i64); 516 517 addQRTypeForNEON(MVT::v4f32); 518 addQRTypeForNEON(MVT::v2f64); 519 addQRTypeForNEON(MVT::v16i8); 520 addQRTypeForNEON(MVT::v8i16); 521 addQRTypeForNEON(MVT::v4i32); 522 addQRTypeForNEON(MVT::v2i64); 523 524 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 525 // neither Neon nor VFP support any arithmetic operations on it. 526 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 527 // supported for v4f32. 528 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 529 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 530 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 531 // FIXME: Code duplication: FDIV and FREM are expanded always, see 532 // ARMTargetLowering::addTypeForNEON method for details. 533 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 534 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 535 // FIXME: Create unittest. 536 // In another words, find a way when "copysign" appears in DAG with vector 537 // operands. 538 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 539 // FIXME: Code duplication: SETCC has custom operation action, see 540 // ARMTargetLowering::addTypeForNEON method for details. 541 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 542 // FIXME: Create unittest for FNEG and for FABS. 543 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 544 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 545 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 546 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 547 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 548 setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); 549 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 550 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 551 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 552 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 553 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 554 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 555 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 556 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 557 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 558 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 559 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 560 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 561 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 562 563 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 564 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 565 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 566 setOperationAction(ISD::FPOWI, MVT::v4f32, Expand); 567 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 568 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 569 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 570 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 571 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 572 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 573 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 574 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 575 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 576 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 577 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 578 579 // Mark v2f32 intrinsics. 580 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 581 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 582 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 583 setOperationAction(ISD::FPOWI, MVT::v2f32, Expand); 584 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 585 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 586 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 587 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 588 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 589 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 590 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 591 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 592 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 593 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 594 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 595 596 // Neon does not support some operations on v1i64 and v2i64 types. 597 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 598 // Custom handling for some quad-vector types to detect VMULL. 599 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 600 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 601 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 602 // Custom handling for some vector types to avoid expensive expansions 603 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 604 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 605 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 606 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 607 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 608 // a destination type that is wider than the source, and nor does 609 // it have a FP_TO_[SU]INT instruction with a narrower destination than 610 // source. 611 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 612 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 613 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 614 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 615 616 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 617 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 618 619 // NEON does not have single instruction CTPOP for vectors with element 620 // types wider than 8-bits. However, custom lowering can leverage the 621 // v8i8/v16i8 vcnt instruction. 622 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 623 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 624 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 625 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 626 setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); 627 setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); 628 629 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 630 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 631 632 // NEON does not have single instruction CTTZ for vectors. 633 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 634 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 635 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 636 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 637 638 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 639 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 640 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 641 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 642 643 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 644 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 645 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 646 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 647 648 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 649 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 650 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 651 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 652 653 // NEON only has FMA instructions as of VFP4. 654 if (!Subtarget->hasVFP4()) { 655 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 656 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 657 } 658 659 setTargetDAGCombine(ISD::INTRINSIC_VOID); 660 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 661 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 662 setTargetDAGCombine(ISD::SHL); 663 setTargetDAGCombine(ISD::SRL); 664 setTargetDAGCombine(ISD::SRA); 665 setTargetDAGCombine(ISD::SIGN_EXTEND); 666 setTargetDAGCombine(ISD::ZERO_EXTEND); 667 setTargetDAGCombine(ISD::ANY_EXTEND); 668 setTargetDAGCombine(ISD::BUILD_VECTOR); 669 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 670 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 671 setTargetDAGCombine(ISD::STORE); 672 setTargetDAGCombine(ISD::FP_TO_SINT); 673 setTargetDAGCombine(ISD::FP_TO_UINT); 674 setTargetDAGCombine(ISD::FDIV); 675 setTargetDAGCombine(ISD::LOAD); 676 677 // It is legal to extload from v4i8 to v4i16 or v4i32. 678 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 679 MVT::v2i32}) { 680 for (MVT VT : MVT::integer_vector_valuetypes()) { 681 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 682 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 683 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 684 } 685 } 686 } 687 688 // ARM and Thumb2 support UMLAL/SMLAL. 689 if (!Subtarget->isThumb1Only()) 690 setTargetDAGCombine(ISD::ADDC); 691 692 if (Subtarget->isFPOnlySP()) { 693 // When targeting a floating-point unit with only single-precision 694 // operations, f64 is legal for the few double-precision instructions which 695 // are present However, no double-precision operations other than moves, 696 // loads and stores are provided by the hardware. 697 setOperationAction(ISD::FADD, MVT::f64, Expand); 698 setOperationAction(ISD::FSUB, MVT::f64, Expand); 699 setOperationAction(ISD::FMUL, MVT::f64, Expand); 700 setOperationAction(ISD::FMA, MVT::f64, Expand); 701 setOperationAction(ISD::FDIV, MVT::f64, Expand); 702 setOperationAction(ISD::FREM, MVT::f64, Expand); 703 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 704 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 705 setOperationAction(ISD::FNEG, MVT::f64, Expand); 706 setOperationAction(ISD::FABS, MVT::f64, Expand); 707 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 708 setOperationAction(ISD::FSIN, MVT::f64, Expand); 709 setOperationAction(ISD::FCOS, MVT::f64, Expand); 710 setOperationAction(ISD::FPOWI, MVT::f64, Expand); 711 setOperationAction(ISD::FPOW, MVT::f64, Expand); 712 setOperationAction(ISD::FLOG, MVT::f64, Expand); 713 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 714 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 715 setOperationAction(ISD::FEXP, MVT::f64, Expand); 716 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 717 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 718 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 719 setOperationAction(ISD::FRINT, MVT::f64, Expand); 720 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 721 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 722 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 723 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 724 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 725 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 726 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 727 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 728 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 729 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 730 } 731 732 computeRegisterProperties(Subtarget->getRegisterInfo()); 733 734 // ARM does not have floating-point extending loads. 735 for (MVT VT : MVT::fp_valuetypes()) { 736 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 737 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 738 } 739 740 // ... or truncating stores 741 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 742 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 743 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 744 745 // ARM does not have i1 sign extending load. 746 for (MVT VT : MVT::integer_valuetypes()) 747 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 748 749 // ARM supports all 4 flavors of integer indexed load / store. 750 if (!Subtarget->isThumb1Only()) { 751 for (unsigned im = (unsigned)ISD::PRE_INC; 752 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 753 setIndexedLoadAction(im, MVT::i1, Legal); 754 setIndexedLoadAction(im, MVT::i8, Legal); 755 setIndexedLoadAction(im, MVT::i16, Legal); 756 setIndexedLoadAction(im, MVT::i32, Legal); 757 setIndexedStoreAction(im, MVT::i1, Legal); 758 setIndexedStoreAction(im, MVT::i8, Legal); 759 setIndexedStoreAction(im, MVT::i16, Legal); 760 setIndexedStoreAction(im, MVT::i32, Legal); 761 } 762 } else { 763 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 764 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 765 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 766 } 767 768 setOperationAction(ISD::SADDO, MVT::i32, Custom); 769 setOperationAction(ISD::UADDO, MVT::i32, Custom); 770 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 771 setOperationAction(ISD::USUBO, MVT::i32, Custom); 772 773 // i64 operation support. 774 setOperationAction(ISD::MUL, MVT::i64, Expand); 775 setOperationAction(ISD::MULHU, MVT::i32, Expand); 776 if (Subtarget->isThumb1Only()) { 777 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 778 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 779 } 780 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 781 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 782 setOperationAction(ISD::MULHS, MVT::i32, Expand); 783 784 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 785 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 786 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 787 setOperationAction(ISD::SRL, MVT::i64, Custom); 788 setOperationAction(ISD::SRA, MVT::i64, Custom); 789 790 if (!Subtarget->isThumb1Only()) { 791 // FIXME: We should do this for Thumb1 as well. 792 setOperationAction(ISD::ADDC, MVT::i32, Custom); 793 setOperationAction(ISD::ADDE, MVT::i32, Custom); 794 setOperationAction(ISD::SUBC, MVT::i32, Custom); 795 setOperationAction(ISD::SUBE, MVT::i32, Custom); 796 } 797 798 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 799 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 800 801 // ARM does not have ROTL. 802 setOperationAction(ISD::ROTL, MVT::i32, Expand); 803 for (MVT VT : MVT::vector_valuetypes()) { 804 setOperationAction(ISD::ROTL, VT, Expand); 805 setOperationAction(ISD::ROTR, VT, Expand); 806 } 807 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 808 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 809 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) 810 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 811 812 // @llvm.readcyclecounter requires the Performance Monitors extension. 813 // Default to the 0 expansion on unsupported platforms. 814 // FIXME: Technically there are older ARM CPUs that have 815 // implementation-specific ways of obtaining this information. 816 if (Subtarget->hasPerfMon()) 817 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 818 819 // Only ARMv6 has BSWAP. 820 if (!Subtarget->hasV6Ops()) 821 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 822 823 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide() 824 : Subtarget->hasDivideInARMMode(); 825 if (!hasDivide) { 826 // These are expanded into libcalls if the cpu doesn't have HW divider. 827 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 828 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 829 } 830 831 if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) { 832 setOperationAction(ISD::SDIV, MVT::i32, Custom); 833 setOperationAction(ISD::UDIV, MVT::i32, Custom); 834 835 setOperationAction(ISD::SDIV, MVT::i64, Custom); 836 setOperationAction(ISD::UDIV, MVT::i64, Custom); 837 } 838 839 setOperationAction(ISD::SREM, MVT::i32, Expand); 840 setOperationAction(ISD::UREM, MVT::i32, Expand); 841 842 // Register based DivRem for AEABI (RTABI 4.2) 843 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 844 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 845 Subtarget->isTargetWindows()) { 846 setOperationAction(ISD::SREM, MVT::i64, Custom); 847 setOperationAction(ISD::UREM, MVT::i64, Custom); 848 HasStandaloneRem = false; 849 850 if (Subtarget->isTargetWindows()) { 851 const struct { 852 const RTLIB::Libcall Op; 853 const char * const Name; 854 const CallingConv::ID CC; 855 } LibraryCalls[] = { 856 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 857 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 858 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 859 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 860 861 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 862 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 863 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 864 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 865 }; 866 867 for (const auto &LC : LibraryCalls) { 868 setLibcallName(LC.Op, LC.Name); 869 setLibcallCallingConv(LC.Op, LC.CC); 870 } 871 } else { 872 const struct { 873 const RTLIB::Libcall Op; 874 const char * const Name; 875 const CallingConv::ID CC; 876 } LibraryCalls[] = { 877 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 878 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 879 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 880 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 881 882 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 883 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 884 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 885 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 886 }; 887 888 for (const auto &LC : LibraryCalls) { 889 setLibcallName(LC.Op, LC.Name); 890 setLibcallCallingConv(LC.Op, LC.CC); 891 } 892 } 893 894 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 895 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 896 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 897 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 898 } else { 899 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 900 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 901 } 902 903 if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) 904 for (auto &VT : {MVT::f32, MVT::f64}) 905 setOperationAction(ISD::FPOWI, VT, Custom); 906 907 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 908 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 909 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 910 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 911 912 setOperationAction(ISD::TRAP, MVT::Other, Legal); 913 914 // Use the default implementation. 915 setOperationAction(ISD::VASTART, MVT::Other, Custom); 916 setOperationAction(ISD::VAARG, MVT::Other, Expand); 917 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 918 setOperationAction(ISD::VAEND, MVT::Other, Expand); 919 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 920 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 921 922 if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) 923 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 924 else 925 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 926 927 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 928 // the default expansion. 929 InsertFencesForAtomic = false; 930 if (Subtarget->hasAnyDataBarrier() && 931 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 932 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 933 // to ldrex/strex loops already. 934 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 935 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 936 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 937 938 // On v8, we have particularly efficient implementations of atomic fences 939 // if they can be combined with nearby atomic loads and stores. 940 if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { 941 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 942 InsertFencesForAtomic = true; 943 } 944 } else { 945 // If there's anything we can use as a barrier, go through custom lowering 946 // for ATOMIC_FENCE. 947 // If target has DMB in thumb, Fences can be inserted. 948 if (Subtarget->hasDataBarrier()) 949 InsertFencesForAtomic = true; 950 951 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 952 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 953 954 // Set them all for expansion, which will force libcalls. 955 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 956 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 957 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 958 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 959 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 960 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 961 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 962 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 963 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 964 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 965 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 966 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 967 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 968 // Unordered/Monotonic case. 969 if (!InsertFencesForAtomic) { 970 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 971 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 972 } 973 } 974 975 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 976 977 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 978 if (!Subtarget->hasV6Ops()) { 979 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 980 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 981 } 982 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 983 984 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 985 !Subtarget->isThumb1Only()) { 986 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 987 // iff target supports vfp2. 988 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 989 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 990 } 991 992 // We want to custom lower some of our intrinsics. 993 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 994 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 995 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 996 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 997 if (Subtarget->useSjLjEH()) 998 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 999 1000 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1001 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1002 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1003 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1004 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1005 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1006 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1007 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1008 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1009 1010 // Thumb-1 cannot currently select ARMISD::SUBE. 1011 if (!Subtarget->isThumb1Only()) 1012 setOperationAction(ISD::SETCCE, MVT::i32, Custom); 1013 1014 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 1015 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1016 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1017 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1018 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1019 1020 // We don't support sin/cos/fmod/copysign/pow 1021 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1022 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1023 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1024 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1025 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1026 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1027 setOperationAction(ISD::FREM, MVT::f64, Expand); 1028 setOperationAction(ISD::FREM, MVT::f32, Expand); 1029 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 1030 !Subtarget->isThumb1Only()) { 1031 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1032 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1033 } 1034 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1035 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1036 1037 if (!Subtarget->hasVFP4()) { 1038 setOperationAction(ISD::FMA, MVT::f64, Expand); 1039 setOperationAction(ISD::FMA, MVT::f32, Expand); 1040 } 1041 1042 // Various VFP goodness 1043 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1044 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1045 if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { 1046 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1047 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1048 } 1049 1050 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1051 if (!Subtarget->hasFP16()) { 1052 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1053 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1054 } 1055 } 1056 1057 // Combine sin / cos into one node or libcall if possible. 1058 if (Subtarget->hasSinCos()) { 1059 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1060 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1061 if (Subtarget->isTargetWatchABI()) { 1062 setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); 1063 setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); 1064 } 1065 if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { 1066 // For iOS, we don't want to the normal expansion of a libcall to 1067 // sincos. We want to issue a libcall to __sincos_stret. 1068 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1069 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1070 } 1071 } 1072 1073 // FP-ARMv8 implements a lot of rounding-like FP operations. 1074 if (Subtarget->hasFPARMv8()) { 1075 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1076 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1077 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1078 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1079 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1080 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1081 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1082 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1083 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1084 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1085 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1086 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1087 1088 if (!Subtarget->isFPOnlySP()) { 1089 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1090 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1091 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1092 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1093 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1094 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1095 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1096 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1097 } 1098 } 1099 1100 if (Subtarget->hasNEON()) { 1101 // vmin and vmax aren't available in a scalar form, so we use 1102 // a NEON instruction with an undef lane instead. 1103 setOperationAction(ISD::FMINNAN, MVT::f32, Legal); 1104 setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); 1105 setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); 1106 setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); 1107 setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); 1108 setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); 1109 } 1110 1111 // We have target-specific dag combine patterns for the following nodes: 1112 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1113 setTargetDAGCombine(ISD::ADD); 1114 setTargetDAGCombine(ISD::SUB); 1115 setTargetDAGCombine(ISD::MUL); 1116 setTargetDAGCombine(ISD::AND); 1117 setTargetDAGCombine(ISD::OR); 1118 setTargetDAGCombine(ISD::XOR); 1119 1120 if (Subtarget->hasV6Ops()) 1121 setTargetDAGCombine(ISD::SRL); 1122 1123 setStackPointerRegisterToSaveRestore(ARM::SP); 1124 1125 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1126 !Subtarget->hasVFP2()) 1127 setSchedulingPreference(Sched::RegPressure); 1128 else 1129 setSchedulingPreference(Sched::Hybrid); 1130 1131 //// temporary - rewrite interface to use type 1132 MaxStoresPerMemset = 8; 1133 MaxStoresPerMemsetOptSize = 4; 1134 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1135 MaxStoresPerMemcpyOptSize = 2; 1136 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1137 MaxStoresPerMemmoveOptSize = 2; 1138 1139 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1140 // are at least 4 bytes aligned. 1141 setMinStackArgumentAlignment(4); 1142 1143 // Prefer likely predicted branches to selects on out-of-order cores. 1144 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1145 1146 setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); 1147 } 1148 1149 bool ARMTargetLowering::useSoftFloat() const { 1150 return Subtarget->useSoftFloat(); 1151 } 1152 1153 // FIXME: It might make sense to define the representative register class as the 1154 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1155 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1156 // SPR's representative would be DPR_VFP2. This should work well if register 1157 // pressure tracking were modified such that a register use would increment the 1158 // pressure of the register class's representative and all of it's super 1159 // classes' representatives transitively. We have not implemented this because 1160 // of the difficulty prior to coalescing of modeling operand register classes 1161 // due to the common occurrence of cross class copies and subregister insertions 1162 // and extractions. 1163 std::pair<const TargetRegisterClass *, uint8_t> 1164 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1165 MVT VT) const { 1166 const TargetRegisterClass *RRC = nullptr; 1167 uint8_t Cost = 1; 1168 switch (VT.SimpleTy) { 1169 default: 1170 return TargetLowering::findRepresentativeClass(TRI, VT); 1171 // Use DPR as representative register class for all floating point 1172 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1173 // the cost is 1 for both f32 and f64. 1174 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1175 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1176 RRC = &ARM::DPRRegClass; 1177 // When NEON is used for SP, only half of the register file is available 1178 // because operations that define both SP and DP results will be constrained 1179 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1180 // coalescing by double-counting the SP regs. See the FIXME above. 1181 if (Subtarget->useNEONForSinglePrecisionFP()) 1182 Cost = 2; 1183 break; 1184 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1185 case MVT::v4f32: case MVT::v2f64: 1186 RRC = &ARM::DPRRegClass; 1187 Cost = 2; 1188 break; 1189 case MVT::v4i64: 1190 RRC = &ARM::DPRRegClass; 1191 Cost = 4; 1192 break; 1193 case MVT::v8i64: 1194 RRC = &ARM::DPRRegClass; 1195 Cost = 8; 1196 break; 1197 } 1198 return std::make_pair(RRC, Cost); 1199 } 1200 1201 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1202 switch ((ARMISD::NodeType)Opcode) { 1203 case ARMISD::FIRST_NUMBER: break; 1204 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1205 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1206 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1207 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1208 case ARMISD::CALL: return "ARMISD::CALL"; 1209 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1210 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1211 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1212 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1213 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1214 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1215 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1216 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1217 case ARMISD::CMP: return "ARMISD::CMP"; 1218 case ARMISD::CMN: return "ARMISD::CMN"; 1219 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1220 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1221 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1222 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1223 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1224 1225 case ARMISD::CMOV: return "ARMISD::CMOV"; 1226 1227 case ARMISD::SSAT: return "ARMISD::SSAT"; 1228 1229 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1230 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1231 case ARMISD::RRX: return "ARMISD::RRX"; 1232 1233 case ARMISD::ADDC: return "ARMISD::ADDC"; 1234 case ARMISD::ADDE: return "ARMISD::ADDE"; 1235 case ARMISD::SUBC: return "ARMISD::SUBC"; 1236 case ARMISD::SUBE: return "ARMISD::SUBE"; 1237 1238 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1239 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1240 1241 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1242 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1243 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1244 1245 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1246 1247 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1248 1249 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1250 1251 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1252 1253 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1254 1255 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1256 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1257 1258 case ARMISD::VCEQ: return "ARMISD::VCEQ"; 1259 case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; 1260 case ARMISD::VCGE: return "ARMISD::VCGE"; 1261 case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; 1262 case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; 1263 case ARMISD::VCGEU: return "ARMISD::VCGEU"; 1264 case ARMISD::VCGT: return "ARMISD::VCGT"; 1265 case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; 1266 case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; 1267 case ARMISD::VCGTU: return "ARMISD::VCGTU"; 1268 case ARMISD::VTST: return "ARMISD::VTST"; 1269 1270 case ARMISD::VSHL: return "ARMISD::VSHL"; 1271 case ARMISD::VSHRs: return "ARMISD::VSHRs"; 1272 case ARMISD::VSHRu: return "ARMISD::VSHRu"; 1273 case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; 1274 case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; 1275 case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; 1276 case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; 1277 case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; 1278 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; 1279 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; 1280 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; 1281 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; 1282 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; 1283 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; 1284 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; 1285 case ARMISD::VSLI: return "ARMISD::VSLI"; 1286 case ARMISD::VSRI: return "ARMISD::VSRI"; 1287 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1288 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1289 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1290 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1291 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1292 case ARMISD::VDUP: return "ARMISD::VDUP"; 1293 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1294 case ARMISD::VEXT: return "ARMISD::VEXT"; 1295 case ARMISD::VREV64: return "ARMISD::VREV64"; 1296 case ARMISD::VREV32: return "ARMISD::VREV32"; 1297 case ARMISD::VREV16: return "ARMISD::VREV16"; 1298 case ARMISD::VZIP: return "ARMISD::VZIP"; 1299 case ARMISD::VUZP: return "ARMISD::VUZP"; 1300 case ARMISD::VTRN: return "ARMISD::VTRN"; 1301 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1302 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1303 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1304 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1305 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1306 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1307 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1308 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1309 case ARMISD::BFI: return "ARMISD::BFI"; 1310 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1311 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1312 case ARMISD::VBSL: return "ARMISD::VBSL"; 1313 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1314 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1315 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1316 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1317 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1318 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1319 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1320 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1321 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1322 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1323 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1324 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1325 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1326 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1327 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1328 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1329 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1330 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1331 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1332 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1333 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1334 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1335 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1336 } 1337 return nullptr; 1338 } 1339 1340 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1341 EVT VT) const { 1342 if (!VT.isVector()) 1343 return getPointerTy(DL); 1344 return VT.changeVectorElementTypeToInteger(); 1345 } 1346 1347 /// getRegClassFor - Return the register class that should be used for the 1348 /// specified value type. 1349 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { 1350 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1351 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1352 // load / store 4 to 8 consecutive D registers. 1353 if (Subtarget->hasNEON()) { 1354 if (VT == MVT::v4i64) 1355 return &ARM::QQPRRegClass; 1356 if (VT == MVT::v8i64) 1357 return &ARM::QQQQPRRegClass; 1358 } 1359 return TargetLowering::getRegClassFor(VT); 1360 } 1361 1362 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1363 // source/dest is aligned and the copy size is large enough. We therefore want 1364 // to align such objects passed to memory intrinsics. 1365 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1366 unsigned &PrefAlign) const { 1367 if (!isa<MemIntrinsic>(CI)) 1368 return false; 1369 MinSize = 8; 1370 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1371 // cycle faster than 4-byte aligned LDM. 1372 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1373 return true; 1374 } 1375 1376 // Create a fast isel object. 1377 FastISel * 1378 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1379 const TargetLibraryInfo *libInfo) const { 1380 return ARM::createFastISel(funcInfo, libInfo); 1381 } 1382 1383 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1384 unsigned NumVals = N->getNumValues(); 1385 if (!NumVals) 1386 return Sched::RegPressure; 1387 1388 for (unsigned i = 0; i != NumVals; ++i) { 1389 EVT VT = N->getValueType(i); 1390 if (VT == MVT::Glue || VT == MVT::Other) 1391 continue; 1392 if (VT.isFloatingPoint() || VT.isVector()) 1393 return Sched::ILP; 1394 } 1395 1396 if (!N->isMachineOpcode()) 1397 return Sched::RegPressure; 1398 1399 // Load are scheduled for latency even if there instruction itinerary 1400 // is not available. 1401 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1402 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1403 1404 if (MCID.getNumDefs() == 0) 1405 return Sched::RegPressure; 1406 if (!Itins->isEmpty() && 1407 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1408 return Sched::ILP; 1409 1410 return Sched::RegPressure; 1411 } 1412 1413 //===----------------------------------------------------------------------===// 1414 // Lowering Code 1415 //===----------------------------------------------------------------------===// 1416 1417 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1418 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1419 switch (CC) { 1420 default: llvm_unreachable("Unknown condition code!"); 1421 case ISD::SETNE: return ARMCC::NE; 1422 case ISD::SETEQ: return ARMCC::EQ; 1423 case ISD::SETGT: return ARMCC::GT; 1424 case ISD::SETGE: return ARMCC::GE; 1425 case ISD::SETLT: return ARMCC::LT; 1426 case ISD::SETLE: return ARMCC::LE; 1427 case ISD::SETUGT: return ARMCC::HI; 1428 case ISD::SETUGE: return ARMCC::HS; 1429 case ISD::SETULT: return ARMCC::LO; 1430 case ISD::SETULE: return ARMCC::LS; 1431 } 1432 } 1433 1434 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1435 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1436 ARMCC::CondCodes &CondCode2) { 1437 CondCode2 = ARMCC::AL; 1438 switch (CC) { 1439 default: llvm_unreachable("Unknown FP condition!"); 1440 case ISD::SETEQ: 1441 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1442 case ISD::SETGT: 1443 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1444 case ISD::SETGE: 1445 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1446 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1447 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1448 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1449 case ISD::SETO: CondCode = ARMCC::VC; break; 1450 case ISD::SETUO: CondCode = ARMCC::VS; break; 1451 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1452 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1453 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1454 case ISD::SETLT: 1455 case ISD::SETULT: CondCode = ARMCC::LT; break; 1456 case ISD::SETLE: 1457 case ISD::SETULE: CondCode = ARMCC::LE; break; 1458 case ISD::SETNE: 1459 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1460 } 1461 } 1462 1463 //===----------------------------------------------------------------------===// 1464 // Calling Convention Implementation 1465 //===----------------------------------------------------------------------===// 1466 1467 #include "ARMGenCallingConv.inc" 1468 1469 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1470 /// account presence of floating point hardware and calling convention 1471 /// limitations, such as support for variadic functions. 1472 CallingConv::ID 1473 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1474 bool isVarArg) const { 1475 switch (CC) { 1476 default: 1477 llvm_unreachable("Unsupported calling convention"); 1478 case CallingConv::ARM_AAPCS: 1479 case CallingConv::ARM_APCS: 1480 case CallingConv::GHC: 1481 return CC; 1482 case CallingConv::PreserveMost: 1483 return CallingConv::PreserveMost; 1484 case CallingConv::ARM_AAPCS_VFP: 1485 case CallingConv::Swift: 1486 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1487 case CallingConv::C: 1488 if (!Subtarget->isAAPCS_ABI()) 1489 return CallingConv::ARM_APCS; 1490 else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && 1491 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1492 !isVarArg) 1493 return CallingConv::ARM_AAPCS_VFP; 1494 else 1495 return CallingConv::ARM_AAPCS; 1496 case CallingConv::Fast: 1497 case CallingConv::CXX_FAST_TLS: 1498 if (!Subtarget->isAAPCS_ABI()) { 1499 if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1500 return CallingConv::Fast; 1501 return CallingConv::ARM_APCS; 1502 } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1503 return CallingConv::ARM_AAPCS_VFP; 1504 else 1505 return CallingConv::ARM_AAPCS; 1506 } 1507 } 1508 1509 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1510 bool isVarArg) const { 1511 return CCAssignFnForNode(CC, false, isVarArg); 1512 } 1513 1514 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1515 bool isVarArg) const { 1516 return CCAssignFnForNode(CC, true, isVarArg); 1517 } 1518 1519 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1520 /// CallingConvention. 1521 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1522 bool Return, 1523 bool isVarArg) const { 1524 switch (getEffectiveCallingConv(CC, isVarArg)) { 1525 default: 1526 llvm_unreachable("Unsupported calling convention"); 1527 case CallingConv::ARM_APCS: 1528 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1529 case CallingConv::ARM_AAPCS: 1530 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1531 case CallingConv::ARM_AAPCS_VFP: 1532 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1533 case CallingConv::Fast: 1534 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1535 case CallingConv::GHC: 1536 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1537 case CallingConv::PreserveMost: 1538 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1539 } 1540 } 1541 1542 /// LowerCallResult - Lower the result values of a call into the 1543 /// appropriate copies out of appropriate physical registers. 1544 SDValue ARMTargetLowering::LowerCallResult( 1545 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1546 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1547 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1548 SDValue ThisVal) const { 1549 1550 // Assign locations to each value returned by this call. 1551 SmallVector<CCValAssign, 16> RVLocs; 1552 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1553 *DAG.getContext(), Call); 1554 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1555 1556 // Copy all of the result registers out of their specified physreg. 1557 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1558 CCValAssign VA = RVLocs[i]; 1559 1560 // Pass 'this' value directly from the argument to return value, to avoid 1561 // reg unit interference 1562 if (i == 0 && isThisReturn) { 1563 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1564 "unexpected return calling convention register assignment"); 1565 InVals.push_back(ThisVal); 1566 continue; 1567 } 1568 1569 SDValue Val; 1570 if (VA.needsCustom()) { 1571 // Handle f64 or half of a v2f64. 1572 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1573 InFlag); 1574 Chain = Lo.getValue(1); 1575 InFlag = Lo.getValue(2); 1576 VA = RVLocs[++i]; // skip ahead to next loc 1577 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1578 InFlag); 1579 Chain = Hi.getValue(1); 1580 InFlag = Hi.getValue(2); 1581 if (!Subtarget->isLittle()) 1582 std::swap (Lo, Hi); 1583 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1584 1585 if (VA.getLocVT() == MVT::v2f64) { 1586 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1587 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1588 DAG.getConstant(0, dl, MVT::i32)); 1589 1590 VA = RVLocs[++i]; // skip ahead to next loc 1591 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1592 Chain = Lo.getValue(1); 1593 InFlag = Lo.getValue(2); 1594 VA = RVLocs[++i]; // skip ahead to next loc 1595 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1596 Chain = Hi.getValue(1); 1597 InFlag = Hi.getValue(2); 1598 if (!Subtarget->isLittle()) 1599 std::swap (Lo, Hi); 1600 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1601 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1602 DAG.getConstant(1, dl, MVT::i32)); 1603 } 1604 } else { 1605 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1606 InFlag); 1607 Chain = Val.getValue(1); 1608 InFlag = Val.getValue(2); 1609 } 1610 1611 switch (VA.getLocInfo()) { 1612 default: llvm_unreachable("Unknown loc info!"); 1613 case CCValAssign::Full: break; 1614 case CCValAssign::BCvt: 1615 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1616 break; 1617 } 1618 1619 InVals.push_back(Val); 1620 } 1621 1622 return Chain; 1623 } 1624 1625 /// LowerMemOpCallTo - Store the argument to the stack. 1626 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1627 SDValue Arg, const SDLoc &dl, 1628 SelectionDAG &DAG, 1629 const CCValAssign &VA, 1630 ISD::ArgFlagsTy Flags) const { 1631 unsigned LocMemOffset = VA.getLocMemOffset(); 1632 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1633 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1634 StackPtr, PtrOff); 1635 return DAG.getStore( 1636 Chain, dl, Arg, PtrOff, 1637 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 1638 } 1639 1640 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 1641 SDValue Chain, SDValue &Arg, 1642 RegsToPassVector &RegsToPass, 1643 CCValAssign &VA, CCValAssign &NextVA, 1644 SDValue &StackPtr, 1645 SmallVectorImpl<SDValue> &MemOpChains, 1646 ISD::ArgFlagsTy Flags) const { 1647 1648 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1649 DAG.getVTList(MVT::i32, MVT::i32), Arg); 1650 unsigned id = Subtarget->isLittle() ? 0 : 1; 1651 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 1652 1653 if (NextVA.isRegLoc()) 1654 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 1655 else { 1656 assert(NextVA.isMemLoc()); 1657 if (!StackPtr.getNode()) 1658 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 1659 getPointerTy(DAG.getDataLayout())); 1660 1661 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 1662 dl, DAG, NextVA, 1663 Flags)); 1664 } 1665 } 1666 1667 /// LowerCall - Lowering a call into a callseq_start <- 1668 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 1669 /// nodes. 1670 SDValue 1671 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1672 SmallVectorImpl<SDValue> &InVals) const { 1673 SelectionDAG &DAG = CLI.DAG; 1674 SDLoc &dl = CLI.DL; 1675 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1676 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1677 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1678 SDValue Chain = CLI.Chain; 1679 SDValue Callee = CLI.Callee; 1680 bool &isTailCall = CLI.IsTailCall; 1681 CallingConv::ID CallConv = CLI.CallConv; 1682 bool doesNotRet = CLI.DoesNotReturn; 1683 bool isVarArg = CLI.IsVarArg; 1684 1685 MachineFunction &MF = DAG.getMachineFunction(); 1686 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 1687 bool isThisReturn = false; 1688 bool isSibCall = false; 1689 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); 1690 1691 // Disable tail calls if they're not supported. 1692 if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") 1693 isTailCall = false; 1694 1695 if (isTailCall) { 1696 // Check if it's really possible to do a tail call. 1697 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1698 isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(), 1699 Outs, OutVals, Ins, DAG); 1700 if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) 1701 report_fatal_error("failed to perform tail call elimination on a call " 1702 "site marked musttail"); 1703 // We don't support GuaranteedTailCallOpt for ARM, only automatically 1704 // detected sibcalls. 1705 if (isTailCall) { 1706 ++NumTailCalls; 1707 isSibCall = true; 1708 } 1709 } 1710 1711 // Analyze operands of the call, assigning locations to each operand. 1712 SmallVector<CCValAssign, 16> ArgLocs; 1713 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1714 *DAG.getContext(), Call); 1715 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 1716 1717 // Get a count of how many bytes are to be pushed on the stack. 1718 unsigned NumBytes = CCInfo.getNextStackOffset(); 1719 1720 // For tail calls, memory operands are available in our caller's stack. 1721 if (isSibCall) 1722 NumBytes = 0; 1723 1724 // Adjust the stack pointer for the new arguments... 1725 // These operations are automatically eliminated by the prolog/epilog pass 1726 if (!isSibCall) 1727 Chain = DAG.getCALLSEQ_START(Chain, 1728 DAG.getIntPtrConstant(NumBytes, dl, true), dl); 1729 1730 SDValue StackPtr = 1731 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 1732 1733 RegsToPassVector RegsToPass; 1734 SmallVector<SDValue, 8> MemOpChains; 1735 1736 // Walk the register/memloc assignments, inserting copies/loads. In the case 1737 // of tail call optimization, arguments are handled later. 1738 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1739 i != e; 1740 ++i, ++realArgIdx) { 1741 CCValAssign &VA = ArgLocs[i]; 1742 SDValue Arg = OutVals[realArgIdx]; 1743 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1744 bool isByVal = Flags.isByVal(); 1745 1746 // Promote the value if needed. 1747 switch (VA.getLocInfo()) { 1748 default: llvm_unreachable("Unknown loc info!"); 1749 case CCValAssign::Full: break; 1750 case CCValAssign::SExt: 1751 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1752 break; 1753 case CCValAssign::ZExt: 1754 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1755 break; 1756 case CCValAssign::AExt: 1757 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1758 break; 1759 case CCValAssign::BCvt: 1760 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1761 break; 1762 } 1763 1764 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 1765 if (VA.needsCustom()) { 1766 if (VA.getLocVT() == MVT::v2f64) { 1767 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1768 DAG.getConstant(0, dl, MVT::i32)); 1769 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1770 DAG.getConstant(1, dl, MVT::i32)); 1771 1772 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 1773 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1774 1775 VA = ArgLocs[++i]; // skip ahead to next loc 1776 if (VA.isRegLoc()) { 1777 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 1778 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1779 } else { 1780 assert(VA.isMemLoc()); 1781 1782 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 1783 dl, DAG, VA, Flags)); 1784 } 1785 } else { 1786 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 1787 StackPtr, MemOpChains, Flags); 1788 } 1789 } else if (VA.isRegLoc()) { 1790 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) { 1791 assert(VA.getLocVT() == MVT::i32 && 1792 "unexpected calling convention register assignment"); 1793 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 1794 "unexpected use of 'returned'"); 1795 isThisReturn = true; 1796 } 1797 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1798 } else if (isByVal) { 1799 assert(VA.isMemLoc()); 1800 unsigned offset = 0; 1801 1802 // True if this byval aggregate will be split between registers 1803 // and memory. 1804 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 1805 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 1806 1807 if (CurByValIdx < ByValArgsCount) { 1808 1809 unsigned RegBegin, RegEnd; 1810 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 1811 1812 EVT PtrVT = 1813 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 1814 unsigned int i, j; 1815 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 1816 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 1817 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 1818 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 1819 MachinePointerInfo(), 1820 DAG.InferPtrAlignment(AddArg)); 1821 MemOpChains.push_back(Load.getValue(1)); 1822 RegsToPass.push_back(std::make_pair(j, Load)); 1823 } 1824 1825 // If parameter size outsides register area, "offset" value 1826 // helps us to calculate stack slot for remained part properly. 1827 offset = RegEnd - RegBegin; 1828 1829 CCInfo.nextInRegsParam(); 1830 } 1831 1832 if (Flags.getByValSize() > 4*offset) { 1833 auto PtrVT = getPointerTy(DAG.getDataLayout()); 1834 unsigned LocMemOffset = VA.getLocMemOffset(); 1835 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1836 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 1837 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 1838 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 1839 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 1840 MVT::i32); 1841 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 1842 MVT::i32); 1843 1844 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 1845 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 1846 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 1847 Ops)); 1848 } 1849 } else if (!isSibCall) { 1850 assert(VA.isMemLoc()); 1851 1852 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1853 dl, DAG, VA, Flags)); 1854 } 1855 } 1856 1857 if (!MemOpChains.empty()) 1858 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 1859 1860 // Build a sequence of copy-to-reg nodes chained together with token chain 1861 // and flag operands which copy the outgoing args into the appropriate regs. 1862 SDValue InFlag; 1863 // Tail call byval lowering might overwrite argument registers so in case of 1864 // tail call optimization the copies to registers are lowered later. 1865 if (!isTailCall) 1866 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1867 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1868 RegsToPass[i].second, InFlag); 1869 InFlag = Chain.getValue(1); 1870 } 1871 1872 // For tail calls lower the arguments to the 'real' stack slot. 1873 if (isTailCall) { 1874 // Force all the incoming stack arguments to be loaded from the stack 1875 // before any new outgoing arguments are stored to the stack, because the 1876 // outgoing stack slots may alias the incoming argument stack slots, and 1877 // the alias isn't otherwise explicit. This is slightly more conservative 1878 // than necessary, because it means that each store effectively depends 1879 // on every argument instead of just those arguments it would clobber. 1880 1881 // Do not flag preceding copytoreg stuff together with the following stuff. 1882 InFlag = SDValue(); 1883 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1884 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1885 RegsToPass[i].second, InFlag); 1886 InFlag = Chain.getValue(1); 1887 } 1888 InFlag = SDValue(); 1889 } 1890 1891 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 1892 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 1893 // node so that legalize doesn't hack it. 1894 bool isDirect = false; 1895 1896 const TargetMachine &TM = getTargetMachine(); 1897 const Module *Mod = MF.getFunction()->getParent(); 1898 const GlobalValue *GV = nullptr; 1899 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1900 GV = G->getGlobal(); 1901 bool isStub = 1902 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 1903 1904 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 1905 bool isLocalARMFunc = false; 1906 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1907 auto PtrVt = getPointerTy(DAG.getDataLayout()); 1908 1909 if (Subtarget->genLongCalls()) { 1910 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 1911 "long-calls codegen is not position independent!"); 1912 // Handle a global address or an external symbol. If it's not one of 1913 // those, the target's already in a register, so we don't need to do 1914 // anything extra. 1915 if (isa<GlobalAddressSDNode>(Callee)) { 1916 // Create a constant pool entry for the callee address 1917 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1918 ARMConstantPoolValue *CPV = 1919 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 1920 1921 // Get the address of the callee into a register 1922 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1923 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1924 Callee = DAG.getLoad( 1925 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1926 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 1927 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 1928 const char *Sym = S->getSymbol(); 1929 1930 // Create a constant pool entry for the callee address 1931 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1932 ARMConstantPoolValue *CPV = 1933 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1934 ARMPCLabelIndex, 0); 1935 // Get the address of the callee into a register 1936 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1937 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1938 Callee = DAG.getLoad( 1939 PtrVt, dl, DAG.getEntryNode(), CPAddr, 1940 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 1941 } 1942 } else if (isa<GlobalAddressSDNode>(Callee)) { 1943 // If we're optimizing for minimum size and the function is called three or 1944 // more times in this block, we can improve codesize by calling indirectly 1945 // as BLXr has a 16-bit encoding. 1946 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 1947 auto *BB = CLI.CS->getParent(); 1948 bool PreferIndirect = 1949 Subtarget->isThumb() && MF.getFunction()->optForMinSize() && 1950 count_if(GV->users(), [&BB](const User *U) { 1951 return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB; 1952 }) > 2; 1953 1954 if (!PreferIndirect) { 1955 isDirect = true; 1956 bool isDef = GV->isStrongDefinitionForLinker(); 1957 1958 // ARM call to a local ARM function is predicable. 1959 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 1960 // tBX takes a register source operand. 1961 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1962 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 1963 Callee = DAG.getNode( 1964 ARMISD::WrapperPIC, dl, PtrVt, 1965 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 1966 Callee = DAG.getLoad( 1967 PtrVt, dl, DAG.getEntryNode(), Callee, 1968 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 1969 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 1970 MachineMemOperand::MOInvariant); 1971 } else if (Subtarget->isTargetCOFF()) { 1972 assert(Subtarget->isTargetWindows() && 1973 "Windows is the only supported COFF target"); 1974 unsigned TargetFlags = GV->hasDLLImportStorageClass() 1975 ? ARMII::MO_DLLIMPORT 1976 : ARMII::MO_NO_FLAG; 1977 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, 1978 TargetFlags); 1979 if (GV->hasDLLImportStorageClass()) 1980 Callee = 1981 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 1982 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 1983 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 1984 } else { 1985 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 1986 } 1987 } 1988 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1989 isDirect = true; 1990 // tBX takes a register source operand. 1991 const char *Sym = S->getSymbol(); 1992 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1993 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1994 ARMConstantPoolValue *CPV = 1995 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1996 ARMPCLabelIndex, 4); 1997 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 1998 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1999 Callee = DAG.getLoad( 2000 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2001 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2002 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2003 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2004 } else { 2005 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2006 } 2007 } 2008 2009 // FIXME: handle tail calls differently. 2010 unsigned CallOpc; 2011 if (Subtarget->isThumb()) { 2012 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2013 CallOpc = ARMISD::CALL_NOLINK; 2014 else 2015 CallOpc = ARMISD::CALL; 2016 } else { 2017 if (!isDirect && !Subtarget->hasV5TOps()) 2018 CallOpc = ARMISD::CALL_NOLINK; 2019 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2020 // Emit regular call when code size is the priority 2021 !MF.getFunction()->optForMinSize()) 2022 // "mov lr, pc; b _foo" to avoid confusing the RSP 2023 CallOpc = ARMISD::CALL_NOLINK; 2024 else 2025 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2026 } 2027 2028 std::vector<SDValue> Ops; 2029 Ops.push_back(Chain); 2030 Ops.push_back(Callee); 2031 2032 // Add argument registers to the end of the list so that they are known live 2033 // into the call. 2034 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2035 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2036 RegsToPass[i].second.getValueType())); 2037 2038 // Add a register mask operand representing the call-preserved registers. 2039 if (!isTailCall) { 2040 const uint32_t *Mask; 2041 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2042 if (isThisReturn) { 2043 // For 'this' returns, use the R0-preserving mask if applicable 2044 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2045 if (!Mask) { 2046 // Set isThisReturn to false if the calling convention is not one that 2047 // allows 'returned' to be modeled in this way, so LowerCallResult does 2048 // not try to pass 'this' straight through 2049 isThisReturn = false; 2050 Mask = ARI->getCallPreservedMask(MF, CallConv); 2051 } 2052 } else 2053 Mask = ARI->getCallPreservedMask(MF, CallConv); 2054 2055 assert(Mask && "Missing call preserved mask for calling convention"); 2056 Ops.push_back(DAG.getRegisterMask(Mask)); 2057 } 2058 2059 if (InFlag.getNode()) 2060 Ops.push_back(InFlag); 2061 2062 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2063 if (isTailCall) { 2064 MF.getFrameInfo().setHasTailCall(); 2065 return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2066 } 2067 2068 // Returns a chain and a flag for retval copy to use. 2069 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2070 InFlag = Chain.getValue(1); 2071 2072 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2073 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2074 if (!Ins.empty()) 2075 InFlag = Chain.getValue(1); 2076 2077 // Handle result values, copying them out of physregs into vregs that we 2078 // return. 2079 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2080 InVals, isThisReturn, 2081 isThisReturn ? OutVals[0] : SDValue()); 2082 } 2083 2084 /// HandleByVal - Every parameter *after* a byval parameter is passed 2085 /// on the stack. Remember the next parameter register to allocate, 2086 /// and then confiscate the rest of the parameter registers to insure 2087 /// this. 2088 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2089 unsigned Align) const { 2090 assert((State->getCallOrPrologue() == Prologue || 2091 State->getCallOrPrologue() == Call) && 2092 "unhandled ParmContext"); 2093 2094 // Byval (as with any stack) slots are always at least 4 byte aligned. 2095 Align = std::max(Align, 4U); 2096 2097 unsigned Reg = State->AllocateReg(GPRArgRegs); 2098 if (!Reg) 2099 return; 2100 2101 unsigned AlignInRegs = Align / 4; 2102 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2103 for (unsigned i = 0; i < Waste; ++i) 2104 Reg = State->AllocateReg(GPRArgRegs); 2105 2106 if (!Reg) 2107 return; 2108 2109 unsigned Excess = 4 * (ARM::R4 - Reg); 2110 2111 // Special case when NSAA != SP and parameter size greater than size of 2112 // all remained GPR regs. In that case we can't split parameter, we must 2113 // send it to stack. We also must set NCRN to R4, so waste all 2114 // remained registers. 2115 const unsigned NSAAOffset = State->getNextStackOffset(); 2116 if (NSAAOffset != 0 && Size > Excess) { 2117 while (State->AllocateReg(GPRArgRegs)) 2118 ; 2119 return; 2120 } 2121 2122 // First register for byval parameter is the first register that wasn't 2123 // allocated before this method call, so it would be "reg". 2124 // If parameter is small enough to be saved in range [reg, r4), then 2125 // the end (first after last) register would be reg + param-size-in-regs, 2126 // else parameter would be splitted between registers and stack, 2127 // end register would be r4 in this case. 2128 unsigned ByValRegBegin = Reg; 2129 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2130 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2131 // Note, first register is allocated in the beginning of function already, 2132 // allocate remained amount of registers we need. 2133 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2134 State->AllocateReg(GPRArgRegs); 2135 // A byval parameter that is split between registers and memory needs its 2136 // size truncated here. 2137 // In the case where the entire structure fits in registers, we set the 2138 // size in memory to zero. 2139 Size = std::max<int>(Size - Excess, 0); 2140 } 2141 2142 /// MatchingStackOffset - Return true if the given stack call argument is 2143 /// already available in the same position (relatively) of the caller's 2144 /// incoming argument stack. 2145 static 2146 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2147 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2148 const TargetInstrInfo *TII) { 2149 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2150 int FI = INT_MAX; 2151 if (Arg.getOpcode() == ISD::CopyFromReg) { 2152 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2153 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2154 return false; 2155 MachineInstr *Def = MRI->getVRegDef(VR); 2156 if (!Def) 2157 return false; 2158 if (!Flags.isByVal()) { 2159 if (!TII->isLoadFromStackSlot(*Def, FI)) 2160 return false; 2161 } else { 2162 return false; 2163 } 2164 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2165 if (Flags.isByVal()) 2166 // ByVal argument is passed in as a pointer but it's now being 2167 // dereferenced. e.g. 2168 // define @foo(%struct.X* %A) { 2169 // tail call @bar(%struct.X* byval %A) 2170 // } 2171 return false; 2172 SDValue Ptr = Ld->getBasePtr(); 2173 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2174 if (!FINode) 2175 return false; 2176 FI = FINode->getIndex(); 2177 } else 2178 return false; 2179 2180 assert(FI != INT_MAX); 2181 if (!MFI.isFixedObjectIndex(FI)) 2182 return false; 2183 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2184 } 2185 2186 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2187 /// for tail call optimization. Targets which want to do tail call 2188 /// optimization should implement this function. 2189 bool 2190 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2191 CallingConv::ID CalleeCC, 2192 bool isVarArg, 2193 bool isCalleeStructRet, 2194 bool isCallerStructRet, 2195 const SmallVectorImpl<ISD::OutputArg> &Outs, 2196 const SmallVectorImpl<SDValue> &OutVals, 2197 const SmallVectorImpl<ISD::InputArg> &Ins, 2198 SelectionDAG& DAG) const { 2199 MachineFunction &MF = DAG.getMachineFunction(); 2200 const Function *CallerF = MF.getFunction(); 2201 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2202 2203 assert(Subtarget->supportsTailCall()); 2204 2205 // Look for obvious safe cases to perform tail call optimization that do not 2206 // require ABI changes. This is what gcc calls sibcall. 2207 2208 // Exception-handling functions need a special set of instructions to indicate 2209 // a return to the hardware. Tail-calling another function would probably 2210 // break this. 2211 if (CallerF->hasFnAttribute("interrupt")) 2212 return false; 2213 2214 // Also avoid sibcall optimization if either caller or callee uses struct 2215 // return semantics. 2216 if (isCalleeStructRet || isCallerStructRet) 2217 return false; 2218 2219 // Externally-defined functions with weak linkage should not be 2220 // tail-called on ARM when the OS does not support dynamic 2221 // pre-emption of symbols, as the AAELF spec requires normal calls 2222 // to undefined weak functions to be replaced with a NOP or jump to the 2223 // next instruction. The behaviour of branch instructions in this 2224 // situation (as used for tail calls) is implementation-defined, so we 2225 // cannot rely on the linker replacing the tail call with a return. 2226 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2227 const GlobalValue *GV = G->getGlobal(); 2228 const Triple &TT = getTargetMachine().getTargetTriple(); 2229 if (GV->hasExternalWeakLinkage() && 2230 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2231 return false; 2232 } 2233 2234 // Check that the call results are passed in the same way. 2235 LLVMContext &C = *DAG.getContext(); 2236 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2237 CCAssignFnForReturn(CalleeCC, isVarArg), 2238 CCAssignFnForReturn(CallerCC, isVarArg))) 2239 return false; 2240 // The callee has to preserve all registers the caller needs to preserve. 2241 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2242 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2243 if (CalleeCC != CallerCC) { 2244 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2245 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2246 return false; 2247 } 2248 2249 // If Caller's vararg or byval argument has been split between registers and 2250 // stack, do not perform tail call, since part of the argument is in caller's 2251 // local frame. 2252 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2253 if (AFI_Caller->getArgRegsSaveSize()) 2254 return false; 2255 2256 // If the callee takes no arguments then go on to check the results of the 2257 // call. 2258 if (!Outs.empty()) { 2259 // Check if stack adjustment is needed. For now, do not do this if any 2260 // argument is passed on the stack. 2261 SmallVector<CCValAssign, 16> ArgLocs; 2262 ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call); 2263 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2264 if (CCInfo.getNextStackOffset()) { 2265 // Check if the arguments are already laid out in the right way as 2266 // the caller's fixed stack objects. 2267 MachineFrameInfo &MFI = MF.getFrameInfo(); 2268 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2269 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2270 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2271 i != e; 2272 ++i, ++realArgIdx) { 2273 CCValAssign &VA = ArgLocs[i]; 2274 EVT RegVT = VA.getLocVT(); 2275 SDValue Arg = OutVals[realArgIdx]; 2276 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2277 if (VA.getLocInfo() == CCValAssign::Indirect) 2278 return false; 2279 if (VA.needsCustom()) { 2280 // f64 and vector types are split into multiple registers or 2281 // register/stack-slot combinations. The types will not match 2282 // the registers; give up on memory f64 refs until we figure 2283 // out what to do about this. 2284 if (!VA.isRegLoc()) 2285 return false; 2286 if (!ArgLocs[++i].isRegLoc()) 2287 return false; 2288 if (RegVT == MVT::v2f64) { 2289 if (!ArgLocs[++i].isRegLoc()) 2290 return false; 2291 if (!ArgLocs[++i].isRegLoc()) 2292 return false; 2293 } 2294 } else if (!VA.isRegLoc()) { 2295 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2296 MFI, MRI, TII)) 2297 return false; 2298 } 2299 } 2300 } 2301 2302 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2303 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2304 return false; 2305 } 2306 2307 return true; 2308 } 2309 2310 bool 2311 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2312 MachineFunction &MF, bool isVarArg, 2313 const SmallVectorImpl<ISD::OutputArg> &Outs, 2314 LLVMContext &Context) const { 2315 SmallVector<CCValAssign, 16> RVLocs; 2316 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2317 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2318 } 2319 2320 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2321 const SDLoc &DL, SelectionDAG &DAG) { 2322 const MachineFunction &MF = DAG.getMachineFunction(); 2323 const Function *F = MF.getFunction(); 2324 2325 StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString(); 2326 2327 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2328 // version of the "preferred return address". These offsets affect the return 2329 // instruction if this is a return from PL1 without hypervisor extensions. 2330 // IRQ/FIQ: +4 "subs pc, lr, #4" 2331 // SWI: 0 "subs pc, lr, #0" 2332 // ABORT: +4 "subs pc, lr, #4" 2333 // UNDEF: +4/+2 "subs pc, lr, #0" 2334 // UNDEF varies depending on where the exception came from ARM or Thumb 2335 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2336 2337 int64_t LROffset; 2338 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2339 IntKind == "ABORT") 2340 LROffset = 4; 2341 else if (IntKind == "SWI" || IntKind == "UNDEF") 2342 LROffset = 0; 2343 else 2344 report_fatal_error("Unsupported interrupt attribute. If present, value " 2345 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2346 2347 RetOps.insert(RetOps.begin() + 1, 2348 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2349 2350 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2351 } 2352 2353 SDValue 2354 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2355 bool isVarArg, 2356 const SmallVectorImpl<ISD::OutputArg> &Outs, 2357 const SmallVectorImpl<SDValue> &OutVals, 2358 const SDLoc &dl, SelectionDAG &DAG) const { 2359 2360 // CCValAssign - represent the assignment of the return value to a location. 2361 SmallVector<CCValAssign, 16> RVLocs; 2362 2363 // CCState - Info about the registers and stack slots. 2364 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2365 *DAG.getContext(), Call); 2366 2367 // Analyze outgoing return values. 2368 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2369 2370 SDValue Flag; 2371 SmallVector<SDValue, 4> RetOps; 2372 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2373 bool isLittleEndian = Subtarget->isLittle(); 2374 2375 MachineFunction &MF = DAG.getMachineFunction(); 2376 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2377 AFI->setReturnRegsCount(RVLocs.size()); 2378 2379 // Copy the result values into the output registers. 2380 for (unsigned i = 0, realRVLocIdx = 0; 2381 i != RVLocs.size(); 2382 ++i, ++realRVLocIdx) { 2383 CCValAssign &VA = RVLocs[i]; 2384 assert(VA.isRegLoc() && "Can only return in registers!"); 2385 2386 SDValue Arg = OutVals[realRVLocIdx]; 2387 2388 switch (VA.getLocInfo()) { 2389 default: llvm_unreachable("Unknown loc info!"); 2390 case CCValAssign::Full: break; 2391 case CCValAssign::BCvt: 2392 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2393 break; 2394 } 2395 2396 if (VA.needsCustom()) { 2397 if (VA.getLocVT() == MVT::v2f64) { 2398 // Extract the first half and return it in two registers. 2399 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2400 DAG.getConstant(0, dl, MVT::i32)); 2401 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2402 DAG.getVTList(MVT::i32, MVT::i32), Half); 2403 2404 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2405 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2406 Flag); 2407 Flag = Chain.getValue(1); 2408 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2409 VA = RVLocs[++i]; // skip ahead to next loc 2410 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2411 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2412 Flag); 2413 Flag = Chain.getValue(1); 2414 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2415 VA = RVLocs[++i]; // skip ahead to next loc 2416 2417 // Extract the 2nd half and fall through to handle it as an f64 value. 2418 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2419 DAG.getConstant(1, dl, MVT::i32)); 2420 } 2421 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2422 // available. 2423 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2424 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2425 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2426 fmrrd.getValue(isLittleEndian ? 0 : 1), 2427 Flag); 2428 Flag = Chain.getValue(1); 2429 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2430 VA = RVLocs[++i]; // skip ahead to next loc 2431 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2432 fmrrd.getValue(isLittleEndian ? 1 : 0), 2433 Flag); 2434 } else 2435 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2436 2437 // Guarantee that all emitted copies are 2438 // stuck together, avoiding something bad. 2439 Flag = Chain.getValue(1); 2440 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2441 } 2442 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2443 const MCPhysReg *I = 2444 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2445 if (I) { 2446 for (; *I; ++I) { 2447 if (ARM::GPRRegClass.contains(*I)) 2448 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2449 else if (ARM::DPRRegClass.contains(*I)) 2450 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2451 else 2452 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2453 } 2454 } 2455 2456 // Update chain and glue. 2457 RetOps[0] = Chain; 2458 if (Flag.getNode()) 2459 RetOps.push_back(Flag); 2460 2461 // CPUs which aren't M-class use a special sequence to return from 2462 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2463 // though we use "subs pc, lr, #N"). 2464 // 2465 // M-class CPUs actually use a normal return sequence with a special 2466 // (hardware-provided) value in LR, so the normal code path works. 2467 if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") && 2468 !Subtarget->isMClass()) { 2469 if (Subtarget->isThumb1Only()) 2470 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2471 return LowerInterruptReturn(RetOps, dl, DAG); 2472 } 2473 2474 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2475 } 2476 2477 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2478 if (N->getNumValues() != 1) 2479 return false; 2480 if (!N->hasNUsesOfValue(1, 0)) 2481 return false; 2482 2483 SDValue TCChain = Chain; 2484 SDNode *Copy = *N->use_begin(); 2485 if (Copy->getOpcode() == ISD::CopyToReg) { 2486 // If the copy has a glue operand, we conservatively assume it isn't safe to 2487 // perform a tail call. 2488 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2489 return false; 2490 TCChain = Copy->getOperand(0); 2491 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2492 SDNode *VMov = Copy; 2493 // f64 returned in a pair of GPRs. 2494 SmallPtrSet<SDNode*, 2> Copies; 2495 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2496 UI != UE; ++UI) { 2497 if (UI->getOpcode() != ISD::CopyToReg) 2498 return false; 2499 Copies.insert(*UI); 2500 } 2501 if (Copies.size() > 2) 2502 return false; 2503 2504 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2505 UI != UE; ++UI) { 2506 SDValue UseChain = UI->getOperand(0); 2507 if (Copies.count(UseChain.getNode())) 2508 // Second CopyToReg 2509 Copy = *UI; 2510 else { 2511 // We are at the top of this chain. 2512 // If the copy has a glue operand, we conservatively assume it 2513 // isn't safe to perform a tail call. 2514 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2515 return false; 2516 // First CopyToReg 2517 TCChain = UseChain; 2518 } 2519 } 2520 } else if (Copy->getOpcode() == ISD::BITCAST) { 2521 // f32 returned in a single GPR. 2522 if (!Copy->hasOneUse()) 2523 return false; 2524 Copy = *Copy->use_begin(); 2525 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2526 return false; 2527 // If the copy has a glue operand, we conservatively assume it isn't safe to 2528 // perform a tail call. 2529 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2530 return false; 2531 TCChain = Copy->getOperand(0); 2532 } else { 2533 return false; 2534 } 2535 2536 bool HasRet = false; 2537 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2538 UI != UE; ++UI) { 2539 if (UI->getOpcode() != ARMISD::RET_FLAG && 2540 UI->getOpcode() != ARMISD::INTRET_FLAG) 2541 return false; 2542 HasRet = true; 2543 } 2544 2545 if (!HasRet) 2546 return false; 2547 2548 Chain = TCChain; 2549 return true; 2550 } 2551 2552 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2553 if (!Subtarget->supportsTailCall()) 2554 return false; 2555 2556 auto Attr = 2557 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2558 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2559 return false; 2560 2561 return true; 2562 } 2563 2564 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2565 // and pass the lower and high parts through. 2566 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2567 SDLoc DL(Op); 2568 SDValue WriteValue = Op->getOperand(2); 2569 2570 // This function is only supposed to be called for i64 type argument. 2571 assert(WriteValue.getValueType() == MVT::i64 2572 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2573 2574 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2575 DAG.getConstant(0, DL, MVT::i32)); 2576 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2577 DAG.getConstant(1, DL, MVT::i32)); 2578 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2579 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 2580 } 2581 2582 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2583 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2584 // one of the above mentioned nodes. It has to be wrapped because otherwise 2585 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2586 // be used to form addressing mode. These wrapped nodes will be selected 2587 // into MOVi. 2588 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 2589 EVT PtrVT = Op.getValueType(); 2590 // FIXME there is no actual debug info here 2591 SDLoc dl(Op); 2592 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2593 SDValue Res; 2594 if (CP->isMachineConstantPoolEntry()) 2595 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 2596 CP->getAlignment()); 2597 else 2598 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 2599 CP->getAlignment()); 2600 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 2601 } 2602 2603 unsigned ARMTargetLowering::getJumpTableEncoding() const { 2604 return MachineJumpTableInfo::EK_Inline; 2605 } 2606 2607 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 2608 SelectionDAG &DAG) const { 2609 MachineFunction &MF = DAG.getMachineFunction(); 2610 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2611 unsigned ARMPCLabelIndex = 0; 2612 SDLoc DL(Op); 2613 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2614 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 2615 SDValue CPAddr; 2616 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 2617 if (!IsPositionIndependent) { 2618 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 2619 } else { 2620 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2621 ARMPCLabelIndex = AFI->createPICLabelUId(); 2622 ARMConstantPoolValue *CPV = 2623 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 2624 ARMCP::CPBlockAddress, PCAdj); 2625 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2626 } 2627 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 2628 SDValue Result = DAG.getLoad( 2629 PtrVT, DL, DAG.getEntryNode(), CPAddr, 2630 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2631 if (!IsPositionIndependent) 2632 return Result; 2633 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 2634 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 2635 } 2636 2637 /// \brief Convert a TLS address reference into the correct sequence of loads 2638 /// and calls to compute the variable's address for Darwin, and return an 2639 /// SDValue containing the final node. 2640 2641 /// Darwin only has one TLS scheme which must be capable of dealing with the 2642 /// fully general situation, in the worst case. This means: 2643 /// + "extern __thread" declaration. 2644 /// + Defined in a possibly unknown dynamic library. 2645 /// 2646 /// The general system is that each __thread variable has a [3 x i32] descriptor 2647 /// which contains information used by the runtime to calculate the address. The 2648 /// only part of this the compiler needs to know about is the first word, which 2649 /// contains a function pointer that must be called with the address of the 2650 /// entire descriptor in "r0". 2651 /// 2652 /// Since this descriptor may be in a different unit, in general access must 2653 /// proceed along the usual ARM rules. A common sequence to produce is: 2654 /// 2655 /// movw rT1, :lower16:_var$non_lazy_ptr 2656 /// movt rT1, :upper16:_var$non_lazy_ptr 2657 /// ldr r0, [rT1] 2658 /// ldr rT2, [r0] 2659 /// blx rT2 2660 /// [...address now in r0...] 2661 SDValue 2662 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 2663 SelectionDAG &DAG) const { 2664 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 2665 SDLoc DL(Op); 2666 2667 // First step is to get the address of the actua global symbol. This is where 2668 // the TLS descriptor lives. 2669 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 2670 2671 // The first entry in the descriptor is a function pointer that we must call 2672 // to obtain the address of the variable. 2673 SDValue Chain = DAG.getEntryNode(); 2674 SDValue FuncTLVGet = DAG.getLoad( 2675 MVT::i32, DL, Chain, DescAddr, 2676 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2677 /* Alignment = */ 4, 2678 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 2679 MachineMemOperand::MOInvariant); 2680 Chain = FuncTLVGet.getValue(1); 2681 2682 MachineFunction &F = DAG.getMachineFunction(); 2683 MachineFrameInfo &MFI = F.getFrameInfo(); 2684 MFI.setAdjustsStack(true); 2685 2686 // TLS calls preserve all registers except those that absolutely must be 2687 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 2688 // silly). 2689 auto TRI = 2690 getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo(); 2691 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 2692 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 2693 2694 // Finally, we can make the call. This is just a degenerate version of a 2695 // normal AArch64 call node: r0 takes the address of the descriptor, and 2696 // returns the address of the variable in this thread. 2697 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 2698 Chain = 2699 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 2700 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 2701 DAG.getRegisterMask(Mask), Chain.getValue(1)); 2702 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 2703 } 2704 2705 SDValue 2706 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 2707 SelectionDAG &DAG) const { 2708 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 2709 2710 SDValue Chain = DAG.getEntryNode(); 2711 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2712 SDLoc DL(Op); 2713 2714 // Load the current TEB (thread environment block) 2715 SDValue Ops[] = {Chain, 2716 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 2717 DAG.getConstant(15, DL, MVT::i32), 2718 DAG.getConstant(0, DL, MVT::i32), 2719 DAG.getConstant(13, DL, MVT::i32), 2720 DAG.getConstant(0, DL, MVT::i32), 2721 DAG.getConstant(2, DL, MVT::i32)}; 2722 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 2723 DAG.getVTList(MVT::i32, MVT::Other), Ops); 2724 2725 SDValue TEB = CurrentTEB.getValue(0); 2726 Chain = CurrentTEB.getValue(1); 2727 2728 // Load the ThreadLocalStoragePointer from the TEB 2729 // A pointer to the TLS array is located at offset 0x2c from the TEB. 2730 SDValue TLSArray = 2731 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 2732 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 2733 2734 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 2735 // offset into the TLSArray. 2736 2737 // Load the TLS index from the C runtime 2738 SDValue TLSIndex = 2739 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 2740 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 2741 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 2742 2743 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 2744 DAG.getConstant(2, DL, MVT::i32)); 2745 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 2746 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 2747 MachinePointerInfo()); 2748 2749 // Get the offset of the start of the .tls section (section base) 2750 const auto *GA = cast<GlobalAddressSDNode>(Op); 2751 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 2752 SDValue Offset = DAG.getLoad( 2753 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 2754 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 2755 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2756 2757 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 2758 } 2759 2760 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 2761 SDValue 2762 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 2763 SelectionDAG &DAG) const { 2764 SDLoc dl(GA); 2765 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2766 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2767 MachineFunction &MF = DAG.getMachineFunction(); 2768 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2769 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2770 ARMConstantPoolValue *CPV = 2771 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2772 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 2773 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2774 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 2775 Argument = DAG.getLoad( 2776 PtrVT, dl, DAG.getEntryNode(), Argument, 2777 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2778 SDValue Chain = Argument.getValue(1); 2779 2780 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2781 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 2782 2783 // call __tls_get_addr. 2784 ArgListTy Args; 2785 ArgListEntry Entry; 2786 Entry.Node = Argument; 2787 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 2788 Args.push_back(Entry); 2789 2790 // FIXME: is there useful debug info available here? 2791 TargetLowering::CallLoweringInfo CLI(DAG); 2792 CLI.setDebugLoc(dl).setChain(Chain) 2793 .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 2794 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 2795 2796 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2797 return CallResult.first; 2798 } 2799 2800 // Lower ISD::GlobalTLSAddress using the "initial exec" or 2801 // "local exec" model. 2802 SDValue 2803 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 2804 SelectionDAG &DAG, 2805 TLSModel::Model model) const { 2806 const GlobalValue *GV = GA->getGlobal(); 2807 SDLoc dl(GA); 2808 SDValue Offset; 2809 SDValue Chain = DAG.getEntryNode(); 2810 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2811 // Get the Thread Pointer 2812 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2813 2814 if (model == TLSModel::InitialExec) { 2815 MachineFunction &MF = DAG.getMachineFunction(); 2816 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2817 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2818 // Initial exec model. 2819 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2820 ARMConstantPoolValue *CPV = 2821 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2822 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 2823 true); 2824 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2825 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2826 Offset = DAG.getLoad( 2827 PtrVT, dl, Chain, Offset, 2828 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2829 Chain = Offset.getValue(1); 2830 2831 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2832 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 2833 2834 Offset = DAG.getLoad( 2835 PtrVT, dl, Chain, Offset, 2836 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2837 } else { 2838 // local exec model 2839 assert(model == TLSModel::LocalExec); 2840 ARMConstantPoolValue *CPV = 2841 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 2842 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2843 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2844 Offset = DAG.getLoad( 2845 PtrVT, dl, Chain, Offset, 2846 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2847 } 2848 2849 // The address of the thread local variable is the add of the thread 2850 // pointer with the offset of the variable. 2851 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 2852 } 2853 2854 SDValue 2855 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 2856 if (Subtarget->isTargetDarwin()) 2857 return LowerGlobalTLSAddressDarwin(Op, DAG); 2858 2859 if (Subtarget->isTargetWindows()) 2860 return LowerGlobalTLSAddressWindows(Op, DAG); 2861 2862 // TODO: implement the "local dynamic" model 2863 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 2864 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2865 if (DAG.getTarget().Options.EmulatedTLS) 2866 return LowerToTLSEmulatedModel(GA, DAG); 2867 2868 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 2869 2870 switch (model) { 2871 case TLSModel::GeneralDynamic: 2872 case TLSModel::LocalDynamic: 2873 return LowerToTLSGeneralDynamicModel(GA, DAG); 2874 case TLSModel::InitialExec: 2875 case TLSModel::LocalExec: 2876 return LowerToTLSExecModels(GA, DAG, model); 2877 } 2878 llvm_unreachable("bogus TLS model"); 2879 } 2880 2881 /// Return true if all users of V are within function F, looking through 2882 /// ConstantExprs. 2883 static bool allUsersAreInFunction(const Value *V, const Function *F) { 2884 SmallVector<const User*,4> Worklist; 2885 for (auto *U : V->users()) 2886 Worklist.push_back(U); 2887 while (!Worklist.empty()) { 2888 auto *U = Worklist.pop_back_val(); 2889 if (isa<ConstantExpr>(U)) { 2890 for (auto *UU : U->users()) 2891 Worklist.push_back(UU); 2892 continue; 2893 } 2894 2895 auto *I = dyn_cast<Instruction>(U); 2896 if (!I || I->getParent()->getParent() != F) 2897 return false; 2898 } 2899 return true; 2900 } 2901 2902 /// Return true if all users of V are within some (any) function, looking through 2903 /// ConstantExprs. In other words, are there any global constant users? 2904 static bool allUsersAreInFunctions(const Value *V) { 2905 SmallVector<const User*,4> Worklist; 2906 for (auto *U : V->users()) 2907 Worklist.push_back(U); 2908 while (!Worklist.empty()) { 2909 auto *U = Worklist.pop_back_val(); 2910 if (isa<ConstantExpr>(U)) { 2911 for (auto *UU : U->users()) 2912 Worklist.push_back(UU); 2913 continue; 2914 } 2915 2916 if (!isa<Instruction>(U)) 2917 return false; 2918 } 2919 return true; 2920 } 2921 2922 // Return true if T is an integer, float or an array/vector of either. 2923 static bool isSimpleType(Type *T) { 2924 if (T->isIntegerTy() || T->isFloatingPointTy()) 2925 return true; 2926 Type *SubT = nullptr; 2927 if (T->isArrayTy()) 2928 SubT = T->getArrayElementType(); 2929 else if (T->isVectorTy()) 2930 SubT = T->getVectorElementType(); 2931 else 2932 return false; 2933 return SubT->isIntegerTy() || SubT->isFloatingPointTy(); 2934 } 2935 2936 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, 2937 EVT PtrVT, const SDLoc &dl) { 2938 // If we're creating a pool entry for a constant global with unnamed address, 2939 // and the global is small enough, we can emit it inline into the constant pool 2940 // to save ourselves an indirection. 2941 // 2942 // This is a win if the constant is only used in one function (so it doesn't 2943 // need to be duplicated) or duplicating the constant wouldn't increase code 2944 // size (implying the constant is no larger than 4 bytes). 2945 const Function *F = DAG.getMachineFunction().getFunction(); 2946 2947 // We rely on this decision to inline being idemopotent and unrelated to the 2948 // use-site. We know that if we inline a variable at one use site, we'll 2949 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 2950 // doesn't know about this optimization, so bail out if it's enabled else 2951 // we could decide to inline here (and thus never emit the GV) but require 2952 // the GV from fast-isel generated code. 2953 if (!EnableConstpoolPromotion || 2954 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 2955 return SDValue(); 2956 2957 auto *GVar = dyn_cast<GlobalVariable>(GV); 2958 if (!GVar || !GVar->hasInitializer() || 2959 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 2960 !GVar->hasLocalLinkage()) 2961 return SDValue(); 2962 2963 // Ensure that we don't try and inline any type that contains pointers. If 2964 // we inline a value that contains relocations, we move the relocations from 2965 // .data to .text which is not ideal. 2966 auto *Init = GVar->getInitializer(); 2967 if (!isSimpleType(Init->getType())) 2968 return SDValue(); 2969 2970 // The constant islands pass can only really deal with alignment requests 2971 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 2972 // any type wanting greater alignment requirements than 4 bytes. We also 2973 // can only promote constants that are multiples of 4 bytes in size or 2974 // are paddable to a multiple of 4. Currently we only try and pad constants 2975 // that are strings for simplicity. 2976 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 2977 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 2978 unsigned Align = GVar->getAlignment(); 2979 unsigned RequiredPadding = 4 - (Size % 4); 2980 bool PaddingPossible = 2981 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 2982 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize) 2983 return SDValue(); 2984 2985 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 2986 MachineFunction &MF = DAG.getMachineFunction(); 2987 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2988 2989 // We can't bloat the constant pool too much, else the ConstantIslands pass 2990 // may fail to converge. If we haven't promoted this global yet (it may have 2991 // multiple uses), and promoting it would increase the constant pool size (Sz 2992 // > 4), ensure we have space to do so up to MaxTotal. 2993 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 2994 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 2995 ConstpoolPromotionMaxTotal) 2996 return SDValue(); 2997 2998 // This is only valid if all users are in a single function OR it has users 2999 // in multiple functions but it no larger than a pointer. We also check if 3000 // GVar has constant (non-ConstantExpr) users. If so, it essentially has its 3001 // address taken. 3002 if (!allUsersAreInFunction(GVar, F) && 3003 !(Size <= 4 && allUsersAreInFunctions(GVar))) 3004 return SDValue(); 3005 3006 // We're going to inline this global. Pad it out if needed. 3007 if (RequiredPadding != 4) { 3008 StringRef S = CDAInit->getAsString(); 3009 3010 SmallVector<uint8_t,16> V(S.size()); 3011 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3012 while (RequiredPadding--) 3013 V.push_back(0); 3014 Init = ConstantDataArray::get(*DAG.getContext(), V); 3015 } 3016 3017 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3018 SDValue CPAddr = 3019 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3020 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3021 AFI->markGlobalAsPromotedToConstantPool(GVar); 3022 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3023 PaddedSize - 4); 3024 } 3025 ++NumConstpoolPromoted; 3026 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3027 } 3028 3029 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3030 SelectionDAG &DAG) const { 3031 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3032 SDLoc dl(Op); 3033 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3034 const TargetMachine &TM = getTargetMachine(); 3035 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3036 GV = GA->getBaseObject(); 3037 bool IsRO = 3038 (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) || 3039 isa<Function>(GV); 3040 3041 // promoteToConstantPool only if not generating XO text section 3042 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3043 if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl)) 3044 return V; 3045 3046 if (isPositionIndependent()) { 3047 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3048 3049 MachineFunction &MF = DAG.getMachineFunction(); 3050 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3051 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3052 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3053 SDLoc dl(Op); 3054 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3055 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( 3056 GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, 3057 UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, 3058 /*AddCurrentAddress=*/UseGOT_PREL); 3059 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3060 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3061 SDValue Result = DAG.getLoad( 3062 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3063 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3064 SDValue Chain = Result.getValue(1); 3065 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3066 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3067 if (UseGOT_PREL) 3068 Result = 3069 DAG.getLoad(PtrVT, dl, Chain, Result, 3070 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3071 return Result; 3072 } else if (Subtarget->isROPI() && IsRO) { 3073 // PC-relative. 3074 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3075 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3076 return Result; 3077 } else if (Subtarget->isRWPI() && !IsRO) { 3078 // SB-relative. 3079 ARMConstantPoolValue *CPV = 3080 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3081 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3082 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3083 SDValue G = DAG.getLoad( 3084 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3085 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3086 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3087 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G); 3088 return Result; 3089 } 3090 3091 // If we have T2 ops, we can materialize the address directly via movt/movw 3092 // pair. This is always cheaper. 3093 if (Subtarget->useMovt(DAG.getMachineFunction())) { 3094 ++NumMovwMovt; 3095 // FIXME: Once remat is capable of dealing with instructions with register 3096 // operands, expand this into two nodes. 3097 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3098 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3099 } else { 3100 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3101 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3102 return DAG.getLoad( 3103 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3104 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3105 } 3106 } 3107 3108 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3109 SelectionDAG &DAG) const { 3110 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3111 "ROPI/RWPI not currently supported for Darwin"); 3112 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3113 SDLoc dl(Op); 3114 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3115 3116 if (Subtarget->useMovt(DAG.getMachineFunction())) 3117 ++NumMovwMovt; 3118 3119 // FIXME: Once remat is capable of dealing with instructions with register 3120 // operands, expand this into multiple nodes 3121 unsigned Wrapper = 3122 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3123 3124 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3125 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3126 3127 if (Subtarget->isGVIndirectSymbol(GV)) 3128 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3129 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3130 return Result; 3131 } 3132 3133 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3134 SelectionDAG &DAG) const { 3135 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3136 assert(Subtarget->useMovt(DAG.getMachineFunction()) && 3137 "Windows on ARM expects to use movw/movt"); 3138 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3139 "ROPI/RWPI not currently supported for Windows"); 3140 3141 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3142 const ARMII::TOF TargetFlags = 3143 (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); 3144 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3145 SDValue Result; 3146 SDLoc DL(Op); 3147 3148 ++NumMovwMovt; 3149 3150 // FIXME: Once remat is capable of dealing with instructions with register 3151 // operands, expand this into two nodes. 3152 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3153 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, 3154 TargetFlags)); 3155 if (GV->hasDLLImportStorageClass()) 3156 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3157 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3158 return Result; 3159 } 3160 3161 SDValue 3162 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3163 SDLoc dl(Op); 3164 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3165 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3166 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3167 Op.getOperand(1), Val); 3168 } 3169 3170 SDValue 3171 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3172 SDLoc dl(Op); 3173 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3174 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3175 } 3176 3177 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3178 SelectionDAG &DAG) const { 3179 SDLoc dl(Op); 3180 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3181 Op.getOperand(0)); 3182 } 3183 3184 SDValue 3185 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3186 const ARMSubtarget *Subtarget) const { 3187 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3188 SDLoc dl(Op); 3189 switch (IntNo) { 3190 default: return SDValue(); // Don't custom lower most intrinsics. 3191 case Intrinsic::thread_pointer: { 3192 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3193 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3194 } 3195 case Intrinsic::eh_sjlj_lsda: { 3196 MachineFunction &MF = DAG.getMachineFunction(); 3197 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3198 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3199 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3200 SDValue CPAddr; 3201 bool IsPositionIndependent = isPositionIndependent(); 3202 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3203 ARMConstantPoolValue *CPV = 3204 ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, 3205 ARMCP::CPLSDA, PCAdj); 3206 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3207 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3208 SDValue Result = DAG.getLoad( 3209 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3210 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3211 3212 if (IsPositionIndependent) { 3213 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3214 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3215 } 3216 return Result; 3217 } 3218 case Intrinsic::arm_neon_vmulls: 3219 case Intrinsic::arm_neon_vmullu: { 3220 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3221 ? ARMISD::VMULLs : ARMISD::VMULLu; 3222 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3223 Op.getOperand(1), Op.getOperand(2)); 3224 } 3225 case Intrinsic::arm_neon_vminnm: 3226 case Intrinsic::arm_neon_vmaxnm: { 3227 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3228 ? ISD::FMINNUM : ISD::FMAXNUM; 3229 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3230 Op.getOperand(1), Op.getOperand(2)); 3231 } 3232 case Intrinsic::arm_neon_vminu: 3233 case Intrinsic::arm_neon_vmaxu: { 3234 if (Op.getValueType().isFloatingPoint()) 3235 return SDValue(); 3236 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3237 ? ISD::UMIN : ISD::UMAX; 3238 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3239 Op.getOperand(1), Op.getOperand(2)); 3240 } 3241 case Intrinsic::arm_neon_vmins: 3242 case Intrinsic::arm_neon_vmaxs: { 3243 // v{min,max}s is overloaded between signed integers and floats. 3244 if (!Op.getValueType().isFloatingPoint()) { 3245 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3246 ? ISD::SMIN : ISD::SMAX; 3247 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3248 Op.getOperand(1), Op.getOperand(2)); 3249 } 3250 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3251 ? ISD::FMINNAN : ISD::FMAXNAN; 3252 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3253 Op.getOperand(1), Op.getOperand(2)); 3254 } 3255 } 3256 } 3257 3258 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3259 const ARMSubtarget *Subtarget) { 3260 // FIXME: handle "fence singlethread" more efficiently. 3261 SDLoc dl(Op); 3262 if (!Subtarget->hasDataBarrier()) { 3263 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3264 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3265 // here. 3266 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3267 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3268 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3269 DAG.getConstant(0, dl, MVT::i32)); 3270 } 3271 3272 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3273 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3274 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3275 if (Subtarget->isMClass()) { 3276 // Only a full system barrier exists in the M-class architectures. 3277 Domain = ARM_MB::SY; 3278 } else if (Subtarget->preferISHSTBarriers() && 3279 Ord == AtomicOrdering::Release) { 3280 // Swift happens to implement ISHST barriers in a way that's compatible with 3281 // Release semantics but weaker than ISH so we'd be fools not to use 3282 // it. Beware: other processors probably don't! 3283 Domain = ARM_MB::ISHST; 3284 } 3285 3286 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3287 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3288 DAG.getConstant(Domain, dl, MVT::i32)); 3289 } 3290 3291 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3292 const ARMSubtarget *Subtarget) { 3293 // ARM pre v5TE and Thumb1 does not have preload instructions. 3294 if (!(Subtarget->isThumb2() || 3295 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3296 // Just preserve the chain. 3297 return Op.getOperand(0); 3298 3299 SDLoc dl(Op); 3300 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3301 if (!isRead && 3302 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3303 // ARMv7 with MP extension has PLDW. 3304 return Op.getOperand(0); 3305 3306 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3307 if (Subtarget->isThumb()) { 3308 // Invert the bits. 3309 isRead = ~isRead & 1; 3310 isData = ~isData & 1; 3311 } 3312 3313 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3314 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3315 DAG.getConstant(isData, dl, MVT::i32)); 3316 } 3317 3318 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3319 MachineFunction &MF = DAG.getMachineFunction(); 3320 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3321 3322 // vastart just stores the address of the VarArgsFrameIndex slot into the 3323 // memory location argument. 3324 SDLoc dl(Op); 3325 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3326 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3327 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3328 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3329 MachinePointerInfo(SV)); 3330 } 3331 3332 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3333 CCValAssign &NextVA, 3334 SDValue &Root, 3335 SelectionDAG &DAG, 3336 const SDLoc &dl) const { 3337 MachineFunction &MF = DAG.getMachineFunction(); 3338 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3339 3340 const TargetRegisterClass *RC; 3341 if (AFI->isThumb1OnlyFunction()) 3342 RC = &ARM::tGPRRegClass; 3343 else 3344 RC = &ARM::GPRRegClass; 3345 3346 // Transform the arguments stored in physical registers into virtual ones. 3347 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3348 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3349 3350 SDValue ArgValue2; 3351 if (NextVA.isMemLoc()) { 3352 MachineFrameInfo &MFI = MF.getFrameInfo(); 3353 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3354 3355 // Create load node to retrieve arguments from the stack. 3356 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3357 ArgValue2 = DAG.getLoad( 3358 MVT::i32, dl, Root, FIN, 3359 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3360 } else { 3361 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3362 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3363 } 3364 if (!Subtarget->isLittle()) 3365 std::swap (ArgValue, ArgValue2); 3366 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3367 } 3368 3369 // The remaining GPRs hold either the beginning of variable-argument 3370 // data, or the beginning of an aggregate passed by value (usually 3371 // byval). Either way, we allocate stack slots adjacent to the data 3372 // provided by our caller, and store the unallocated registers there. 3373 // If this is a variadic function, the va_list pointer will begin with 3374 // these values; otherwise, this reassembles a (byval) structure that 3375 // was split between registers and memory. 3376 // Return: The frame index registers were stored into. 3377 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3378 const SDLoc &dl, SDValue &Chain, 3379 const Value *OrigArg, 3380 unsigned InRegsParamRecordIdx, 3381 int ArgOffset, unsigned ArgSize) const { 3382 // Currently, two use-cases possible: 3383 // Case #1. Non-var-args function, and we meet first byval parameter. 3384 // Setup first unallocated register as first byval register; 3385 // eat all remained registers 3386 // (these two actions are performed by HandleByVal method). 3387 // Then, here, we initialize stack frame with 3388 // "store-reg" instructions. 3389 // Case #2. Var-args function, that doesn't contain byval parameters. 3390 // The same: eat all remained unallocated registers, 3391 // initialize stack frame. 3392 3393 MachineFunction &MF = DAG.getMachineFunction(); 3394 MachineFrameInfo &MFI = MF.getFrameInfo(); 3395 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3396 unsigned RBegin, REnd; 3397 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3398 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3399 } else { 3400 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3401 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3402 REnd = ARM::R4; 3403 } 3404 3405 if (REnd != RBegin) 3406 ArgOffset = -4 * (ARM::R4 - RBegin); 3407 3408 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3409 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3410 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3411 3412 SmallVector<SDValue, 4> MemOps; 3413 const TargetRegisterClass *RC = 3414 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3415 3416 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3417 unsigned VReg = MF.addLiveIn(Reg, RC); 3418 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3419 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3420 MachinePointerInfo(OrigArg, 4 * i)); 3421 MemOps.push_back(Store); 3422 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3423 } 3424 3425 if (!MemOps.empty()) 3426 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3427 return FrameIndex; 3428 } 3429 3430 // Setup stack frame, the va_list pointer will start from. 3431 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3432 const SDLoc &dl, SDValue &Chain, 3433 unsigned ArgOffset, 3434 unsigned TotalArgRegsSaveSize, 3435 bool ForceMutable) const { 3436 MachineFunction &MF = DAG.getMachineFunction(); 3437 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3438 3439 // Try to store any remaining integer argument regs 3440 // to their spots on the stack so that they may be loaded by dereferencing 3441 // the result of va_next. 3442 // If there is no regs to be stored, just point address after last 3443 // argument passed via stack. 3444 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3445 CCInfo.getInRegsParamsCount(), 3446 CCInfo.getNextStackOffset(), 4); 3447 AFI->setVarArgsFrameIndex(FrameIndex); 3448 } 3449 3450 SDValue ARMTargetLowering::LowerFormalArguments( 3451 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3452 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3453 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3454 MachineFunction &MF = DAG.getMachineFunction(); 3455 MachineFrameInfo &MFI = MF.getFrameInfo(); 3456 3457 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3458 3459 // Assign locations to all of the incoming arguments. 3460 SmallVector<CCValAssign, 16> ArgLocs; 3461 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3462 *DAG.getContext(), Prologue); 3463 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 3464 3465 SmallVector<SDValue, 16> ArgValues; 3466 SDValue ArgValue; 3467 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 3468 unsigned CurArgIdx = 0; 3469 3470 // Initially ArgRegsSaveSize is zero. 3471 // Then we increase this value each time we meet byval parameter. 3472 // We also increase this value in case of varargs function. 3473 AFI->setArgRegsSaveSize(0); 3474 3475 // Calculate the amount of stack space that we need to allocate to store 3476 // byval and variadic arguments that are passed in registers. 3477 // We need to know this before we allocate the first byval or variadic 3478 // argument, as they will be allocated a stack slot below the CFA (Canonical 3479 // Frame Address, the stack pointer at entry to the function). 3480 unsigned ArgRegBegin = ARM::R4; 3481 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3482 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 3483 break; 3484 3485 CCValAssign &VA = ArgLocs[i]; 3486 unsigned Index = VA.getValNo(); 3487 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 3488 if (!Flags.isByVal()) 3489 continue; 3490 3491 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 3492 unsigned RBegin, REnd; 3493 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 3494 ArgRegBegin = std::min(ArgRegBegin, RBegin); 3495 3496 CCInfo.nextInRegsParam(); 3497 } 3498 CCInfo.rewindByValRegsInfo(); 3499 3500 int lastInsIndex = -1; 3501 if (isVarArg && MFI.hasVAStart()) { 3502 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3503 if (RegIdx != array_lengthof(GPRArgRegs)) 3504 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 3505 } 3506 3507 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 3508 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 3509 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3510 3511 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3512 CCValAssign &VA = ArgLocs[i]; 3513 if (Ins[VA.getValNo()].isOrigArg()) { 3514 std::advance(CurOrigArg, 3515 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 3516 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 3517 } 3518 // Arguments stored in registers. 3519 if (VA.isRegLoc()) { 3520 EVT RegVT = VA.getLocVT(); 3521 3522 if (VA.needsCustom()) { 3523 // f64 and vector types are split up into multiple registers or 3524 // combinations of registers and stack slots. 3525 if (VA.getLocVT() == MVT::v2f64) { 3526 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 3527 Chain, DAG, dl); 3528 VA = ArgLocs[++i]; // skip ahead to next loc 3529 SDValue ArgValue2; 3530 if (VA.isMemLoc()) { 3531 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 3532 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3533 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 3534 MachinePointerInfo::getFixedStack( 3535 DAG.getMachineFunction(), FI)); 3536 } else { 3537 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 3538 Chain, DAG, dl); 3539 } 3540 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 3541 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3542 ArgValue, ArgValue1, 3543 DAG.getIntPtrConstant(0, dl)); 3544 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3545 ArgValue, ArgValue2, 3546 DAG.getIntPtrConstant(1, dl)); 3547 } else 3548 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 3549 3550 } else { 3551 const TargetRegisterClass *RC; 3552 3553 if (RegVT == MVT::f32) 3554 RC = &ARM::SPRRegClass; 3555 else if (RegVT == MVT::f64) 3556 RC = &ARM::DPRRegClass; 3557 else if (RegVT == MVT::v2f64) 3558 RC = &ARM::QPRRegClass; 3559 else if (RegVT == MVT::i32) 3560 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 3561 : &ARM::GPRRegClass; 3562 else 3563 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3564 3565 // Transform the arguments in physical registers into virtual ones. 3566 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3567 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 3568 } 3569 3570 // If this is an 8 or 16-bit value, it is really passed promoted 3571 // to 32 bits. Insert an assert[sz]ext to capture this, then 3572 // truncate to the right size. 3573 switch (VA.getLocInfo()) { 3574 default: llvm_unreachable("Unknown loc info!"); 3575 case CCValAssign::Full: break; 3576 case CCValAssign::BCvt: 3577 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 3578 break; 3579 case CCValAssign::SExt: 3580 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 3581 DAG.getValueType(VA.getValVT())); 3582 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3583 break; 3584 case CCValAssign::ZExt: 3585 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 3586 DAG.getValueType(VA.getValVT())); 3587 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3588 break; 3589 } 3590 3591 InVals.push_back(ArgValue); 3592 3593 } else { // VA.isRegLoc() 3594 3595 // sanity check 3596 assert(VA.isMemLoc()); 3597 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 3598 3599 int index = VA.getValNo(); 3600 3601 // Some Ins[] entries become multiple ArgLoc[] entries. 3602 // Process them only once. 3603 if (index != lastInsIndex) 3604 { 3605 ISD::ArgFlagsTy Flags = Ins[index].Flags; 3606 // FIXME: For now, all byval parameter objects are marked mutable. 3607 // This can be changed with more analysis. 3608 // In case of tail call optimization mark all arguments mutable. 3609 // Since they could be overwritten by lowering of arguments in case of 3610 // a tail call. 3611 if (Flags.isByVal()) { 3612 assert(Ins[index].isOrigArg() && 3613 "Byval arguments cannot be implicit"); 3614 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 3615 3616 int FrameIndex = StoreByValRegs( 3617 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 3618 VA.getLocMemOffset(), Flags.getByValSize()); 3619 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 3620 CCInfo.nextInRegsParam(); 3621 } else { 3622 unsigned FIOffset = VA.getLocMemOffset(); 3623 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 3624 FIOffset, true); 3625 3626 // Create load nodes to retrieve arguments from the stack. 3627 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3628 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 3629 MachinePointerInfo::getFixedStack( 3630 DAG.getMachineFunction(), FI))); 3631 } 3632 lastInsIndex = index; 3633 } 3634 } 3635 } 3636 3637 // varargs 3638 if (isVarArg && MFI.hasVAStart()) 3639 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 3640 CCInfo.getNextStackOffset(), 3641 TotalArgRegsSaveSize); 3642 3643 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 3644 3645 return Chain; 3646 } 3647 3648 /// isFloatingPointZero - Return true if this is +0.0. 3649 static bool isFloatingPointZero(SDValue Op) { 3650 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 3651 return CFP->getValueAPF().isPosZero(); 3652 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 3653 // Maybe this has already been legalized into the constant pool? 3654 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 3655 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 3656 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 3657 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 3658 return CFP->getValueAPF().isPosZero(); 3659 } 3660 } else if (Op->getOpcode() == ISD::BITCAST && 3661 Op->getValueType(0) == MVT::f64) { 3662 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 3663 // created by LowerConstantFP(). 3664 SDValue BitcastOp = Op->getOperand(0); 3665 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 3666 isNullConstant(BitcastOp->getOperand(0))) 3667 return true; 3668 } 3669 return false; 3670 } 3671 3672 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 3673 /// the given operands. 3674 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3675 SDValue &ARMcc, SelectionDAG &DAG, 3676 const SDLoc &dl) const { 3677 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3678 unsigned C = RHSC->getZExtValue(); 3679 if (!isLegalICmpImmediate(C)) { 3680 // Constant does not fit, try adjusting it by one? 3681 switch (CC) { 3682 default: break; 3683 case ISD::SETLT: 3684 case ISD::SETGE: 3685 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 3686 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3687 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3688 } 3689 break; 3690 case ISD::SETULT: 3691 case ISD::SETUGE: 3692 if (C != 0 && isLegalICmpImmediate(C-1)) { 3693 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3694 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3695 } 3696 break; 3697 case ISD::SETLE: 3698 case ISD::SETGT: 3699 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 3700 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3701 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3702 } 3703 break; 3704 case ISD::SETULE: 3705 case ISD::SETUGT: 3706 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 3707 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3708 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3709 } 3710 break; 3711 } 3712 } 3713 } 3714 3715 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3716 ARMISD::NodeType CompareType; 3717 switch (CondCode) { 3718 default: 3719 CompareType = ARMISD::CMP; 3720 break; 3721 case ARMCC::EQ: 3722 case ARMCC::NE: 3723 // Uses only Z Flag 3724 CompareType = ARMISD::CMPZ; 3725 break; 3726 } 3727 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3728 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 3729 } 3730 3731 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 3732 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 3733 SelectionDAG &DAG, const SDLoc &dl) const { 3734 assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); 3735 SDValue Cmp; 3736 if (!isFloatingPointZero(RHS)) 3737 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); 3738 else 3739 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); 3740 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 3741 } 3742 3743 /// duplicateCmp - Glue values can have only one use, so this function 3744 /// duplicates a comparison node. 3745 SDValue 3746 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 3747 unsigned Opc = Cmp.getOpcode(); 3748 SDLoc DL(Cmp); 3749 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 3750 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3751 3752 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 3753 Cmp = Cmp.getOperand(0); 3754 Opc = Cmp.getOpcode(); 3755 if (Opc == ARMISD::CMPFP) 3756 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3757 else { 3758 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 3759 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 3760 } 3761 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 3762 } 3763 3764 std::pair<SDValue, SDValue> 3765 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 3766 SDValue &ARMcc) const { 3767 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 3768 3769 SDValue Value, OverflowCmp; 3770 SDValue LHS = Op.getOperand(0); 3771 SDValue RHS = Op.getOperand(1); 3772 SDLoc dl(Op); 3773 3774 // FIXME: We are currently always generating CMPs because we don't support 3775 // generating CMN through the backend. This is not as good as the natural 3776 // CMP case because it causes a register dependency and cannot be folded 3777 // later. 3778 3779 switch (Op.getOpcode()) { 3780 default: 3781 llvm_unreachable("Unknown overflow instruction!"); 3782 case ISD::SADDO: 3783 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3784 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3785 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3786 break; 3787 case ISD::UADDO: 3788 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3789 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3790 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3791 break; 3792 case ISD::SSUBO: 3793 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3794 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3795 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3796 break; 3797 case ISD::USUBO: 3798 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3799 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3800 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3801 break; 3802 } // switch (...) 3803 3804 return std::make_pair(Value, OverflowCmp); 3805 } 3806 3807 3808 SDValue 3809 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 3810 // Let legalize expand this if it isn't a legal type yet. 3811 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 3812 return SDValue(); 3813 3814 SDValue Value, OverflowCmp; 3815 SDValue ARMcc; 3816 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 3817 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3818 SDLoc dl(Op); 3819 // We use 0 and 1 as false and true values. 3820 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3821 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3822 EVT VT = Op.getValueType(); 3823 3824 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 3825 ARMcc, CCR, OverflowCmp); 3826 3827 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 3828 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 3829 } 3830 3831 3832 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 3833 SDValue Cond = Op.getOperand(0); 3834 SDValue SelectTrue = Op.getOperand(1); 3835 SDValue SelectFalse = Op.getOperand(2); 3836 SDLoc dl(Op); 3837 unsigned Opc = Cond.getOpcode(); 3838 3839 if (Cond.getResNo() == 1 && 3840 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3841 Opc == ISD::USUBO)) { 3842 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 3843 return SDValue(); 3844 3845 SDValue Value, OverflowCmp; 3846 SDValue ARMcc; 3847 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 3848 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3849 EVT VT = Op.getValueType(); 3850 3851 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 3852 OverflowCmp, DAG); 3853 } 3854 3855 // Convert: 3856 // 3857 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 3858 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 3859 // 3860 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 3861 const ConstantSDNode *CMOVTrue = 3862 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 3863 const ConstantSDNode *CMOVFalse = 3864 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 3865 3866 if (CMOVTrue && CMOVFalse) { 3867 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 3868 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 3869 3870 SDValue True; 3871 SDValue False; 3872 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 3873 True = SelectTrue; 3874 False = SelectFalse; 3875 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 3876 True = SelectFalse; 3877 False = SelectTrue; 3878 } 3879 3880 if (True.getNode() && False.getNode()) { 3881 EVT VT = Op.getValueType(); 3882 SDValue ARMcc = Cond.getOperand(2); 3883 SDValue CCR = Cond.getOperand(3); 3884 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 3885 assert(True.getValueType() == VT); 3886 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 3887 } 3888 } 3889 } 3890 3891 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 3892 // undefined bits before doing a full-word comparison with zero. 3893 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 3894 DAG.getConstant(1, dl, Cond.getValueType())); 3895 3896 return DAG.getSelectCC(dl, Cond, 3897 DAG.getConstant(0, dl, Cond.getValueType()), 3898 SelectTrue, SelectFalse, ISD::SETNE); 3899 } 3900 3901 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 3902 bool &swpCmpOps, bool &swpVselOps) { 3903 // Start by selecting the GE condition code for opcodes that return true for 3904 // 'equality' 3905 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 3906 CC == ISD::SETULE) 3907 CondCode = ARMCC::GE; 3908 3909 // and GT for opcodes that return false for 'equality'. 3910 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 3911 CC == ISD::SETULT) 3912 CondCode = ARMCC::GT; 3913 3914 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 3915 // to swap the compare operands. 3916 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 3917 CC == ISD::SETULT) 3918 swpCmpOps = true; 3919 3920 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 3921 // If we have an unordered opcode, we need to swap the operands to the VSEL 3922 // instruction (effectively negating the condition). 3923 // 3924 // This also has the effect of swapping which one of 'less' or 'greater' 3925 // returns true, so we also swap the compare operands. It also switches 3926 // whether we return true for 'equality', so we compensate by picking the 3927 // opposite condition code to our original choice. 3928 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 3929 CC == ISD::SETUGT) { 3930 swpCmpOps = !swpCmpOps; 3931 swpVselOps = !swpVselOps; 3932 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 3933 } 3934 3935 // 'ordered' is 'anything but unordered', so use the VS condition code and 3936 // swap the VSEL operands. 3937 if (CC == ISD::SETO) { 3938 CondCode = ARMCC::VS; 3939 swpVselOps = true; 3940 } 3941 3942 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 3943 // code and swap the VSEL operands. 3944 if (CC == ISD::SETUNE) { 3945 CondCode = ARMCC::EQ; 3946 swpVselOps = true; 3947 } 3948 } 3949 3950 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 3951 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 3952 SDValue Cmp, SelectionDAG &DAG) const { 3953 if (Subtarget->isFPOnlySP() && VT == MVT::f64) { 3954 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 3955 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 3956 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 3957 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 3958 3959 SDValue TrueLow = TrueVal.getValue(0); 3960 SDValue TrueHigh = TrueVal.getValue(1); 3961 SDValue FalseLow = FalseVal.getValue(0); 3962 SDValue FalseHigh = FalseVal.getValue(1); 3963 3964 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 3965 ARMcc, CCR, Cmp); 3966 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 3967 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 3968 3969 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 3970 } else { 3971 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 3972 Cmp); 3973 } 3974 } 3975 3976 static bool isGTorGE(ISD::CondCode CC) { 3977 return CC == ISD::SETGT || CC == ISD::SETGE; 3978 } 3979 3980 static bool isLTorLE(ISD::CondCode CC) { 3981 return CC == ISD::SETLT || CC == ISD::SETLE; 3982 } 3983 3984 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 3985 // All of these conditions (and their <= and >= counterparts) will do: 3986 // x < k ? k : x 3987 // x > k ? x : k 3988 // k < x ? x : k 3989 // k > x ? k : x 3990 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 3991 const SDValue TrueVal, const SDValue FalseVal, 3992 const ISD::CondCode CC, const SDValue K) { 3993 return (isGTorGE(CC) && 3994 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 3995 (isLTorLE(CC) && 3996 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 3997 } 3998 3999 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4000 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4001 const SDValue TrueVal, const SDValue FalseVal, 4002 const ISD::CondCode CC, const SDValue K) { 4003 return (isGTorGE(CC) && 4004 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4005 (isLTorLE(CC) && 4006 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4007 } 4008 4009 // Check if two chained conditionals could be converted into SSAT. 4010 // 4011 // SSAT can replace a set of two conditional selectors that bound a number to an 4012 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4013 // 4014 // x < -k ? -k : (x > k ? k : x) 4015 // x < -k ? -k : (x < k ? x : k) 4016 // x > -k ? (x > k ? k : x) : -k 4017 // x < k ? (x < -k ? -k : x) : k 4018 // etc. 4019 // 4020 // It returns true if the conversion can be done, false otherwise. 4021 // Additionally, the variable is returned in parameter V and the constant in K. 4022 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4023 uint64_t &K) { 4024 4025 SDValue LHS1 = Op.getOperand(0); 4026 SDValue RHS1 = Op.getOperand(1); 4027 SDValue TrueVal1 = Op.getOperand(2); 4028 SDValue FalseVal1 = Op.getOperand(3); 4029 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4030 4031 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4032 if (Op2.getOpcode() != ISD::SELECT_CC) 4033 return false; 4034 4035 SDValue LHS2 = Op2.getOperand(0); 4036 SDValue RHS2 = Op2.getOperand(1); 4037 SDValue TrueVal2 = Op2.getOperand(2); 4038 SDValue FalseVal2 = Op2.getOperand(3); 4039 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4040 4041 // Find out which are the constants and which are the variables 4042 // in each conditional 4043 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4044 ? &RHS1 4045 : NULL; 4046 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4047 ? &RHS2 4048 : NULL; 4049 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4050 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4051 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4052 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4053 4054 // We must detect cases where the original operations worked with 16- or 4055 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4056 // must work with sign-extended values but the select operations return 4057 // the original non-extended value. 4058 SDValue V2TmpReg = V2Tmp; 4059 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4060 V2TmpReg = V2Tmp->getOperand(0); 4061 4062 // Check that the registers and the constants have the correct values 4063 // in both conditionals 4064 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4065 V2TmpReg != V2) 4066 return false; 4067 4068 // Figure out which conditional is saturating the lower/upper bound. 4069 const SDValue *LowerCheckOp = 4070 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4071 ? &Op 4072 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 4073 : NULL; 4074 const SDValue *UpperCheckOp = 4075 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4076 ? &Op 4077 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 4078 : NULL; 4079 4080 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4081 return false; 4082 4083 // Check that the constant in the lower-bound check is 4084 // the opposite of the constant in the upper-bound check 4085 // in 1's complement. 4086 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4087 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4088 int64_t PosVal = std::max(Val1, Val2); 4089 4090 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4091 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4092 Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) { 4093 4094 V = V2; 4095 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4096 return true; 4097 } 4098 4099 return false; 4100 } 4101 4102 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4103 4104 EVT VT = Op.getValueType(); 4105 SDLoc dl(Op); 4106 4107 // Try to convert two saturating conditional selects into a single SSAT 4108 SDValue SatValue; 4109 uint64_t SatConstant; 4110 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4111 isSaturatingConditional(Op, SatValue, SatConstant)) 4112 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4113 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4114 4115 SDValue LHS = Op.getOperand(0); 4116 SDValue RHS = Op.getOperand(1); 4117 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4118 SDValue TrueVal = Op.getOperand(2); 4119 SDValue FalseVal = Op.getOperand(3); 4120 4121 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4122 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4123 dl); 4124 4125 // If softenSetCCOperands only returned one value, we should compare it to 4126 // zero. 4127 if (!RHS.getNode()) { 4128 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4129 CC = ISD::SETNE; 4130 } 4131 } 4132 4133 if (LHS.getValueType() == MVT::i32) { 4134 // Try to generate VSEL on ARMv8. 4135 // The VSEL instruction can't use all the usual ARM condition 4136 // codes: it only has two bits to select the condition code, so it's 4137 // constrained to use only GE, GT, VS and EQ. 4138 // 4139 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 4140 // swap the operands of the previous compare instruction (effectively 4141 // inverting the compare condition, swapping 'less' and 'greater') and 4142 // sometimes need to swap the operands to the VSEL (which inverts the 4143 // condition in the sense of firing whenever the previous condition didn't) 4144 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 4145 TrueVal.getValueType() == MVT::f64)) { 4146 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4147 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 4148 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 4149 CC = ISD::getSetCCInverse(CC, true); 4150 std::swap(TrueVal, FalseVal); 4151 } 4152 } 4153 4154 SDValue ARMcc; 4155 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4156 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4157 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4158 } 4159 4160 ARMCC::CondCodes CondCode, CondCode2; 4161 FPCCToARMCC(CC, CondCode, CondCode2); 4162 4163 // Try to generate VMAXNM/VMINNM on ARMv8. 4164 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 4165 TrueVal.getValueType() == MVT::f64)) { 4166 bool swpCmpOps = false; 4167 bool swpVselOps = false; 4168 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 4169 4170 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 4171 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 4172 if (swpCmpOps) 4173 std::swap(LHS, RHS); 4174 if (swpVselOps) 4175 std::swap(TrueVal, FalseVal); 4176 } 4177 } 4178 4179 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4180 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 4181 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4182 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4183 if (CondCode2 != ARMCC::AL) { 4184 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 4185 // FIXME: Needs another CMP because flag can have but one use. 4186 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 4187 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 4188 } 4189 return Result; 4190 } 4191 4192 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 4193 /// to morph to an integer compare sequence. 4194 static bool canChangeToInt(SDValue Op, bool &SeenZero, 4195 const ARMSubtarget *Subtarget) { 4196 SDNode *N = Op.getNode(); 4197 if (!N->hasOneUse()) 4198 // Otherwise it requires moving the value from fp to integer registers. 4199 return false; 4200 if (!N->getNumValues()) 4201 return false; 4202 EVT VT = Op.getValueType(); 4203 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 4204 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 4205 // vmrs are very slow, e.g. cortex-a8. 4206 return false; 4207 4208 if (isFloatingPointZero(Op)) { 4209 SeenZero = true; 4210 return true; 4211 } 4212 return ISD::isNormalLoad(N); 4213 } 4214 4215 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 4216 if (isFloatingPointZero(Op)) 4217 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 4218 4219 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 4220 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 4221 Ld->getPointerInfo(), Ld->getAlignment(), 4222 Ld->getMemOperand()->getFlags()); 4223 4224 llvm_unreachable("Unknown VFP cmp argument!"); 4225 } 4226 4227 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 4228 SDValue &RetVal1, SDValue &RetVal2) { 4229 SDLoc dl(Op); 4230 4231 if (isFloatingPointZero(Op)) { 4232 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 4233 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 4234 return; 4235 } 4236 4237 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 4238 SDValue Ptr = Ld->getBasePtr(); 4239 RetVal1 = 4240 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 4241 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 4242 4243 EVT PtrType = Ptr.getValueType(); 4244 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 4245 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 4246 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 4247 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 4248 Ld->getPointerInfo().getWithOffset(4), NewAlign, 4249 Ld->getMemOperand()->getFlags()); 4250 return; 4251 } 4252 4253 llvm_unreachable("Unknown VFP cmp argument!"); 4254 } 4255 4256 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 4257 /// f32 and even f64 comparisons to integer ones. 4258 SDValue 4259 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 4260 SDValue Chain = Op.getOperand(0); 4261 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4262 SDValue LHS = Op.getOperand(2); 4263 SDValue RHS = Op.getOperand(3); 4264 SDValue Dest = Op.getOperand(4); 4265 SDLoc dl(Op); 4266 4267 bool LHSSeenZero = false; 4268 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 4269 bool RHSSeenZero = false; 4270 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 4271 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 4272 // If unsafe fp math optimization is enabled and there are no other uses of 4273 // the CMP operands, and the condition code is EQ or NE, we can optimize it 4274 // to an integer comparison. 4275 if (CC == ISD::SETOEQ) 4276 CC = ISD::SETEQ; 4277 else if (CC == ISD::SETUNE) 4278 CC = ISD::SETNE; 4279 4280 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4281 SDValue ARMcc; 4282 if (LHS.getValueType() == MVT::f32) { 4283 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4284 bitcastf32Toi32(LHS, DAG), Mask); 4285 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4286 bitcastf32Toi32(RHS, DAG), Mask); 4287 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4288 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4289 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4290 Chain, Dest, ARMcc, CCR, Cmp); 4291 } 4292 4293 SDValue LHS1, LHS2; 4294 SDValue RHS1, RHS2; 4295 expandf64Toi32(LHS, DAG, LHS1, LHS2); 4296 expandf64Toi32(RHS, DAG, RHS1, RHS2); 4297 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 4298 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 4299 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4300 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4301 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4302 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 4303 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 4304 } 4305 4306 return SDValue(); 4307 } 4308 4309 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 4310 SDValue Chain = Op.getOperand(0); 4311 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4312 SDValue LHS = Op.getOperand(2); 4313 SDValue RHS = Op.getOperand(3); 4314 SDValue Dest = Op.getOperand(4); 4315 SDLoc dl(Op); 4316 4317 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4318 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4319 dl); 4320 4321 // If softenSetCCOperands only returned one value, we should compare it to 4322 // zero. 4323 if (!RHS.getNode()) { 4324 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4325 CC = ISD::SETNE; 4326 } 4327 } 4328 4329 if (LHS.getValueType() == MVT::i32) { 4330 SDValue ARMcc; 4331 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4332 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4333 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4334 Chain, Dest, ARMcc, CCR, Cmp); 4335 } 4336 4337 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 4338 4339 if (getTargetMachine().Options.UnsafeFPMath && 4340 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 4341 CC == ISD::SETNE || CC == ISD::SETUNE)) { 4342 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 4343 return Result; 4344 } 4345 4346 ARMCC::CondCodes CondCode, CondCode2; 4347 FPCCToARMCC(CC, CondCode, CondCode2); 4348 4349 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4350 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 4351 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4352 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4353 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 4354 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4355 if (CondCode2 != ARMCC::AL) { 4356 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 4357 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 4358 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4359 } 4360 return Res; 4361 } 4362 4363 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 4364 SDValue Chain = Op.getOperand(0); 4365 SDValue Table = Op.getOperand(1); 4366 SDValue Index = Op.getOperand(2); 4367 SDLoc dl(Op); 4368 4369 EVT PTy = getPointerTy(DAG.getDataLayout()); 4370 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 4371 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 4372 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 4373 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 4374 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); 4375 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 4376 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 4377 // which does another jump to the destination. This also makes it easier 4378 // to translate it to TBB / TBH later (Thumb2 only). 4379 // FIXME: This might not work if the function is extremely large. 4380 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 4381 Addr, Op.getOperand(2), JTI); 4382 } 4383 if (isPositionIndependent() || Subtarget->isROPI()) { 4384 Addr = 4385 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 4386 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4387 Chain = Addr.getValue(1); 4388 Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); 4389 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4390 } else { 4391 Addr = 4392 DAG.getLoad(PTy, dl, Chain, Addr, 4393 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4394 Chain = Addr.getValue(1); 4395 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4396 } 4397 } 4398 4399 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 4400 EVT VT = Op.getValueType(); 4401 SDLoc dl(Op); 4402 4403 if (Op.getValueType().getVectorElementType() == MVT::i32) { 4404 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 4405 return Op; 4406 return DAG.UnrollVectorOp(Op.getNode()); 4407 } 4408 4409 assert(Op.getOperand(0).getValueType() == MVT::v4f32 && 4410 "Invalid type for custom lowering!"); 4411 if (VT != MVT::v4i16) 4412 return DAG.UnrollVectorOp(Op.getNode()); 4413 4414 Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); 4415 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 4416 } 4417 4418 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 4419 EVT VT = Op.getValueType(); 4420 if (VT.isVector()) 4421 return LowerVectorFP_TO_INT(Op, DAG); 4422 if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { 4423 RTLIB::Libcall LC; 4424 if (Op.getOpcode() == ISD::FP_TO_SINT) 4425 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), 4426 Op.getValueType()); 4427 else 4428 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), 4429 Op.getValueType()); 4430 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4431 /*isSigned*/ false, SDLoc(Op)).first; 4432 } 4433 4434 return Op; 4435 } 4436 4437 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4438 EVT VT = Op.getValueType(); 4439 SDLoc dl(Op); 4440 4441 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 4442 if (VT.getVectorElementType() == MVT::f32) 4443 return Op; 4444 return DAG.UnrollVectorOp(Op.getNode()); 4445 } 4446 4447 assert(Op.getOperand(0).getValueType() == MVT::v4i16 && 4448 "Invalid type for custom lowering!"); 4449 if (VT != MVT::v4f32) 4450 return DAG.UnrollVectorOp(Op.getNode()); 4451 4452 unsigned CastOpc; 4453 unsigned Opc; 4454 switch (Op.getOpcode()) { 4455 default: llvm_unreachable("Invalid opcode!"); 4456 case ISD::SINT_TO_FP: 4457 CastOpc = ISD::SIGN_EXTEND; 4458 Opc = ISD::SINT_TO_FP; 4459 break; 4460 case ISD::UINT_TO_FP: 4461 CastOpc = ISD::ZERO_EXTEND; 4462 Opc = ISD::UINT_TO_FP; 4463 break; 4464 } 4465 4466 Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); 4467 return DAG.getNode(Opc, dl, VT, Op); 4468 } 4469 4470 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 4471 EVT VT = Op.getValueType(); 4472 if (VT.isVector()) 4473 return LowerVectorINT_TO_FP(Op, DAG); 4474 if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { 4475 RTLIB::Libcall LC; 4476 if (Op.getOpcode() == ISD::SINT_TO_FP) 4477 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 4478 Op.getValueType()); 4479 else 4480 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 4481 Op.getValueType()); 4482 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4483 /*isSigned*/ false, SDLoc(Op)).first; 4484 } 4485 4486 return Op; 4487 } 4488 4489 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 4490 // Implement fcopysign with a fabs and a conditional fneg. 4491 SDValue Tmp0 = Op.getOperand(0); 4492 SDValue Tmp1 = Op.getOperand(1); 4493 SDLoc dl(Op); 4494 EVT VT = Op.getValueType(); 4495 EVT SrcVT = Tmp1.getValueType(); 4496 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 4497 Tmp0.getOpcode() == ARMISD::VMOVDRR; 4498 bool UseNEON = !InGPR && Subtarget->hasNEON(); 4499 4500 if (UseNEON) { 4501 // Use VBSL to copy the sign bit. 4502 unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); 4503 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 4504 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 4505 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 4506 if (VT == MVT::f64) 4507 Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4508 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 4509 DAG.getConstant(32, dl, MVT::i32)); 4510 else /*if (VT == MVT::f32)*/ 4511 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 4512 if (SrcVT == MVT::f32) { 4513 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 4514 if (VT == MVT::f64) 4515 Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4516 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 4517 DAG.getConstant(32, dl, MVT::i32)); 4518 } else if (VT == MVT::f32) 4519 Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, 4520 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 4521 DAG.getConstant(32, dl, MVT::i32)); 4522 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 4523 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 4524 4525 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), 4526 dl, MVT::i32); 4527 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 4528 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 4529 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 4530 4531 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 4532 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 4533 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 4534 if (VT == MVT::f32) { 4535 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 4536 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 4537 DAG.getConstant(0, dl, MVT::i32)); 4538 } else { 4539 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 4540 } 4541 4542 return Res; 4543 } 4544 4545 // Bitcast operand 1 to i32. 4546 if (SrcVT == MVT::f64) 4547 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4548 Tmp1).getValue(1); 4549 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 4550 4551 // Or in the signbit with integer operations. 4552 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 4553 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4554 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 4555 if (VT == MVT::f32) { 4556 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 4557 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 4558 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 4559 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 4560 } 4561 4562 // f64: Or the high part with signbit and then combine two parts. 4563 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4564 Tmp0); 4565 SDValue Lo = Tmp0.getValue(0); 4566 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 4567 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 4568 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 4569 } 4570 4571 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 4572 MachineFunction &MF = DAG.getMachineFunction(); 4573 MachineFrameInfo &MFI = MF.getFrameInfo(); 4574 MFI.setReturnAddressIsTaken(true); 4575 4576 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 4577 return SDValue(); 4578 4579 EVT VT = Op.getValueType(); 4580 SDLoc dl(Op); 4581 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4582 if (Depth) { 4583 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4584 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 4585 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 4586 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 4587 MachinePointerInfo()); 4588 } 4589 4590 // Return LR, which contains the return address. Mark it an implicit live-in. 4591 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 4592 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 4593 } 4594 4595 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 4596 const ARMBaseRegisterInfo &ARI = 4597 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 4598 MachineFunction &MF = DAG.getMachineFunction(); 4599 MachineFrameInfo &MFI = MF.getFrameInfo(); 4600 MFI.setFrameAddressIsTaken(true); 4601 4602 EVT VT = Op.getValueType(); 4603 SDLoc dl(Op); // FIXME probably not meaningful 4604 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4605 unsigned FrameReg = ARI.getFrameRegister(MF); 4606 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 4607 while (Depth--) 4608 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 4609 MachinePointerInfo()); 4610 return FrameAddr; 4611 } 4612 4613 // FIXME? Maybe this could be a TableGen attribute on some registers and 4614 // this table could be generated automatically from RegInfo. 4615 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, 4616 SelectionDAG &DAG) const { 4617 unsigned Reg = StringSwitch<unsigned>(RegName) 4618 .Case("sp", ARM::SP) 4619 .Default(0); 4620 if (Reg) 4621 return Reg; 4622 report_fatal_error(Twine("Invalid register name \"" 4623 + StringRef(RegName) + "\".")); 4624 } 4625 4626 // Result is 64 bit value so split into two 32 bit values and return as a 4627 // pair of values. 4628 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 4629 SelectionDAG &DAG) { 4630 SDLoc DL(N); 4631 4632 // This function is only supposed to be called for i64 type destination. 4633 assert(N->getValueType(0) == MVT::i64 4634 && "ExpandREAD_REGISTER called for non-i64 type result."); 4635 4636 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 4637 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 4638 N->getOperand(0), 4639 N->getOperand(1)); 4640 4641 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 4642 Read.getValue(1))); 4643 Results.push_back(Read.getOperand(0)); 4644 } 4645 4646 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 4647 /// When \p DstVT, the destination type of \p BC, is on the vector 4648 /// register bank and the source of bitcast, \p Op, operates on the same bank, 4649 /// it might be possible to combine them, such that everything stays on the 4650 /// vector register bank. 4651 /// \p return The node that would replace \p BT, if the combine 4652 /// is possible. 4653 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 4654 SelectionDAG &DAG) { 4655 SDValue Op = BC->getOperand(0); 4656 EVT DstVT = BC->getValueType(0); 4657 4658 // The only vector instruction that can produce a scalar (remember, 4659 // since the bitcast was about to be turned into VMOVDRR, the source 4660 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 4661 // Moreover, we can do this combine only if there is one use. 4662 // Finally, if the destination type is not a vector, there is not 4663 // much point on forcing everything on the vector bank. 4664 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 4665 !Op.hasOneUse()) 4666 return SDValue(); 4667 4668 // If the index is not constant, we will introduce an additional 4669 // multiply that will stick. 4670 // Give up in that case. 4671 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 4672 if (!Index) 4673 return SDValue(); 4674 unsigned DstNumElt = DstVT.getVectorNumElements(); 4675 4676 // Compute the new index. 4677 const APInt &APIntIndex = Index->getAPIntValue(); 4678 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 4679 NewIndex *= APIntIndex; 4680 // Check if the new constant index fits into i32. 4681 if (NewIndex.getBitWidth() > 32) 4682 return SDValue(); 4683 4684 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 4685 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 4686 SDLoc dl(Op); 4687 SDValue ExtractSrc = Op.getOperand(0); 4688 EVT VecVT = EVT::getVectorVT( 4689 *DAG.getContext(), DstVT.getScalarType(), 4690 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 4691 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 4692 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 4693 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 4694 } 4695 4696 /// ExpandBITCAST - If the target supports VFP, this function is called to 4697 /// expand a bit convert where either the source or destination type is i64 to 4698 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 4699 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 4700 /// vectors), since the legalizer won't know what to do with that. 4701 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { 4702 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 4703 SDLoc dl(N); 4704 SDValue Op = N->getOperand(0); 4705 4706 // This function is only supposed to be called for i64 types, either as the 4707 // source or destination of the bit convert. 4708 EVT SrcVT = Op.getValueType(); 4709 EVT DstVT = N->getValueType(0); 4710 assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && 4711 "ExpandBITCAST called for non-i64 type"); 4712 4713 // Turn i64->f64 into VMOVDRR. 4714 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 4715 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 4716 // if we can combine the bitcast with its source. 4717 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 4718 return Val; 4719 4720 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 4721 DAG.getConstant(0, dl, MVT::i32)); 4722 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 4723 DAG.getConstant(1, dl, MVT::i32)); 4724 return DAG.getNode(ISD::BITCAST, dl, DstVT, 4725 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 4726 } 4727 4728 // Turn f64->i64 into VMOVRRD. 4729 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 4730 SDValue Cvt; 4731 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 4732 SrcVT.getVectorNumElements() > 1) 4733 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 4734 DAG.getVTList(MVT::i32, MVT::i32), 4735 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 4736 else 4737 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 4738 DAG.getVTList(MVT::i32, MVT::i32), Op); 4739 // Merge the pieces into a single i64 value. 4740 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 4741 } 4742 4743 return SDValue(); 4744 } 4745 4746 /// getZeroVector - Returns a vector of specified type with all zero elements. 4747 /// Zero vectors are used to represent vector negation and in those cases 4748 /// will be implemented with the NEON VNEG instruction. However, VNEG does 4749 /// not support i64 elements, so sometimes the zero vectors will need to be 4750 /// explicitly constructed. Regardless, use a canonical VMOV to create the 4751 /// zero vector. 4752 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 4753 assert(VT.isVector() && "Expected a vector type"); 4754 // The canonical modified immediate encoding of a zero vector is....0! 4755 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 4756 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 4757 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 4758 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 4759 } 4760 4761 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4762 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 4763 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 4764 SelectionDAG &DAG) const { 4765 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4766 EVT VT = Op.getValueType(); 4767 unsigned VTBits = VT.getSizeInBits(); 4768 SDLoc dl(Op); 4769 SDValue ShOpLo = Op.getOperand(0); 4770 SDValue ShOpHi = Op.getOperand(1); 4771 SDValue ShAmt = Op.getOperand(2); 4772 SDValue ARMcc; 4773 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4774 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4775 4776 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4777 4778 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 4779 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 4780 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4781 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 4782 DAG.getConstant(VTBits, dl, MVT::i32)); 4783 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4784 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4785 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4786 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4787 ISD::SETGE, ARMcc, DAG, dl); 4788 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 4789 ARMcc, CCR, CmpLo); 4790 4791 4792 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4793 SDValue HiBigShift = Opc == ISD::SRA 4794 ? DAG.getNode(Opc, dl, VT, ShOpHi, 4795 DAG.getConstant(VTBits - 1, dl, VT)) 4796 : DAG.getConstant(0, dl, VT); 4797 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4798 ISD::SETGE, ARMcc, DAG, dl); 4799 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 4800 ARMcc, CCR, CmpHi); 4801 4802 SDValue Ops[2] = { Lo, Hi }; 4803 return DAG.getMergeValues(Ops, dl); 4804 } 4805 4806 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4807 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 4808 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 4809 SelectionDAG &DAG) const { 4810 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4811 EVT VT = Op.getValueType(); 4812 unsigned VTBits = VT.getSizeInBits(); 4813 SDLoc dl(Op); 4814 SDValue ShOpLo = Op.getOperand(0); 4815 SDValue ShOpHi = Op.getOperand(1); 4816 SDValue ShAmt = Op.getOperand(2); 4817 SDValue ARMcc; 4818 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4819 4820 assert(Op.getOpcode() == ISD::SHL_PARTS); 4821 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 4822 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 4823 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4824 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4825 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4826 4827 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 4828 DAG.getConstant(VTBits, dl, MVT::i32)); 4829 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4830 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4831 ISD::SETGE, ARMcc, DAG, dl); 4832 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 4833 ARMcc, CCR, CmpHi); 4834 4835 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 4836 ISD::SETGE, ARMcc, DAG, dl); 4837 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4838 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 4839 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 4840 4841 SDValue Ops[2] = { Lo, Hi }; 4842 return DAG.getMergeValues(Ops, dl); 4843 } 4844 4845 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 4846 SelectionDAG &DAG) const { 4847 // The rounding mode is in bits 23:22 of the FPSCR. 4848 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 4849 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 4850 // so that the shift + and get folded into a bitfield extract. 4851 SDLoc dl(Op); 4852 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, 4853 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, 4854 MVT::i32)); 4855 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 4856 DAG.getConstant(1U << 22, dl, MVT::i32)); 4857 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 4858 DAG.getConstant(22, dl, MVT::i32)); 4859 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 4860 DAG.getConstant(3, dl, MVT::i32)); 4861 } 4862 4863 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 4864 const ARMSubtarget *ST) { 4865 SDLoc dl(N); 4866 EVT VT = N->getValueType(0); 4867 if (VT.isVector()) { 4868 assert(ST->hasNEON()); 4869 4870 // Compute the least significant set bit: LSB = X & -X 4871 SDValue X = N->getOperand(0); 4872 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 4873 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 4874 4875 EVT ElemTy = VT.getVectorElementType(); 4876 4877 if (ElemTy == MVT::i8) { 4878 // Compute with: cttz(x) = ctpop(lsb - 1) 4879 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4880 DAG.getTargetConstant(1, dl, ElemTy)); 4881 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 4882 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 4883 } 4884 4885 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 4886 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 4887 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 4888 unsigned NumBits = ElemTy.getSizeInBits(); 4889 SDValue WidthMinus1 = 4890 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4891 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 4892 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 4893 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 4894 } 4895 4896 // Compute with: cttz(x) = ctpop(lsb - 1) 4897 4898 // Since we can only compute the number of bits in a byte with vcnt.8, we 4899 // have to gather the result with pairwise addition (vpaddl) for i16, i32, 4900 // and i64. 4901 4902 // Compute LSB - 1. 4903 SDValue Bits; 4904 if (ElemTy == MVT::i64) { 4905 // Load constant 0xffff'ffff'ffff'ffff to register. 4906 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4907 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 4908 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 4909 } else { 4910 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 4911 DAG.getTargetConstant(1, dl, ElemTy)); 4912 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 4913 } 4914 4915 // Count #bits with vcnt.8. 4916 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 4917 SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); 4918 SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); 4919 4920 // Gather the #bits with vpaddl (pairwise add.) 4921 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 4922 SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, 4923 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4924 Cnt8); 4925 if (ElemTy == MVT::i16) 4926 return Cnt16; 4927 4928 EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; 4929 SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, 4930 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4931 Cnt16); 4932 if (ElemTy == MVT::i32) 4933 return Cnt32; 4934 4935 assert(ElemTy == MVT::i64); 4936 SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 4937 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 4938 Cnt32); 4939 return Cnt64; 4940 } 4941 4942 if (!ST->hasV6T2Ops()) 4943 return SDValue(); 4944 4945 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 4946 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 4947 } 4948 4949 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count 4950 /// for each 16-bit element from operand, repeated. The basic idea is to 4951 /// leverage vcnt to get the 8-bit counts, gather and add the results. 4952 /// 4953 /// Trace for v4i16: 4954 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 4955 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) 4956 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) 4957 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 4958 /// [b0 b1 b2 b3 b4 b5 b6 b7] 4959 /// +[b1 b0 b3 b2 b5 b4 b7 b6] 4960 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, 4961 /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) 4962 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { 4963 EVT VT = N->getValueType(0); 4964 SDLoc DL(N); 4965 4966 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 4967 SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); 4968 SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); 4969 SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); 4970 SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); 4971 return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); 4972 } 4973 4974 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the 4975 /// bit-count for each 16-bit element from the operand. We need slightly 4976 /// different sequencing for v4i16 and v8i16 to stay within NEON's available 4977 /// 64/128-bit registers. 4978 /// 4979 /// Trace for v4i16: 4980 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 4981 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) 4982 /// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] 4983 /// v4i16:Extracted = [k0 k1 k2 k3 ] 4984 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { 4985 EVT VT = N->getValueType(0); 4986 SDLoc DL(N); 4987 4988 SDValue BitCounts = getCTPOP16BitCounts(N, DAG); 4989 if (VT.is64BitVector()) { 4990 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); 4991 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, 4992 DAG.getIntPtrConstant(0, DL)); 4993 } else { 4994 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, 4995 BitCounts, DAG.getIntPtrConstant(0, DL)); 4996 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); 4997 } 4998 } 4999 5000 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the 5001 /// bit-count for each 32-bit element from the operand. The idea here is 5002 /// to split the vector into 16-bit elements, leverage the 16-bit count 5003 /// routine, and then combine the results. 5004 /// 5005 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): 5006 /// input = [v0 v1 ] (vi: 32-bit elements) 5007 /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) 5008 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) 5009 /// vrev: N0 = [k1 k0 k3 k2 ] 5010 /// [k0 k1 k2 k3 ] 5011 /// N1 =+[k1 k0 k3 k2 ] 5012 /// [k0 k2 k1 k3 ] 5013 /// N2 =+[k1 k3 k0 k2 ] 5014 /// [k0 k2 k1 k3 ] 5015 /// Extended =+[k1 k3 k0 k2 ] 5016 /// [k0 k2 ] 5017 /// Extracted=+[k1 k3 ] 5018 /// 5019 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { 5020 EVT VT = N->getValueType(0); 5021 SDLoc DL(N); 5022 5023 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 5024 5025 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); 5026 SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); 5027 SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); 5028 SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); 5029 SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); 5030 5031 if (VT.is64BitVector()) { 5032 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); 5033 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, 5034 DAG.getIntPtrConstant(0, DL)); 5035 } else { 5036 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, 5037 DAG.getIntPtrConstant(0, DL)); 5038 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); 5039 } 5040 } 5041 5042 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 5043 const ARMSubtarget *ST) { 5044 EVT VT = N->getValueType(0); 5045 5046 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 5047 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || 5048 VT == MVT::v4i16 || VT == MVT::v8i16) && 5049 "Unexpected type for custom ctpop lowering"); 5050 5051 if (VT.getVectorElementType() == MVT::i32) 5052 return lowerCTPOP32BitElements(N, DAG); 5053 else 5054 return lowerCTPOP16BitElements(N, DAG); 5055 } 5056 5057 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 5058 const ARMSubtarget *ST) { 5059 EVT VT = N->getValueType(0); 5060 SDLoc dl(N); 5061 5062 if (!VT.isVector()) 5063 return SDValue(); 5064 5065 // Lower vector shifts on NEON to use VSHL. 5066 assert(ST->hasNEON() && "unexpected vector shift"); 5067 5068 // Left shifts translate directly to the vshiftu intrinsic. 5069 if (N->getOpcode() == ISD::SHL) 5070 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5071 DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, 5072 MVT::i32), 5073 N->getOperand(0), N->getOperand(1)); 5074 5075 assert((N->getOpcode() == ISD::SRA || 5076 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); 5077 5078 // NEON uses the same intrinsics for both left and right shifts. For 5079 // right shifts, the shift amounts are negative, so negate the vector of 5080 // shift amounts. 5081 EVT ShiftVT = N->getOperand(1).getValueType(); 5082 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, 5083 getZeroVector(ShiftVT, DAG, dl), 5084 N->getOperand(1)); 5085 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? 5086 Intrinsic::arm_neon_vshifts : 5087 Intrinsic::arm_neon_vshiftu); 5088 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5089 DAG.getConstant(vshiftInt, dl, MVT::i32), 5090 N->getOperand(0), NegatedCount); 5091 } 5092 5093 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 5094 const ARMSubtarget *ST) { 5095 EVT VT = N->getValueType(0); 5096 SDLoc dl(N); 5097 5098 // We can get here for a node like i32 = ISD::SHL i32, i64 5099 if (VT != MVT::i64) 5100 return SDValue(); 5101 5102 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 5103 "Unknown shift to lower!"); 5104 5105 // We only lower SRA, SRL of 1 here, all others use generic lowering. 5106 if (!isOneConstant(N->getOperand(1))) 5107 return SDValue(); 5108 5109 // If we are in thumb mode, we don't have RRX. 5110 if (ST->isThumb1Only()) return SDValue(); 5111 5112 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 5113 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5114 DAG.getConstant(0, dl, MVT::i32)); 5115 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5116 DAG.getConstant(1, dl, MVT::i32)); 5117 5118 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 5119 // captures the result into a carry flag. 5120 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 5121 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 5122 5123 // The low part is an ARMISD::RRX operand, which shifts the carry in. 5124 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 5125 5126 // Merge the pieces into a single i64 value. 5127 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5128 } 5129 5130 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5131 SDValue TmpOp0, TmpOp1; 5132 bool Invert = false; 5133 bool Swap = false; 5134 unsigned Opc = 0; 5135 5136 SDValue Op0 = Op.getOperand(0); 5137 SDValue Op1 = Op.getOperand(1); 5138 SDValue CC = Op.getOperand(2); 5139 EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 5140 EVT VT = Op.getValueType(); 5141 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5142 SDLoc dl(Op); 5143 5144 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 5145 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 5146 // Special-case integer 64-bit equality comparisons. They aren't legal, 5147 // but they can be lowered with a few vector instructions. 5148 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 5149 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 5150 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 5151 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 5152 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 5153 DAG.getCondCode(ISD::SETEQ)); 5154 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 5155 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 5156 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 5157 if (SetCCOpcode == ISD::SETNE) 5158 Merged = DAG.getNOT(dl, Merged, CmpVT); 5159 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 5160 return Merged; 5161 } 5162 5163 if (CmpVT.getVectorElementType() == MVT::i64) 5164 // 64-bit comparisons are not legal in general. 5165 return SDValue(); 5166 5167 if (Op1.getValueType().isFloatingPoint()) { 5168 switch (SetCCOpcode) { 5169 default: llvm_unreachable("Illegal FP comparison"); 5170 case ISD::SETUNE: 5171 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; 5172 case ISD::SETOEQ: 5173 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5174 case ISD::SETOLT: 5175 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 5176 case ISD::SETOGT: 5177 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5178 case ISD::SETOLE: 5179 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 5180 case ISD::SETOGE: 5181 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5182 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 5183 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; 5184 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 5185 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; 5186 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 5187 case ISD::SETONE: 5188 // Expand this to (OLT | OGT). 5189 TmpOp0 = Op0; 5190 TmpOp1 = Op1; 5191 Opc = ISD::OR; 5192 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5193 Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); 5194 break; 5195 case ISD::SETUO: 5196 Invert = true; 5197 LLVM_FALLTHROUGH; 5198 case ISD::SETO: 5199 // Expand this to (OLT | OGE). 5200 TmpOp0 = Op0; 5201 TmpOp1 = Op1; 5202 Opc = ISD::OR; 5203 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5204 Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); 5205 break; 5206 } 5207 } else { 5208 // Integer comparisons. 5209 switch (SetCCOpcode) { 5210 default: llvm_unreachable("Illegal integer comparison"); 5211 case ISD::SETNE: Invert = true; 5212 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5213 case ISD::SETLT: Swap = true; 5214 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5215 case ISD::SETLE: Swap = true; 5216 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5217 case ISD::SETULT: Swap = true; 5218 case ISD::SETUGT: Opc = ARMISD::VCGTU; break; 5219 case ISD::SETULE: Swap = true; 5220 case ISD::SETUGE: Opc = ARMISD::VCGEU; break; 5221 } 5222 5223 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 5224 if (Opc == ARMISD::VCEQ) { 5225 5226 SDValue AndOp; 5227 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5228 AndOp = Op0; 5229 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 5230 AndOp = Op1; 5231 5232 // Ignore bitconvert. 5233 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 5234 AndOp = AndOp.getOperand(0); 5235 5236 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 5237 Opc = ARMISD::VTST; 5238 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 5239 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 5240 Invert = !Invert; 5241 } 5242 } 5243 } 5244 5245 if (Swap) 5246 std::swap(Op0, Op1); 5247 5248 // If one of the operands is a constant vector zero, attempt to fold the 5249 // comparison to a specialized compare-against-zero form. 5250 SDValue SingleOp; 5251 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5252 SingleOp = Op0; 5253 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 5254 if (Opc == ARMISD::VCGE) 5255 Opc = ARMISD::VCLEZ; 5256 else if (Opc == ARMISD::VCGT) 5257 Opc = ARMISD::VCLTZ; 5258 SingleOp = Op1; 5259 } 5260 5261 SDValue Result; 5262 if (SingleOp.getNode()) { 5263 switch (Opc) { 5264 case ARMISD::VCEQ: 5265 Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; 5266 case ARMISD::VCGE: 5267 Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; 5268 case ARMISD::VCLEZ: 5269 Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; 5270 case ARMISD::VCGT: 5271 Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; 5272 case ARMISD::VCLTZ: 5273 Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; 5274 default: 5275 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5276 } 5277 } else { 5278 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5279 } 5280 5281 Result = DAG.getSExtOrTrunc(Result, dl, VT); 5282 5283 if (Invert) 5284 Result = DAG.getNOT(dl, Result, VT); 5285 5286 return Result; 5287 } 5288 5289 static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) { 5290 SDValue LHS = Op.getOperand(0); 5291 SDValue RHS = Op.getOperand(1); 5292 SDValue Carry = Op.getOperand(2); 5293 SDValue Cond = Op.getOperand(3); 5294 SDLoc DL(Op); 5295 5296 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); 5297 5298 assert(Carry.getOpcode() != ISD::CARRY_FALSE); 5299 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 5300 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 5301 5302 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 5303 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 5304 SDValue ARMcc = DAG.getConstant( 5305 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 5306 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5307 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 5308 Cmp.getValue(1), SDValue()); 5309 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 5310 CCR, Chain.getValue(1)); 5311 } 5312 5313 /// isNEONModifiedImm - Check if the specified splat value corresponds to a 5314 /// valid vector constant for a NEON instruction with a "modified immediate" 5315 /// operand (e.g., VMOV). If so, return the encoded value. 5316 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 5317 unsigned SplatBitSize, SelectionDAG &DAG, 5318 const SDLoc &dl, EVT &VT, bool is128Bits, 5319 NEONModImmType type) { 5320 unsigned OpCmode, Imm; 5321 5322 // SplatBitSize is set to the smallest size that splats the vector, so a 5323 // zero vector will always have SplatBitSize == 8. However, NEON modified 5324 // immediate instructions others than VMOV do not support the 8-bit encoding 5325 // of a zero vector, and the default encoding of zero is supposed to be the 5326 // 32-bit version. 5327 if (SplatBits == 0) 5328 SplatBitSize = 32; 5329 5330 switch (SplatBitSize) { 5331 case 8: 5332 if (type != VMOVModImm) 5333 return SDValue(); 5334 // Any 1-byte value is OK. Op=0, Cmode=1110. 5335 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 5336 OpCmode = 0xe; 5337 Imm = SplatBits; 5338 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 5339 break; 5340 5341 case 16: 5342 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 5343 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 5344 if ((SplatBits & ~0xff) == 0) { 5345 // Value = 0x00nn: Op=x, Cmode=100x. 5346 OpCmode = 0x8; 5347 Imm = SplatBits; 5348 break; 5349 } 5350 if ((SplatBits & ~0xff00) == 0) { 5351 // Value = 0xnn00: Op=x, Cmode=101x. 5352 OpCmode = 0xa; 5353 Imm = SplatBits >> 8; 5354 break; 5355 } 5356 return SDValue(); 5357 5358 case 32: 5359 // NEON's 32-bit VMOV supports splat values where: 5360 // * only one byte is nonzero, or 5361 // * the least significant byte is 0xff and the second byte is nonzero, or 5362 // * the least significant 2 bytes are 0xff and the third is nonzero. 5363 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 5364 if ((SplatBits & ~0xff) == 0) { 5365 // Value = 0x000000nn: Op=x, Cmode=000x. 5366 OpCmode = 0; 5367 Imm = SplatBits; 5368 break; 5369 } 5370 if ((SplatBits & ~0xff00) == 0) { 5371 // Value = 0x0000nn00: Op=x, Cmode=001x. 5372 OpCmode = 0x2; 5373 Imm = SplatBits >> 8; 5374 break; 5375 } 5376 if ((SplatBits & ~0xff0000) == 0) { 5377 // Value = 0x00nn0000: Op=x, Cmode=010x. 5378 OpCmode = 0x4; 5379 Imm = SplatBits >> 16; 5380 break; 5381 } 5382 if ((SplatBits & ~0xff000000) == 0) { 5383 // Value = 0xnn000000: Op=x, Cmode=011x. 5384 OpCmode = 0x6; 5385 Imm = SplatBits >> 24; 5386 break; 5387 } 5388 5389 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 5390 if (type == OtherModImm) return SDValue(); 5391 5392 if ((SplatBits & ~0xffff) == 0 && 5393 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 5394 // Value = 0x0000nnff: Op=x, Cmode=1100. 5395 OpCmode = 0xc; 5396 Imm = SplatBits >> 8; 5397 break; 5398 } 5399 5400 if ((SplatBits & ~0xffffff) == 0 && 5401 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 5402 // Value = 0x00nnffff: Op=x, Cmode=1101. 5403 OpCmode = 0xd; 5404 Imm = SplatBits >> 16; 5405 break; 5406 } 5407 5408 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 5409 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 5410 // VMOV.I32. A (very) minor optimization would be to replicate the value 5411 // and fall through here to test for a valid 64-bit splat. But, then the 5412 // caller would also need to check and handle the change in size. 5413 return SDValue(); 5414 5415 case 64: { 5416 if (type != VMOVModImm) 5417 return SDValue(); 5418 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 5419 uint64_t BitMask = 0xff; 5420 uint64_t Val = 0; 5421 unsigned ImmMask = 1; 5422 Imm = 0; 5423 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 5424 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 5425 Val |= BitMask; 5426 Imm |= ImmMask; 5427 } else if ((SplatBits & BitMask) != 0) { 5428 return SDValue(); 5429 } 5430 BitMask <<= 8; 5431 ImmMask <<= 1; 5432 } 5433 5434 if (DAG.getDataLayout().isBigEndian()) 5435 // swap higher and lower 32 bit word 5436 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 5437 5438 // Op=1, Cmode=1110. 5439 OpCmode = 0x1e; 5440 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 5441 break; 5442 } 5443 5444 default: 5445 llvm_unreachable("unexpected size for isNEONModifiedImm"); 5446 } 5447 5448 unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); 5449 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 5450 } 5451 5452 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 5453 const ARMSubtarget *ST) const { 5454 bool IsDouble = Op.getValueType() == MVT::f64; 5455 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 5456 const APFloat &FPVal = CFP->getValueAPF(); 5457 5458 // Prevent floating-point constants from using literal loads 5459 // when execute-only is enabled. 5460 if (ST->genExecuteOnly()) { 5461 APInt INTVal = FPVal.bitcastToAPInt(); 5462 SDLoc DL(CFP); 5463 if (IsDouble) { 5464 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 5465 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 5466 if (!ST->isLittle()) 5467 std::swap(Lo, Hi); 5468 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 5469 } else { 5470 return DAG.getConstant(INTVal, DL, MVT::i32); 5471 } 5472 } 5473 5474 if (!ST->hasVFP3()) 5475 return SDValue(); 5476 5477 // Use the default (constant pool) lowering for double constants when we have 5478 // an SP-only FPU 5479 if (IsDouble && Subtarget->isFPOnlySP()) 5480 return SDValue(); 5481 5482 // Try splatting with a VMOV.f32... 5483 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 5484 5485 if (ImmVal != -1) { 5486 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 5487 // We have code in place to select a valid ConstantFP already, no need to 5488 // do any mangling. 5489 return Op; 5490 } 5491 5492 // It's a float and we are trying to use NEON operations where 5493 // possible. Lower it to a splat followed by an extract. 5494 SDLoc DL(Op); 5495 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 5496 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 5497 NewVal); 5498 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 5499 DAG.getConstant(0, DL, MVT::i32)); 5500 } 5501 5502 // The rest of our options are NEON only, make sure that's allowed before 5503 // proceeding.. 5504 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 5505 return SDValue(); 5506 5507 EVT VMovVT; 5508 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 5509 5510 // It wouldn't really be worth bothering for doubles except for one very 5511 // important value, which does happen to match: 0.0. So make sure we don't do 5512 // anything stupid. 5513 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 5514 return SDValue(); 5515 5516 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 5517 SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 5518 VMovVT, false, VMOVModImm); 5519 if (NewVal != SDValue()) { 5520 SDLoc DL(Op); 5521 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 5522 NewVal); 5523 if (IsDouble) 5524 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5525 5526 // It's a float: cast and extract a vector element. 5527 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5528 VecConstant); 5529 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5530 DAG.getConstant(0, DL, MVT::i32)); 5531 } 5532 5533 // Finally, try a VMVN.i32 5534 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 5535 false, VMVNModImm); 5536 if (NewVal != SDValue()) { 5537 SDLoc DL(Op); 5538 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 5539 5540 if (IsDouble) 5541 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5542 5543 // It's a float: cast and extract a vector element. 5544 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5545 VecConstant); 5546 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5547 DAG.getConstant(0, DL, MVT::i32)); 5548 } 5549 5550 return SDValue(); 5551 } 5552 5553 // check if an VEXT instruction can handle the shuffle mask when the 5554 // vector sources of the shuffle are the same. 5555 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5556 unsigned NumElts = VT.getVectorNumElements(); 5557 5558 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5559 if (M[0] < 0) 5560 return false; 5561 5562 Imm = M[0]; 5563 5564 // If this is a VEXT shuffle, the immediate value is the index of the first 5565 // element. The other shuffle indices must be the successive elements after 5566 // the first one. 5567 unsigned ExpectedElt = Imm; 5568 for (unsigned i = 1; i < NumElts; ++i) { 5569 // Increment the expected index. If it wraps around, just follow it 5570 // back to index zero and keep going. 5571 ++ExpectedElt; 5572 if (ExpectedElt == NumElts) 5573 ExpectedElt = 0; 5574 5575 if (M[i] < 0) continue; // ignore UNDEF indices 5576 if (ExpectedElt != static_cast<unsigned>(M[i])) 5577 return false; 5578 } 5579 5580 return true; 5581 } 5582 5583 5584 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 5585 bool &ReverseVEXT, unsigned &Imm) { 5586 unsigned NumElts = VT.getVectorNumElements(); 5587 ReverseVEXT = false; 5588 5589 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5590 if (M[0] < 0) 5591 return false; 5592 5593 Imm = M[0]; 5594 5595 // If this is a VEXT shuffle, the immediate value is the index of the first 5596 // element. The other shuffle indices must be the successive elements after 5597 // the first one. 5598 unsigned ExpectedElt = Imm; 5599 for (unsigned i = 1; i < NumElts; ++i) { 5600 // Increment the expected index. If it wraps around, it may still be 5601 // a VEXT but the source vectors must be swapped. 5602 ExpectedElt += 1; 5603 if (ExpectedElt == NumElts * 2) { 5604 ExpectedElt = 0; 5605 ReverseVEXT = true; 5606 } 5607 5608 if (M[i] < 0) continue; // ignore UNDEF indices 5609 if (ExpectedElt != static_cast<unsigned>(M[i])) 5610 return false; 5611 } 5612 5613 // Adjust the index value if the source operands will be swapped. 5614 if (ReverseVEXT) 5615 Imm -= NumElts; 5616 5617 return true; 5618 } 5619 5620 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 5621 /// instruction with the specified blocksize. (The order of the elements 5622 /// within each block of the vector is reversed.) 5623 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 5624 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 5625 "Only possible block sizes for VREV are: 16, 32, 64"); 5626 5627 unsigned EltSz = VT.getScalarSizeInBits(); 5628 if (EltSz == 64) 5629 return false; 5630 5631 unsigned NumElts = VT.getVectorNumElements(); 5632 unsigned BlockElts = M[0] + 1; 5633 // If the first shuffle index is UNDEF, be optimistic. 5634 if (M[0] < 0) 5635 BlockElts = BlockSize / EltSz; 5636 5637 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 5638 return false; 5639 5640 for (unsigned i = 0; i < NumElts; ++i) { 5641 if (M[i] < 0) continue; // ignore UNDEF indices 5642 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 5643 return false; 5644 } 5645 5646 return true; 5647 } 5648 5649 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 5650 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 5651 // range, then 0 is placed into the resulting vector. So pretty much any mask 5652 // of 8 elements can work here. 5653 return VT == MVT::v8i8 && M.size() == 8; 5654 } 5655 5656 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 5657 // checking that pairs of elements in the shuffle mask represent the same index 5658 // in each vector, incrementing the expected index by 2 at each step. 5659 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 5660 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 5661 // v2={e,f,g,h} 5662 // WhichResult gives the offset for each element in the mask based on which 5663 // of the two results it belongs to. 5664 // 5665 // The transpose can be represented either as: 5666 // result1 = shufflevector v1, v2, result1_shuffle_mask 5667 // result2 = shufflevector v1, v2, result2_shuffle_mask 5668 // where v1/v2 and the shuffle masks have the same number of elements 5669 // (here WhichResult (see below) indicates which result is being checked) 5670 // 5671 // or as: 5672 // results = shufflevector v1, v2, shuffle_mask 5673 // where both results are returned in one vector and the shuffle mask has twice 5674 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 5675 // want to check the low half and high half of the shuffle mask as if it were 5676 // the other case 5677 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5678 unsigned EltSz = VT.getScalarSizeInBits(); 5679 if (EltSz == 64) 5680 return false; 5681 5682 unsigned NumElts = VT.getVectorNumElements(); 5683 if (M.size() != NumElts && M.size() != NumElts*2) 5684 return false; 5685 5686 // If the mask is twice as long as the input vector then we need to check the 5687 // upper and lower parts of the mask with a matching value for WhichResult 5688 // FIXME: A mask with only even values will be rejected in case the first 5689 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 5690 // M[0] is used to determine WhichResult 5691 for (unsigned i = 0; i < M.size(); i += NumElts) { 5692 if (M.size() == NumElts * 2) 5693 WhichResult = i / NumElts; 5694 else 5695 WhichResult = M[i] == 0 ? 0 : 1; 5696 for (unsigned j = 0; j < NumElts; j += 2) { 5697 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 5698 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 5699 return false; 5700 } 5701 } 5702 5703 if (M.size() == NumElts*2) 5704 WhichResult = 0; 5705 5706 return true; 5707 } 5708 5709 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 5710 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5711 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 5712 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5713 unsigned EltSz = VT.getScalarSizeInBits(); 5714 if (EltSz == 64) 5715 return false; 5716 5717 unsigned NumElts = VT.getVectorNumElements(); 5718 if (M.size() != NumElts && M.size() != NumElts*2) 5719 return false; 5720 5721 for (unsigned i = 0; i < M.size(); i += NumElts) { 5722 if (M.size() == NumElts * 2) 5723 WhichResult = i / NumElts; 5724 else 5725 WhichResult = M[i] == 0 ? 0 : 1; 5726 for (unsigned j = 0; j < NumElts; j += 2) { 5727 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 5728 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 5729 return false; 5730 } 5731 } 5732 5733 if (M.size() == NumElts*2) 5734 WhichResult = 0; 5735 5736 return true; 5737 } 5738 5739 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 5740 // that the mask elements are either all even and in steps of size 2 or all odd 5741 // and in steps of size 2. 5742 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 5743 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 5744 // v2={e,f,g,h} 5745 // Requires similar checks to that of isVTRNMask with 5746 // respect the how results are returned. 5747 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5748 unsigned EltSz = VT.getScalarSizeInBits(); 5749 if (EltSz == 64) 5750 return false; 5751 5752 unsigned NumElts = VT.getVectorNumElements(); 5753 if (M.size() != NumElts && M.size() != NumElts*2) 5754 return false; 5755 5756 for (unsigned i = 0; i < M.size(); i += NumElts) { 5757 WhichResult = M[i] == 0 ? 0 : 1; 5758 for (unsigned j = 0; j < NumElts; ++j) { 5759 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 5760 return false; 5761 } 5762 } 5763 5764 if (M.size() == NumElts*2) 5765 WhichResult = 0; 5766 5767 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5768 if (VT.is64BitVector() && EltSz == 32) 5769 return false; 5770 5771 return true; 5772 } 5773 5774 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 5775 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5776 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 5777 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5778 unsigned EltSz = VT.getScalarSizeInBits(); 5779 if (EltSz == 64) 5780 return false; 5781 5782 unsigned NumElts = VT.getVectorNumElements(); 5783 if (M.size() != NumElts && M.size() != NumElts*2) 5784 return false; 5785 5786 unsigned Half = NumElts / 2; 5787 for (unsigned i = 0; i < M.size(); i += NumElts) { 5788 WhichResult = M[i] == 0 ? 0 : 1; 5789 for (unsigned j = 0; j < NumElts; j += Half) { 5790 unsigned Idx = WhichResult; 5791 for (unsigned k = 0; k < Half; ++k) { 5792 int MIdx = M[i + j + k]; 5793 if (MIdx >= 0 && (unsigned) MIdx != Idx) 5794 return false; 5795 Idx += 2; 5796 } 5797 } 5798 } 5799 5800 if (M.size() == NumElts*2) 5801 WhichResult = 0; 5802 5803 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5804 if (VT.is64BitVector() && EltSz == 32) 5805 return false; 5806 5807 return true; 5808 } 5809 5810 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 5811 // that pairs of elements of the shufflemask represent the same index in each 5812 // vector incrementing sequentially through the vectors. 5813 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 5814 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 5815 // v2={e,f,g,h} 5816 // Requires similar checks to that of isVTRNMask with respect the how results 5817 // are returned. 5818 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 5819 unsigned EltSz = VT.getScalarSizeInBits(); 5820 if (EltSz == 64) 5821 return false; 5822 5823 unsigned NumElts = VT.getVectorNumElements(); 5824 if (M.size() != NumElts && M.size() != NumElts*2) 5825 return false; 5826 5827 for (unsigned i = 0; i < M.size(); i += NumElts) { 5828 WhichResult = M[i] == 0 ? 0 : 1; 5829 unsigned Idx = WhichResult * NumElts / 2; 5830 for (unsigned j = 0; j < NumElts; j += 2) { 5831 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 5832 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 5833 return false; 5834 Idx += 1; 5835 } 5836 } 5837 5838 if (M.size() == NumElts*2) 5839 WhichResult = 0; 5840 5841 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5842 if (VT.is64BitVector() && EltSz == 32) 5843 return false; 5844 5845 return true; 5846 } 5847 5848 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 5849 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 5850 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 5851 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 5852 unsigned EltSz = VT.getScalarSizeInBits(); 5853 if (EltSz == 64) 5854 return false; 5855 5856 unsigned NumElts = VT.getVectorNumElements(); 5857 if (M.size() != NumElts && M.size() != NumElts*2) 5858 return false; 5859 5860 for (unsigned i = 0; i < M.size(); i += NumElts) { 5861 WhichResult = M[i] == 0 ? 0 : 1; 5862 unsigned Idx = WhichResult * NumElts / 2; 5863 for (unsigned j = 0; j < NumElts; j += 2) { 5864 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 5865 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 5866 return false; 5867 Idx += 1; 5868 } 5869 } 5870 5871 if (M.size() == NumElts*2) 5872 WhichResult = 0; 5873 5874 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 5875 if (VT.is64BitVector() && EltSz == 32) 5876 return false; 5877 5878 return true; 5879 } 5880 5881 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 5882 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 5883 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 5884 unsigned &WhichResult, 5885 bool &isV_UNDEF) { 5886 isV_UNDEF = false; 5887 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 5888 return ARMISD::VTRN; 5889 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 5890 return ARMISD::VUZP; 5891 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 5892 return ARMISD::VZIP; 5893 5894 isV_UNDEF = true; 5895 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5896 return ARMISD::VTRN; 5897 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5898 return ARMISD::VUZP; 5899 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 5900 return ARMISD::VZIP; 5901 5902 return 0; 5903 } 5904 5905 /// \return true if this is a reverse operation on an vector. 5906 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 5907 unsigned NumElts = VT.getVectorNumElements(); 5908 // Make sure the mask has the right size. 5909 if (NumElts != M.size()) 5910 return false; 5911 5912 // Look for <15, ..., 3, -1, 1, 0>. 5913 for (unsigned i = 0; i != NumElts; ++i) 5914 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 5915 return false; 5916 5917 return true; 5918 } 5919 5920 // If N is an integer constant that can be moved into a register in one 5921 // instruction, return an SDValue of such a constant (will become a MOV 5922 // instruction). Otherwise return null. 5923 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 5924 const ARMSubtarget *ST, const SDLoc &dl) { 5925 uint64_t Val; 5926 if (!isa<ConstantSDNode>(N)) 5927 return SDValue(); 5928 Val = cast<ConstantSDNode>(N)->getZExtValue(); 5929 5930 if (ST->isThumb1Only()) { 5931 if (Val <= 255 || ~Val <= 255) 5932 return DAG.getConstant(Val, dl, MVT::i32); 5933 } else { 5934 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 5935 return DAG.getConstant(Val, dl, MVT::i32); 5936 } 5937 return SDValue(); 5938 } 5939 5940 // If this is a case we can't handle, return null and let the default 5941 // expansion code take care of it. 5942 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 5943 const ARMSubtarget *ST) const { 5944 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 5945 SDLoc dl(Op); 5946 EVT VT = Op.getValueType(); 5947 5948 APInt SplatBits, SplatUndef; 5949 unsigned SplatBitSize; 5950 bool HasAnyUndefs; 5951 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5952 if (SplatUndef.isAllOnesValue()) 5953 return DAG.getUNDEF(VT); 5954 5955 if (SplatBitSize <= 64) { 5956 // Check if an immediate VMOV works. 5957 EVT VmovVT; 5958 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 5959 SplatUndef.getZExtValue(), SplatBitSize, 5960 DAG, dl, VmovVT, VT.is128BitVector(), 5961 VMOVModImm); 5962 if (Val.getNode()) { 5963 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 5964 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5965 } 5966 5967 // Try an immediate VMVN. 5968 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 5969 Val = isNEONModifiedImm(NegatedImm, 5970 SplatUndef.getZExtValue(), SplatBitSize, 5971 DAG, dl, VmovVT, VT.is128BitVector(), 5972 VMVNModImm); 5973 if (Val.getNode()) { 5974 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 5975 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5976 } 5977 5978 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 5979 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 5980 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 5981 if (ImmVal != -1) { 5982 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 5983 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 5984 } 5985 } 5986 } 5987 } 5988 5989 // Scan through the operands to see if only one value is used. 5990 // 5991 // As an optimisation, even if more than one value is used it may be more 5992 // profitable to splat with one value then change some lanes. 5993 // 5994 // Heuristically we decide to do this if the vector has a "dominant" value, 5995 // defined as splatted to more than half of the lanes. 5996 unsigned NumElts = VT.getVectorNumElements(); 5997 bool isOnlyLowElement = true; 5998 bool usesOnlyOneValue = true; 5999 bool hasDominantValue = false; 6000 bool isConstant = true; 6001 6002 // Map of the number of times a particular SDValue appears in the 6003 // element list. 6004 DenseMap<SDValue, unsigned> ValueCounts; 6005 SDValue Value; 6006 for (unsigned i = 0; i < NumElts; ++i) { 6007 SDValue V = Op.getOperand(i); 6008 if (V.isUndef()) 6009 continue; 6010 if (i > 0) 6011 isOnlyLowElement = false; 6012 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 6013 isConstant = false; 6014 6015 ValueCounts.insert(std::make_pair(V, 0)); 6016 unsigned &Count = ValueCounts[V]; 6017 6018 // Is this value dominant? (takes up more than half of the lanes) 6019 if (++Count > (NumElts / 2)) { 6020 hasDominantValue = true; 6021 Value = V; 6022 } 6023 } 6024 if (ValueCounts.size() != 1) 6025 usesOnlyOneValue = false; 6026 if (!Value.getNode() && ValueCounts.size() > 0) 6027 Value = ValueCounts.begin()->first; 6028 6029 if (ValueCounts.size() == 0) 6030 return DAG.getUNDEF(VT); 6031 6032 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 6033 // Keep going if we are hitting this case. 6034 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 6035 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 6036 6037 unsigned EltSize = VT.getScalarSizeInBits(); 6038 6039 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 6040 // i32 and try again. 6041 if (hasDominantValue && EltSize <= 32) { 6042 if (!isConstant) { 6043 SDValue N; 6044 6045 // If we are VDUPing a value that comes directly from a vector, that will 6046 // cause an unnecessary move to and from a GPR, where instead we could 6047 // just use VDUPLANE. We can only do this if the lane being extracted 6048 // is at a constant index, as the VDUP from lane instructions only have 6049 // constant-index forms. 6050 ConstantSDNode *constIndex; 6051 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6052 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 6053 // We need to create a new undef vector to use for the VDUPLANE if the 6054 // size of the vector from which we get the value is different than the 6055 // size of the vector that we need to create. We will insert the element 6056 // such that the register coalescer will remove unnecessary copies. 6057 if (VT != Value->getOperand(0).getValueType()) { 6058 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 6059 VT.getVectorNumElements(); 6060 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6061 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 6062 Value, DAG.getConstant(index, dl, MVT::i32)), 6063 DAG.getConstant(index, dl, MVT::i32)); 6064 } else 6065 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6066 Value->getOperand(0), Value->getOperand(1)); 6067 } else 6068 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 6069 6070 if (!usesOnlyOneValue) { 6071 // The dominant value was splatted as 'N', but we now have to insert 6072 // all differing elements. 6073 for (unsigned I = 0; I < NumElts; ++I) { 6074 if (Op.getOperand(I) == Value) 6075 continue; 6076 SmallVector<SDValue, 3> Ops; 6077 Ops.push_back(N); 6078 Ops.push_back(Op.getOperand(I)); 6079 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 6080 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 6081 } 6082 } 6083 return N; 6084 } 6085 if (VT.getVectorElementType().isFloatingPoint()) { 6086 SmallVector<SDValue, 8> Ops; 6087 for (unsigned i = 0; i < NumElts; ++i) 6088 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, 6089 Op.getOperand(i))); 6090 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 6091 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 6092 Val = LowerBUILD_VECTOR(Val, DAG, ST); 6093 if (Val.getNode()) 6094 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6095 } 6096 if (usesOnlyOneValue) { 6097 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 6098 if (isConstant && Val.getNode()) 6099 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 6100 } 6101 } 6102 6103 // If all elements are constants and the case above didn't get hit, fall back 6104 // to the default expansion, which will generate a load from the constant 6105 // pool. 6106 if (isConstant) 6107 return SDValue(); 6108 6109 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 6110 if (NumElts >= 4) { 6111 SDValue shuffle = ReconstructShuffle(Op, DAG); 6112 if (shuffle != SDValue()) 6113 return shuffle; 6114 } 6115 6116 if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 6117 // If we haven't found an efficient lowering, try splitting a 128-bit vector 6118 // into two 64-bit vectors; we might discover a better way to lower it. 6119 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 6120 EVT ExtVT = VT.getVectorElementType(); 6121 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 6122 SDValue Lower = 6123 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 6124 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 6125 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 6126 SDValue Upper = DAG.getBuildVector( 6127 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 6128 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 6129 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 6130 if (Lower && Upper) 6131 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 6132 } 6133 6134 // Vectors with 32- or 64-bit elements can be built by directly assigning 6135 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 6136 // will be legalized. 6137 if (EltSize >= 32) { 6138 // Do the expansion with floating-point types, since that is what the VFP 6139 // registers are defined to use, and since i64 is not legal. 6140 EVT EltVT = EVT::getFloatingPointVT(EltSize); 6141 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 6142 SmallVector<SDValue, 8> Ops; 6143 for (unsigned i = 0; i < NumElts; ++i) 6144 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 6145 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 6146 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6147 } 6148 6149 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 6150 // know the default expansion would otherwise fall back on something even 6151 // worse. For a vector with one or two non-undef values, that's 6152 // scalar_to_vector for the elements followed by a shuffle (provided the 6153 // shuffle is valid for the target) and materialization element by element 6154 // on the stack followed by a load for everything else. 6155 if (!isConstant && !usesOnlyOneValue) { 6156 SDValue Vec = DAG.getUNDEF(VT); 6157 for (unsigned i = 0 ; i < NumElts; ++i) { 6158 SDValue V = Op.getOperand(i); 6159 if (V.isUndef()) 6160 continue; 6161 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 6162 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 6163 } 6164 return Vec; 6165 } 6166 6167 return SDValue(); 6168 } 6169 6170 // Gather data to see if the operation can be modelled as a 6171 // shuffle in combination with VEXTs. 6172 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 6173 SelectionDAG &DAG) const { 6174 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 6175 SDLoc dl(Op); 6176 EVT VT = Op.getValueType(); 6177 unsigned NumElts = VT.getVectorNumElements(); 6178 6179 struct ShuffleSourceInfo { 6180 SDValue Vec; 6181 unsigned MinElt; 6182 unsigned MaxElt; 6183 6184 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 6185 // be compatible with the shuffle we intend to construct. As a result 6186 // ShuffleVec will be some sliding window into the original Vec. 6187 SDValue ShuffleVec; 6188 6189 // Code should guarantee that element i in Vec starts at element "WindowBase 6190 // + i * WindowScale in ShuffleVec". 6191 int WindowBase; 6192 int WindowScale; 6193 6194 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 6195 ShuffleSourceInfo(SDValue Vec) 6196 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 6197 WindowScale(1) {} 6198 }; 6199 6200 // First gather all vectors used as an immediate source for this BUILD_VECTOR 6201 // node. 6202 SmallVector<ShuffleSourceInfo, 2> Sources; 6203 for (unsigned i = 0; i < NumElts; ++i) { 6204 SDValue V = Op.getOperand(i); 6205 if (V.isUndef()) 6206 continue; 6207 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 6208 // A shuffle can only come from building a vector from various 6209 // elements of other vectors. 6210 return SDValue(); 6211 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 6212 // Furthermore, shuffles require a constant mask, whereas extractelts 6213 // accept variable indices. 6214 return SDValue(); 6215 } 6216 6217 // Add this element source to the list if it's not already there. 6218 SDValue SourceVec = V.getOperand(0); 6219 auto Source = find(Sources, SourceVec); 6220 if (Source == Sources.end()) 6221 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 6222 6223 // Update the minimum and maximum lane number seen. 6224 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 6225 Source->MinElt = std::min(Source->MinElt, EltNo); 6226 Source->MaxElt = std::max(Source->MaxElt, EltNo); 6227 } 6228 6229 // Currently only do something sane when at most two source vectors 6230 // are involved. 6231 if (Sources.size() > 2) 6232 return SDValue(); 6233 6234 // Find out the smallest element size among result and two sources, and use 6235 // it as element size to build the shuffle_vector. 6236 EVT SmallestEltTy = VT.getVectorElementType(); 6237 for (auto &Source : Sources) { 6238 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 6239 if (SrcEltTy.bitsLT(SmallestEltTy)) 6240 SmallestEltTy = SrcEltTy; 6241 } 6242 unsigned ResMultiplier = 6243 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 6244 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6245 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 6246 6247 // If the source vector is too wide or too narrow, we may nevertheless be able 6248 // to construct a compatible shuffle either by concatenating it with UNDEF or 6249 // extracting a suitable range of elements. 6250 for (auto &Src : Sources) { 6251 EVT SrcVT = Src.ShuffleVec.getValueType(); 6252 6253 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 6254 continue; 6255 6256 // This stage of the search produces a source with the same element type as 6257 // the original, but with a total width matching the BUILD_VECTOR output. 6258 EVT EltVT = SrcVT.getVectorElementType(); 6259 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 6260 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 6261 6262 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 6263 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 6264 return SDValue(); 6265 // We can pad out the smaller vector for free, so if it's part of a 6266 // shuffle... 6267 Src.ShuffleVec = 6268 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 6269 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 6270 continue; 6271 } 6272 6273 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 6274 return SDValue(); 6275 6276 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 6277 // Span too large for a VEXT to cope 6278 return SDValue(); 6279 } 6280 6281 if (Src.MinElt >= NumSrcElts) { 6282 // The extraction can just take the second half 6283 Src.ShuffleVec = 6284 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6285 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6286 Src.WindowBase = -NumSrcElts; 6287 } else if (Src.MaxElt < NumSrcElts) { 6288 // The extraction can just take the first half 6289 Src.ShuffleVec = 6290 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6291 DAG.getConstant(0, dl, MVT::i32)); 6292 } else { 6293 // An actual VEXT is needed 6294 SDValue VEXTSrc1 = 6295 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6296 DAG.getConstant(0, dl, MVT::i32)); 6297 SDValue VEXTSrc2 = 6298 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6299 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6300 6301 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 6302 VEXTSrc2, 6303 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 6304 Src.WindowBase = -Src.MinElt; 6305 } 6306 } 6307 6308 // Another possible incompatibility occurs from the vector element types. We 6309 // can fix this by bitcasting the source vectors to the same type we intend 6310 // for the shuffle. 6311 for (auto &Src : Sources) { 6312 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 6313 if (SrcEltTy == SmallestEltTy) 6314 continue; 6315 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 6316 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 6317 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6318 Src.WindowBase *= Src.WindowScale; 6319 } 6320 6321 // Final sanity check before we try to actually produce a shuffle. 6322 DEBUG( 6323 for (auto Src : Sources) 6324 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 6325 ); 6326 6327 // The stars all align, our next step is to produce the mask for the shuffle. 6328 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 6329 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 6330 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 6331 SDValue Entry = Op.getOperand(i); 6332 if (Entry.isUndef()) 6333 continue; 6334 6335 auto Src = find(Sources, Entry.getOperand(0)); 6336 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 6337 6338 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 6339 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 6340 // segment. 6341 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 6342 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 6343 VT.getScalarSizeInBits()); 6344 int LanesDefined = BitsDefined / BitsPerShuffleLane; 6345 6346 // This source is expected to fill ResMultiplier lanes of the final shuffle, 6347 // starting at the appropriate offset. 6348 int *LaneMask = &Mask[i * ResMultiplier]; 6349 6350 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 6351 ExtractBase += NumElts * (Src - Sources.begin()); 6352 for (int j = 0; j < LanesDefined; ++j) 6353 LaneMask[j] = ExtractBase + j; 6354 } 6355 6356 // Final check before we try to produce nonsense... 6357 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 6358 return SDValue(); 6359 6360 // We can't handle more than two sources. This should have already 6361 // been checked before this point. 6362 assert(Sources.size() <= 2 && "Too many sources!"); 6363 6364 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 6365 for (unsigned i = 0; i < Sources.size(); ++i) 6366 ShuffleOps[i] = Sources[i].ShuffleVec; 6367 6368 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 6369 ShuffleOps[1], Mask); 6370 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 6371 } 6372 6373 /// isShuffleMaskLegal - Targets can use this to indicate that they only 6374 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6375 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6376 /// are assumed to be legal. 6377 bool 6378 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6379 EVT VT) const { 6380 if (VT.getVectorNumElements() == 4 && 6381 (VT.is128BitVector() || VT.is64BitVector())) { 6382 unsigned PFIndexes[4]; 6383 for (unsigned i = 0; i != 4; ++i) { 6384 if (M[i] < 0) 6385 PFIndexes[i] = 8; 6386 else 6387 PFIndexes[i] = M[i]; 6388 } 6389 6390 // Compute the index in the perfect shuffle table. 6391 unsigned PFTableIndex = 6392 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6393 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6394 unsigned Cost = (PFEntry >> 30); 6395 6396 if (Cost <= 4) 6397 return true; 6398 } 6399 6400 bool ReverseVEXT, isV_UNDEF; 6401 unsigned Imm, WhichResult; 6402 6403 unsigned EltSize = VT.getScalarSizeInBits(); 6404 return (EltSize >= 32 || 6405 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6406 isVREVMask(M, VT, 64) || 6407 isVREVMask(M, VT, 32) || 6408 isVREVMask(M, VT, 16) || 6409 isVEXTMask(M, VT, ReverseVEXT, Imm) || 6410 isVTBLMask(M, VT) || 6411 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || 6412 ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); 6413 } 6414 6415 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 6416 /// the specified operations to build the shuffle. 6417 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 6418 SDValue RHS, SelectionDAG &DAG, 6419 const SDLoc &dl) { 6420 unsigned OpNum = (PFEntry >> 26) & 0x0F; 6421 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 6422 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 6423 6424 enum { 6425 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 6426 OP_VREV, 6427 OP_VDUP0, 6428 OP_VDUP1, 6429 OP_VDUP2, 6430 OP_VDUP3, 6431 OP_VEXT1, 6432 OP_VEXT2, 6433 OP_VEXT3, 6434 OP_VUZPL, // VUZP, left result 6435 OP_VUZPR, // VUZP, right result 6436 OP_VZIPL, // VZIP, left result 6437 OP_VZIPR, // VZIP, right result 6438 OP_VTRNL, // VTRN, left result 6439 OP_VTRNR // VTRN, right result 6440 }; 6441 6442 if (OpNum == OP_COPY) { 6443 if (LHSID == (1*9+2)*9+3) return LHS; 6444 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 6445 return RHS; 6446 } 6447 6448 SDValue OpLHS, OpRHS; 6449 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 6450 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 6451 EVT VT = OpLHS.getValueType(); 6452 6453 switch (OpNum) { 6454 default: llvm_unreachable("Unknown shuffle opcode!"); 6455 case OP_VREV: 6456 // VREV divides the vector in half and swaps within the half. 6457 if (VT.getVectorElementType() == MVT::i32 || 6458 VT.getVectorElementType() == MVT::f32) 6459 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 6460 // vrev <4 x i16> -> VREV32 6461 if (VT.getVectorElementType() == MVT::i16) 6462 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 6463 // vrev <4 x i8> -> VREV16 6464 assert(VT.getVectorElementType() == MVT::i8); 6465 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 6466 case OP_VDUP0: 6467 case OP_VDUP1: 6468 case OP_VDUP2: 6469 case OP_VDUP3: 6470 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6471 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 6472 case OP_VEXT1: 6473 case OP_VEXT2: 6474 case OP_VEXT3: 6475 return DAG.getNode(ARMISD::VEXT, dl, VT, 6476 OpLHS, OpRHS, 6477 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 6478 case OP_VUZPL: 6479 case OP_VUZPR: 6480 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 6481 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 6482 case OP_VZIPL: 6483 case OP_VZIPR: 6484 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 6485 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 6486 case OP_VTRNL: 6487 case OP_VTRNR: 6488 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 6489 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 6490 } 6491 } 6492 6493 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 6494 ArrayRef<int> ShuffleMask, 6495 SelectionDAG &DAG) { 6496 // Check to see if we can use the VTBL instruction. 6497 SDValue V1 = Op.getOperand(0); 6498 SDValue V2 = Op.getOperand(1); 6499 SDLoc DL(Op); 6500 6501 SmallVector<SDValue, 8> VTBLMask; 6502 for (ArrayRef<int>::iterator 6503 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 6504 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 6505 6506 if (V2.getNode()->isUndef()) 6507 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 6508 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6509 6510 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 6511 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6512 } 6513 6514 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 6515 SelectionDAG &DAG) { 6516 SDLoc DL(Op); 6517 SDValue OpLHS = Op.getOperand(0); 6518 EVT VT = OpLHS.getValueType(); 6519 6520 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 6521 "Expect an v8i16/v16i8 type"); 6522 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 6523 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 6524 // extract the first 8 bytes into the top double word and the last 8 bytes 6525 // into the bottom double word. The v8i16 case is similar. 6526 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 6527 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 6528 DAG.getConstant(ExtractNum, DL, MVT::i32)); 6529 } 6530 6531 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 6532 SDValue V1 = Op.getOperand(0); 6533 SDValue V2 = Op.getOperand(1); 6534 SDLoc dl(Op); 6535 EVT VT = Op.getValueType(); 6536 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 6537 6538 // Convert shuffles that are directly supported on NEON to target-specific 6539 // DAG nodes, instead of keeping them as shuffles and matching them again 6540 // during code selection. This is more efficient and avoids the possibility 6541 // of inconsistencies between legalization and selection. 6542 // FIXME: floating-point vectors should be canonicalized to integer vectors 6543 // of the same time so that they get CSEd properly. 6544 ArrayRef<int> ShuffleMask = SVN->getMask(); 6545 6546 unsigned EltSize = VT.getScalarSizeInBits(); 6547 if (EltSize <= 32) { 6548 if (SVN->isSplat()) { 6549 int Lane = SVN->getSplatIndex(); 6550 // If this is undef splat, generate it via "just" vdup, if possible. 6551 if (Lane == -1) Lane = 0; 6552 6553 // Test if V1 is a SCALAR_TO_VECTOR. 6554 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 6555 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 6556 } 6557 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 6558 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 6559 // reaches it). 6560 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 6561 !isa<ConstantSDNode>(V1.getOperand(0))) { 6562 bool IsScalarToVector = true; 6563 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 6564 if (!V1.getOperand(i).isUndef()) { 6565 IsScalarToVector = false; 6566 break; 6567 } 6568 if (IsScalarToVector) 6569 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 6570 } 6571 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 6572 DAG.getConstant(Lane, dl, MVT::i32)); 6573 } 6574 6575 bool ReverseVEXT; 6576 unsigned Imm; 6577 if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 6578 if (ReverseVEXT) 6579 std::swap(V1, V2); 6580 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 6581 DAG.getConstant(Imm, dl, MVT::i32)); 6582 } 6583 6584 if (isVREVMask(ShuffleMask, VT, 64)) 6585 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 6586 if (isVREVMask(ShuffleMask, VT, 32)) 6587 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 6588 if (isVREVMask(ShuffleMask, VT, 16)) 6589 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 6590 6591 if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 6592 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 6593 DAG.getConstant(Imm, dl, MVT::i32)); 6594 } 6595 6596 // Check for Neon shuffles that modify both input vectors in place. 6597 // If both results are used, i.e., if there are two shuffles with the same 6598 // source operands and with masks corresponding to both results of one of 6599 // these operations, DAG memoization will ensure that a single node is 6600 // used for both shuffles. 6601 unsigned WhichResult; 6602 bool isV_UNDEF; 6603 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 6604 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 6605 if (isV_UNDEF) 6606 V2 = V1; 6607 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 6608 .getValue(WhichResult); 6609 } 6610 6611 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 6612 // shuffles that produce a result larger than their operands with: 6613 // shuffle(concat(v1, undef), concat(v2, undef)) 6614 // -> 6615 // shuffle(concat(v1, v2), undef) 6616 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 6617 // 6618 // This is useful in the general case, but there are special cases where 6619 // native shuffles produce larger results: the two-result ops. 6620 // 6621 // Look through the concat when lowering them: 6622 // shuffle(concat(v1, v2), undef) 6623 // -> 6624 // concat(VZIP(v1, v2):0, :1) 6625 // 6626 if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 6627 SDValue SubV1 = V1->getOperand(0); 6628 SDValue SubV2 = V1->getOperand(1); 6629 EVT SubVT = SubV1.getValueType(); 6630 6631 // We expect these to have been canonicalized to -1. 6632 assert(all_of(ShuffleMask, [&](int i) { 6633 return i < (int)VT.getVectorNumElements(); 6634 }) && "Unexpected shuffle index into UNDEF operand!"); 6635 6636 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 6637 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 6638 if (isV_UNDEF) 6639 SubV2 = SubV1; 6640 assert((WhichResult == 0) && 6641 "In-place shuffle of concat can only have one result!"); 6642 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 6643 SubV1, SubV2); 6644 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 6645 Res.getValue(1)); 6646 } 6647 } 6648 } 6649 6650 // If the shuffle is not directly supported and it has 4 elements, use 6651 // the PerfectShuffle-generated table to synthesize it from other shuffles. 6652 unsigned NumElts = VT.getVectorNumElements(); 6653 if (NumElts == 4) { 6654 unsigned PFIndexes[4]; 6655 for (unsigned i = 0; i != 4; ++i) { 6656 if (ShuffleMask[i] < 0) 6657 PFIndexes[i] = 8; 6658 else 6659 PFIndexes[i] = ShuffleMask[i]; 6660 } 6661 6662 // Compute the index in the perfect shuffle table. 6663 unsigned PFTableIndex = 6664 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6665 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6666 unsigned Cost = (PFEntry >> 30); 6667 6668 if (Cost <= 4) 6669 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 6670 } 6671 6672 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 6673 if (EltSize >= 32) { 6674 // Do the expansion with floating-point types, since that is what the VFP 6675 // registers are defined to use, and since i64 is not legal. 6676 EVT EltVT = EVT::getFloatingPointVT(EltSize); 6677 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 6678 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 6679 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 6680 SmallVector<SDValue, 8> Ops; 6681 for (unsigned i = 0; i < NumElts; ++i) { 6682 if (ShuffleMask[i] < 0) 6683 Ops.push_back(DAG.getUNDEF(EltVT)); 6684 else 6685 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6686 ShuffleMask[i] < (int)NumElts ? V1 : V2, 6687 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 6688 dl, MVT::i32))); 6689 } 6690 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 6691 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6692 } 6693 6694 if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 6695 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 6696 6697 if (VT == MVT::v8i8) 6698 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 6699 return NewOp; 6700 6701 return SDValue(); 6702 } 6703 6704 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 6705 // INSERT_VECTOR_ELT is legal only for immediate indexes. 6706 SDValue Lane = Op.getOperand(2); 6707 if (!isa<ConstantSDNode>(Lane)) 6708 return SDValue(); 6709 6710 return Op; 6711 } 6712 6713 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 6714 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 6715 SDValue Lane = Op.getOperand(1); 6716 if (!isa<ConstantSDNode>(Lane)) 6717 return SDValue(); 6718 6719 SDValue Vec = Op.getOperand(0); 6720 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 6721 SDLoc dl(Op); 6722 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 6723 } 6724 6725 return Op; 6726 } 6727 6728 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6729 // The only time a CONCAT_VECTORS operation can have legal types is when 6730 // two 64-bit vectors are concatenated to a 128-bit vector. 6731 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 6732 "unexpected CONCAT_VECTORS"); 6733 SDLoc dl(Op); 6734 SDValue Val = DAG.getUNDEF(MVT::v2f64); 6735 SDValue Op0 = Op.getOperand(0); 6736 SDValue Op1 = Op.getOperand(1); 6737 if (!Op0.isUndef()) 6738 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 6739 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 6740 DAG.getIntPtrConstant(0, dl)); 6741 if (!Op1.isUndef()) 6742 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 6743 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 6744 DAG.getIntPtrConstant(1, dl)); 6745 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 6746 } 6747 6748 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 6749 /// element has been zero/sign-extended, depending on the isSigned parameter, 6750 /// from an integer type half its size. 6751 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 6752 bool isSigned) { 6753 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 6754 EVT VT = N->getValueType(0); 6755 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 6756 SDNode *BVN = N->getOperand(0).getNode(); 6757 if (BVN->getValueType(0) != MVT::v4i32 || 6758 BVN->getOpcode() != ISD::BUILD_VECTOR) 6759 return false; 6760 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 6761 unsigned HiElt = 1 - LoElt; 6762 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 6763 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 6764 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 6765 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 6766 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 6767 return false; 6768 if (isSigned) { 6769 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 6770 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 6771 return true; 6772 } else { 6773 if (Hi0->isNullValue() && Hi1->isNullValue()) 6774 return true; 6775 } 6776 return false; 6777 } 6778 6779 if (N->getOpcode() != ISD::BUILD_VECTOR) 6780 return false; 6781 6782 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 6783 SDNode *Elt = N->getOperand(i).getNode(); 6784 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 6785 unsigned EltSize = VT.getScalarSizeInBits(); 6786 unsigned HalfSize = EltSize / 2; 6787 if (isSigned) { 6788 if (!isIntN(HalfSize, C->getSExtValue())) 6789 return false; 6790 } else { 6791 if (!isUIntN(HalfSize, C->getZExtValue())) 6792 return false; 6793 } 6794 continue; 6795 } 6796 return false; 6797 } 6798 6799 return true; 6800 } 6801 6802 /// isSignExtended - Check if a node is a vector value that is sign-extended 6803 /// or a constant BUILD_VECTOR with sign-extended elements. 6804 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 6805 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 6806 return true; 6807 if (isExtendedBUILD_VECTOR(N, DAG, true)) 6808 return true; 6809 return false; 6810 } 6811 6812 /// isZeroExtended - Check if a node is a vector value that is zero-extended 6813 /// or a constant BUILD_VECTOR with zero-extended elements. 6814 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 6815 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 6816 return true; 6817 if (isExtendedBUILD_VECTOR(N, DAG, false)) 6818 return true; 6819 return false; 6820 } 6821 6822 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 6823 if (OrigVT.getSizeInBits() >= 64) 6824 return OrigVT; 6825 6826 assert(OrigVT.isSimple() && "Expecting a simple value type"); 6827 6828 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 6829 switch (OrigSimpleTy) { 6830 default: llvm_unreachable("Unexpected Vector Type"); 6831 case MVT::v2i8: 6832 case MVT::v2i16: 6833 return MVT::v2i32; 6834 case MVT::v4i8: 6835 return MVT::v4i16; 6836 } 6837 } 6838 6839 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 6840 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 6841 /// We insert the required extension here to get the vector to fill a D register. 6842 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 6843 const EVT &OrigTy, 6844 const EVT &ExtTy, 6845 unsigned ExtOpcode) { 6846 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 6847 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 6848 // 64-bits we need to insert a new extension so that it will be 64-bits. 6849 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 6850 if (OrigTy.getSizeInBits() >= 64) 6851 return N; 6852 6853 // Must extend size to at least 64 bits to be used as an operand for VMULL. 6854 EVT NewVT = getExtensionTo64Bits(OrigTy); 6855 6856 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 6857 } 6858 6859 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 6860 /// does not do any sign/zero extension. If the original vector is less 6861 /// than 64 bits, an appropriate extension will be added after the load to 6862 /// reach a total size of 64 bits. We have to add the extension separately 6863 /// because ARM does not have a sign/zero extending load for vectors. 6864 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 6865 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 6866 6867 // The load already has the right type. 6868 if (ExtendedTy == LD->getMemoryVT()) 6869 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 6870 LD->getBasePtr(), LD->getPointerInfo(), 6871 LD->getAlignment(), LD->getMemOperand()->getFlags()); 6872 6873 // We need to create a zextload/sextload. We cannot just create a load 6874 // followed by a zext/zext node because LowerMUL is also run during normal 6875 // operation legalization where we can't create illegal types. 6876 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 6877 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 6878 LD->getMemoryVT(), LD->getAlignment(), 6879 LD->getMemOperand()->getFlags()); 6880 } 6881 6882 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 6883 /// extending load, or BUILD_VECTOR with extended elements, return the 6884 /// unextended value. The unextended vector should be 64 bits so that it can 6885 /// be used as an operand to a VMULL instruction. If the original vector size 6886 /// before extension is less than 64 bits we add a an extension to resize 6887 /// the vector to 64 bits. 6888 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 6889 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 6890 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 6891 N->getOperand(0)->getValueType(0), 6892 N->getValueType(0), 6893 N->getOpcode()); 6894 6895 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) 6896 return SkipLoadExtensionForVMULL(LD, DAG); 6897 6898 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 6899 // have been legalized as a BITCAST from v4i32. 6900 if (N->getOpcode() == ISD::BITCAST) { 6901 SDNode *BVN = N->getOperand(0).getNode(); 6902 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 6903 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 6904 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 6905 return DAG.getBuildVector( 6906 MVT::v2i32, SDLoc(N), 6907 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 6908 } 6909 // Construct a new BUILD_VECTOR with elements truncated to half the size. 6910 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 6911 EVT VT = N->getValueType(0); 6912 unsigned EltSize = VT.getScalarSizeInBits() / 2; 6913 unsigned NumElts = VT.getVectorNumElements(); 6914 MVT TruncVT = MVT::getIntegerVT(EltSize); 6915 SmallVector<SDValue, 8> Ops; 6916 SDLoc dl(N); 6917 for (unsigned i = 0; i != NumElts; ++i) { 6918 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 6919 const APInt &CInt = C->getAPIntValue(); 6920 // Element types smaller than 32 bits are not legal, so use i32 elements. 6921 // The values are implicitly truncated so sext vs. zext doesn't matter. 6922 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 6923 } 6924 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 6925 } 6926 6927 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 6928 unsigned Opcode = N->getOpcode(); 6929 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 6930 SDNode *N0 = N->getOperand(0).getNode(); 6931 SDNode *N1 = N->getOperand(1).getNode(); 6932 return N0->hasOneUse() && N1->hasOneUse() && 6933 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 6934 } 6935 return false; 6936 } 6937 6938 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 6939 unsigned Opcode = N->getOpcode(); 6940 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 6941 SDNode *N0 = N->getOperand(0).getNode(); 6942 SDNode *N1 = N->getOperand(1).getNode(); 6943 return N0->hasOneUse() && N1->hasOneUse() && 6944 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 6945 } 6946 return false; 6947 } 6948 6949 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 6950 // Multiplications are only custom-lowered for 128-bit vectors so that 6951 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 6952 EVT VT = Op.getValueType(); 6953 assert(VT.is128BitVector() && VT.isInteger() && 6954 "unexpected type for custom-lowering ISD::MUL"); 6955 SDNode *N0 = Op.getOperand(0).getNode(); 6956 SDNode *N1 = Op.getOperand(1).getNode(); 6957 unsigned NewOpc = 0; 6958 bool isMLA = false; 6959 bool isN0SExt = isSignExtended(N0, DAG); 6960 bool isN1SExt = isSignExtended(N1, DAG); 6961 if (isN0SExt && isN1SExt) 6962 NewOpc = ARMISD::VMULLs; 6963 else { 6964 bool isN0ZExt = isZeroExtended(N0, DAG); 6965 bool isN1ZExt = isZeroExtended(N1, DAG); 6966 if (isN0ZExt && isN1ZExt) 6967 NewOpc = ARMISD::VMULLu; 6968 else if (isN1SExt || isN1ZExt) { 6969 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 6970 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 6971 if (isN1SExt && isAddSubSExt(N0, DAG)) { 6972 NewOpc = ARMISD::VMULLs; 6973 isMLA = true; 6974 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 6975 NewOpc = ARMISD::VMULLu; 6976 isMLA = true; 6977 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 6978 std::swap(N0, N1); 6979 NewOpc = ARMISD::VMULLu; 6980 isMLA = true; 6981 } 6982 } 6983 6984 if (!NewOpc) { 6985 if (VT == MVT::v2i64) 6986 // Fall through to expand this. It is not legal. 6987 return SDValue(); 6988 else 6989 // Other vector multiplications are legal. 6990 return Op; 6991 } 6992 } 6993 6994 // Legalize to a VMULL instruction. 6995 SDLoc DL(Op); 6996 SDValue Op0; 6997 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 6998 if (!isMLA) { 6999 Op0 = SkipExtensionForVMULL(N0, DAG); 7000 assert(Op0.getValueType().is64BitVector() && 7001 Op1.getValueType().is64BitVector() && 7002 "unexpected types for extended operands to VMULL"); 7003 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 7004 } 7005 7006 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 7007 // isel lowering to take advantage of no-stall back to back vmul + vmla. 7008 // vmull q0, d4, d6 7009 // vmlal q0, d5, d6 7010 // is faster than 7011 // vaddl q0, d4, d5 7012 // vmovl q1, d6 7013 // vmul q0, q0, q1 7014 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 7015 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 7016 EVT Op1VT = Op1.getValueType(); 7017 return DAG.getNode(N0->getOpcode(), DL, VT, 7018 DAG.getNode(NewOpc, DL, VT, 7019 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 7020 DAG.getNode(NewOpc, DL, VT, 7021 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 7022 } 7023 7024 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 7025 SelectionDAG &DAG) { 7026 // TODO: Should this propagate fast-math-flags? 7027 7028 // Convert to float 7029 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 7030 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 7031 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 7032 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 7033 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 7034 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 7035 // Get reciprocal estimate. 7036 // float4 recip = vrecpeq_f32(yf); 7037 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7038 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7039 Y); 7040 // Because char has a smaller range than uchar, we can actually get away 7041 // without any newton steps. This requires that we use a weird bias 7042 // of 0xb000, however (again, this has been exhaustively tested). 7043 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 7044 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 7045 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 7046 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 7047 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 7048 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 7049 // Convert back to short. 7050 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 7051 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 7052 return X; 7053 } 7054 7055 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 7056 SelectionDAG &DAG) { 7057 // TODO: Should this propagate fast-math-flags? 7058 7059 SDValue N2; 7060 // Convert to float. 7061 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 7062 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 7063 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 7064 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 7065 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7066 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7067 7068 // Use reciprocal estimate and one refinement step. 7069 // float4 recip = vrecpeq_f32(yf); 7070 // recip *= vrecpsq_f32(yf, recip); 7071 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7072 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7073 N1); 7074 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7075 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7076 N1, N2); 7077 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7078 // Because short has a smaller range than ushort, we can actually get away 7079 // with only a single newton step. This requires that we use a weird bias 7080 // of 89, however (again, this has been exhaustively tested). 7081 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 7082 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7083 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7084 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 7085 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7086 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7087 // Convert back to integer and return. 7088 // return vmovn_s32(vcvt_s32_f32(result)); 7089 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7090 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7091 return N0; 7092 } 7093 7094 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 7095 EVT VT = Op.getValueType(); 7096 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7097 "unexpected type for custom-lowering ISD::SDIV"); 7098 7099 SDLoc dl(Op); 7100 SDValue N0 = Op.getOperand(0); 7101 SDValue N1 = Op.getOperand(1); 7102 SDValue N2, N3; 7103 7104 if (VT == MVT::v8i8) { 7105 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 7106 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 7107 7108 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7109 DAG.getIntPtrConstant(4, dl)); 7110 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7111 DAG.getIntPtrConstant(4, dl)); 7112 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7113 DAG.getIntPtrConstant(0, dl)); 7114 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7115 DAG.getIntPtrConstant(0, dl)); 7116 7117 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 7118 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 7119 7120 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7121 N0 = LowerCONCAT_VECTORS(N0, DAG); 7122 7123 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 7124 return N0; 7125 } 7126 return LowerSDIV_v4i16(N0, N1, dl, DAG); 7127 } 7128 7129 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { 7130 // TODO: Should this propagate fast-math-flags? 7131 EVT VT = Op.getValueType(); 7132 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7133 "unexpected type for custom-lowering ISD::UDIV"); 7134 7135 SDLoc dl(Op); 7136 SDValue N0 = Op.getOperand(0); 7137 SDValue N1 = Op.getOperand(1); 7138 SDValue N2, N3; 7139 7140 if (VT == MVT::v8i8) { 7141 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 7142 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 7143 7144 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7145 DAG.getIntPtrConstant(4, dl)); 7146 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7147 DAG.getIntPtrConstant(4, dl)); 7148 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7149 DAG.getIntPtrConstant(0, dl)); 7150 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7151 DAG.getIntPtrConstant(0, dl)); 7152 7153 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 7154 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 7155 7156 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7157 N0 = LowerCONCAT_VECTORS(N0, DAG); 7158 7159 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 7160 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 7161 MVT::i32), 7162 N0); 7163 return N0; 7164 } 7165 7166 // v4i16 sdiv ... Convert to float. 7167 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 7168 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 7169 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 7170 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 7171 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7172 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7173 7174 // Use reciprocal estimate and two refinement steps. 7175 // float4 recip = vrecpeq_f32(yf); 7176 // recip *= vrecpsq_f32(yf, recip); 7177 // recip *= vrecpsq_f32(yf, recip); 7178 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7179 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7180 BN1); 7181 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7182 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7183 BN1, N2); 7184 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7185 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7186 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7187 BN1, N2); 7188 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7189 // Simply multiplying by the reciprocal estimate can leave us a few ulps 7190 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 7191 // and that it will never cause us to return an answer too large). 7192 // float4 result = as_float4(as_int4(xf*recip) + 2); 7193 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7194 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7195 N1 = DAG.getConstant(2, dl, MVT::v4i32); 7196 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7197 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7198 // Convert back to integer and return. 7199 // return vmovn_u32(vcvt_s32_f32(result)); 7200 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7201 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7202 return N0; 7203 } 7204 7205 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 7206 EVT VT = Op.getNode()->getValueType(0); 7207 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 7208 7209 unsigned Opc; 7210 bool ExtraOp = false; 7211 switch (Op.getOpcode()) { 7212 default: llvm_unreachable("Invalid code"); 7213 case ISD::ADDC: Opc = ARMISD::ADDC; break; 7214 case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; 7215 case ISD::SUBC: Opc = ARMISD::SUBC; break; 7216 case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; 7217 } 7218 7219 if (!ExtraOp) 7220 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 7221 Op.getOperand(1)); 7222 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 7223 Op.getOperand(1), Op.getOperand(2)); 7224 } 7225 7226 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 7227 assert(Subtarget->isTargetDarwin()); 7228 7229 // For iOS, we want to call an alternative entry point: __sincos_stret, 7230 // return values are passed via sret. 7231 SDLoc dl(Op); 7232 SDValue Arg = Op.getOperand(0); 7233 EVT ArgVT = Arg.getValueType(); 7234 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7235 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7236 7237 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7238 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 7239 7240 // Pair of floats / doubles used to pass the result. 7241 Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 7242 auto &DL = DAG.getDataLayout(); 7243 7244 ArgListTy Args; 7245 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 7246 SDValue SRet; 7247 if (ShouldUseSRet) { 7248 // Create stack object for sret. 7249 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 7250 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 7251 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 7252 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 7253 7254 ArgListEntry Entry; 7255 Entry.Node = SRet; 7256 Entry.Ty = RetTy->getPointerTo(); 7257 Entry.isSExt = false; 7258 Entry.isZExt = false; 7259 Entry.isSRet = true; 7260 Args.push_back(Entry); 7261 RetTy = Type::getVoidTy(*DAG.getContext()); 7262 } 7263 7264 ArgListEntry Entry; 7265 Entry.Node = Arg; 7266 Entry.Ty = ArgTy; 7267 Entry.isSExt = false; 7268 Entry.isZExt = false; 7269 Args.push_back(Entry); 7270 7271 const char *LibcallName = 7272 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 7273 RTLIB::Libcall LC = 7274 (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; 7275 CallingConv::ID CC = getLibcallCallingConv(LC); 7276 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 7277 7278 TargetLowering::CallLoweringInfo CLI(DAG); 7279 CLI.setDebugLoc(dl) 7280 .setChain(DAG.getEntryNode()) 7281 .setCallee(CC, RetTy, Callee, std::move(Args)) 7282 .setDiscardResult(ShouldUseSRet); 7283 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 7284 7285 if (!ShouldUseSRet) 7286 return CallResult.first; 7287 7288 SDValue LoadSin = 7289 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 7290 7291 // Address of cos field. 7292 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 7293 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 7294 SDValue LoadCos = 7295 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 7296 7297 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 7298 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 7299 LoadSin.getValue(0), LoadCos.getValue(0)); 7300 } 7301 7302 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 7303 bool Signed, 7304 SDValue &Chain) const { 7305 EVT VT = Op.getValueType(); 7306 assert((VT == MVT::i32 || VT == MVT::i64) && 7307 "unexpected type for custom lowering DIV"); 7308 SDLoc dl(Op); 7309 7310 const auto &DL = DAG.getDataLayout(); 7311 const auto &TLI = DAG.getTargetLoweringInfo(); 7312 7313 const char *Name = nullptr; 7314 if (Signed) 7315 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 7316 else 7317 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 7318 7319 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 7320 7321 ARMTargetLowering::ArgListTy Args; 7322 7323 for (auto AI : {1, 0}) { 7324 ArgListEntry Arg; 7325 Arg.Node = Op.getOperand(AI); 7326 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 7327 Args.push_back(Arg); 7328 } 7329 7330 CallLoweringInfo CLI(DAG); 7331 CLI.setDebugLoc(dl) 7332 .setChain(Chain) 7333 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 7334 ES, std::move(Args)); 7335 7336 return LowerCallTo(CLI).first; 7337 } 7338 7339 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 7340 bool Signed) const { 7341 assert(Op.getValueType() == MVT::i32 && 7342 "unexpected type for custom lowering DIV"); 7343 SDLoc dl(Op); 7344 7345 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 7346 DAG.getEntryNode(), Op.getOperand(1)); 7347 7348 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7349 } 7350 7351 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 7352 SDLoc DL(N); 7353 SDValue Op = N->getOperand(1); 7354 if (N->getValueType(0) == MVT::i32) 7355 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 7356 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7357 DAG.getConstant(0, DL, MVT::i32)); 7358 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7359 DAG.getConstant(1, DL, MVT::i32)); 7360 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 7361 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 7362 } 7363 7364 void ARMTargetLowering::ExpandDIV_Windows( 7365 SDValue Op, SelectionDAG &DAG, bool Signed, 7366 SmallVectorImpl<SDValue> &Results) const { 7367 const auto &DL = DAG.getDataLayout(); 7368 const auto &TLI = DAG.getTargetLoweringInfo(); 7369 7370 assert(Op.getValueType() == MVT::i64 && 7371 "unexpected type for custom lowering DIV"); 7372 SDLoc dl(Op); 7373 7374 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 7375 7376 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7377 7378 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 7379 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 7380 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 7381 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 7382 7383 Results.push_back(Lower); 7384 Results.push_back(Upper); 7385 } 7386 7387 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 7388 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 7389 // Acquire/Release load/store is not legal for targets without a dmb or 7390 // equivalent available. 7391 return SDValue(); 7392 7393 // Monotonic load/store is legal for all targets. 7394 return Op; 7395 } 7396 7397 static void ReplaceREADCYCLECOUNTER(SDNode *N, 7398 SmallVectorImpl<SDValue> &Results, 7399 SelectionDAG &DAG, 7400 const ARMSubtarget *Subtarget) { 7401 SDLoc DL(N); 7402 // Under Power Management extensions, the cycle-count is: 7403 // mrc p15, #0, <Rt>, c9, c13, #0 7404 SDValue Ops[] = { N->getOperand(0), // Chain 7405 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 7406 DAG.getConstant(15, DL, MVT::i32), 7407 DAG.getConstant(0, DL, MVT::i32), 7408 DAG.getConstant(9, DL, MVT::i32), 7409 DAG.getConstant(13, DL, MVT::i32), 7410 DAG.getConstant(0, DL, MVT::i32) 7411 }; 7412 7413 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 7414 DAG.getVTList(MVT::i32, MVT::Other), Ops); 7415 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 7416 DAG.getConstant(0, DL, MVT::i32))); 7417 Results.push_back(Cycles32.getValue(1)); 7418 } 7419 7420 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 7421 SDLoc dl(V.getNode()); 7422 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 7423 SDValue VHi = DAG.getAnyExtOrTrunc( 7424 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 7425 dl, MVT::i32); 7426 SDValue RegClass = 7427 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 7428 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 7429 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 7430 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 7431 return SDValue( 7432 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 7433 } 7434 7435 static void ReplaceCMP_SWAP_64Results(SDNode *N, 7436 SmallVectorImpl<SDValue> & Results, 7437 SelectionDAG &DAG) { 7438 assert(N->getValueType(0) == MVT::i64 && 7439 "AtomicCmpSwap on types less than 64 should be legal"); 7440 SDValue Ops[] = {N->getOperand(1), 7441 createGPRPairNode(DAG, N->getOperand(2)), 7442 createGPRPairNode(DAG, N->getOperand(3)), 7443 N->getOperand(0)}; 7444 SDNode *CmpSwap = DAG.getMachineNode( 7445 ARM::CMP_SWAP_64, SDLoc(N), 7446 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 7447 7448 MachineFunction &MF = DAG.getMachineFunction(); 7449 MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); 7450 MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); 7451 cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); 7452 7453 Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32, 7454 SDValue(CmpSwap, 0))); 7455 Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32, 7456 SDValue(CmpSwap, 0))); 7457 Results.push_back(SDValue(CmpSwap, 2)); 7458 } 7459 7460 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, 7461 SelectionDAG &DAG) { 7462 const auto &TLI = DAG.getTargetLoweringInfo(); 7463 7464 assert(Subtarget.getTargetTriple().isOSMSVCRT() && 7465 "Custom lowering is MSVCRT specific!"); 7466 7467 SDLoc dl(Op); 7468 SDValue Val = Op.getOperand(0); 7469 MVT Ty = Val->getSimpleValueType(0); 7470 SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); 7471 SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", 7472 TLI.getPointerTy(DAG.getDataLayout())); 7473 7474 TargetLowering::ArgListTy Args; 7475 TargetLowering::ArgListEntry Entry; 7476 7477 Entry.Node = Val; 7478 Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); 7479 Entry.isZExt = true; 7480 Args.push_back(Entry); 7481 7482 Entry.Node = Exponent; 7483 Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); 7484 Entry.isZExt = true; 7485 Args.push_back(Entry); 7486 7487 Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); 7488 7489 // In the in-chain to the call is the entry node If we are emitting a 7490 // tailcall, the chain will be mutated if the node has a non-entry input 7491 // chain. 7492 SDValue InChain = DAG.getEntryNode(); 7493 SDValue TCChain = InChain; 7494 7495 const auto *F = DAG.getMachineFunction().getFunction(); 7496 bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && 7497 F->getReturnType() == LCRTy; 7498 if (IsTC) 7499 InChain = TCChain; 7500 7501 TargetLowering::CallLoweringInfo CLI(DAG); 7502 CLI.setDebugLoc(dl) 7503 .setChain(InChain) 7504 .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) 7505 .setTailCall(IsTC); 7506 std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI); 7507 7508 // Return the chain (the DAG root) if it is a tail call 7509 return !CI.second.getNode() ? DAG.getRoot() : CI.first; 7510 } 7511 7512 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7513 switch (Op.getOpcode()) { 7514 default: llvm_unreachable("Don't know how to custom lower this!"); 7515 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 7516 case ISD::ConstantPool: 7517 if (Subtarget->genExecuteOnly()) 7518 llvm_unreachable("execute-only should not generate constant pools"); 7519 return LowerConstantPool(Op, DAG); 7520 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7521 case ISD::GlobalAddress: 7522 switch (Subtarget->getTargetTriple().getObjectFormat()) { 7523 default: llvm_unreachable("unknown object format"); 7524 case Triple::COFF: 7525 return LowerGlobalAddressWindows(Op, DAG); 7526 case Triple::ELF: 7527 return LowerGlobalAddressELF(Op, DAG); 7528 case Triple::MachO: 7529 return LowerGlobalAddressDarwin(Op, DAG); 7530 } 7531 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7532 case ISD::SELECT: return LowerSELECT(Op, DAG); 7533 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 7534 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 7535 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 7536 case ISD::VASTART: return LowerVASTART(Op, DAG); 7537 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 7538 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 7539 case ISD::SINT_TO_FP: 7540 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 7541 case ISD::FP_TO_SINT: 7542 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 7543 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7544 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7545 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7546 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 7547 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 7548 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 7549 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 7550 Subtarget); 7551 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); 7552 case ISD::SHL: 7553 case ISD::SRL: 7554 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 7555 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 7556 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 7557 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 7558 case ISD::SRL_PARTS: 7559 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 7560 case ISD::CTTZ: 7561 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 7562 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 7563 case ISD::SETCC: return LowerVSETCC(Op, DAG); 7564 case ISD::SETCCE: return LowerSETCCE(Op, DAG); 7565 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 7566 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 7567 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7568 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7569 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7570 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7571 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7572 case ISD::MUL: return LowerMUL(Op, DAG); 7573 case ISD::SDIV: 7574 if (Subtarget->isTargetWindows()) 7575 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 7576 return LowerSDIV(Op, DAG); 7577 case ISD::UDIV: 7578 if (Subtarget->isTargetWindows()) 7579 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 7580 return LowerUDIV(Op, DAG); 7581 case ISD::ADDC: 7582 case ISD::ADDE: 7583 case ISD::SUBC: 7584 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 7585 case ISD::SADDO: 7586 case ISD::UADDO: 7587 case ISD::SSUBO: 7588 case ISD::USUBO: 7589 return LowerXALUO(Op, DAG); 7590 case ISD::ATOMIC_LOAD: 7591 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 7592 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 7593 case ISD::SDIVREM: 7594 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 7595 case ISD::DYNAMIC_STACKALLOC: 7596 if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) 7597 return LowerDYNAMIC_STACKALLOC(Op, DAG); 7598 llvm_unreachable("Don't know how to custom lower this!"); 7599 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 7600 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 7601 case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); 7602 case ARMISD::WIN__DBZCHK: return SDValue(); 7603 } 7604 } 7605 7606 /// ReplaceNodeResults - Replace the results of node with an illegal result 7607 /// type with new values built out of custom code. 7608 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 7609 SmallVectorImpl<SDValue> &Results, 7610 SelectionDAG &DAG) const { 7611 SDValue Res; 7612 switch (N->getOpcode()) { 7613 default: 7614 llvm_unreachable("Don't know how to custom expand this!"); 7615 case ISD::READ_REGISTER: 7616 ExpandREAD_REGISTER(N, Results, DAG); 7617 break; 7618 case ISD::BITCAST: 7619 Res = ExpandBITCAST(N, DAG); 7620 break; 7621 case ISD::SRL: 7622 case ISD::SRA: 7623 Res = Expand64BitShift(N, DAG, Subtarget); 7624 break; 7625 case ISD::SREM: 7626 case ISD::UREM: 7627 Res = LowerREM(N, DAG); 7628 break; 7629 case ISD::SDIVREM: 7630 case ISD::UDIVREM: 7631 Res = LowerDivRem(SDValue(N, 0), DAG); 7632 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 7633 Results.push_back(Res.getValue(0)); 7634 Results.push_back(Res.getValue(1)); 7635 return; 7636 case ISD::READCYCLECOUNTER: 7637 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 7638 return; 7639 case ISD::UDIV: 7640 case ISD::SDIV: 7641 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 7642 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 7643 Results); 7644 case ISD::ATOMIC_CMP_SWAP: 7645 ReplaceCMP_SWAP_64Results(N, Results, DAG); 7646 return; 7647 } 7648 if (Res.getNode()) 7649 Results.push_back(Res); 7650 } 7651 7652 //===----------------------------------------------------------------------===// 7653 // ARM Scheduler Hooks 7654 //===----------------------------------------------------------------------===// 7655 7656 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 7657 /// registers the function context. 7658 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 7659 MachineBasicBlock *MBB, 7660 MachineBasicBlock *DispatchBB, 7661 int FI) const { 7662 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 7663 "ROPI/RWPI not currently supported with SjLj"); 7664 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7665 DebugLoc dl = MI.getDebugLoc(); 7666 MachineFunction *MF = MBB->getParent(); 7667 MachineRegisterInfo *MRI = &MF->getRegInfo(); 7668 MachineConstantPool *MCP = MF->getConstantPool(); 7669 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 7670 const Function *F = MF->getFunction(); 7671 7672 bool isThumb = Subtarget->isThumb(); 7673 bool isThumb2 = Subtarget->isThumb2(); 7674 7675 unsigned PCLabelId = AFI->createPICLabelUId(); 7676 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 7677 ARMConstantPoolValue *CPV = 7678 ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); 7679 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 7680 7681 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 7682 : &ARM::GPRRegClass; 7683 7684 // Grab constant pool and fixed stack memory operands. 7685 MachineMemOperand *CPMMO = 7686 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 7687 MachineMemOperand::MOLoad, 4, 4); 7688 7689 MachineMemOperand *FIMMOSt = 7690 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 7691 MachineMemOperand::MOStore, 4, 4); 7692 7693 // Load the address of the dispatch MBB into the jump buffer. 7694 if (isThumb2) { 7695 // Incoming value: jbuf 7696 // ldr.n r5, LCPI1_1 7697 // orr r5, r5, #1 7698 // add r5, pc 7699 // str r5, [$jbuf, #+4] ; &jbuf[1] 7700 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7701 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 7702 .addConstantPoolIndex(CPI) 7703 .addMemOperand(CPMMO) 7704 .add(predOps(ARMCC::AL)); 7705 // Set the low bit because of thumb mode. 7706 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7707 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 7708 .addReg(NewVReg1, RegState::Kill) 7709 .addImm(0x01) 7710 .add(predOps(ARMCC::AL)) 7711 .add(condCodeOp()); 7712 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7713 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 7714 .addReg(NewVReg2, RegState::Kill) 7715 .addImm(PCLabelId); 7716 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 7717 .addReg(NewVReg3, RegState::Kill) 7718 .addFrameIndex(FI) 7719 .addImm(36) // &jbuf[1] :: pc 7720 .addMemOperand(FIMMOSt) 7721 .add(predOps(ARMCC::AL)); 7722 } else if (isThumb) { 7723 // Incoming value: jbuf 7724 // ldr.n r1, LCPI1_4 7725 // add r1, pc 7726 // mov r2, #1 7727 // orrs r1, r2 7728 // add r2, $jbuf, #+4 ; &jbuf[1] 7729 // str r1, [r2] 7730 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7731 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 7732 .addConstantPoolIndex(CPI) 7733 .addMemOperand(CPMMO) 7734 .add(predOps(ARMCC::AL)); 7735 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7736 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 7737 .addReg(NewVReg1, RegState::Kill) 7738 .addImm(PCLabelId); 7739 // Set the low bit because of thumb mode. 7740 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7741 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 7742 .addReg(ARM::CPSR, RegState::Define) 7743 .addImm(1) 7744 .add(predOps(ARMCC::AL)); 7745 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7746 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 7747 .addReg(ARM::CPSR, RegState::Define) 7748 .addReg(NewVReg2, RegState::Kill) 7749 .addReg(NewVReg3, RegState::Kill) 7750 .add(predOps(ARMCC::AL)); 7751 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 7752 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 7753 .addFrameIndex(FI) 7754 .addImm(36); // &jbuf[1] :: pc 7755 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 7756 .addReg(NewVReg4, RegState::Kill) 7757 .addReg(NewVReg5, RegState::Kill) 7758 .addImm(0) 7759 .addMemOperand(FIMMOSt) 7760 .add(predOps(ARMCC::AL)); 7761 } else { 7762 // Incoming value: jbuf 7763 // ldr r1, LCPI1_1 7764 // add r1, pc, r1 7765 // str r1, [$jbuf, #+4] ; &jbuf[1] 7766 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7767 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 7768 .addConstantPoolIndex(CPI) 7769 .addImm(0) 7770 .addMemOperand(CPMMO) 7771 .add(predOps(ARMCC::AL)); 7772 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7773 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 7774 .addReg(NewVReg1, RegState::Kill) 7775 .addImm(PCLabelId) 7776 .add(predOps(ARMCC::AL)); 7777 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 7778 .addReg(NewVReg2, RegState::Kill) 7779 .addFrameIndex(FI) 7780 .addImm(36) // &jbuf[1] :: pc 7781 .addMemOperand(FIMMOSt) 7782 .add(predOps(ARMCC::AL)); 7783 } 7784 } 7785 7786 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 7787 MachineBasicBlock *MBB) const { 7788 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 7789 DebugLoc dl = MI.getDebugLoc(); 7790 MachineFunction *MF = MBB->getParent(); 7791 MachineRegisterInfo *MRI = &MF->getRegInfo(); 7792 MachineFrameInfo &MFI = MF->getFrameInfo(); 7793 int FI = MFI.getFunctionContextIndex(); 7794 7795 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 7796 : &ARM::GPRnopcRegClass; 7797 7798 // Get a mapping of the call site numbers to all of the landing pads they're 7799 // associated with. 7800 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad; 7801 unsigned MaxCSNum = 0; 7802 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 7803 ++BB) { 7804 if (!BB->isEHPad()) continue; 7805 7806 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 7807 // pad. 7808 for (MachineBasicBlock::iterator 7809 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 7810 if (!II->isEHLabel()) continue; 7811 7812 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 7813 if (!MF->hasCallSiteLandingPad(Sym)) continue; 7814 7815 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 7816 for (SmallVectorImpl<unsigned>::iterator 7817 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 7818 CSI != CSE; ++CSI) { 7819 CallSiteNumToLPad[*CSI].push_back(&*BB); 7820 MaxCSNum = std::max(MaxCSNum, *CSI); 7821 } 7822 break; 7823 } 7824 } 7825 7826 // Get an ordered list of the machine basic blocks for the jump table. 7827 std::vector<MachineBasicBlock*> LPadList; 7828 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 7829 LPadList.reserve(CallSiteNumToLPad.size()); 7830 for (unsigned I = 1; I <= MaxCSNum; ++I) { 7831 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 7832 for (SmallVectorImpl<MachineBasicBlock*>::iterator 7833 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 7834 LPadList.push_back(*II); 7835 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 7836 } 7837 } 7838 7839 assert(!LPadList.empty() && 7840 "No landing pad destinations for the dispatch jump table!"); 7841 7842 // Create the jump table and associated information. 7843 MachineJumpTableInfo *JTI = 7844 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 7845 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 7846 7847 // Create the MBBs for the dispatch code. 7848 7849 // Shove the dispatch's address into the return slot in the function context. 7850 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 7851 DispatchBB->setIsEHPad(); 7852 7853 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 7854 unsigned trap_opcode; 7855 if (Subtarget->isThumb()) 7856 trap_opcode = ARM::tTRAP; 7857 else 7858 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 7859 7860 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 7861 DispatchBB->addSuccessor(TrapBB); 7862 7863 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 7864 DispatchBB->addSuccessor(DispContBB); 7865 7866 // Insert and MBBs. 7867 MF->insert(MF->end(), DispatchBB); 7868 MF->insert(MF->end(), DispContBB); 7869 MF->insert(MF->end(), TrapBB); 7870 7871 // Insert code into the entry block that creates and registers the function 7872 // context. 7873 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 7874 7875 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 7876 MachinePointerInfo::getFixedStack(*MF, FI), 7877 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 7878 7879 MachineInstrBuilder MIB; 7880 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 7881 7882 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 7883 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 7884 7885 // Add a register mask with no preserved registers. This results in all 7886 // registers being marked as clobbered. This can't work if the dispatch block 7887 // is in a Thumb1 function and is linked with ARM code which uses the FP 7888 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 7889 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 7890 7891 bool IsPositionIndependent = isPositionIndependent(); 7892 unsigned NumLPads = LPadList.size(); 7893 if (Subtarget->isThumb2()) { 7894 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7895 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 7896 .addFrameIndex(FI) 7897 .addImm(4) 7898 .addMemOperand(FIMMOLd) 7899 .add(predOps(ARMCC::AL)); 7900 7901 if (NumLPads < 256) { 7902 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 7903 .addReg(NewVReg1) 7904 .addImm(LPadList.size()) 7905 .add(predOps(ARMCC::AL)); 7906 } else { 7907 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7908 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 7909 .addImm(NumLPads & 0xFFFF) 7910 .add(predOps(ARMCC::AL)); 7911 7912 unsigned VReg2 = VReg1; 7913 if ((NumLPads & 0xFFFF0000) != 0) { 7914 VReg2 = MRI->createVirtualRegister(TRC); 7915 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 7916 .addReg(VReg1) 7917 .addImm(NumLPads >> 16) 7918 .add(predOps(ARMCC::AL)); 7919 } 7920 7921 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 7922 .addReg(NewVReg1) 7923 .addReg(VReg2) 7924 .add(predOps(ARMCC::AL)); 7925 } 7926 7927 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 7928 .addMBB(TrapBB) 7929 .addImm(ARMCC::HI) 7930 .addReg(ARM::CPSR); 7931 7932 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7933 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 7934 .addJumpTableIndex(MJTI) 7935 .add(predOps(ARMCC::AL)); 7936 7937 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 7938 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 7939 .addReg(NewVReg3, RegState::Kill) 7940 .addReg(NewVReg1) 7941 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 7942 .add(predOps(ARMCC::AL)) 7943 .add(condCodeOp()); 7944 7945 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 7946 .addReg(NewVReg4, RegState::Kill) 7947 .addReg(NewVReg1) 7948 .addJumpTableIndex(MJTI); 7949 } else if (Subtarget->isThumb()) { 7950 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 7951 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 7952 .addFrameIndex(FI) 7953 .addImm(1) 7954 .addMemOperand(FIMMOLd) 7955 .add(predOps(ARMCC::AL)); 7956 7957 if (NumLPads < 256) { 7958 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 7959 .addReg(NewVReg1) 7960 .addImm(NumLPads) 7961 .add(predOps(ARMCC::AL)); 7962 } else { 7963 MachineConstantPool *ConstantPool = MF->getConstantPool(); 7964 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 7965 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 7966 7967 // MachineConstantPool wants an explicit alignment. 7968 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 7969 if (Align == 0) 7970 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 7971 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 7972 7973 unsigned VReg1 = MRI->createVirtualRegister(TRC); 7974 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 7975 .addReg(VReg1, RegState::Define) 7976 .addConstantPoolIndex(Idx) 7977 .add(predOps(ARMCC::AL)); 7978 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 7979 .addReg(NewVReg1) 7980 .addReg(VReg1) 7981 .add(predOps(ARMCC::AL)); 7982 } 7983 7984 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 7985 .addMBB(TrapBB) 7986 .addImm(ARMCC::HI) 7987 .addReg(ARM::CPSR); 7988 7989 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 7990 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 7991 .addReg(ARM::CPSR, RegState::Define) 7992 .addReg(NewVReg1) 7993 .addImm(2) 7994 .add(predOps(ARMCC::AL)); 7995 7996 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 7997 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 7998 .addJumpTableIndex(MJTI) 7999 .add(predOps(ARMCC::AL)); 8000 8001 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8002 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 8003 .addReg(ARM::CPSR, RegState::Define) 8004 .addReg(NewVReg2, RegState::Kill) 8005 .addReg(NewVReg3) 8006 .add(predOps(ARMCC::AL)); 8007 8008 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8009 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8010 8011 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8012 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 8013 .addReg(NewVReg4, RegState::Kill) 8014 .addImm(0) 8015 .addMemOperand(JTMMOLd) 8016 .add(predOps(ARMCC::AL)); 8017 8018 unsigned NewVReg6 = NewVReg5; 8019 if (IsPositionIndependent) { 8020 NewVReg6 = MRI->createVirtualRegister(TRC); 8021 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 8022 .addReg(ARM::CPSR, RegState::Define) 8023 .addReg(NewVReg5, RegState::Kill) 8024 .addReg(NewVReg3) 8025 .add(predOps(ARMCC::AL)); 8026 } 8027 8028 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 8029 .addReg(NewVReg6, RegState::Kill) 8030 .addJumpTableIndex(MJTI); 8031 } else { 8032 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8033 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 8034 .addFrameIndex(FI) 8035 .addImm(4) 8036 .addMemOperand(FIMMOLd) 8037 .add(predOps(ARMCC::AL)); 8038 8039 if (NumLPads < 256) { 8040 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 8041 .addReg(NewVReg1) 8042 .addImm(NumLPads) 8043 .add(predOps(ARMCC::AL)); 8044 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 8045 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8046 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 8047 .addImm(NumLPads & 0xFFFF) 8048 .add(predOps(ARMCC::AL)); 8049 8050 unsigned VReg2 = VReg1; 8051 if ((NumLPads & 0xFFFF0000) != 0) { 8052 VReg2 = MRI->createVirtualRegister(TRC); 8053 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 8054 .addReg(VReg1) 8055 .addImm(NumLPads >> 16) 8056 .add(predOps(ARMCC::AL)); 8057 } 8058 8059 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8060 .addReg(NewVReg1) 8061 .addReg(VReg2) 8062 .add(predOps(ARMCC::AL)); 8063 } else { 8064 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8065 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 8066 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 8067 8068 // MachineConstantPool wants an explicit alignment. 8069 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8070 if (Align == 0) 8071 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8072 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8073 8074 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8075 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 8076 .addReg(VReg1, RegState::Define) 8077 .addConstantPoolIndex(Idx) 8078 .addImm(0) 8079 .add(predOps(ARMCC::AL)); 8080 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8081 .addReg(NewVReg1) 8082 .addReg(VReg1, RegState::Kill) 8083 .add(predOps(ARMCC::AL)); 8084 } 8085 8086 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 8087 .addMBB(TrapBB) 8088 .addImm(ARMCC::HI) 8089 .addReg(ARM::CPSR); 8090 8091 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8092 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 8093 .addReg(NewVReg1) 8094 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 8095 .add(predOps(ARMCC::AL)) 8096 .add(condCodeOp()); 8097 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8098 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 8099 .addJumpTableIndex(MJTI) 8100 .add(predOps(ARMCC::AL)); 8101 8102 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8103 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8104 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8105 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 8106 .addReg(NewVReg3, RegState::Kill) 8107 .addReg(NewVReg4) 8108 .addImm(0) 8109 .addMemOperand(JTMMOLd) 8110 .add(predOps(ARMCC::AL)); 8111 8112 if (IsPositionIndependent) { 8113 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 8114 .addReg(NewVReg5, RegState::Kill) 8115 .addReg(NewVReg4) 8116 .addJumpTableIndex(MJTI); 8117 } else { 8118 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 8119 .addReg(NewVReg5, RegState::Kill) 8120 .addJumpTableIndex(MJTI); 8121 } 8122 } 8123 8124 // Add the jump table entries as successors to the MBB. 8125 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 8126 for (std::vector<MachineBasicBlock*>::iterator 8127 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 8128 MachineBasicBlock *CurMBB = *I; 8129 if (SeenMBBs.insert(CurMBB).second) 8130 DispContBB->addSuccessor(CurMBB); 8131 } 8132 8133 // N.B. the order the invoke BBs are processed in doesn't matter here. 8134 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 8135 SmallVector<MachineBasicBlock*, 64> MBBLPads; 8136 for (MachineBasicBlock *BB : InvokeBBs) { 8137 8138 // Remove the landing pad successor from the invoke block and replace it 8139 // with the new dispatch block. 8140 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 8141 BB->succ_end()); 8142 while (!Successors.empty()) { 8143 MachineBasicBlock *SMBB = Successors.pop_back_val(); 8144 if (SMBB->isEHPad()) { 8145 BB->removeSuccessor(SMBB); 8146 MBBLPads.push_back(SMBB); 8147 } 8148 } 8149 8150 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 8151 BB->normalizeSuccProbs(); 8152 8153 // Find the invoke call and mark all of the callee-saved registers as 8154 // 'implicit defined' so that they're spilled. This prevents code from 8155 // moving instructions to before the EH block, where they will never be 8156 // executed. 8157 for (MachineBasicBlock::reverse_iterator 8158 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 8159 if (!II->isCall()) continue; 8160 8161 DenseMap<unsigned, bool> DefRegs; 8162 for (MachineInstr::mop_iterator 8163 OI = II->operands_begin(), OE = II->operands_end(); 8164 OI != OE; ++OI) { 8165 if (!OI->isReg()) continue; 8166 DefRegs[OI->getReg()] = true; 8167 } 8168 8169 MachineInstrBuilder MIB(*MF, &*II); 8170 8171 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 8172 unsigned Reg = SavedRegs[i]; 8173 if (Subtarget->isThumb2() && 8174 !ARM::tGPRRegClass.contains(Reg) && 8175 !ARM::hGPRRegClass.contains(Reg)) 8176 continue; 8177 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 8178 continue; 8179 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 8180 continue; 8181 if (!DefRegs[Reg]) 8182 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 8183 } 8184 8185 break; 8186 } 8187 } 8188 8189 // Mark all former landing pads as non-landing pads. The dispatch is the only 8190 // landing pad now. 8191 for (SmallVectorImpl<MachineBasicBlock*>::iterator 8192 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 8193 (*I)->setIsEHPad(false); 8194 8195 // The instruction is gone now. 8196 MI.eraseFromParent(); 8197 } 8198 8199 static 8200 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 8201 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 8202 E = MBB->succ_end(); I != E; ++I) 8203 if (*I != Succ) 8204 return *I; 8205 llvm_unreachable("Expecting a BB with two successors!"); 8206 } 8207 8208 /// Return the load opcode for a given load size. If load size >= 8, 8209 /// neon opcode will be returned. 8210 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 8211 if (LdSize >= 8) 8212 return LdSize == 16 ? ARM::VLD1q32wb_fixed 8213 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 8214 if (IsThumb1) 8215 return LdSize == 4 ? ARM::tLDRi 8216 : LdSize == 2 ? ARM::tLDRHi 8217 : LdSize == 1 ? ARM::tLDRBi : 0; 8218 if (IsThumb2) 8219 return LdSize == 4 ? ARM::t2LDR_POST 8220 : LdSize == 2 ? ARM::t2LDRH_POST 8221 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 8222 return LdSize == 4 ? ARM::LDR_POST_IMM 8223 : LdSize == 2 ? ARM::LDRH_POST 8224 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 8225 } 8226 8227 /// Return the store opcode for a given store size. If store size >= 8, 8228 /// neon opcode will be returned. 8229 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 8230 if (StSize >= 8) 8231 return StSize == 16 ? ARM::VST1q32wb_fixed 8232 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 8233 if (IsThumb1) 8234 return StSize == 4 ? ARM::tSTRi 8235 : StSize == 2 ? ARM::tSTRHi 8236 : StSize == 1 ? ARM::tSTRBi : 0; 8237 if (IsThumb2) 8238 return StSize == 4 ? ARM::t2STR_POST 8239 : StSize == 2 ? ARM::t2STRH_POST 8240 : StSize == 1 ? ARM::t2STRB_POST : 0; 8241 return StSize == 4 ? ARM::STR_POST_IMM 8242 : StSize == 2 ? ARM::STRH_POST 8243 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 8244 } 8245 8246 /// Emit a post-increment load operation with given size. The instructions 8247 /// will be added to BB at Pos. 8248 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8249 const TargetInstrInfo *TII, const DebugLoc &dl, 8250 unsigned LdSize, unsigned Data, unsigned AddrIn, 8251 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8252 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 8253 assert(LdOpc != 0 && "Should have a load opcode"); 8254 if (LdSize >= 8) { 8255 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8256 .addReg(AddrOut, RegState::Define) 8257 .addReg(AddrIn) 8258 .addImm(0) 8259 .add(predOps(ARMCC::AL)); 8260 } else if (IsThumb1) { 8261 // load + update AddrIn 8262 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8263 .addReg(AddrIn) 8264 .addImm(0) 8265 .add(predOps(ARMCC::AL)); 8266 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8267 .add(t1CondCodeOp()) 8268 .addReg(AddrIn) 8269 .addImm(LdSize) 8270 .add(predOps(ARMCC::AL)); 8271 } else if (IsThumb2) { 8272 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8273 .addReg(AddrOut, RegState::Define) 8274 .addReg(AddrIn) 8275 .addImm(LdSize) 8276 .add(predOps(ARMCC::AL)); 8277 } else { // arm 8278 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8279 .addReg(AddrOut, RegState::Define) 8280 .addReg(AddrIn) 8281 .addReg(0) 8282 .addImm(LdSize) 8283 .add(predOps(ARMCC::AL)); 8284 } 8285 } 8286 8287 /// Emit a post-increment store operation with given size. The instructions 8288 /// will be added to BB at Pos. 8289 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8290 const TargetInstrInfo *TII, const DebugLoc &dl, 8291 unsigned StSize, unsigned Data, unsigned AddrIn, 8292 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8293 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 8294 assert(StOpc != 0 && "Should have a store opcode"); 8295 if (StSize >= 8) { 8296 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8297 .addReg(AddrIn) 8298 .addImm(0) 8299 .addReg(Data) 8300 .add(predOps(ARMCC::AL)); 8301 } else if (IsThumb1) { 8302 // store + update AddrIn 8303 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 8304 .addReg(Data) 8305 .addReg(AddrIn) 8306 .addImm(0) 8307 .add(predOps(ARMCC::AL)); 8308 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8309 .add(t1CondCodeOp()) 8310 .addReg(AddrIn) 8311 .addImm(StSize) 8312 .add(predOps(ARMCC::AL)); 8313 } else if (IsThumb2) { 8314 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8315 .addReg(Data) 8316 .addReg(AddrIn) 8317 .addImm(StSize) 8318 .add(predOps(ARMCC::AL)); 8319 } else { // arm 8320 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8321 .addReg(Data) 8322 .addReg(AddrIn) 8323 .addReg(0) 8324 .addImm(StSize) 8325 .add(predOps(ARMCC::AL)); 8326 } 8327 } 8328 8329 MachineBasicBlock * 8330 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 8331 MachineBasicBlock *BB) const { 8332 // This pseudo instruction has 3 operands: dst, src, size 8333 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 8334 // Otherwise, we will generate unrolled scalar copies. 8335 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8336 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8337 MachineFunction::iterator It = ++BB->getIterator(); 8338 8339 unsigned dest = MI.getOperand(0).getReg(); 8340 unsigned src = MI.getOperand(1).getReg(); 8341 unsigned SizeVal = MI.getOperand(2).getImm(); 8342 unsigned Align = MI.getOperand(3).getImm(); 8343 DebugLoc dl = MI.getDebugLoc(); 8344 8345 MachineFunction *MF = BB->getParent(); 8346 MachineRegisterInfo &MRI = MF->getRegInfo(); 8347 unsigned UnitSize = 0; 8348 const TargetRegisterClass *TRC = nullptr; 8349 const TargetRegisterClass *VecTRC = nullptr; 8350 8351 bool IsThumb1 = Subtarget->isThumb1Only(); 8352 bool IsThumb2 = Subtarget->isThumb2(); 8353 bool IsThumb = Subtarget->isThumb(); 8354 8355 if (Align & 1) { 8356 UnitSize = 1; 8357 } else if (Align & 2) { 8358 UnitSize = 2; 8359 } else { 8360 // Check whether we can use NEON instructions. 8361 if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && 8362 Subtarget->hasNEON()) { 8363 if ((Align % 16 == 0) && SizeVal >= 16) 8364 UnitSize = 16; 8365 else if ((Align % 8 == 0) && SizeVal >= 8) 8366 UnitSize = 8; 8367 } 8368 // Can't use NEON instructions. 8369 if (UnitSize == 0) 8370 UnitSize = 4; 8371 } 8372 8373 // Select the correct opcode and register class for unit size load/store 8374 bool IsNeon = UnitSize >= 8; 8375 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 8376 if (IsNeon) 8377 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 8378 : UnitSize == 8 ? &ARM::DPRRegClass 8379 : nullptr; 8380 8381 unsigned BytesLeft = SizeVal % UnitSize; 8382 unsigned LoopSize = SizeVal - BytesLeft; 8383 8384 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 8385 // Use LDR and STR to copy. 8386 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 8387 // [destOut] = STR_POST(scratch, destIn, UnitSize) 8388 unsigned srcIn = src; 8389 unsigned destIn = dest; 8390 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 8391 unsigned srcOut = MRI.createVirtualRegister(TRC); 8392 unsigned destOut = MRI.createVirtualRegister(TRC); 8393 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 8394 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 8395 IsThumb1, IsThumb2); 8396 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 8397 IsThumb1, IsThumb2); 8398 srcIn = srcOut; 8399 destIn = destOut; 8400 } 8401 8402 // Handle the leftover bytes with LDRB and STRB. 8403 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 8404 // [destOut] = STRB_POST(scratch, destIn, 1) 8405 for (unsigned i = 0; i < BytesLeft; i++) { 8406 unsigned srcOut = MRI.createVirtualRegister(TRC); 8407 unsigned destOut = MRI.createVirtualRegister(TRC); 8408 unsigned scratch = MRI.createVirtualRegister(TRC); 8409 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 8410 IsThumb1, IsThumb2); 8411 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 8412 IsThumb1, IsThumb2); 8413 srcIn = srcOut; 8414 destIn = destOut; 8415 } 8416 MI.eraseFromParent(); // The instruction is gone now. 8417 return BB; 8418 } 8419 8420 // Expand the pseudo op to a loop. 8421 // thisMBB: 8422 // ... 8423 // movw varEnd, # --> with thumb2 8424 // movt varEnd, # 8425 // ldrcp varEnd, idx --> without thumb2 8426 // fallthrough --> loopMBB 8427 // loopMBB: 8428 // PHI varPhi, varEnd, varLoop 8429 // PHI srcPhi, src, srcLoop 8430 // PHI destPhi, dst, destLoop 8431 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 8432 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 8433 // subs varLoop, varPhi, #UnitSize 8434 // bne loopMBB 8435 // fallthrough --> exitMBB 8436 // exitMBB: 8437 // epilogue to handle left-over bytes 8438 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 8439 // [destOut] = STRB_POST(scratch, destLoop, 1) 8440 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8441 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8442 MF->insert(It, loopMBB); 8443 MF->insert(It, exitMBB); 8444 8445 // Transfer the remainder of BB and its successor edges to exitMBB. 8446 exitMBB->splice(exitMBB->begin(), BB, 8447 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8448 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 8449 8450 // Load an immediate to varEnd. 8451 unsigned varEnd = MRI.createVirtualRegister(TRC); 8452 if (Subtarget->useMovt(*MF)) { 8453 unsigned Vtmp = varEnd; 8454 if ((LoopSize & 0xFFFF0000) != 0) 8455 Vtmp = MRI.createVirtualRegister(TRC); 8456 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 8457 .addImm(LoopSize & 0xFFFF) 8458 .add(predOps(ARMCC::AL)); 8459 8460 if ((LoopSize & 0xFFFF0000) != 0) 8461 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 8462 .addReg(Vtmp) 8463 .addImm(LoopSize >> 16) 8464 .add(predOps(ARMCC::AL)); 8465 } else { 8466 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8467 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 8468 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 8469 8470 // MachineConstantPool wants an explicit alignment. 8471 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8472 if (Align == 0) 8473 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8474 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8475 8476 if (IsThumb) 8477 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 8478 .addReg(varEnd, RegState::Define) 8479 .addConstantPoolIndex(Idx) 8480 .add(predOps(ARMCC::AL)); 8481 else 8482 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 8483 .addReg(varEnd, RegState::Define) 8484 .addConstantPoolIndex(Idx) 8485 .addImm(0) 8486 .add(predOps(ARMCC::AL)); 8487 } 8488 BB->addSuccessor(loopMBB); 8489 8490 // Generate the loop body: 8491 // varPhi = PHI(varLoop, varEnd) 8492 // srcPhi = PHI(srcLoop, src) 8493 // destPhi = PHI(destLoop, dst) 8494 MachineBasicBlock *entryBB = BB; 8495 BB = loopMBB; 8496 unsigned varLoop = MRI.createVirtualRegister(TRC); 8497 unsigned varPhi = MRI.createVirtualRegister(TRC); 8498 unsigned srcLoop = MRI.createVirtualRegister(TRC); 8499 unsigned srcPhi = MRI.createVirtualRegister(TRC); 8500 unsigned destLoop = MRI.createVirtualRegister(TRC); 8501 unsigned destPhi = MRI.createVirtualRegister(TRC); 8502 8503 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 8504 .addReg(varLoop).addMBB(loopMBB) 8505 .addReg(varEnd).addMBB(entryBB); 8506 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 8507 .addReg(srcLoop).addMBB(loopMBB) 8508 .addReg(src).addMBB(entryBB); 8509 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 8510 .addReg(destLoop).addMBB(loopMBB) 8511 .addReg(dest).addMBB(entryBB); 8512 8513 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 8514 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 8515 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 8516 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 8517 IsThumb1, IsThumb2); 8518 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 8519 IsThumb1, IsThumb2); 8520 8521 // Decrement loop variable by UnitSize. 8522 if (IsThumb1) { 8523 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 8524 .add(t1CondCodeOp()) 8525 .addReg(varPhi) 8526 .addImm(UnitSize) 8527 .add(predOps(ARMCC::AL)); 8528 } else { 8529 MachineInstrBuilder MIB = 8530 BuildMI(*BB, BB->end(), dl, 8531 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 8532 MIB.addReg(varPhi) 8533 .addImm(UnitSize) 8534 .add(predOps(ARMCC::AL)) 8535 .add(condCodeOp()); 8536 MIB->getOperand(5).setReg(ARM::CPSR); 8537 MIB->getOperand(5).setIsDef(true); 8538 } 8539 BuildMI(*BB, BB->end(), dl, 8540 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 8541 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 8542 8543 // loopMBB can loop back to loopMBB or fall through to exitMBB. 8544 BB->addSuccessor(loopMBB); 8545 BB->addSuccessor(exitMBB); 8546 8547 // Add epilogue to handle BytesLeft. 8548 BB = exitMBB; 8549 auto StartOfExit = exitMBB->begin(); 8550 8551 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 8552 // [destOut] = STRB_POST(scratch, destLoop, 1) 8553 unsigned srcIn = srcLoop; 8554 unsigned destIn = destLoop; 8555 for (unsigned i = 0; i < BytesLeft; i++) { 8556 unsigned srcOut = MRI.createVirtualRegister(TRC); 8557 unsigned destOut = MRI.createVirtualRegister(TRC); 8558 unsigned scratch = MRI.createVirtualRegister(TRC); 8559 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 8560 IsThumb1, IsThumb2); 8561 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 8562 IsThumb1, IsThumb2); 8563 srcIn = srcOut; 8564 destIn = destOut; 8565 } 8566 8567 MI.eraseFromParent(); // The instruction is gone now. 8568 return BB; 8569 } 8570 8571 MachineBasicBlock * 8572 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 8573 MachineBasicBlock *MBB) const { 8574 const TargetMachine &TM = getTargetMachine(); 8575 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 8576 DebugLoc DL = MI.getDebugLoc(); 8577 8578 assert(Subtarget->isTargetWindows() && 8579 "__chkstk is only supported on Windows"); 8580 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 8581 8582 // __chkstk takes the number of words to allocate on the stack in R4, and 8583 // returns the stack adjustment in number of bytes in R4. This will not 8584 // clober any other registers (other than the obvious lr). 8585 // 8586 // Although, technically, IP should be considered a register which may be 8587 // clobbered, the call itself will not touch it. Windows on ARM is a pure 8588 // thumb-2 environment, so there is no interworking required. As a result, we 8589 // do not expect a veneer to be emitted by the linker, clobbering IP. 8590 // 8591 // Each module receives its own copy of __chkstk, so no import thunk is 8592 // required, again, ensuring that IP is not clobbered. 8593 // 8594 // Finally, although some linkers may theoretically provide a trampoline for 8595 // out of range calls (which is quite common due to a 32M range limitation of 8596 // branches for Thumb), we can generate the long-call version via 8597 // -mcmodel=large, alleviating the need for the trampoline which may clobber 8598 // IP. 8599 8600 switch (TM.getCodeModel()) { 8601 case CodeModel::Small: 8602 case CodeModel::Medium: 8603 case CodeModel::Default: 8604 case CodeModel::Kernel: 8605 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 8606 .add(predOps(ARMCC::AL)) 8607 .addExternalSymbol("__chkstk") 8608 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 8609 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 8610 .addReg(ARM::R12, 8611 RegState::Implicit | RegState::Define | RegState::Dead); 8612 break; 8613 case CodeModel::Large: 8614 case CodeModel::JITDefault: { 8615 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 8616 unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 8617 8618 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 8619 .addExternalSymbol("__chkstk"); 8620 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 8621 .add(predOps(ARMCC::AL)) 8622 .addReg(Reg, RegState::Kill) 8623 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 8624 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 8625 .addReg(ARM::R12, 8626 RegState::Implicit | RegState::Define | RegState::Dead); 8627 break; 8628 } 8629 } 8630 8631 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 8632 .addReg(ARM::SP, RegState::Kill) 8633 .addReg(ARM::R4, RegState::Kill) 8634 .setMIFlags(MachineInstr::FrameSetup) 8635 .add(predOps(ARMCC::AL)) 8636 .add(condCodeOp()); 8637 8638 MI.eraseFromParent(); 8639 return MBB; 8640 } 8641 8642 MachineBasicBlock * 8643 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 8644 MachineBasicBlock *MBB) const { 8645 DebugLoc DL = MI.getDebugLoc(); 8646 MachineFunction *MF = MBB->getParent(); 8647 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8648 8649 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 8650 MF->insert(++MBB->getIterator(), ContBB); 8651 ContBB->splice(ContBB->begin(), MBB, 8652 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 8653 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 8654 MBB->addSuccessor(ContBB); 8655 8656 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 8657 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 8658 MF->push_back(TrapBB); 8659 MBB->addSuccessor(TrapBB); 8660 8661 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 8662 .addReg(MI.getOperand(0).getReg()) 8663 .addImm(0) 8664 .add(predOps(ARMCC::AL)); 8665 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 8666 .addMBB(TrapBB) 8667 .addImm(ARMCC::EQ) 8668 .addReg(ARM::CPSR); 8669 8670 MI.eraseFromParent(); 8671 return ContBB; 8672 } 8673 8674 MachineBasicBlock * 8675 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 8676 MachineBasicBlock *BB) const { 8677 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8678 DebugLoc dl = MI.getDebugLoc(); 8679 bool isThumb2 = Subtarget->isThumb2(); 8680 switch (MI.getOpcode()) { 8681 default: { 8682 MI.dump(); 8683 llvm_unreachable("Unexpected instr type to insert"); 8684 } 8685 8686 // Thumb1 post-indexed loads are really just single-register LDMs. 8687 case ARM::tLDR_postidx: { 8688 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 8689 .add(MI.getOperand(1)) // Rn_wb 8690 .add(MI.getOperand(2)) // Rn 8691 .add(MI.getOperand(3)) // PredImm 8692 .add(MI.getOperand(4)) // PredReg 8693 .add(MI.getOperand(0)); // Rt 8694 MI.eraseFromParent(); 8695 return BB; 8696 } 8697 8698 // The Thumb2 pre-indexed stores have the same MI operands, they just 8699 // define them differently in the .td files from the isel patterns, so 8700 // they need pseudos. 8701 case ARM::t2STR_preidx: 8702 MI.setDesc(TII->get(ARM::t2STR_PRE)); 8703 return BB; 8704 case ARM::t2STRB_preidx: 8705 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 8706 return BB; 8707 case ARM::t2STRH_preidx: 8708 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 8709 return BB; 8710 8711 case ARM::STRi_preidx: 8712 case ARM::STRBi_preidx: { 8713 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 8714 : ARM::STRB_PRE_IMM; 8715 // Decode the offset. 8716 unsigned Offset = MI.getOperand(4).getImm(); 8717 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 8718 Offset = ARM_AM::getAM2Offset(Offset); 8719 if (isSub) 8720 Offset = -Offset; 8721 8722 MachineMemOperand *MMO = *MI.memoperands_begin(); 8723 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 8724 .add(MI.getOperand(0)) // Rn_wb 8725 .add(MI.getOperand(1)) // Rt 8726 .add(MI.getOperand(2)) // Rn 8727 .addImm(Offset) // offset (skip GPR==zero_reg) 8728 .add(MI.getOperand(5)) // pred 8729 .add(MI.getOperand(6)) 8730 .addMemOperand(MMO); 8731 MI.eraseFromParent(); 8732 return BB; 8733 } 8734 case ARM::STRr_preidx: 8735 case ARM::STRBr_preidx: 8736 case ARM::STRH_preidx: { 8737 unsigned NewOpc; 8738 switch (MI.getOpcode()) { 8739 default: llvm_unreachable("unexpected opcode!"); 8740 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 8741 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 8742 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 8743 } 8744 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 8745 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 8746 MIB.add(MI.getOperand(i)); 8747 MI.eraseFromParent(); 8748 return BB; 8749 } 8750 8751 case ARM::tMOVCCr_pseudo: { 8752 // To "insert" a SELECT_CC instruction, we actually have to insert the 8753 // diamond control-flow pattern. The incoming instruction knows the 8754 // destination vreg to set, the condition code register to branch on, the 8755 // true/false values to select between, and a branch opcode to use. 8756 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8757 MachineFunction::iterator It = ++BB->getIterator(); 8758 8759 // thisMBB: 8760 // ... 8761 // TrueVal = ... 8762 // cmpTY ccX, r1, r2 8763 // bCC copy1MBB 8764 // fallthrough --> copy0MBB 8765 MachineBasicBlock *thisMBB = BB; 8766 MachineFunction *F = BB->getParent(); 8767 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8768 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8769 F->insert(It, copy0MBB); 8770 F->insert(It, sinkMBB); 8771 8772 // Transfer the remainder of BB and its successor edges to sinkMBB. 8773 sinkMBB->splice(sinkMBB->begin(), BB, 8774 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8775 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8776 8777 BB->addSuccessor(copy0MBB); 8778 BB->addSuccessor(sinkMBB); 8779 8780 BuildMI(BB, dl, TII->get(ARM::tBcc)) 8781 .addMBB(sinkMBB) 8782 .addImm(MI.getOperand(3).getImm()) 8783 .addReg(MI.getOperand(4).getReg()); 8784 8785 // copy0MBB: 8786 // %FalseValue = ... 8787 // # fallthrough to sinkMBB 8788 BB = copy0MBB; 8789 8790 // Update machine-CFG edges 8791 BB->addSuccessor(sinkMBB); 8792 8793 // sinkMBB: 8794 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8795 // ... 8796 BB = sinkMBB; 8797 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 8798 .addReg(MI.getOperand(1).getReg()) 8799 .addMBB(copy0MBB) 8800 .addReg(MI.getOperand(2).getReg()) 8801 .addMBB(thisMBB); 8802 8803 MI.eraseFromParent(); // The pseudo instruction is gone now. 8804 return BB; 8805 } 8806 8807 case ARM::BCCi64: 8808 case ARM::BCCZi64: { 8809 // If there is an unconditional branch to the other successor, remove it. 8810 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8811 8812 // Compare both parts that make up the double comparison separately for 8813 // equality. 8814 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 8815 8816 unsigned LHS1 = MI.getOperand(1).getReg(); 8817 unsigned LHS2 = MI.getOperand(2).getReg(); 8818 if (RHSisZero) { 8819 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 8820 .addReg(LHS1) 8821 .addImm(0) 8822 .add(predOps(ARMCC::AL)); 8823 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 8824 .addReg(LHS2).addImm(0) 8825 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 8826 } else { 8827 unsigned RHS1 = MI.getOperand(3).getReg(); 8828 unsigned RHS2 = MI.getOperand(4).getReg(); 8829 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 8830 .addReg(LHS1) 8831 .addReg(RHS1) 8832 .add(predOps(ARMCC::AL)); 8833 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 8834 .addReg(LHS2).addReg(RHS2) 8835 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 8836 } 8837 8838 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 8839 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 8840 if (MI.getOperand(0).getImm() == ARMCC::NE) 8841 std::swap(destMBB, exitMBB); 8842 8843 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 8844 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 8845 if (isThumb2) 8846 BuildMI(BB, dl, TII->get(ARM::t2B)) 8847 .addMBB(exitMBB) 8848 .add(predOps(ARMCC::AL)); 8849 else 8850 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 8851 8852 MI.eraseFromParent(); // The pseudo instruction is gone now. 8853 return BB; 8854 } 8855 8856 case ARM::Int_eh_sjlj_setjmp: 8857 case ARM::Int_eh_sjlj_setjmp_nofp: 8858 case ARM::tInt_eh_sjlj_setjmp: 8859 case ARM::t2Int_eh_sjlj_setjmp: 8860 case ARM::t2Int_eh_sjlj_setjmp_nofp: 8861 return BB; 8862 8863 case ARM::Int_eh_sjlj_setup_dispatch: 8864 EmitSjLjDispatchBlock(MI, BB); 8865 return BB; 8866 8867 case ARM::ABS: 8868 case ARM::t2ABS: { 8869 // To insert an ABS instruction, we have to insert the 8870 // diamond control-flow pattern. The incoming instruction knows the 8871 // source vreg to test against 0, the destination vreg to set, 8872 // the condition code register to branch on, the 8873 // true/false values to select between, and a branch opcode to use. 8874 // It transforms 8875 // V1 = ABS V0 8876 // into 8877 // V2 = MOVS V0 8878 // BCC (branch to SinkBB if V0 >= 0) 8879 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 8880 // SinkBB: V1 = PHI(V2, V3) 8881 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8882 MachineFunction::iterator BBI = ++BB->getIterator(); 8883 MachineFunction *Fn = BB->getParent(); 8884 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 8885 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 8886 Fn->insert(BBI, RSBBB); 8887 Fn->insert(BBI, SinkBB); 8888 8889 unsigned int ABSSrcReg = MI.getOperand(1).getReg(); 8890 unsigned int ABSDstReg = MI.getOperand(0).getReg(); 8891 bool ABSSrcKIll = MI.getOperand(1).isKill(); 8892 bool isThumb2 = Subtarget->isThumb2(); 8893 MachineRegisterInfo &MRI = Fn->getRegInfo(); 8894 // In Thumb mode S must not be specified if source register is the SP or 8895 // PC and if destination register is the SP, so restrict register class 8896 unsigned NewRsbDstReg = 8897 MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 8898 8899 // Transfer the remainder of BB and its successor edges to sinkMBB. 8900 SinkBB->splice(SinkBB->begin(), BB, 8901 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8902 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 8903 8904 BB->addSuccessor(RSBBB); 8905 BB->addSuccessor(SinkBB); 8906 8907 // fall through to SinkMBB 8908 RSBBB->addSuccessor(SinkBB); 8909 8910 // insert a cmp at the end of BB 8911 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 8912 .addReg(ABSSrcReg) 8913 .addImm(0) 8914 .add(predOps(ARMCC::AL)); 8915 8916 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 8917 BuildMI(BB, dl, 8918 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 8919 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 8920 8921 // insert rsbri in RSBBB 8922 // Note: BCC and rsbri will be converted into predicated rsbmi 8923 // by if-conversion pass 8924 BuildMI(*RSBBB, RSBBB->begin(), dl, 8925 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 8926 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 8927 .addImm(0) 8928 .add(predOps(ARMCC::AL)) 8929 .add(condCodeOp()); 8930 8931 // insert PHI in SinkBB, 8932 // reuse ABSDstReg to not change uses of ABS instruction 8933 BuildMI(*SinkBB, SinkBB->begin(), dl, 8934 TII->get(ARM::PHI), ABSDstReg) 8935 .addReg(NewRsbDstReg).addMBB(RSBBB) 8936 .addReg(ABSSrcReg).addMBB(BB); 8937 8938 // remove ABS instruction 8939 MI.eraseFromParent(); 8940 8941 // return last added BB 8942 return SinkBB; 8943 } 8944 case ARM::COPY_STRUCT_BYVAL_I32: 8945 ++NumLoopByVals; 8946 return EmitStructByval(MI, BB); 8947 case ARM::WIN__CHKSTK: 8948 return EmitLowered__chkstk(MI, BB); 8949 case ARM::WIN__DBZCHK: 8950 return EmitLowered__dbzchk(MI, BB); 8951 } 8952 } 8953 8954 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers 8955 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 8956 /// instead of as a custom inserter because we need the use list from the SDNode. 8957 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 8958 MachineInstr &MI, const SDNode *Node) { 8959 bool isThumb1 = Subtarget->isThumb1Only(); 8960 8961 DebugLoc DL = MI.getDebugLoc(); 8962 MachineFunction *MF = MI.getParent()->getParent(); 8963 MachineRegisterInfo &MRI = MF->getRegInfo(); 8964 MachineInstrBuilder MIB(*MF, MI); 8965 8966 // If the new dst/src is unused mark it as dead. 8967 if (!Node->hasAnyUseOfValue(0)) { 8968 MI.getOperand(0).setIsDead(true); 8969 } 8970 if (!Node->hasAnyUseOfValue(1)) { 8971 MI.getOperand(1).setIsDead(true); 8972 } 8973 8974 // The MEMCPY both defines and kills the scratch registers. 8975 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 8976 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 8977 : &ARM::GPRRegClass); 8978 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 8979 } 8980 } 8981 8982 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 8983 SDNode *Node) const { 8984 if (MI.getOpcode() == ARM::MEMCPY) { 8985 attachMEMCPYScratchRegs(Subtarget, MI, Node); 8986 return; 8987 } 8988 8989 const MCInstrDesc *MCID = &MI.getDesc(); 8990 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 8991 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 8992 // operand is still set to noreg. If needed, set the optional operand's 8993 // register to CPSR, and remove the redundant implicit def. 8994 // 8995 // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>). 8996 8997 // Rename pseudo opcodes. 8998 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 8999 if (NewOpc) { 9000 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 9001 MCID = &TII->get(NewOpc); 9002 9003 assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 && 9004 "converted opcode should be the same except for cc_out"); 9005 9006 MI.setDesc(*MCID); 9007 9008 // Add the optional cc_out operand 9009 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 9010 } 9011 unsigned ccOutIdx = MCID->getNumOperands() - 1; 9012 9013 // Any ARM instruction that sets the 's' bit should specify an optional 9014 // "cc_out" operand in the last operand position. 9015 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 9016 assert(!NewOpc && "Optional cc_out operand required"); 9017 return; 9018 } 9019 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 9020 // since we already have an optional CPSR def. 9021 bool definesCPSR = false; 9022 bool deadCPSR = false; 9023 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 9024 ++i) { 9025 const MachineOperand &MO = MI.getOperand(i); 9026 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 9027 definesCPSR = true; 9028 if (MO.isDead()) 9029 deadCPSR = true; 9030 MI.RemoveOperand(i); 9031 break; 9032 } 9033 } 9034 if (!definesCPSR) { 9035 assert(!NewOpc && "Optional cc_out operand required"); 9036 return; 9037 } 9038 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 9039 if (deadCPSR) { 9040 assert(!MI.getOperand(ccOutIdx).getReg() && 9041 "expect uninitialized optional cc_out operand"); 9042 return; 9043 } 9044 9045 // If this instruction was defined with an optional CPSR def and its dag node 9046 // had a live implicit CPSR def, then activate the optional CPSR def. 9047 MachineOperand &MO = MI.getOperand(ccOutIdx); 9048 MO.setReg(ARM::CPSR); 9049 MO.setIsDef(true); 9050 } 9051 9052 //===----------------------------------------------------------------------===// 9053 // ARM Optimization Hooks 9054 //===----------------------------------------------------------------------===// 9055 9056 // Helper function that checks if N is a null or all ones constant. 9057 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 9058 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 9059 } 9060 9061 // Return true if N is conditionally 0 or all ones. 9062 // Detects these expressions where cc is an i1 value: 9063 // 9064 // (select cc 0, y) [AllOnes=0] 9065 // (select cc y, 0) [AllOnes=0] 9066 // (zext cc) [AllOnes=0] 9067 // (sext cc) [AllOnes=0/1] 9068 // (select cc -1, y) [AllOnes=1] 9069 // (select cc y, -1) [AllOnes=1] 9070 // 9071 // Invert is set when N is the null/all ones constant when CC is false. 9072 // OtherOp is set to the alternative value of N. 9073 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 9074 SDValue &CC, bool &Invert, 9075 SDValue &OtherOp, 9076 SelectionDAG &DAG) { 9077 switch (N->getOpcode()) { 9078 default: return false; 9079 case ISD::SELECT: { 9080 CC = N->getOperand(0); 9081 SDValue N1 = N->getOperand(1); 9082 SDValue N2 = N->getOperand(2); 9083 if (isZeroOrAllOnes(N1, AllOnes)) { 9084 Invert = false; 9085 OtherOp = N2; 9086 return true; 9087 } 9088 if (isZeroOrAllOnes(N2, AllOnes)) { 9089 Invert = true; 9090 OtherOp = N1; 9091 return true; 9092 } 9093 return false; 9094 } 9095 case ISD::ZERO_EXTEND: 9096 // (zext cc) can never be the all ones value. 9097 if (AllOnes) 9098 return false; 9099 LLVM_FALLTHROUGH; 9100 case ISD::SIGN_EXTEND: { 9101 SDLoc dl(N); 9102 EVT VT = N->getValueType(0); 9103 CC = N->getOperand(0); 9104 if (CC.getValueType() != MVT::i1) 9105 return false; 9106 Invert = !AllOnes; 9107 if (AllOnes) 9108 // When looking for an AllOnes constant, N is an sext, and the 'other' 9109 // value is 0. 9110 OtherOp = DAG.getConstant(0, dl, VT); 9111 else if (N->getOpcode() == ISD::ZERO_EXTEND) 9112 // When looking for a 0 constant, N can be zext or sext. 9113 OtherOp = DAG.getConstant(1, dl, VT); 9114 else 9115 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 9116 VT); 9117 return true; 9118 } 9119 } 9120 } 9121 9122 // Combine a constant select operand into its use: 9123 // 9124 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 9125 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 9126 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 9127 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 9128 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 9129 // 9130 // The transform is rejected if the select doesn't have a constant operand that 9131 // is null, or all ones when AllOnes is set. 9132 // 9133 // Also recognize sext/zext from i1: 9134 // 9135 // (add (zext cc), x) -> (select cc (add x, 1), x) 9136 // (add (sext cc), x) -> (select cc (add x, -1), x) 9137 // 9138 // These transformations eventually create predicated instructions. 9139 // 9140 // @param N The node to transform. 9141 // @param Slct The N operand that is a select. 9142 // @param OtherOp The other N operand (x above). 9143 // @param DCI Context. 9144 // @param AllOnes Require the select constant to be all ones instead of null. 9145 // @returns The new node, or SDValue() on failure. 9146 static 9147 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 9148 TargetLowering::DAGCombinerInfo &DCI, 9149 bool AllOnes = false) { 9150 SelectionDAG &DAG = DCI.DAG; 9151 EVT VT = N->getValueType(0); 9152 SDValue NonConstantVal; 9153 SDValue CCOp; 9154 bool SwapSelectOps; 9155 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 9156 NonConstantVal, DAG)) 9157 return SDValue(); 9158 9159 // Slct is now know to be the desired identity constant when CC is true. 9160 SDValue TrueVal = OtherOp; 9161 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 9162 OtherOp, NonConstantVal); 9163 // Unless SwapSelectOps says CC should be false. 9164 if (SwapSelectOps) 9165 std::swap(TrueVal, FalseVal); 9166 9167 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 9168 CCOp, TrueVal, FalseVal); 9169 } 9170 9171 // Attempt combineSelectAndUse on each operand of a commutative operator N. 9172 static 9173 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 9174 TargetLowering::DAGCombinerInfo &DCI) { 9175 SDValue N0 = N->getOperand(0); 9176 SDValue N1 = N->getOperand(1); 9177 if (N0.getNode()->hasOneUse()) 9178 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 9179 return Result; 9180 if (N1.getNode()->hasOneUse()) 9181 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 9182 return Result; 9183 return SDValue(); 9184 } 9185 9186 static bool IsVUZPShuffleNode(SDNode *N) { 9187 // VUZP shuffle node. 9188 if (N->getOpcode() == ARMISD::VUZP) 9189 return true; 9190 9191 // "VUZP" on i32 is an alias for VTRN. 9192 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 9193 return true; 9194 9195 return false; 9196 } 9197 9198 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 9199 TargetLowering::DAGCombinerInfo &DCI, 9200 const ARMSubtarget *Subtarget) { 9201 // Look for ADD(VUZP.0, VUZP.1). 9202 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 9203 N0 == N1) 9204 return SDValue(); 9205 9206 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 9207 if (!N->getValueType(0).is64BitVector()) 9208 return SDValue(); 9209 9210 // Generate vpadd. 9211 SelectionDAG &DAG = DCI.DAG; 9212 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9213 SDLoc dl(N); 9214 SDNode *Unzip = N0.getNode(); 9215 EVT VT = N->getValueType(0); 9216 9217 SmallVector<SDValue, 8> Ops; 9218 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 9219 TLI.getPointerTy(DAG.getDataLayout()))); 9220 Ops.push_back(Unzip->getOperand(0)); 9221 Ops.push_back(Unzip->getOperand(1)); 9222 9223 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9224 } 9225 9226 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9227 TargetLowering::DAGCombinerInfo &DCI, 9228 const ARMSubtarget *Subtarget) { 9229 // Check for two extended operands. 9230 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 9231 N1.getOpcode() == ISD::SIGN_EXTEND) && 9232 !(N0.getOpcode() == ISD::ZERO_EXTEND && 9233 N1.getOpcode() == ISD::ZERO_EXTEND)) 9234 return SDValue(); 9235 9236 SDValue N00 = N0.getOperand(0); 9237 SDValue N10 = N1.getOperand(0); 9238 9239 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 9240 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 9241 N00 == N10) 9242 return SDValue(); 9243 9244 // We only recognize Q register paddl here; this can't be reached until 9245 // after type legalization. 9246 if (!N00.getValueType().is64BitVector() || 9247 !N0.getValueType().is128BitVector()) 9248 return SDValue(); 9249 9250 // Generate vpaddl. 9251 SelectionDAG &DAG = DCI.DAG; 9252 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9253 SDLoc dl(N); 9254 EVT VT = N->getValueType(0); 9255 9256 SmallVector<SDValue, 8> Ops; 9257 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 9258 unsigned Opcode; 9259 if (N0.getOpcode() == ISD::SIGN_EXTEND) 9260 Opcode = Intrinsic::arm_neon_vpaddls; 9261 else 9262 Opcode = Intrinsic::arm_neon_vpaddlu; 9263 Ops.push_back(DAG.getConstant(Opcode, dl, 9264 TLI.getPointerTy(DAG.getDataLayout()))); 9265 EVT ElemTy = N00.getValueType().getVectorElementType(); 9266 unsigned NumElts = VT.getVectorNumElements(); 9267 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 9268 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 9269 N00.getOperand(0), N00.getOperand(1)); 9270 Ops.push_back(Concat); 9271 9272 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9273 } 9274 9275 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 9276 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 9277 // much easier to match. 9278 static SDValue 9279 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9280 TargetLowering::DAGCombinerInfo &DCI, 9281 const ARMSubtarget *Subtarget) { 9282 // Only perform optimization if after legalize, and if NEON is available. We 9283 // also expected both operands to be BUILD_VECTORs. 9284 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 9285 || N0.getOpcode() != ISD::BUILD_VECTOR 9286 || N1.getOpcode() != ISD::BUILD_VECTOR) 9287 return SDValue(); 9288 9289 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 9290 EVT VT = N->getValueType(0); 9291 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 9292 return SDValue(); 9293 9294 // Check that the vector operands are of the right form. 9295 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 9296 // operands, where N is the size of the formed vector. 9297 // Each EXTRACT_VECTOR should have the same input vector and odd or even 9298 // index such that we have a pair wise add pattern. 9299 9300 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 9301 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9302 return SDValue(); 9303 SDValue Vec = N0->getOperand(0)->getOperand(0); 9304 SDNode *V = Vec.getNode(); 9305 unsigned nextIndex = 0; 9306 9307 // For each operands to the ADD which are BUILD_VECTORs, 9308 // check to see if each of their operands are an EXTRACT_VECTOR with 9309 // the same vector and appropriate index. 9310 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 9311 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 9312 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 9313 9314 SDValue ExtVec0 = N0->getOperand(i); 9315 SDValue ExtVec1 = N1->getOperand(i); 9316 9317 // First operand is the vector, verify its the same. 9318 if (V != ExtVec0->getOperand(0).getNode() || 9319 V != ExtVec1->getOperand(0).getNode()) 9320 return SDValue(); 9321 9322 // Second is the constant, verify its correct. 9323 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 9324 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 9325 9326 // For the constant, we want to see all the even or all the odd. 9327 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 9328 || C1->getZExtValue() != nextIndex+1) 9329 return SDValue(); 9330 9331 // Increment index. 9332 nextIndex+=2; 9333 } else 9334 return SDValue(); 9335 } 9336 9337 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. 9338 if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 9339 return SDValue(); 9340 9341 // Create VPADDL node. 9342 SelectionDAG &DAG = DCI.DAG; 9343 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9344 9345 SDLoc dl(N); 9346 9347 // Build operand list. 9348 SmallVector<SDValue, 8> Ops; 9349 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 9350 TLI.getPointerTy(DAG.getDataLayout()))); 9351 9352 // Input is the vector. 9353 Ops.push_back(Vec); 9354 9355 // Get widened type and narrowed type. 9356 MVT widenType; 9357 unsigned numElem = VT.getVectorNumElements(); 9358 9359 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 9360 switch (inputLaneType.getSimpleVT().SimpleTy) { 9361 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 9362 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 9363 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 9364 default: 9365 llvm_unreachable("Invalid vector element type for padd optimization."); 9366 } 9367 9368 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 9369 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 9370 return DAG.getNode(ExtOp, dl, VT, tmp); 9371 } 9372 9373 static SDValue findMUL_LOHI(SDValue V) { 9374 if (V->getOpcode() == ISD::UMUL_LOHI || 9375 V->getOpcode() == ISD::SMUL_LOHI) 9376 return V; 9377 return SDValue(); 9378 } 9379 9380 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, 9381 TargetLowering::DAGCombinerInfo &DCI, 9382 const ARMSubtarget *Subtarget) { 9383 9384 // Look for multiply add opportunities. 9385 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 9386 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 9387 // a glue link from the first add to the second add. 9388 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 9389 // a S/UMLAL instruction. 9390 // UMUL_LOHI 9391 // / :lo \ :hi 9392 // / \ [no multiline comment] 9393 // loAdd -> ADDE | 9394 // \ :glue / 9395 // \ / 9396 // ADDC <- hiAdd 9397 // 9398 assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); 9399 SDValue AddcOp0 = AddcNode->getOperand(0); 9400 SDValue AddcOp1 = AddcNode->getOperand(1); 9401 9402 // Check if the two operands are from the same mul_lohi node. 9403 if (AddcOp0.getNode() == AddcOp1.getNode()) 9404 return SDValue(); 9405 9406 assert(AddcNode->getNumValues() == 2 && 9407 AddcNode->getValueType(0) == MVT::i32 && 9408 "Expect ADDC with two result values. First: i32"); 9409 9410 // Check that we have a glued ADDC node. 9411 if (AddcNode->getValueType(1) != MVT::Glue) 9412 return SDValue(); 9413 9414 // Check that the ADDC adds the low result of the S/UMUL_LOHI. 9415 if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && 9416 AddcOp0->getOpcode() != ISD::SMUL_LOHI && 9417 AddcOp1->getOpcode() != ISD::UMUL_LOHI && 9418 AddcOp1->getOpcode() != ISD::SMUL_LOHI) 9419 return SDValue(); 9420 9421 // Look for the glued ADDE. 9422 SDNode* AddeNode = AddcNode->getGluedUser(); 9423 if (!AddeNode) 9424 return SDValue(); 9425 9426 // Make sure it is really an ADDE. 9427 if (AddeNode->getOpcode() != ISD::ADDE) 9428 return SDValue(); 9429 9430 assert(AddeNode->getNumOperands() == 3 && 9431 AddeNode->getOperand(2).getValueType() == MVT::Glue && 9432 "ADDE node has the wrong inputs"); 9433 9434 // Check for the triangle shape. 9435 SDValue AddeOp0 = AddeNode->getOperand(0); 9436 SDValue AddeOp1 = AddeNode->getOperand(1); 9437 9438 // Make sure that the ADDE operands are not coming from the same node. 9439 if (AddeOp0.getNode() == AddeOp1.getNode()) 9440 return SDValue(); 9441 9442 // Find the MUL_LOHI node walking up ADDE's operands. 9443 bool IsLeftOperandMUL = false; 9444 SDValue MULOp = findMUL_LOHI(AddeOp0); 9445 if (MULOp == SDValue()) 9446 MULOp = findMUL_LOHI(AddeOp1); 9447 else 9448 IsLeftOperandMUL = true; 9449 if (MULOp == SDValue()) 9450 return SDValue(); 9451 9452 // Figure out the right opcode. 9453 unsigned Opc = MULOp->getOpcode(); 9454 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 9455 9456 // Figure out the high and low input values to the MLAL node. 9457 SDValue* HiAdd = nullptr; 9458 SDValue* LoMul = nullptr; 9459 SDValue* LowAdd = nullptr; 9460 9461 // Ensure that ADDE is from high result of ISD::SMUL_LOHI. 9462 if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) 9463 return SDValue(); 9464 9465 if (IsLeftOperandMUL) 9466 HiAdd = &AddeOp1; 9467 else 9468 HiAdd = &AddeOp0; 9469 9470 9471 // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node 9472 // whose low result is fed to the ADDC we are checking. 9473 9474 if (AddcOp0 == MULOp.getValue(0)) { 9475 LoMul = &AddcOp0; 9476 LowAdd = &AddcOp1; 9477 } 9478 if (AddcOp1 == MULOp.getValue(0)) { 9479 LoMul = &AddcOp1; 9480 LowAdd = &AddcOp0; 9481 } 9482 9483 if (!LoMul) 9484 return SDValue(); 9485 9486 // Create the merged node. 9487 SelectionDAG &DAG = DCI.DAG; 9488 9489 // Build operand list. 9490 SmallVector<SDValue, 8> Ops; 9491 Ops.push_back(LoMul->getOperand(0)); 9492 Ops.push_back(LoMul->getOperand(1)); 9493 Ops.push_back(*LowAdd); 9494 Ops.push_back(*HiAdd); 9495 9496 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode), 9497 DAG.getVTList(MVT::i32, MVT::i32), Ops); 9498 9499 // Replace the ADDs' nodes uses by the MLA node's values. 9500 SDValue HiMLALResult(MLALNode.getNode(), 1); 9501 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 9502 9503 SDValue LoMLALResult(MLALNode.getNode(), 0); 9504 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 9505 9506 // Return original node to notify the driver to stop replacing. 9507 SDValue resNode(AddcNode, 0); 9508 return resNode; 9509 } 9510 9511 static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode, 9512 TargetLowering::DAGCombinerInfo &DCI, 9513 const ARMSubtarget *Subtarget) { 9514 // UMAAL is similar to UMLAL except that it adds two unsigned values. 9515 // While trying to combine for the other MLAL nodes, first search for the 9516 // chance to use UMAAL. Check if Addc uses another addc node which can first 9517 // be combined into a UMLAL. The other pattern is AddcNode being combined 9518 // into an UMLAL and then using another addc is handled in ISelDAGToDAG. 9519 9520 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || 9521 (Subtarget->isThumb() && !Subtarget->hasThumb2())) 9522 return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); 9523 9524 SDNode *PrevAddc = nullptr; 9525 if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC) 9526 PrevAddc = AddcNode->getOperand(0).getNode(); 9527 else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC) 9528 PrevAddc = AddcNode->getOperand(1).getNode(); 9529 9530 // If there's no addc chains, just return a search for any MLAL. 9531 if (PrevAddc == nullptr) 9532 return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); 9533 9534 // Try to convert the addc operand to an MLAL and if that fails try to 9535 // combine AddcNode. 9536 SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget); 9537 if (MLAL != SDValue(PrevAddc, 0)) 9538 return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); 9539 9540 // Find the converted UMAAL or quit if it doesn't exist. 9541 SDNode *UmlalNode = nullptr; 9542 SDValue AddHi; 9543 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 9544 UmlalNode = AddcNode->getOperand(0).getNode(); 9545 AddHi = AddcNode->getOperand(1); 9546 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 9547 UmlalNode = AddcNode->getOperand(1).getNode(); 9548 AddHi = AddcNode->getOperand(0); 9549 } else { 9550 return SDValue(); 9551 } 9552 9553 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 9554 // the ADDC as well as Zero. 9555 auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3)); 9556 9557 if (!Zero || Zero->getZExtValue() != 0) 9558 return SDValue(); 9559 9560 // Check that we have a glued ADDC node. 9561 if (AddcNode->getValueType(1) != MVT::Glue) 9562 return SDValue(); 9563 9564 // Look for the glued ADDE. 9565 SDNode* AddeNode = AddcNode->getGluedUser(); 9566 if (!AddeNode) 9567 return SDValue(); 9568 9569 if ((AddeNode->getOperand(0).getNode() == Zero && 9570 AddeNode->getOperand(1).getNode() == UmlalNode) || 9571 (AddeNode->getOperand(0).getNode() == UmlalNode && 9572 AddeNode->getOperand(1).getNode() == Zero)) { 9573 9574 SelectionDAG &DAG = DCI.DAG; 9575 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 9576 UmlalNode->getOperand(2), AddHi }; 9577 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 9578 DAG.getVTList(MVT::i32, MVT::i32), Ops); 9579 9580 // Replace the ADDs' nodes uses by the UMAAL node's values. 9581 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 9582 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 9583 9584 // Return original node to notify the driver to stop replacing. 9585 return SDValue(AddcNode, 0); 9586 } 9587 return SDValue(); 9588 } 9589 9590 /// PerformADDCCombine - Target-specific dag combine transform from 9591 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or 9592 /// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 9593 static SDValue PerformADDCCombine(SDNode *N, 9594 TargetLowering::DAGCombinerInfo &DCI, 9595 const ARMSubtarget *Subtarget) { 9596 9597 if (Subtarget->isThumb1Only()) return SDValue(); 9598 9599 // Only perform the checks after legalize when the pattern is available. 9600 if (DCI.isBeforeLegalize()) return SDValue(); 9601 9602 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 9603 } 9604 9605 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 9606 /// operands N0 and N1. This is a helper for PerformADDCombine that is 9607 /// called with the default operands, and if that fails, with commuted 9608 /// operands. 9609 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 9610 TargetLowering::DAGCombinerInfo &DCI, 9611 const ARMSubtarget *Subtarget){ 9612 // Attempt to create vpadd for this add. 9613 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 9614 return Result; 9615 9616 // Attempt to create vpaddl for this add. 9617 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 9618 return Result; 9619 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 9620 Subtarget)) 9621 return Result; 9622 9623 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 9624 if (N0.getNode()->hasOneUse()) 9625 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 9626 return Result; 9627 return SDValue(); 9628 } 9629 9630 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 9631 /// 9632 static SDValue PerformADDCombine(SDNode *N, 9633 TargetLowering::DAGCombinerInfo &DCI, 9634 const ARMSubtarget *Subtarget) { 9635 SDValue N0 = N->getOperand(0); 9636 SDValue N1 = N->getOperand(1); 9637 9638 // First try with the default operand order. 9639 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 9640 return Result; 9641 9642 // If that didn't work, try again with the operands commuted. 9643 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 9644 } 9645 9646 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 9647 /// 9648 static SDValue PerformSUBCombine(SDNode *N, 9649 TargetLowering::DAGCombinerInfo &DCI) { 9650 SDValue N0 = N->getOperand(0); 9651 SDValue N1 = N->getOperand(1); 9652 9653 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 9654 if (N1.getNode()->hasOneUse()) 9655 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 9656 return Result; 9657 9658 return SDValue(); 9659 } 9660 9661 /// PerformVMULCombine 9662 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 9663 /// special multiplier accumulator forwarding. 9664 /// vmul d3, d0, d2 9665 /// vmla d3, d1, d2 9666 /// is faster than 9667 /// vadd d3, d0, d1 9668 /// vmul d3, d3, d2 9669 // However, for (A + B) * (A + B), 9670 // vadd d2, d0, d1 9671 // vmul d3, d0, d2 9672 // vmla d3, d1, d2 9673 // is slower than 9674 // vadd d2, d0, d1 9675 // vmul d3, d2, d2 9676 static SDValue PerformVMULCombine(SDNode *N, 9677 TargetLowering::DAGCombinerInfo &DCI, 9678 const ARMSubtarget *Subtarget) { 9679 if (!Subtarget->hasVMLxForwarding()) 9680 return SDValue(); 9681 9682 SelectionDAG &DAG = DCI.DAG; 9683 SDValue N0 = N->getOperand(0); 9684 SDValue N1 = N->getOperand(1); 9685 unsigned Opcode = N0.getOpcode(); 9686 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 9687 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 9688 Opcode = N1.getOpcode(); 9689 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 9690 Opcode != ISD::FADD && Opcode != ISD::FSUB) 9691 return SDValue(); 9692 std::swap(N0, N1); 9693 } 9694 9695 if (N0 == N1) 9696 return SDValue(); 9697 9698 EVT VT = N->getValueType(0); 9699 SDLoc DL(N); 9700 SDValue N00 = N0->getOperand(0); 9701 SDValue N01 = N0->getOperand(1); 9702 return DAG.getNode(Opcode, DL, VT, 9703 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 9704 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 9705 } 9706 9707 static SDValue PerformMULCombine(SDNode *N, 9708 TargetLowering::DAGCombinerInfo &DCI, 9709 const ARMSubtarget *Subtarget) { 9710 SelectionDAG &DAG = DCI.DAG; 9711 9712 if (Subtarget->isThumb1Only()) 9713 return SDValue(); 9714 9715 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9716 return SDValue(); 9717 9718 EVT VT = N->getValueType(0); 9719 if (VT.is64BitVector() || VT.is128BitVector()) 9720 return PerformVMULCombine(N, DCI, Subtarget); 9721 if (VT != MVT::i32) 9722 return SDValue(); 9723 9724 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9725 if (!C) 9726 return SDValue(); 9727 9728 int64_t MulAmt = C->getSExtValue(); 9729 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 9730 9731 ShiftAmt = ShiftAmt & (32 - 1); 9732 SDValue V = N->getOperand(0); 9733 SDLoc DL(N); 9734 9735 SDValue Res; 9736 MulAmt >>= ShiftAmt; 9737 9738 if (MulAmt >= 0) { 9739 if (isPowerOf2_32(MulAmt - 1)) { 9740 // (mul x, 2^N + 1) => (add (shl x, N), x) 9741 Res = DAG.getNode(ISD::ADD, DL, VT, 9742 V, 9743 DAG.getNode(ISD::SHL, DL, VT, 9744 V, 9745 DAG.getConstant(Log2_32(MulAmt - 1), DL, 9746 MVT::i32))); 9747 } else if (isPowerOf2_32(MulAmt + 1)) { 9748 // (mul x, 2^N - 1) => (sub (shl x, N), x) 9749 Res = DAG.getNode(ISD::SUB, DL, VT, 9750 DAG.getNode(ISD::SHL, DL, VT, 9751 V, 9752 DAG.getConstant(Log2_32(MulAmt + 1), DL, 9753 MVT::i32)), 9754 V); 9755 } else 9756 return SDValue(); 9757 } else { 9758 uint64_t MulAmtAbs = -MulAmt; 9759 if (isPowerOf2_32(MulAmtAbs + 1)) { 9760 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 9761 Res = DAG.getNode(ISD::SUB, DL, VT, 9762 V, 9763 DAG.getNode(ISD::SHL, DL, VT, 9764 V, 9765 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 9766 MVT::i32))); 9767 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 9768 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 9769 Res = DAG.getNode(ISD::ADD, DL, VT, 9770 V, 9771 DAG.getNode(ISD::SHL, DL, VT, 9772 V, 9773 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 9774 MVT::i32))); 9775 Res = DAG.getNode(ISD::SUB, DL, VT, 9776 DAG.getConstant(0, DL, MVT::i32), Res); 9777 9778 } else 9779 return SDValue(); 9780 } 9781 9782 if (ShiftAmt != 0) 9783 Res = DAG.getNode(ISD::SHL, DL, VT, 9784 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 9785 9786 // Do not add new nodes to DAG combiner worklist. 9787 DCI.CombineTo(N, Res, false); 9788 return SDValue(); 9789 } 9790 9791 static SDValue PerformANDCombine(SDNode *N, 9792 TargetLowering::DAGCombinerInfo &DCI, 9793 const ARMSubtarget *Subtarget) { 9794 9795 // Attempt to use immediate-form VBIC 9796 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 9797 SDLoc dl(N); 9798 EVT VT = N->getValueType(0); 9799 SelectionDAG &DAG = DCI.DAG; 9800 9801 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9802 return SDValue(); 9803 9804 APInt SplatBits, SplatUndef; 9805 unsigned SplatBitSize; 9806 bool HasAnyUndefs; 9807 if (BVN && 9808 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 9809 if (SplatBitSize <= 64) { 9810 EVT VbicVT; 9811 SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), 9812 SplatUndef.getZExtValue(), SplatBitSize, 9813 DAG, dl, VbicVT, VT.is128BitVector(), 9814 OtherModImm); 9815 if (Val.getNode()) { 9816 SDValue Input = 9817 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 9818 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 9819 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 9820 } 9821 } 9822 } 9823 9824 if (!Subtarget->isThumb1Only()) { 9825 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 9826 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 9827 return Result; 9828 } 9829 9830 return SDValue(); 9831 } 9832 9833 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 9834 static SDValue PerformORCombine(SDNode *N, 9835 TargetLowering::DAGCombinerInfo &DCI, 9836 const ARMSubtarget *Subtarget) { 9837 // Attempt to use immediate-form VORR 9838 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 9839 SDLoc dl(N); 9840 EVT VT = N->getValueType(0); 9841 SelectionDAG &DAG = DCI.DAG; 9842 9843 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9844 return SDValue(); 9845 9846 APInt SplatBits, SplatUndef; 9847 unsigned SplatBitSize; 9848 bool HasAnyUndefs; 9849 if (BVN && Subtarget->hasNEON() && 9850 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 9851 if (SplatBitSize <= 64) { 9852 EVT VorrVT; 9853 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 9854 SplatUndef.getZExtValue(), SplatBitSize, 9855 DAG, dl, VorrVT, VT.is128BitVector(), 9856 OtherModImm); 9857 if (Val.getNode()) { 9858 SDValue Input = 9859 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 9860 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 9861 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 9862 } 9863 } 9864 } 9865 9866 if (!Subtarget->isThumb1Only()) { 9867 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 9868 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 9869 return Result; 9870 } 9871 9872 // The code below optimizes (or (and X, Y), Z). 9873 // The AND operand needs to have a single user to make these optimizations 9874 // profitable. 9875 SDValue N0 = N->getOperand(0); 9876 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 9877 return SDValue(); 9878 SDValue N1 = N->getOperand(1); 9879 9880 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 9881 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 9882 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 9883 APInt SplatUndef; 9884 unsigned SplatBitSize; 9885 bool HasAnyUndefs; 9886 9887 APInt SplatBits0, SplatBits1; 9888 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 9889 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 9890 // Ensure that the second operand of both ands are constants 9891 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 9892 HasAnyUndefs) && !HasAnyUndefs) { 9893 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 9894 HasAnyUndefs) && !HasAnyUndefs) { 9895 // Ensure that the bit width of the constants are the same and that 9896 // the splat arguments are logical inverses as per the pattern we 9897 // are trying to simplify. 9898 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 9899 SplatBits0 == ~SplatBits1) { 9900 // Canonicalize the vector type to make instruction selection 9901 // simpler. 9902 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 9903 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 9904 N0->getOperand(1), 9905 N0->getOperand(0), 9906 N1->getOperand(0)); 9907 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 9908 } 9909 } 9910 } 9911 } 9912 9913 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 9914 // reasonable. 9915 9916 // BFI is only available on V6T2+ 9917 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 9918 return SDValue(); 9919 9920 SDLoc DL(N); 9921 // 1) or (and A, mask), val => ARMbfi A, val, mask 9922 // iff (val & mask) == val 9923 // 9924 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 9925 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 9926 // && mask == ~mask2 9927 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 9928 // && ~mask == mask2 9929 // (i.e., copy a bitfield value into another bitfield of the same width) 9930 9931 if (VT != MVT::i32) 9932 return SDValue(); 9933 9934 SDValue N00 = N0.getOperand(0); 9935 9936 // The value and the mask need to be constants so we can verify this is 9937 // actually a bitfield set. If the mask is 0xffff, we can do better 9938 // via a movt instruction, so don't use BFI in that case. 9939 SDValue MaskOp = N0.getOperand(1); 9940 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 9941 if (!MaskC) 9942 return SDValue(); 9943 unsigned Mask = MaskC->getZExtValue(); 9944 if (Mask == 0xffff) 9945 return SDValue(); 9946 SDValue Res; 9947 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 9948 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9949 if (N1C) { 9950 unsigned Val = N1C->getZExtValue(); 9951 if ((Val & ~Mask) != Val) 9952 return SDValue(); 9953 9954 if (ARM::isBitFieldInvertedMask(Mask)) { 9955 Val >>= countTrailingZeros(~Mask); 9956 9957 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 9958 DAG.getConstant(Val, DL, MVT::i32), 9959 DAG.getConstant(Mask, DL, MVT::i32)); 9960 9961 // Do not add new nodes to DAG combiner worklist. 9962 DCI.CombineTo(N, Res, false); 9963 return SDValue(); 9964 } 9965 } else if (N1.getOpcode() == ISD::AND) { 9966 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 9967 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 9968 if (!N11C) 9969 return SDValue(); 9970 unsigned Mask2 = N11C->getZExtValue(); 9971 9972 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 9973 // as is to match. 9974 if (ARM::isBitFieldInvertedMask(Mask) && 9975 (Mask == ~Mask2)) { 9976 // The pack halfword instruction works better for masks that fit it, 9977 // so use that when it's available. 9978 if (Subtarget->hasT2ExtractPack() && 9979 (Mask == 0xffff || Mask == 0xffff0000)) 9980 return SDValue(); 9981 // 2a 9982 unsigned amt = countTrailingZeros(Mask2); 9983 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 9984 DAG.getConstant(amt, DL, MVT::i32)); 9985 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 9986 DAG.getConstant(Mask, DL, MVT::i32)); 9987 // Do not add new nodes to DAG combiner worklist. 9988 DCI.CombineTo(N, Res, false); 9989 return SDValue(); 9990 } else if (ARM::isBitFieldInvertedMask(~Mask) && 9991 (~Mask == Mask2)) { 9992 // The pack halfword instruction works better for masks that fit it, 9993 // so use that when it's available. 9994 if (Subtarget->hasT2ExtractPack() && 9995 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 9996 return SDValue(); 9997 // 2b 9998 unsigned lsb = countTrailingZeros(Mask); 9999 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 10000 DAG.getConstant(lsb, DL, MVT::i32)); 10001 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 10002 DAG.getConstant(Mask2, DL, MVT::i32)); 10003 // Do not add new nodes to DAG combiner worklist. 10004 DCI.CombineTo(N, Res, false); 10005 return SDValue(); 10006 } 10007 } 10008 10009 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 10010 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 10011 ARM::isBitFieldInvertedMask(~Mask)) { 10012 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 10013 // where lsb(mask) == #shamt and masked bits of B are known zero. 10014 SDValue ShAmt = N00.getOperand(1); 10015 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 10016 unsigned LSB = countTrailingZeros(Mask); 10017 if (ShAmtC != LSB) 10018 return SDValue(); 10019 10020 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 10021 DAG.getConstant(~Mask, DL, MVT::i32)); 10022 10023 // Do not add new nodes to DAG combiner worklist. 10024 DCI.CombineTo(N, Res, false); 10025 } 10026 10027 return SDValue(); 10028 } 10029 10030 static SDValue PerformXORCombine(SDNode *N, 10031 TargetLowering::DAGCombinerInfo &DCI, 10032 const ARMSubtarget *Subtarget) { 10033 EVT VT = N->getValueType(0); 10034 SelectionDAG &DAG = DCI.DAG; 10035 10036 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10037 return SDValue(); 10038 10039 if (!Subtarget->isThumb1Only()) { 10040 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 10041 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 10042 return Result; 10043 } 10044 10045 return SDValue(); 10046 } 10047 10048 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 10049 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 10050 // their position in "to" (Rd). 10051 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 10052 assert(N->getOpcode() == ARMISD::BFI); 10053 10054 SDValue From = N->getOperand(1); 10055 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 10056 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 10057 10058 // If the Base came from a SHR #C, we can deduce that it is really testing bit 10059 // #C in the base of the SHR. 10060 if (From->getOpcode() == ISD::SRL && 10061 isa<ConstantSDNode>(From->getOperand(1))) { 10062 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 10063 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 10064 FromMask <<= Shift.getLimitedValue(31); 10065 From = From->getOperand(0); 10066 } 10067 10068 return From; 10069 } 10070 10071 // If A and B contain one contiguous set of bits, does A | B == A . B? 10072 // 10073 // Neither A nor B must be zero. 10074 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 10075 unsigned LastActiveBitInA = A.countTrailingZeros(); 10076 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 10077 return LastActiveBitInA - 1 == FirstActiveBitInB; 10078 } 10079 10080 static SDValue FindBFIToCombineWith(SDNode *N) { 10081 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 10082 // if one exists. 10083 APInt ToMask, FromMask; 10084 SDValue From = ParseBFI(N, ToMask, FromMask); 10085 SDValue To = N->getOperand(0); 10086 10087 // Now check for a compatible BFI to merge with. We can pass through BFIs that 10088 // aren't compatible, but not if they set the same bit in their destination as 10089 // we do (or that of any BFI we're going to combine with). 10090 SDValue V = To; 10091 APInt CombinedToMask = ToMask; 10092 while (V.getOpcode() == ARMISD::BFI) { 10093 APInt NewToMask, NewFromMask; 10094 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 10095 if (NewFrom != From) { 10096 // This BFI has a different base. Keep going. 10097 CombinedToMask |= NewToMask; 10098 V = V.getOperand(0); 10099 continue; 10100 } 10101 10102 // Do the written bits conflict with any we've seen so far? 10103 if ((NewToMask & CombinedToMask).getBoolValue()) 10104 // Conflicting bits - bail out because going further is unsafe. 10105 return SDValue(); 10106 10107 // Are the new bits contiguous when combined with the old bits? 10108 if (BitsProperlyConcatenate(ToMask, NewToMask) && 10109 BitsProperlyConcatenate(FromMask, NewFromMask)) 10110 return V; 10111 if (BitsProperlyConcatenate(NewToMask, ToMask) && 10112 BitsProperlyConcatenate(NewFromMask, FromMask)) 10113 return V; 10114 10115 // We've seen a write to some bits, so track it. 10116 CombinedToMask |= NewToMask; 10117 // Keep going... 10118 V = V.getOperand(0); 10119 } 10120 10121 return SDValue(); 10122 } 10123 10124 static SDValue PerformBFICombine(SDNode *N, 10125 TargetLowering::DAGCombinerInfo &DCI) { 10126 SDValue N1 = N->getOperand(1); 10127 if (N1.getOpcode() == ISD::AND) { 10128 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 10129 // the bits being cleared by the AND are not demanded by the BFI. 10130 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 10131 if (!N11C) 10132 return SDValue(); 10133 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 10134 unsigned LSB = countTrailingZeros(~InvMask); 10135 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 10136 assert(Width < 10137 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 10138 "undefined behavior"); 10139 unsigned Mask = (1u << Width) - 1; 10140 unsigned Mask2 = N11C->getZExtValue(); 10141 if ((Mask & (~Mask2)) == 0) 10142 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 10143 N->getOperand(0), N1.getOperand(0), 10144 N->getOperand(2)); 10145 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 10146 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 10147 // Keep track of any consecutive bits set that all come from the same base 10148 // value. We can combine these together into a single BFI. 10149 SDValue CombineBFI = FindBFIToCombineWith(N); 10150 if (CombineBFI == SDValue()) 10151 return SDValue(); 10152 10153 // We've found a BFI. 10154 APInt ToMask1, FromMask1; 10155 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 10156 10157 APInt ToMask2, FromMask2; 10158 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 10159 assert(From1 == From2); 10160 (void)From2; 10161 10162 // First, unlink CombineBFI. 10163 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 10164 // Then create a new BFI, combining the two together. 10165 APInt NewFromMask = FromMask1 | FromMask2; 10166 APInt NewToMask = ToMask1 | ToMask2; 10167 10168 EVT VT = N->getValueType(0); 10169 SDLoc dl(N); 10170 10171 if (NewFromMask[0] == 0) 10172 From1 = DCI.DAG.getNode( 10173 ISD::SRL, dl, VT, From1, 10174 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 10175 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 10176 DCI.DAG.getConstant(~NewToMask, dl, VT)); 10177 } 10178 return SDValue(); 10179 } 10180 10181 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 10182 /// ARMISD::VMOVRRD. 10183 static SDValue PerformVMOVRRDCombine(SDNode *N, 10184 TargetLowering::DAGCombinerInfo &DCI, 10185 const ARMSubtarget *Subtarget) { 10186 // vmovrrd(vmovdrr x, y) -> x,y 10187 SDValue InDouble = N->getOperand(0); 10188 if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) 10189 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 10190 10191 // vmovrrd(load f64) -> (load i32), (load i32) 10192 SDNode *InNode = InDouble.getNode(); 10193 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 10194 InNode->getValueType(0) == MVT::f64 && 10195 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 10196 !cast<LoadSDNode>(InNode)->isVolatile()) { 10197 // TODO: Should this be done for non-FrameIndex operands? 10198 LoadSDNode *LD = cast<LoadSDNode>(InNode); 10199 10200 SelectionDAG &DAG = DCI.DAG; 10201 SDLoc DL(LD); 10202 SDValue BasePtr = LD->getBasePtr(); 10203 SDValue NewLD1 = 10204 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 10205 LD->getAlignment(), LD->getMemOperand()->getFlags()); 10206 10207 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 10208 DAG.getConstant(4, DL, MVT::i32)); 10209 SDValue NewLD2 = DAG.getLoad( 10210 MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), 10211 std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); 10212 10213 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 10214 if (DCI.DAG.getDataLayout().isBigEndian()) 10215 std::swap (NewLD1, NewLD2); 10216 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 10217 return Result; 10218 } 10219 10220 return SDValue(); 10221 } 10222 10223 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 10224 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 10225 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 10226 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 10227 SDValue Op0 = N->getOperand(0); 10228 SDValue Op1 = N->getOperand(1); 10229 if (Op0.getOpcode() == ISD::BITCAST) 10230 Op0 = Op0.getOperand(0); 10231 if (Op1.getOpcode() == ISD::BITCAST) 10232 Op1 = Op1.getOperand(0); 10233 if (Op0.getOpcode() == ARMISD::VMOVRRD && 10234 Op0.getNode() == Op1.getNode() && 10235 Op0.getResNo() == 0 && Op1.getResNo() == 1) 10236 return DAG.getNode(ISD::BITCAST, SDLoc(N), 10237 N->getValueType(0), Op0.getOperand(0)); 10238 return SDValue(); 10239 } 10240 10241 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 10242 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 10243 /// i64 vector to have f64 elements, since the value can then be loaded 10244 /// directly into a VFP register. 10245 static bool hasNormalLoadOperand(SDNode *N) { 10246 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 10247 for (unsigned i = 0; i < NumElts; ++i) { 10248 SDNode *Elt = N->getOperand(i).getNode(); 10249 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 10250 return true; 10251 } 10252 return false; 10253 } 10254 10255 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 10256 /// ISD::BUILD_VECTOR. 10257 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 10258 TargetLowering::DAGCombinerInfo &DCI, 10259 const ARMSubtarget *Subtarget) { 10260 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 10261 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 10262 // into a pair of GPRs, which is fine when the value is used as a scalar, 10263 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 10264 SelectionDAG &DAG = DCI.DAG; 10265 if (N->getNumOperands() == 2) 10266 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 10267 return RV; 10268 10269 // Load i64 elements as f64 values so that type legalization does not split 10270 // them up into i32 values. 10271 EVT VT = N->getValueType(0); 10272 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 10273 return SDValue(); 10274 SDLoc dl(N); 10275 SmallVector<SDValue, 8> Ops; 10276 unsigned NumElts = VT.getVectorNumElements(); 10277 for (unsigned i = 0; i < NumElts; ++i) { 10278 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 10279 Ops.push_back(V); 10280 // Make the DAGCombiner fold the bitcast. 10281 DCI.AddToWorklist(V.getNode()); 10282 } 10283 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 10284 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 10285 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 10286 } 10287 10288 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 10289 static SDValue 10290 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 10291 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 10292 // At that time, we may have inserted bitcasts from integer to float. 10293 // If these bitcasts have survived DAGCombine, change the lowering of this 10294 // BUILD_VECTOR in something more vector friendly, i.e., that does not 10295 // force to use floating point types. 10296 10297 // Make sure we can change the type of the vector. 10298 // This is possible iff: 10299 // 1. The vector is only used in a bitcast to a integer type. I.e., 10300 // 1.1. Vector is used only once. 10301 // 1.2. Use is a bit convert to an integer type. 10302 // 2. The size of its operands are 32-bits (64-bits are not legal). 10303 EVT VT = N->getValueType(0); 10304 EVT EltVT = VT.getVectorElementType(); 10305 10306 // Check 1.1. and 2. 10307 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 10308 return SDValue(); 10309 10310 // By construction, the input type must be float. 10311 assert(EltVT == MVT::f32 && "Unexpected type!"); 10312 10313 // Check 1.2. 10314 SDNode *Use = *N->use_begin(); 10315 if (Use->getOpcode() != ISD::BITCAST || 10316 Use->getValueType(0).isFloatingPoint()) 10317 return SDValue(); 10318 10319 // Check profitability. 10320 // Model is, if more than half of the relevant operands are bitcast from 10321 // i32, turn the build_vector into a sequence of insert_vector_elt. 10322 // Relevant operands are everything that is not statically 10323 // (i.e., at compile time) bitcasted. 10324 unsigned NumOfBitCastedElts = 0; 10325 unsigned NumElts = VT.getVectorNumElements(); 10326 unsigned NumOfRelevantElts = NumElts; 10327 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 10328 SDValue Elt = N->getOperand(Idx); 10329 if (Elt->getOpcode() == ISD::BITCAST) { 10330 // Assume only bit cast to i32 will go away. 10331 if (Elt->getOperand(0).getValueType() == MVT::i32) 10332 ++NumOfBitCastedElts; 10333 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 10334 // Constants are statically casted, thus do not count them as 10335 // relevant operands. 10336 --NumOfRelevantElts; 10337 } 10338 10339 // Check if more than half of the elements require a non-free bitcast. 10340 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 10341 return SDValue(); 10342 10343 SelectionDAG &DAG = DCI.DAG; 10344 // Create the new vector type. 10345 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 10346 // Check if the type is legal. 10347 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10348 if (!TLI.isTypeLegal(VecVT)) 10349 return SDValue(); 10350 10351 // Combine: 10352 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 10353 // => BITCAST INSERT_VECTOR_ELT 10354 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 10355 // (BITCAST EN), N. 10356 SDValue Vec = DAG.getUNDEF(VecVT); 10357 SDLoc dl(N); 10358 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 10359 SDValue V = N->getOperand(Idx); 10360 if (V.isUndef()) 10361 continue; 10362 if (V.getOpcode() == ISD::BITCAST && 10363 V->getOperand(0).getValueType() == MVT::i32) 10364 // Fold obvious case. 10365 V = V.getOperand(0); 10366 else { 10367 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 10368 // Make the DAGCombiner fold the bitcasts. 10369 DCI.AddToWorklist(V.getNode()); 10370 } 10371 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 10372 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 10373 } 10374 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 10375 // Make the DAGCombiner fold the bitcasts. 10376 DCI.AddToWorklist(Vec.getNode()); 10377 return Vec; 10378 } 10379 10380 /// PerformInsertEltCombine - Target-specific dag combine xforms for 10381 /// ISD::INSERT_VECTOR_ELT. 10382 static SDValue PerformInsertEltCombine(SDNode *N, 10383 TargetLowering::DAGCombinerInfo &DCI) { 10384 // Bitcast an i64 load inserted into a vector to f64. 10385 // Otherwise, the i64 value will be legalized to a pair of i32 values. 10386 EVT VT = N->getValueType(0); 10387 SDNode *Elt = N->getOperand(1).getNode(); 10388 if (VT.getVectorElementType() != MVT::i64 || 10389 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 10390 return SDValue(); 10391 10392 SelectionDAG &DAG = DCI.DAG; 10393 SDLoc dl(N); 10394 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 10395 VT.getVectorNumElements()); 10396 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 10397 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 10398 // Make the DAGCombiner fold the bitcasts. 10399 DCI.AddToWorklist(Vec.getNode()); 10400 DCI.AddToWorklist(V.getNode()); 10401 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 10402 Vec, V, N->getOperand(2)); 10403 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 10404 } 10405 10406 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 10407 /// ISD::VECTOR_SHUFFLE. 10408 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 10409 // The LLVM shufflevector instruction does not require the shuffle mask 10410 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 10411 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 10412 // operands do not match the mask length, they are extended by concatenating 10413 // them with undef vectors. That is probably the right thing for other 10414 // targets, but for NEON it is better to concatenate two double-register 10415 // size vector operands into a single quad-register size vector. Do that 10416 // transformation here: 10417 // shuffle(concat(v1, undef), concat(v2, undef)) -> 10418 // shuffle(concat(v1, v2), undef) 10419 SDValue Op0 = N->getOperand(0); 10420 SDValue Op1 = N->getOperand(1); 10421 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 10422 Op1.getOpcode() != ISD::CONCAT_VECTORS || 10423 Op0.getNumOperands() != 2 || 10424 Op1.getNumOperands() != 2) 10425 return SDValue(); 10426 SDValue Concat0Op1 = Op0.getOperand(1); 10427 SDValue Concat1Op1 = Op1.getOperand(1); 10428 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 10429 return SDValue(); 10430 // Skip the transformation if any of the types are illegal. 10431 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10432 EVT VT = N->getValueType(0); 10433 if (!TLI.isTypeLegal(VT) || 10434 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 10435 !TLI.isTypeLegal(Concat1Op1.getValueType())) 10436 return SDValue(); 10437 10438 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 10439 Op0.getOperand(0), Op1.getOperand(0)); 10440 // Translate the shuffle mask. 10441 SmallVector<int, 16> NewMask; 10442 unsigned NumElts = VT.getVectorNumElements(); 10443 unsigned HalfElts = NumElts/2; 10444 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 10445 for (unsigned n = 0; n < NumElts; ++n) { 10446 int MaskElt = SVN->getMaskElt(n); 10447 int NewElt = -1; 10448 if (MaskElt < (int)HalfElts) 10449 NewElt = MaskElt; 10450 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 10451 NewElt = HalfElts + MaskElt - NumElts; 10452 NewMask.push_back(NewElt); 10453 } 10454 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 10455 DAG.getUNDEF(VT), NewMask); 10456 } 10457 10458 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 10459 /// NEON load/store intrinsics, and generic vector load/stores, to merge 10460 /// base address updates. 10461 /// For generic load/stores, the memory type is assumed to be a vector. 10462 /// The caller is assumed to have checked legality. 10463 static SDValue CombineBaseUpdate(SDNode *N, 10464 TargetLowering::DAGCombinerInfo &DCI) { 10465 SelectionDAG &DAG = DCI.DAG; 10466 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 10467 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 10468 const bool isStore = N->getOpcode() == ISD::STORE; 10469 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 10470 SDValue Addr = N->getOperand(AddrOpIdx); 10471 MemSDNode *MemN = cast<MemSDNode>(N); 10472 SDLoc dl(N); 10473 10474 // Search for a use of the address operand that is an increment. 10475 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 10476 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 10477 SDNode *User = *UI; 10478 if (User->getOpcode() != ISD::ADD || 10479 UI.getUse().getResNo() != Addr.getResNo()) 10480 continue; 10481 10482 // Check that the add is independent of the load/store. Otherwise, folding 10483 // it would create a cycle. 10484 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 10485 continue; 10486 10487 // Find the new opcode for the updating load/store. 10488 bool isLoadOp = true; 10489 bool isLaneOp = false; 10490 unsigned NewOpc = 0; 10491 unsigned NumVecs = 0; 10492 if (isIntrinsic) { 10493 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 10494 switch (IntNo) { 10495 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 10496 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 10497 NumVecs = 1; break; 10498 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 10499 NumVecs = 2; break; 10500 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 10501 NumVecs = 3; break; 10502 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 10503 NumVecs = 4; break; 10504 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 10505 NumVecs = 2; isLaneOp = true; break; 10506 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 10507 NumVecs = 3; isLaneOp = true; break; 10508 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 10509 NumVecs = 4; isLaneOp = true; break; 10510 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 10511 NumVecs = 1; isLoadOp = false; break; 10512 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 10513 NumVecs = 2; isLoadOp = false; break; 10514 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 10515 NumVecs = 3; isLoadOp = false; break; 10516 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 10517 NumVecs = 4; isLoadOp = false; break; 10518 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 10519 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 10520 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 10521 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 10522 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 10523 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 10524 } 10525 } else { 10526 isLaneOp = true; 10527 switch (N->getOpcode()) { 10528 default: llvm_unreachable("unexpected opcode for Neon base update"); 10529 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 10530 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 10531 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 10532 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 10533 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 10534 NumVecs = 1; isLaneOp = false; break; 10535 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 10536 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 10537 } 10538 } 10539 10540 // Find the size of memory referenced by the load/store. 10541 EVT VecTy; 10542 if (isLoadOp) { 10543 VecTy = N->getValueType(0); 10544 } else if (isIntrinsic) { 10545 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 10546 } else { 10547 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 10548 VecTy = N->getOperand(1).getValueType(); 10549 } 10550 10551 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 10552 if (isLaneOp) 10553 NumBytes /= VecTy.getVectorNumElements(); 10554 10555 // If the increment is a constant, it must match the memory ref size. 10556 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 10557 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 10558 uint64_t IncVal = CInc->getZExtValue(); 10559 if (IncVal != NumBytes) 10560 continue; 10561 } else if (NumBytes >= 3 * 16) { 10562 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 10563 // separate instructions that make it harder to use a non-constant update. 10564 continue; 10565 } 10566 10567 // OK, we found an ADD we can fold into the base update. 10568 // Now, create a _UPD node, taking care of not breaking alignment. 10569 10570 EVT AlignedVecTy = VecTy; 10571 unsigned Alignment = MemN->getAlignment(); 10572 10573 // If this is a less-than-standard-aligned load/store, change the type to 10574 // match the standard alignment. 10575 // The alignment is overlooked when selecting _UPD variants; and it's 10576 // easier to introduce bitcasts here than fix that. 10577 // There are 3 ways to get to this base-update combine: 10578 // - intrinsics: they are assumed to be properly aligned (to the standard 10579 // alignment of the memory type), so we don't need to do anything. 10580 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 10581 // intrinsics, so, likewise, there's nothing to do. 10582 // - generic load/store instructions: the alignment is specified as an 10583 // explicit operand, rather than implicitly as the standard alignment 10584 // of the memory type (like the intrisics). We need to change the 10585 // memory type to match the explicit alignment. That way, we don't 10586 // generate non-standard-aligned ARMISD::VLDx nodes. 10587 if (isa<LSBaseSDNode>(N)) { 10588 if (Alignment == 0) 10589 Alignment = 1; 10590 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 10591 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 10592 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 10593 assert(!isLaneOp && "Unexpected generic load/store lane."); 10594 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 10595 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 10596 } 10597 // Don't set an explicit alignment on regular load/stores that we want 10598 // to transform to VLD/VST 1_UPD nodes. 10599 // This matches the behavior of regular load/stores, which only get an 10600 // explicit alignment if the MMO alignment is larger than the standard 10601 // alignment of the memory type. 10602 // Intrinsics, however, always get an explicit alignment, set to the 10603 // alignment of the MMO. 10604 Alignment = 1; 10605 } 10606 10607 // Create the new updating load/store node. 10608 // First, create an SDVTList for the new updating node's results. 10609 EVT Tys[6]; 10610 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 10611 unsigned n; 10612 for (n = 0; n < NumResultVecs; ++n) 10613 Tys[n] = AlignedVecTy; 10614 Tys[n++] = MVT::i32; 10615 Tys[n] = MVT::Other; 10616 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 10617 10618 // Then, gather the new node's operands. 10619 SmallVector<SDValue, 8> Ops; 10620 Ops.push_back(N->getOperand(0)); // incoming chain 10621 Ops.push_back(N->getOperand(AddrOpIdx)); 10622 Ops.push_back(Inc); 10623 10624 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 10625 // Try to match the intrinsic's signature 10626 Ops.push_back(StN->getValue()); 10627 } else { 10628 // Loads (and of course intrinsics) match the intrinsics' signature, 10629 // so just add all but the alignment operand. 10630 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 10631 Ops.push_back(N->getOperand(i)); 10632 } 10633 10634 // For all node types, the alignment operand is always the last one. 10635 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 10636 10637 // If this is a non-standard-aligned STORE, the penultimate operand is the 10638 // stored value. Bitcast it to the aligned type. 10639 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 10640 SDValue &StVal = Ops[Ops.size()-2]; 10641 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 10642 } 10643 10644 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 10645 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 10646 MemN->getMemOperand()); 10647 10648 // Update the uses. 10649 SmallVector<SDValue, 5> NewResults; 10650 for (unsigned i = 0; i < NumResultVecs; ++i) 10651 NewResults.push_back(SDValue(UpdN.getNode(), i)); 10652 10653 // If this is an non-standard-aligned LOAD, the first result is the loaded 10654 // value. Bitcast it to the expected result type. 10655 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 10656 SDValue &LdVal = NewResults[0]; 10657 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 10658 } 10659 10660 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 10661 DCI.CombineTo(N, NewResults); 10662 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 10663 10664 break; 10665 } 10666 return SDValue(); 10667 } 10668 10669 static SDValue PerformVLDCombine(SDNode *N, 10670 TargetLowering::DAGCombinerInfo &DCI) { 10671 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10672 return SDValue(); 10673 10674 return CombineBaseUpdate(N, DCI); 10675 } 10676 10677 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 10678 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 10679 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 10680 /// return true. 10681 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 10682 SelectionDAG &DAG = DCI.DAG; 10683 EVT VT = N->getValueType(0); 10684 // vldN-dup instructions only support 64-bit vectors for N > 1. 10685 if (!VT.is64BitVector()) 10686 return false; 10687 10688 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 10689 SDNode *VLD = N->getOperand(0).getNode(); 10690 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 10691 return false; 10692 unsigned NumVecs = 0; 10693 unsigned NewOpc = 0; 10694 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 10695 if (IntNo == Intrinsic::arm_neon_vld2lane) { 10696 NumVecs = 2; 10697 NewOpc = ARMISD::VLD2DUP; 10698 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 10699 NumVecs = 3; 10700 NewOpc = ARMISD::VLD3DUP; 10701 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 10702 NumVecs = 4; 10703 NewOpc = ARMISD::VLD4DUP; 10704 } else { 10705 return false; 10706 } 10707 10708 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 10709 // numbers match the load. 10710 unsigned VLDLaneNo = 10711 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 10712 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 10713 UI != UE; ++UI) { 10714 // Ignore uses of the chain result. 10715 if (UI.getUse().getResNo() == NumVecs) 10716 continue; 10717 SDNode *User = *UI; 10718 if (User->getOpcode() != ARMISD::VDUPLANE || 10719 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 10720 return false; 10721 } 10722 10723 // Create the vldN-dup node. 10724 EVT Tys[5]; 10725 unsigned n; 10726 for (n = 0; n < NumVecs; ++n) 10727 Tys[n] = VT; 10728 Tys[n] = MVT::Other; 10729 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 10730 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 10731 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 10732 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 10733 Ops, VLDMemInt->getMemoryVT(), 10734 VLDMemInt->getMemOperand()); 10735 10736 // Update the uses. 10737 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 10738 UI != UE; ++UI) { 10739 unsigned ResNo = UI.getUse().getResNo(); 10740 // Ignore uses of the chain result. 10741 if (ResNo == NumVecs) 10742 continue; 10743 SDNode *User = *UI; 10744 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 10745 } 10746 10747 // Now the vldN-lane intrinsic is dead except for its chain result. 10748 // Update uses of the chain. 10749 std::vector<SDValue> VLDDupResults; 10750 for (unsigned n = 0; n < NumVecs; ++n) 10751 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 10752 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 10753 DCI.CombineTo(VLD, VLDDupResults); 10754 10755 return true; 10756 } 10757 10758 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 10759 /// ARMISD::VDUPLANE. 10760 static SDValue PerformVDUPLANECombine(SDNode *N, 10761 TargetLowering::DAGCombinerInfo &DCI) { 10762 SDValue Op = N->getOperand(0); 10763 10764 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 10765 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 10766 if (CombineVLDDUP(N, DCI)) 10767 return SDValue(N, 0); 10768 10769 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 10770 // redundant. Ignore bit_converts for now; element sizes are checked below. 10771 while (Op.getOpcode() == ISD::BITCAST) 10772 Op = Op.getOperand(0); 10773 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 10774 return SDValue(); 10775 10776 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 10777 unsigned EltSize = Op.getScalarValueSizeInBits(); 10778 // The canonical VMOV for a zero vector uses a 32-bit element size. 10779 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10780 unsigned EltBits; 10781 if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) 10782 EltSize = 8; 10783 EVT VT = N->getValueType(0); 10784 if (EltSize > VT.getScalarSizeInBits()) 10785 return SDValue(); 10786 10787 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 10788 } 10789 10790 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 10791 static SDValue PerformVDUPCombine(SDNode *N, 10792 TargetLowering::DAGCombinerInfo &DCI) { 10793 SelectionDAG &DAG = DCI.DAG; 10794 SDValue Op = N->getOperand(0); 10795 10796 // Match VDUP(LOAD) -> VLD1DUP. 10797 // We match this pattern here rather than waiting for isel because the 10798 // transform is only legal for unindexed loads. 10799 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 10800 if (LD && Op.hasOneUse() && LD->isUnindexed() && 10801 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 10802 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 10803 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 10804 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 10805 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 10806 Ops, LD->getMemoryVT(), 10807 LD->getMemOperand()); 10808 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 10809 return VLDDup; 10810 } 10811 10812 return SDValue(); 10813 } 10814 10815 static SDValue PerformLOADCombine(SDNode *N, 10816 TargetLowering::DAGCombinerInfo &DCI) { 10817 EVT VT = N->getValueType(0); 10818 10819 // If this is a legal vector load, try to combine it into a VLD1_UPD. 10820 if (ISD::isNormalLoad(N) && VT.isVector() && 10821 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10822 return CombineBaseUpdate(N, DCI); 10823 10824 return SDValue(); 10825 } 10826 10827 /// PerformSTORECombine - Target-specific dag combine xforms for 10828 /// ISD::STORE. 10829 static SDValue PerformSTORECombine(SDNode *N, 10830 TargetLowering::DAGCombinerInfo &DCI) { 10831 StoreSDNode *St = cast<StoreSDNode>(N); 10832 if (St->isVolatile()) 10833 return SDValue(); 10834 10835 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 10836 // pack all of the elements in one place. Next, store to memory in fewer 10837 // chunks. 10838 SDValue StVal = St->getValue(); 10839 EVT VT = StVal.getValueType(); 10840 if (St->isTruncatingStore() && VT.isVector()) { 10841 SelectionDAG &DAG = DCI.DAG; 10842 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10843 EVT StVT = St->getMemoryVT(); 10844 unsigned NumElems = VT.getVectorNumElements(); 10845 assert(StVT != VT && "Cannot truncate to the same type"); 10846 unsigned FromEltSz = VT.getScalarSizeInBits(); 10847 unsigned ToEltSz = StVT.getScalarSizeInBits(); 10848 10849 // From, To sizes and ElemCount must be pow of two 10850 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); 10851 10852 // We are going to use the original vector elt for storing. 10853 // Accumulated smaller vector elements must be a multiple of the store size. 10854 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); 10855 10856 unsigned SizeRatio = FromEltSz / ToEltSz; 10857 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 10858 10859 // Create a type on which we perform the shuffle. 10860 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 10861 NumElems*SizeRatio); 10862 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 10863 10864 SDLoc DL(St); 10865 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 10866 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 10867 for (unsigned i = 0; i < NumElems; ++i) 10868 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() 10869 ? (i + 1) * SizeRatio - 1 10870 : i * SizeRatio; 10871 10872 // Can't shuffle using an illegal type. 10873 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 10874 10875 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, 10876 DAG.getUNDEF(WideVec.getValueType()), 10877 ShuffleVec); 10878 // At this point all of the data is stored at the bottom of the 10879 // register. We now need to save it to mem. 10880 10881 // Find the largest store unit 10882 MVT StoreType = MVT::i8; 10883 for (MVT Tp : MVT::integer_valuetypes()) { 10884 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 10885 StoreType = Tp; 10886 } 10887 // Didn't find a legal store type. 10888 if (!TLI.isTypeLegal(StoreType)) 10889 return SDValue(); 10890 10891 // Bitcast the original vector into a vector of store-size units 10892 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 10893 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 10894 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 10895 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 10896 SmallVector<SDValue, 8> Chains; 10897 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 10898 TLI.getPointerTy(DAG.getDataLayout())); 10899 SDValue BasePtr = St->getBasePtr(); 10900 10901 // Perform one or more big stores into memory. 10902 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); 10903 for (unsigned I = 0; I < E; I++) { 10904 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 10905 StoreType, ShuffWide, 10906 DAG.getIntPtrConstant(I, DL)); 10907 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, 10908 St->getPointerInfo(), St->getAlignment(), 10909 St->getMemOperand()->getFlags()); 10910 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, 10911 Increment); 10912 Chains.push_back(Ch); 10913 } 10914 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 10915 } 10916 10917 if (!ISD::isNormalStore(St)) 10918 return SDValue(); 10919 10920 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 10921 // ARM stores of arguments in the same cache line. 10922 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 10923 StVal.getNode()->hasOneUse()) { 10924 SelectionDAG &DAG = DCI.DAG; 10925 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 10926 SDLoc DL(St); 10927 SDValue BasePtr = St->getBasePtr(); 10928 SDValue NewST1 = DAG.getStore( 10929 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 10930 BasePtr, St->getPointerInfo(), St->getAlignment(), 10931 St->getMemOperand()->getFlags()); 10932 10933 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 10934 DAG.getConstant(4, DL, MVT::i32)); 10935 return DAG.getStore(NewST1.getValue(0), DL, 10936 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 10937 OffsetPtr, St->getPointerInfo(), 10938 std::min(4U, St->getAlignment() / 2), 10939 St->getMemOperand()->getFlags()); 10940 } 10941 10942 if (StVal.getValueType() == MVT::i64 && 10943 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 10944 10945 // Bitcast an i64 store extracted from a vector to f64. 10946 // Otherwise, the i64 value will be legalized to a pair of i32 values. 10947 SelectionDAG &DAG = DCI.DAG; 10948 SDLoc dl(StVal); 10949 SDValue IntVec = StVal.getOperand(0); 10950 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 10951 IntVec.getValueType().getVectorNumElements()); 10952 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 10953 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 10954 Vec, StVal.getOperand(1)); 10955 dl = SDLoc(N); 10956 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 10957 // Make the DAGCombiner fold the bitcasts. 10958 DCI.AddToWorklist(Vec.getNode()); 10959 DCI.AddToWorklist(ExtElt.getNode()); 10960 DCI.AddToWorklist(V.getNode()); 10961 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 10962 St->getPointerInfo(), St->getAlignment(), 10963 St->getMemOperand()->getFlags(), St->getAAInfo()); 10964 } 10965 10966 // If this is a legal vector store, try to combine it into a VST1_UPD. 10967 if (ISD::isNormalStore(N) && VT.isVector() && 10968 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10969 return CombineBaseUpdate(N, DCI); 10970 10971 return SDValue(); 10972 } 10973 10974 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 10975 /// can replace combinations of VMUL and VCVT (floating-point to integer) 10976 /// when the VMUL has a constant operand that is a power of 2. 10977 /// 10978 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 10979 /// vmul.f32 d16, d17, d16 10980 /// vcvt.s32.f32 d16, d16 10981 /// becomes: 10982 /// vcvt.s32.f32 d16, d16, #3 10983 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 10984 const ARMSubtarget *Subtarget) { 10985 if (!Subtarget->hasNEON()) 10986 return SDValue(); 10987 10988 SDValue Op = N->getOperand(0); 10989 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 10990 Op.getOpcode() != ISD::FMUL) 10991 return SDValue(); 10992 10993 SDValue ConstVec = Op->getOperand(1); 10994 if (!isa<BuildVectorSDNode>(ConstVec)) 10995 return SDValue(); 10996 10997 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 10998 uint32_t FloatBits = FloatTy.getSizeInBits(); 10999 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 11000 uint32_t IntBits = IntTy.getSizeInBits(); 11001 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 11002 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 11003 // These instructions only exist converting from f32 to i32. We can handle 11004 // smaller integers by generating an extra truncate, but larger ones would 11005 // be lossy. We also can't handle more then 4 lanes, since these intructions 11006 // only support v2i32/v4i32 types. 11007 return SDValue(); 11008 } 11009 11010 BitVector UndefElements; 11011 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 11012 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 11013 if (C == -1 || C == 0 || C > 32) 11014 return SDValue(); 11015 11016 SDLoc dl(N); 11017 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 11018 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 11019 Intrinsic::arm_neon_vcvtfp2fxu; 11020 SDValue FixConv = DAG.getNode( 11021 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 11022 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 11023 DAG.getConstant(C, dl, MVT::i32)); 11024 11025 if (IntBits < FloatBits) 11026 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 11027 11028 return FixConv; 11029 } 11030 11031 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 11032 /// can replace combinations of VCVT (integer to floating-point) and VDIV 11033 /// when the VDIV has a constant operand that is a power of 2. 11034 /// 11035 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 11036 /// vcvt.f32.s32 d16, d16 11037 /// vdiv.f32 d16, d17, d16 11038 /// becomes: 11039 /// vcvt.f32.s32 d16, d16, #3 11040 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 11041 const ARMSubtarget *Subtarget) { 11042 if (!Subtarget->hasNEON()) 11043 return SDValue(); 11044 11045 SDValue Op = N->getOperand(0); 11046 unsigned OpOpcode = Op.getNode()->getOpcode(); 11047 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 11048 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 11049 return SDValue(); 11050 11051 SDValue ConstVec = N->getOperand(1); 11052 if (!isa<BuildVectorSDNode>(ConstVec)) 11053 return SDValue(); 11054 11055 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 11056 uint32_t FloatBits = FloatTy.getSizeInBits(); 11057 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 11058 uint32_t IntBits = IntTy.getSizeInBits(); 11059 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 11060 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 11061 // These instructions only exist converting from i32 to f32. We can handle 11062 // smaller integers by generating an extra extend, but larger ones would 11063 // be lossy. We also can't handle more then 4 lanes, since these intructions 11064 // only support v2i32/v4i32 types. 11065 return SDValue(); 11066 } 11067 11068 BitVector UndefElements; 11069 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 11070 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 11071 if (C == -1 || C == 0 || C > 32) 11072 return SDValue(); 11073 11074 SDLoc dl(N); 11075 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 11076 SDValue ConvInput = Op.getOperand(0); 11077 if (IntBits < FloatBits) 11078 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 11079 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 11080 ConvInput); 11081 11082 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 11083 Intrinsic::arm_neon_vcvtfxu2fp; 11084 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 11085 Op.getValueType(), 11086 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 11087 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 11088 } 11089 11090 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 11091 /// operand of a vector shift operation, where all the elements of the 11092 /// build_vector must have the same constant integer value. 11093 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 11094 // Ignore bit_converts. 11095 while (Op.getOpcode() == ISD::BITCAST) 11096 Op = Op.getOperand(0); 11097 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 11098 APInt SplatBits, SplatUndef; 11099 unsigned SplatBitSize; 11100 bool HasAnyUndefs; 11101 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 11102 HasAnyUndefs, ElementBits) || 11103 SplatBitSize > ElementBits) 11104 return false; 11105 Cnt = SplatBits.getSExtValue(); 11106 return true; 11107 } 11108 11109 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 11110 /// operand of a vector shift left operation. That value must be in the range: 11111 /// 0 <= Value < ElementBits for a left shift; or 11112 /// 0 <= Value <= ElementBits for a long left shift. 11113 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 11114 assert(VT.isVector() && "vector shift count is not a vector type"); 11115 int64_t ElementBits = VT.getScalarSizeInBits(); 11116 if (! getVShiftImm(Op, ElementBits, Cnt)) 11117 return false; 11118 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); 11119 } 11120 11121 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 11122 /// operand of a vector shift right operation. For a shift opcode, the value 11123 /// is positive, but for an intrinsic the value count must be negative. The 11124 /// absolute value must be in the range: 11125 /// 1 <= |Value| <= ElementBits for a right shift; or 11126 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 11127 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 11128 int64_t &Cnt) { 11129 assert(VT.isVector() && "vector shift count is not a vector type"); 11130 int64_t ElementBits = VT.getScalarSizeInBits(); 11131 if (! getVShiftImm(Op, ElementBits, Cnt)) 11132 return false; 11133 if (!isIntrinsic) 11134 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); 11135 if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { 11136 Cnt = -Cnt; 11137 return true; 11138 } 11139 return false; 11140 } 11141 11142 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 11143 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 11144 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 11145 switch (IntNo) { 11146 default: 11147 // Don't do anything for most intrinsics. 11148 break; 11149 11150 // Vector shifts: check for immediate versions and lower them. 11151 // Note: This is done during DAG combining instead of DAG legalizing because 11152 // the build_vectors for 64-bit vector element shift counts are generally 11153 // not legal, and it is hard to see their values after they get legalized to 11154 // loads from a constant pool. 11155 case Intrinsic::arm_neon_vshifts: 11156 case Intrinsic::arm_neon_vshiftu: 11157 case Intrinsic::arm_neon_vrshifts: 11158 case Intrinsic::arm_neon_vrshiftu: 11159 case Intrinsic::arm_neon_vrshiftn: 11160 case Intrinsic::arm_neon_vqshifts: 11161 case Intrinsic::arm_neon_vqshiftu: 11162 case Intrinsic::arm_neon_vqshiftsu: 11163 case Intrinsic::arm_neon_vqshiftns: 11164 case Intrinsic::arm_neon_vqshiftnu: 11165 case Intrinsic::arm_neon_vqshiftnsu: 11166 case Intrinsic::arm_neon_vqrshiftns: 11167 case Intrinsic::arm_neon_vqrshiftnu: 11168 case Intrinsic::arm_neon_vqrshiftnsu: { 11169 EVT VT = N->getOperand(1).getValueType(); 11170 int64_t Cnt; 11171 unsigned VShiftOpc = 0; 11172 11173 switch (IntNo) { 11174 case Intrinsic::arm_neon_vshifts: 11175 case Intrinsic::arm_neon_vshiftu: 11176 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 11177 VShiftOpc = ARMISD::VSHL; 11178 break; 11179 } 11180 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 11181 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? 11182 ARMISD::VSHRs : ARMISD::VSHRu); 11183 break; 11184 } 11185 return SDValue(); 11186 11187 case Intrinsic::arm_neon_vrshifts: 11188 case Intrinsic::arm_neon_vrshiftu: 11189 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 11190 break; 11191 return SDValue(); 11192 11193 case Intrinsic::arm_neon_vqshifts: 11194 case Intrinsic::arm_neon_vqshiftu: 11195 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 11196 break; 11197 return SDValue(); 11198 11199 case Intrinsic::arm_neon_vqshiftsu: 11200 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 11201 break; 11202 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 11203 11204 case Intrinsic::arm_neon_vrshiftn: 11205 case Intrinsic::arm_neon_vqshiftns: 11206 case Intrinsic::arm_neon_vqshiftnu: 11207 case Intrinsic::arm_neon_vqshiftnsu: 11208 case Intrinsic::arm_neon_vqrshiftns: 11209 case Intrinsic::arm_neon_vqrshiftnu: 11210 case Intrinsic::arm_neon_vqrshiftnsu: 11211 // Narrowing shifts require an immediate right shift. 11212 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 11213 break; 11214 llvm_unreachable("invalid shift count for narrowing vector shift " 11215 "intrinsic"); 11216 11217 default: 11218 llvm_unreachable("unhandled vector shift"); 11219 } 11220 11221 switch (IntNo) { 11222 case Intrinsic::arm_neon_vshifts: 11223 case Intrinsic::arm_neon_vshiftu: 11224 // Opcode already set above. 11225 break; 11226 case Intrinsic::arm_neon_vrshifts: 11227 VShiftOpc = ARMISD::VRSHRs; break; 11228 case Intrinsic::arm_neon_vrshiftu: 11229 VShiftOpc = ARMISD::VRSHRu; break; 11230 case Intrinsic::arm_neon_vrshiftn: 11231 VShiftOpc = ARMISD::VRSHRN; break; 11232 case Intrinsic::arm_neon_vqshifts: 11233 VShiftOpc = ARMISD::VQSHLs; break; 11234 case Intrinsic::arm_neon_vqshiftu: 11235 VShiftOpc = ARMISD::VQSHLu; break; 11236 case Intrinsic::arm_neon_vqshiftsu: 11237 VShiftOpc = ARMISD::VQSHLsu; break; 11238 case Intrinsic::arm_neon_vqshiftns: 11239 VShiftOpc = ARMISD::VQSHRNs; break; 11240 case Intrinsic::arm_neon_vqshiftnu: 11241 VShiftOpc = ARMISD::VQSHRNu; break; 11242 case Intrinsic::arm_neon_vqshiftnsu: 11243 VShiftOpc = ARMISD::VQSHRNsu; break; 11244 case Intrinsic::arm_neon_vqrshiftns: 11245 VShiftOpc = ARMISD::VQRSHRNs; break; 11246 case Intrinsic::arm_neon_vqrshiftnu: 11247 VShiftOpc = ARMISD::VQRSHRNu; break; 11248 case Intrinsic::arm_neon_vqrshiftnsu: 11249 VShiftOpc = ARMISD::VQRSHRNsu; break; 11250 } 11251 11252 SDLoc dl(N); 11253 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 11254 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 11255 } 11256 11257 case Intrinsic::arm_neon_vshiftins: { 11258 EVT VT = N->getOperand(1).getValueType(); 11259 int64_t Cnt; 11260 unsigned VShiftOpc = 0; 11261 11262 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 11263 VShiftOpc = ARMISD::VSLI; 11264 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 11265 VShiftOpc = ARMISD::VSRI; 11266 else { 11267 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 11268 } 11269 11270 SDLoc dl(N); 11271 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 11272 N->getOperand(1), N->getOperand(2), 11273 DAG.getConstant(Cnt, dl, MVT::i32)); 11274 } 11275 11276 case Intrinsic::arm_neon_vqrshifts: 11277 case Intrinsic::arm_neon_vqrshiftu: 11278 // No immediate versions of these to check for. 11279 break; 11280 } 11281 11282 return SDValue(); 11283 } 11284 11285 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 11286 /// lowers them. As with the vector shift intrinsics, this is done during DAG 11287 /// combining instead of DAG legalizing because the build_vectors for 64-bit 11288 /// vector element shift counts are generally not legal, and it is hard to see 11289 /// their values after they get legalized to loads from a constant pool. 11290 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, 11291 const ARMSubtarget *ST) { 11292 EVT VT = N->getValueType(0); 11293 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 11294 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 11295 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 11296 SDValue N1 = N->getOperand(1); 11297 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 11298 SDValue N0 = N->getOperand(0); 11299 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 11300 DAG.MaskedValueIsZero(N0.getOperand(0), 11301 APInt::getHighBitsSet(32, 16))) 11302 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 11303 } 11304 } 11305 11306 // Nothing to be done for scalar shifts. 11307 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11308 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 11309 return SDValue(); 11310 11311 assert(ST->hasNEON() && "unexpected vector shift"); 11312 int64_t Cnt; 11313 11314 switch (N->getOpcode()) { 11315 default: llvm_unreachable("unexpected shift opcode"); 11316 11317 case ISD::SHL: 11318 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 11319 SDLoc dl(N); 11320 return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), 11321 DAG.getConstant(Cnt, dl, MVT::i32)); 11322 } 11323 break; 11324 11325 case ISD::SRA: 11326 case ISD::SRL: 11327 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 11328 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? 11329 ARMISD::VSHRs : ARMISD::VSHRu); 11330 SDLoc dl(N); 11331 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 11332 DAG.getConstant(Cnt, dl, MVT::i32)); 11333 } 11334 } 11335 return SDValue(); 11336 } 11337 11338 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 11339 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 11340 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 11341 const ARMSubtarget *ST) { 11342 SDValue N0 = N->getOperand(0); 11343 11344 // Check for sign- and zero-extensions of vector extract operations of 8- 11345 // and 16-bit vector elements. NEON supports these directly. They are 11346 // handled during DAG combining because type legalization will promote them 11347 // to 32-bit types and it is messy to recognize the operations after that. 11348 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11349 SDValue Vec = N0.getOperand(0); 11350 SDValue Lane = N0.getOperand(1); 11351 EVT VT = N->getValueType(0); 11352 EVT EltVT = N0.getValueType(); 11353 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11354 11355 if (VT == MVT::i32 && 11356 (EltVT == MVT::i8 || EltVT == MVT::i16) && 11357 TLI.isTypeLegal(Vec.getValueType()) && 11358 isa<ConstantSDNode>(Lane)) { 11359 11360 unsigned Opc = 0; 11361 switch (N->getOpcode()) { 11362 default: llvm_unreachable("unexpected opcode"); 11363 case ISD::SIGN_EXTEND: 11364 Opc = ARMISD::VGETLANEs; 11365 break; 11366 case ISD::ZERO_EXTEND: 11367 case ISD::ANY_EXTEND: 11368 Opc = ARMISD::VGETLANEu; 11369 break; 11370 } 11371 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 11372 } 11373 } 11374 11375 return SDValue(); 11376 } 11377 11378 static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, 11379 APInt &KnownOne) { 11380 if (Op.getOpcode() == ARMISD::BFI) { 11381 // Conservatively, we can recurse down the first operand 11382 // and just mask out all affected bits. 11383 computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); 11384 11385 // The operand to BFI is already a mask suitable for removing the bits it 11386 // sets. 11387 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 11388 const APInt &Mask = CI->getAPIntValue(); 11389 KnownZero &= Mask; 11390 KnownOne &= Mask; 11391 return; 11392 } 11393 if (Op.getOpcode() == ARMISD::CMOV) { 11394 APInt KZ2(KnownZero.getBitWidth(), 0); 11395 APInt KO2(KnownOne.getBitWidth(), 0); 11396 computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); 11397 computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); 11398 11399 KnownZero &= KZ2; 11400 KnownOne &= KO2; 11401 return; 11402 } 11403 return DAG.computeKnownBits(Op, KnownZero, KnownOne); 11404 } 11405 11406 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 11407 // If we have a CMOV, OR and AND combination such as: 11408 // if (x & CN) 11409 // y |= CM; 11410 // 11411 // And: 11412 // * CN is a single bit; 11413 // * All bits covered by CM are known zero in y 11414 // 11415 // Then we can convert this into a sequence of BFI instructions. This will 11416 // always be a win if CM is a single bit, will always be no worse than the 11417 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 11418 // three bits (due to the extra IT instruction). 11419 11420 SDValue Op0 = CMOV->getOperand(0); 11421 SDValue Op1 = CMOV->getOperand(1); 11422 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 11423 auto CC = CCNode->getAPIntValue().getLimitedValue(); 11424 SDValue CmpZ = CMOV->getOperand(4); 11425 11426 // The compare must be against zero. 11427 if (!isNullConstant(CmpZ->getOperand(1))) 11428 return SDValue(); 11429 11430 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 11431 SDValue And = CmpZ->getOperand(0); 11432 if (And->getOpcode() != ISD::AND) 11433 return SDValue(); 11434 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1)); 11435 if (!AndC || !AndC->getAPIntValue().isPowerOf2()) 11436 return SDValue(); 11437 SDValue X = And->getOperand(0); 11438 11439 if (CC == ARMCC::EQ) { 11440 // We're performing an "equal to zero" compare. Swap the operands so we 11441 // canonicalize on a "not equal to zero" compare. 11442 std::swap(Op0, Op1); 11443 } else { 11444 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 11445 } 11446 11447 if (Op1->getOpcode() != ISD::OR) 11448 return SDValue(); 11449 11450 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 11451 if (!OrC) 11452 return SDValue(); 11453 SDValue Y = Op1->getOperand(0); 11454 11455 if (Op0 != Y) 11456 return SDValue(); 11457 11458 // Now, is it profitable to continue? 11459 APInt OrCI = OrC->getAPIntValue(); 11460 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 11461 if (OrCI.countPopulation() > Heuristic) 11462 return SDValue(); 11463 11464 // Lastly, can we determine that the bits defined by OrCI 11465 // are zero in Y? 11466 APInt KnownZero, KnownOne; 11467 computeKnownBits(DAG, Y, KnownZero, KnownOne); 11468 if ((OrCI & KnownZero) != OrCI) 11469 return SDValue(); 11470 11471 // OK, we can do the combine. 11472 SDValue V = Y; 11473 SDLoc dl(X); 11474 EVT VT = X.getValueType(); 11475 unsigned BitInX = AndC->getAPIntValue().logBase2(); 11476 11477 if (BitInX != 0) { 11478 // We must shift X first. 11479 X = DAG.getNode(ISD::SRL, dl, VT, X, 11480 DAG.getConstant(BitInX, dl, VT)); 11481 } 11482 11483 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 11484 BitInY < NumActiveBits; ++BitInY) { 11485 if (OrCI[BitInY] == 0) 11486 continue; 11487 APInt Mask(VT.getSizeInBits(), 0); 11488 Mask.setBit(BitInY); 11489 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 11490 // Confusingly, the operand is an *inverted* mask. 11491 DAG.getConstant(~Mask, dl, VT)); 11492 } 11493 11494 return V; 11495 } 11496 11497 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 11498 SDValue 11499 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 11500 SDValue Cmp = N->getOperand(4); 11501 if (Cmp.getOpcode() != ARMISD::CMPZ) 11502 // Only looking at NE cases. 11503 return SDValue(); 11504 11505 EVT VT = N->getValueType(0); 11506 SDLoc dl(N); 11507 SDValue LHS = Cmp.getOperand(0); 11508 SDValue RHS = Cmp.getOperand(1); 11509 SDValue Chain = N->getOperand(0); 11510 SDValue BB = N->getOperand(1); 11511 SDValue ARMcc = N->getOperand(2); 11512 ARMCC::CondCodes CC = 11513 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 11514 11515 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 11516 // -> (brcond Chain BB CC CPSR Cmp) 11517 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 11518 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 11519 LHS->getOperand(0)->hasOneUse()) { 11520 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 11521 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 11522 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 11523 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 11524 if ((LHS00C && LHS00C->getZExtValue() == 0) && 11525 (LHS01C && LHS01C->getZExtValue() == 1) && 11526 (LHS1C && LHS1C->getZExtValue() == 1) && 11527 (RHSC && RHSC->getZExtValue() == 0)) { 11528 return DAG.getNode( 11529 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 11530 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 11531 } 11532 } 11533 11534 return SDValue(); 11535 } 11536 11537 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 11538 SDValue 11539 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 11540 SDValue Cmp = N->getOperand(4); 11541 if (Cmp.getOpcode() != ARMISD::CMPZ) 11542 // Only looking at EQ and NE cases. 11543 return SDValue(); 11544 11545 EVT VT = N->getValueType(0); 11546 SDLoc dl(N); 11547 SDValue LHS = Cmp.getOperand(0); 11548 SDValue RHS = Cmp.getOperand(1); 11549 SDValue FalseVal = N->getOperand(0); 11550 SDValue TrueVal = N->getOperand(1); 11551 SDValue ARMcc = N->getOperand(2); 11552 ARMCC::CondCodes CC = 11553 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 11554 11555 // BFI is only available on V6T2+. 11556 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 11557 SDValue R = PerformCMOVToBFICombine(N, DAG); 11558 if (R) 11559 return R; 11560 } 11561 11562 // Simplify 11563 // mov r1, r0 11564 // cmp r1, x 11565 // mov r0, y 11566 // moveq r0, x 11567 // to 11568 // cmp r0, x 11569 // movne r0, y 11570 // 11571 // mov r1, r0 11572 // cmp r1, x 11573 // mov r0, x 11574 // movne r0, y 11575 // to 11576 // cmp r0, x 11577 // movne r0, y 11578 /// FIXME: Turn this into a target neutral optimization? 11579 SDValue Res; 11580 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 11581 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 11582 N->getOperand(3), Cmp); 11583 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 11584 SDValue ARMcc; 11585 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 11586 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 11587 N->getOperand(3), NewCmp); 11588 } 11589 11590 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 11591 // -> (cmov F T CC CPSR Cmp) 11592 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 11593 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 11594 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 11595 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 11596 if ((LHS0C && LHS0C->getZExtValue() == 0) && 11597 (LHS1C && LHS1C->getZExtValue() == 1) && 11598 (RHSC && RHSC->getZExtValue() == 0)) { 11599 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 11600 LHS->getOperand(2), LHS->getOperand(3), 11601 LHS->getOperand(4)); 11602 } 11603 } 11604 11605 if (Res.getNode()) { 11606 APInt KnownZero, KnownOne; 11607 DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne); 11608 // Capture demanded bits information that would be otherwise lost. 11609 if (KnownZero == 0xfffffffe) 11610 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 11611 DAG.getValueType(MVT::i1)); 11612 else if (KnownZero == 0xffffff00) 11613 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 11614 DAG.getValueType(MVT::i8)); 11615 else if (KnownZero == 0xffff0000) 11616 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 11617 DAG.getValueType(MVT::i16)); 11618 } 11619 11620 return Res; 11621 } 11622 11623 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 11624 DAGCombinerInfo &DCI) const { 11625 switch (N->getOpcode()) { 11626 default: break; 11627 case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); 11628 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 11629 case ISD::SUB: return PerformSUBCombine(N, DCI); 11630 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 11631 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 11632 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 11633 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 11634 case ARMISD::BFI: return PerformBFICombine(N, DCI); 11635 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 11636 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 11637 case ISD::STORE: return PerformSTORECombine(N, DCI); 11638 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 11639 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 11640 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 11641 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 11642 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); 11643 case ISD::FP_TO_SINT: 11644 case ISD::FP_TO_UINT: 11645 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 11646 case ISD::FDIV: 11647 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 11648 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 11649 case ISD::SHL: 11650 case ISD::SRA: 11651 case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); 11652 case ISD::SIGN_EXTEND: 11653 case ISD::ZERO_EXTEND: 11654 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 11655 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 11656 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 11657 case ISD::LOAD: return PerformLOADCombine(N, DCI); 11658 case ARMISD::VLD1DUP: 11659 case ARMISD::VLD2DUP: 11660 case ARMISD::VLD3DUP: 11661 case ARMISD::VLD4DUP: 11662 return PerformVLDCombine(N, DCI); 11663 case ARMISD::BUILD_VECTOR: 11664 return PerformARMBUILD_VECTORCombine(N, DCI); 11665 case ISD::INTRINSIC_VOID: 11666 case ISD::INTRINSIC_W_CHAIN: 11667 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11668 case Intrinsic::arm_neon_vld1: 11669 case Intrinsic::arm_neon_vld2: 11670 case Intrinsic::arm_neon_vld3: 11671 case Intrinsic::arm_neon_vld4: 11672 case Intrinsic::arm_neon_vld2lane: 11673 case Intrinsic::arm_neon_vld3lane: 11674 case Intrinsic::arm_neon_vld4lane: 11675 case Intrinsic::arm_neon_vst1: 11676 case Intrinsic::arm_neon_vst2: 11677 case Intrinsic::arm_neon_vst3: 11678 case Intrinsic::arm_neon_vst4: 11679 case Intrinsic::arm_neon_vst2lane: 11680 case Intrinsic::arm_neon_vst3lane: 11681 case Intrinsic::arm_neon_vst4lane: 11682 return PerformVLDCombine(N, DCI); 11683 default: break; 11684 } 11685 break; 11686 } 11687 return SDValue(); 11688 } 11689 11690 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 11691 EVT VT) const { 11692 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 11693 } 11694 11695 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 11696 unsigned, 11697 unsigned, 11698 bool *Fast) const { 11699 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 11700 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 11701 11702 switch (VT.getSimpleVT().SimpleTy) { 11703 default: 11704 return false; 11705 case MVT::i8: 11706 case MVT::i16: 11707 case MVT::i32: { 11708 // Unaligned access can use (for example) LRDB, LRDH, LDR 11709 if (AllowsUnaligned) { 11710 if (Fast) 11711 *Fast = Subtarget->hasV7Ops(); 11712 return true; 11713 } 11714 return false; 11715 } 11716 case MVT::f64: 11717 case MVT::v2f64: { 11718 // For any little-endian targets with neon, we can support unaligned ld/st 11719 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 11720 // A big-endian target may also explicitly support unaligned accesses 11721 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 11722 if (Fast) 11723 *Fast = true; 11724 return true; 11725 } 11726 return false; 11727 } 11728 } 11729 } 11730 11731 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 11732 unsigned AlignCheck) { 11733 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 11734 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 11735 } 11736 11737 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, 11738 unsigned DstAlign, unsigned SrcAlign, 11739 bool IsMemset, bool ZeroMemset, 11740 bool MemcpyStrSrc, 11741 MachineFunction &MF) const { 11742 const Function *F = MF.getFunction(); 11743 11744 // See if we can use NEON instructions for this... 11745 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 11746 !F->hasFnAttribute(Attribute::NoImplicitFloat)) { 11747 bool Fast; 11748 if (Size >= 16 && 11749 (memOpAlign(SrcAlign, DstAlign, 16) || 11750 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { 11751 return MVT::v2f64; 11752 } else if (Size >= 8 && 11753 (memOpAlign(SrcAlign, DstAlign, 8) || 11754 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && 11755 Fast))) { 11756 return MVT::f64; 11757 } 11758 } 11759 11760 // Lowering to i32/i16 if the size permits. 11761 if (Size >= 4) 11762 return MVT::i32; 11763 else if (Size >= 2) 11764 return MVT::i16; 11765 11766 // Let the target-independent logic figure it out. 11767 return MVT::Other; 11768 } 11769 11770 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 11771 if (Val.getOpcode() != ISD::LOAD) 11772 return false; 11773 11774 EVT VT1 = Val.getValueType(); 11775 if (!VT1.isSimple() || !VT1.isInteger() || 11776 !VT2.isSimple() || !VT2.isInteger()) 11777 return false; 11778 11779 switch (VT1.getSimpleVT().SimpleTy) { 11780 default: break; 11781 case MVT::i1: 11782 case MVT::i8: 11783 case MVT::i16: 11784 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 11785 return true; 11786 } 11787 11788 return false; 11789 } 11790 11791 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 11792 EVT VT = ExtVal.getValueType(); 11793 11794 if (!isTypeLegal(VT)) 11795 return false; 11796 11797 // Don't create a loadext if we can fold the extension into a wide/long 11798 // instruction. 11799 // If there's more than one user instruction, the loadext is desirable no 11800 // matter what. There can be two uses by the same instruction. 11801 if (ExtVal->use_empty() || 11802 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 11803 return true; 11804 11805 SDNode *U = *ExtVal->use_begin(); 11806 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 11807 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) 11808 return false; 11809 11810 return true; 11811 } 11812 11813 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 11814 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11815 return false; 11816 11817 if (!isTypeLegal(EVT::getEVT(Ty1))) 11818 return false; 11819 11820 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 11821 11822 // Assuming the caller doesn't have a zeroext or signext return parameter, 11823 // truncation all the way down to i1 is valid. 11824 return true; 11825 } 11826 11827 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 11828 const AddrMode &AM, Type *Ty, 11829 unsigned AS) const { 11830 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 11831 if (Subtarget->hasFPAO()) 11832 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 11833 return 0; 11834 } 11835 return -1; 11836 } 11837 11838 11839 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 11840 if (V < 0) 11841 return false; 11842 11843 unsigned Scale = 1; 11844 switch (VT.getSimpleVT().SimpleTy) { 11845 default: return false; 11846 case MVT::i1: 11847 case MVT::i8: 11848 // Scale == 1; 11849 break; 11850 case MVT::i16: 11851 // Scale == 2; 11852 Scale = 2; 11853 break; 11854 case MVT::i32: 11855 // Scale == 4; 11856 Scale = 4; 11857 break; 11858 } 11859 11860 if ((V & (Scale - 1)) != 0) 11861 return false; 11862 V /= Scale; 11863 return V == (V & ((1LL << 5) - 1)); 11864 } 11865 11866 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 11867 const ARMSubtarget *Subtarget) { 11868 bool isNeg = false; 11869 if (V < 0) { 11870 isNeg = true; 11871 V = - V; 11872 } 11873 11874 switch (VT.getSimpleVT().SimpleTy) { 11875 default: return false; 11876 case MVT::i1: 11877 case MVT::i8: 11878 case MVT::i16: 11879 case MVT::i32: 11880 // + imm12 or - imm8 11881 if (isNeg) 11882 return V == (V & ((1LL << 8) - 1)); 11883 return V == (V & ((1LL << 12) - 1)); 11884 case MVT::f32: 11885 case MVT::f64: 11886 // Same as ARM mode. FIXME: NEON? 11887 if (!Subtarget->hasVFP2()) 11888 return false; 11889 if ((V & 3) != 0) 11890 return false; 11891 V >>= 2; 11892 return V == (V & ((1LL << 8) - 1)); 11893 } 11894 } 11895 11896 /// isLegalAddressImmediate - Return true if the integer value can be used 11897 /// as the offset of the target addressing mode for load / store of the 11898 /// given type. 11899 static bool isLegalAddressImmediate(int64_t V, EVT VT, 11900 const ARMSubtarget *Subtarget) { 11901 if (V == 0) 11902 return true; 11903 11904 if (!VT.isSimple()) 11905 return false; 11906 11907 if (Subtarget->isThumb1Only()) 11908 return isLegalT1AddressImmediate(V, VT); 11909 else if (Subtarget->isThumb2()) 11910 return isLegalT2AddressImmediate(V, VT, Subtarget); 11911 11912 // ARM mode. 11913 if (V < 0) 11914 V = - V; 11915 switch (VT.getSimpleVT().SimpleTy) { 11916 default: return false; 11917 case MVT::i1: 11918 case MVT::i8: 11919 case MVT::i32: 11920 // +- imm12 11921 return V == (V & ((1LL << 12) - 1)); 11922 case MVT::i16: 11923 // +- imm8 11924 return V == (V & ((1LL << 8) - 1)); 11925 case MVT::f32: 11926 case MVT::f64: 11927 if (!Subtarget->hasVFP2()) // FIXME: NEON? 11928 return false; 11929 if ((V & 3) != 0) 11930 return false; 11931 V >>= 2; 11932 return V == (V & ((1LL << 8) - 1)); 11933 } 11934 } 11935 11936 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 11937 EVT VT) const { 11938 int Scale = AM.Scale; 11939 if (Scale < 0) 11940 return false; 11941 11942 switch (VT.getSimpleVT().SimpleTy) { 11943 default: return false; 11944 case MVT::i1: 11945 case MVT::i8: 11946 case MVT::i16: 11947 case MVT::i32: 11948 if (Scale == 1) 11949 return true; 11950 // r + r << imm 11951 Scale = Scale & ~1; 11952 return Scale == 2 || Scale == 4 || Scale == 8; 11953 case MVT::i64: 11954 // r + r 11955 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 11956 return true; 11957 return false; 11958 case MVT::isVoid: 11959 // Note, we allow "void" uses (basically, uses that aren't loads or 11960 // stores), because arm allows folding a scale into many arithmetic 11961 // operations. This should be made more precise and revisited later. 11962 11963 // Allow r << imm, but the imm has to be a multiple of two. 11964 if (Scale & 1) return false; 11965 return isPowerOf2_32(Scale); 11966 } 11967 } 11968 11969 /// isLegalAddressingMode - Return true if the addressing mode represented 11970 /// by AM is legal for this target, for a load/store of the specified type. 11971 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 11972 const AddrMode &AM, Type *Ty, 11973 unsigned AS) const { 11974 EVT VT = getValueType(DL, Ty, true); 11975 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 11976 return false; 11977 11978 // Can never fold addr of global into load/store. 11979 if (AM.BaseGV) 11980 return false; 11981 11982 switch (AM.Scale) { 11983 case 0: // no scale reg, must be "r+i" or "r", or "i". 11984 break; 11985 case 1: 11986 if (Subtarget->isThumb1Only()) 11987 return false; 11988 LLVM_FALLTHROUGH; 11989 default: 11990 // ARM doesn't support any R+R*scale+imm addr modes. 11991 if (AM.BaseOffs) 11992 return false; 11993 11994 if (!VT.isSimple()) 11995 return false; 11996 11997 if (Subtarget->isThumb2()) 11998 return isLegalT2ScaledAddressingMode(AM, VT); 11999 12000 int Scale = AM.Scale; 12001 switch (VT.getSimpleVT().SimpleTy) { 12002 default: return false; 12003 case MVT::i1: 12004 case MVT::i8: 12005 case MVT::i32: 12006 if (Scale < 0) Scale = -Scale; 12007 if (Scale == 1) 12008 return true; 12009 // r + r << imm 12010 return isPowerOf2_32(Scale & ~1); 12011 case MVT::i16: 12012 case MVT::i64: 12013 // r + r 12014 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 12015 return true; 12016 return false; 12017 12018 case MVT::isVoid: 12019 // Note, we allow "void" uses (basically, uses that aren't loads or 12020 // stores), because arm allows folding a scale into many arithmetic 12021 // operations. This should be made more precise and revisited later. 12022 12023 // Allow r << imm, but the imm has to be a multiple of two. 12024 if (Scale & 1) return false; 12025 return isPowerOf2_32(Scale); 12026 } 12027 } 12028 return true; 12029 } 12030 12031 /// isLegalICmpImmediate - Return true if the specified immediate is legal 12032 /// icmp immediate, that is the target has icmp instructions which can compare 12033 /// a register against the immediate without having to materialize the 12034 /// immediate into a register. 12035 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 12036 // Thumb2 and ARM modes can use cmn for negative immediates. 12037 if (!Subtarget->isThumb()) 12038 return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; 12039 if (Subtarget->isThumb2()) 12040 return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; 12041 // Thumb1 doesn't have cmn, and only 8-bit immediates. 12042 return Imm >= 0 && Imm <= 255; 12043 } 12044 12045 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 12046 /// *or sub* immediate, that is the target has add or sub instructions which can 12047 /// add a register with the immediate without having to materialize the 12048 /// immediate into a register. 12049 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 12050 // Same encoding for add/sub, just flip the sign. 12051 int64_t AbsImm = std::abs(Imm); 12052 if (!Subtarget->isThumb()) 12053 return ARM_AM::getSOImmVal(AbsImm) != -1; 12054 if (Subtarget->isThumb2()) 12055 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 12056 // Thumb1 only has 8-bit unsigned immediate. 12057 return AbsImm >= 0 && AbsImm <= 255; 12058 } 12059 12060 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 12061 bool isSEXTLoad, SDValue &Base, 12062 SDValue &Offset, bool &isInc, 12063 SelectionDAG &DAG) { 12064 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 12065 return false; 12066 12067 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 12068 // AddressingMode 3 12069 Base = Ptr->getOperand(0); 12070 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 12071 int RHSC = (int)RHS->getZExtValue(); 12072 if (RHSC < 0 && RHSC > -256) { 12073 assert(Ptr->getOpcode() == ISD::ADD); 12074 isInc = false; 12075 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 12076 return true; 12077 } 12078 } 12079 isInc = (Ptr->getOpcode() == ISD::ADD); 12080 Offset = Ptr->getOperand(1); 12081 return true; 12082 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 12083 // AddressingMode 2 12084 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 12085 int RHSC = (int)RHS->getZExtValue(); 12086 if (RHSC < 0 && RHSC > -0x1000) { 12087 assert(Ptr->getOpcode() == ISD::ADD); 12088 isInc = false; 12089 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 12090 Base = Ptr->getOperand(0); 12091 return true; 12092 } 12093 } 12094 12095 if (Ptr->getOpcode() == ISD::ADD) { 12096 isInc = true; 12097 ARM_AM::ShiftOpc ShOpcVal= 12098 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 12099 if (ShOpcVal != ARM_AM::no_shift) { 12100 Base = Ptr->getOperand(1); 12101 Offset = Ptr->getOperand(0); 12102 } else { 12103 Base = Ptr->getOperand(0); 12104 Offset = Ptr->getOperand(1); 12105 } 12106 return true; 12107 } 12108 12109 isInc = (Ptr->getOpcode() == ISD::ADD); 12110 Base = Ptr->getOperand(0); 12111 Offset = Ptr->getOperand(1); 12112 return true; 12113 } 12114 12115 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 12116 return false; 12117 } 12118 12119 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 12120 bool isSEXTLoad, SDValue &Base, 12121 SDValue &Offset, bool &isInc, 12122 SelectionDAG &DAG) { 12123 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 12124 return false; 12125 12126 Base = Ptr->getOperand(0); 12127 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 12128 int RHSC = (int)RHS->getZExtValue(); 12129 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 12130 assert(Ptr->getOpcode() == ISD::ADD); 12131 isInc = false; 12132 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 12133 return true; 12134 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 12135 isInc = Ptr->getOpcode() == ISD::ADD; 12136 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 12137 return true; 12138 } 12139 } 12140 12141 return false; 12142 } 12143 12144 /// getPreIndexedAddressParts - returns true by value, base pointer and 12145 /// offset pointer and addressing mode by reference if the node's address 12146 /// can be legally represented as pre-indexed load / store address. 12147 bool 12148 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 12149 SDValue &Offset, 12150 ISD::MemIndexedMode &AM, 12151 SelectionDAG &DAG) const { 12152 if (Subtarget->isThumb1Only()) 12153 return false; 12154 12155 EVT VT; 12156 SDValue Ptr; 12157 bool isSEXTLoad = false; 12158 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 12159 Ptr = LD->getBasePtr(); 12160 VT = LD->getMemoryVT(); 12161 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 12162 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 12163 Ptr = ST->getBasePtr(); 12164 VT = ST->getMemoryVT(); 12165 } else 12166 return false; 12167 12168 bool isInc; 12169 bool isLegal = false; 12170 if (Subtarget->isThumb2()) 12171 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 12172 Offset, isInc, DAG); 12173 else 12174 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 12175 Offset, isInc, DAG); 12176 if (!isLegal) 12177 return false; 12178 12179 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 12180 return true; 12181 } 12182 12183 /// getPostIndexedAddressParts - returns true by value, base pointer and 12184 /// offset pointer and addressing mode by reference if this node can be 12185 /// combined with a load / store to form a post-indexed load / store. 12186 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 12187 SDValue &Base, 12188 SDValue &Offset, 12189 ISD::MemIndexedMode &AM, 12190 SelectionDAG &DAG) const { 12191 EVT VT; 12192 SDValue Ptr; 12193 bool isSEXTLoad = false, isNonExt; 12194 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 12195 VT = LD->getMemoryVT(); 12196 Ptr = LD->getBasePtr(); 12197 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 12198 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 12199 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 12200 VT = ST->getMemoryVT(); 12201 Ptr = ST->getBasePtr(); 12202 isNonExt = !ST->isTruncatingStore(); 12203 } else 12204 return false; 12205 12206 if (Subtarget->isThumb1Only()) { 12207 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 12208 // must be non-extending/truncating, i32, with an offset of 4. 12209 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 12210 if (Op->getOpcode() != ISD::ADD || !isNonExt) 12211 return false; 12212 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12213 if (!RHS || RHS->getZExtValue() != 4) 12214 return false; 12215 12216 Offset = Op->getOperand(1); 12217 Base = Op->getOperand(0); 12218 AM = ISD::POST_INC; 12219 return true; 12220 } 12221 12222 bool isInc; 12223 bool isLegal = false; 12224 if (Subtarget->isThumb2()) 12225 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 12226 isInc, DAG); 12227 else 12228 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 12229 isInc, DAG); 12230 if (!isLegal) 12231 return false; 12232 12233 if (Ptr != Base) { 12234 // Swap base ptr and offset to catch more post-index load / store when 12235 // it's legal. In Thumb2 mode, offset must be an immediate. 12236 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 12237 !Subtarget->isThumb2()) 12238 std::swap(Base, Offset); 12239 12240 // Post-indexed load / store update the base pointer. 12241 if (Ptr != Base) 12242 return false; 12243 } 12244 12245 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 12246 return true; 12247 } 12248 12249 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 12250 APInt &KnownZero, 12251 APInt &KnownOne, 12252 const SelectionDAG &DAG, 12253 unsigned Depth) const { 12254 unsigned BitWidth = KnownOne.getBitWidth(); 12255 KnownZero = KnownOne = APInt(BitWidth, 0); 12256 switch (Op.getOpcode()) { 12257 default: break; 12258 case ARMISD::ADDC: 12259 case ARMISD::ADDE: 12260 case ARMISD::SUBC: 12261 case ARMISD::SUBE: 12262 // These nodes' second result is a boolean 12263 if (Op.getResNo() == 0) 12264 break; 12265 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 12266 break; 12267 case ARMISD::CMOV: { 12268 // Bits are known zero/one if known on the LHS and RHS. 12269 DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); 12270 if (KnownZero == 0 && KnownOne == 0) return; 12271 12272 APInt KnownZeroRHS, KnownOneRHS; 12273 DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); 12274 KnownZero &= KnownZeroRHS; 12275 KnownOne &= KnownOneRHS; 12276 return; 12277 } 12278 case ISD::INTRINSIC_W_CHAIN: { 12279 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 12280 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 12281 switch (IntID) { 12282 default: return; 12283 case Intrinsic::arm_ldaex: 12284 case Intrinsic::arm_ldrex: { 12285 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 12286 unsigned MemBits = VT.getScalarSizeInBits(); 12287 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 12288 return; 12289 } 12290 } 12291 } 12292 } 12293 } 12294 12295 //===----------------------------------------------------------------------===// 12296 // ARM Inline Assembly Support 12297 //===----------------------------------------------------------------------===// 12298 12299 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 12300 // Looking for "rev" which is V6+. 12301 if (!Subtarget->hasV6Ops()) 12302 return false; 12303 12304 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 12305 std::string AsmStr = IA->getAsmString(); 12306 SmallVector<StringRef, 4> AsmPieces; 12307 SplitString(AsmStr, AsmPieces, ";\n"); 12308 12309 switch (AsmPieces.size()) { 12310 default: return false; 12311 case 1: 12312 AsmStr = AsmPieces[0]; 12313 AsmPieces.clear(); 12314 SplitString(AsmStr, AsmPieces, " \t,"); 12315 12316 // rev $0, $1 12317 if (AsmPieces.size() == 3 && 12318 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 12319 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 12320 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12321 if (Ty && Ty->getBitWidth() == 32) 12322 return IntrinsicLowering::LowerToByteSwap(CI); 12323 } 12324 break; 12325 } 12326 12327 return false; 12328 } 12329 12330 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 12331 // At this point, we have to lower this constraint to something else, so we 12332 // lower it to an "r" or "w". However, by doing this we will force the result 12333 // to be in register, while the X constraint is much more permissive. 12334 // 12335 // Although we are correct (we are free to emit anything, without 12336 // constraints), we might break use cases that would expect us to be more 12337 // efficient and emit something else. 12338 if (!Subtarget->hasVFP2()) 12339 return "r"; 12340 if (ConstraintVT.isFloatingPoint()) 12341 return "w"; 12342 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 12343 (ConstraintVT.getSizeInBits() == 64 || 12344 ConstraintVT.getSizeInBits() == 128)) 12345 return "w"; 12346 12347 return "r"; 12348 } 12349 12350 /// getConstraintType - Given a constraint letter, return the type of 12351 /// constraint it is for this target. 12352 ARMTargetLowering::ConstraintType 12353 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 12354 if (Constraint.size() == 1) { 12355 switch (Constraint[0]) { 12356 default: break; 12357 case 'l': return C_RegisterClass; 12358 case 'w': return C_RegisterClass; 12359 case 'h': return C_RegisterClass; 12360 case 'x': return C_RegisterClass; 12361 case 't': return C_RegisterClass; 12362 case 'j': return C_Other; // Constant for movw. 12363 // An address with a single base register. Due to the way we 12364 // currently handle addresses it is the same as an 'r' memory constraint. 12365 case 'Q': return C_Memory; 12366 } 12367 } else if (Constraint.size() == 2) { 12368 switch (Constraint[0]) { 12369 default: break; 12370 // All 'U+' constraints are addresses. 12371 case 'U': return C_Memory; 12372 } 12373 } 12374 return TargetLowering::getConstraintType(Constraint); 12375 } 12376 12377 /// Examine constraint type and operand type and determine a weight value. 12378 /// This object must already have been set up with the operand type 12379 /// and the current alternative constraint selected. 12380 TargetLowering::ConstraintWeight 12381 ARMTargetLowering::getSingleConstraintMatchWeight( 12382 AsmOperandInfo &info, const char *constraint) const { 12383 ConstraintWeight weight = CW_Invalid; 12384 Value *CallOperandVal = info.CallOperandVal; 12385 // If we don't have a value, we can't do a match, 12386 // but allow it at the lowest weight. 12387 if (!CallOperandVal) 12388 return CW_Default; 12389 Type *type = CallOperandVal->getType(); 12390 // Look at the constraint type. 12391 switch (*constraint) { 12392 default: 12393 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 12394 break; 12395 case 'l': 12396 if (type->isIntegerTy()) { 12397 if (Subtarget->isThumb()) 12398 weight = CW_SpecificReg; 12399 else 12400 weight = CW_Register; 12401 } 12402 break; 12403 case 'w': 12404 if (type->isFloatingPointTy()) 12405 weight = CW_Register; 12406 break; 12407 } 12408 return weight; 12409 } 12410 12411 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; 12412 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 12413 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 12414 if (Constraint.size() == 1) { 12415 // GCC ARM Constraint Letters 12416 switch (Constraint[0]) { 12417 case 'l': // Low regs or general regs. 12418 if (Subtarget->isThumb()) 12419 return RCPair(0U, &ARM::tGPRRegClass); 12420 return RCPair(0U, &ARM::GPRRegClass); 12421 case 'h': // High regs or no regs. 12422 if (Subtarget->isThumb()) 12423 return RCPair(0U, &ARM::hGPRRegClass); 12424 break; 12425 case 'r': 12426 if (Subtarget->isThumb1Only()) 12427 return RCPair(0U, &ARM::tGPRRegClass); 12428 return RCPair(0U, &ARM::GPRRegClass); 12429 case 'w': 12430 if (VT == MVT::Other) 12431 break; 12432 if (VT == MVT::f32) 12433 return RCPair(0U, &ARM::SPRRegClass); 12434 if (VT.getSizeInBits() == 64) 12435 return RCPair(0U, &ARM::DPRRegClass); 12436 if (VT.getSizeInBits() == 128) 12437 return RCPair(0U, &ARM::QPRRegClass); 12438 break; 12439 case 'x': 12440 if (VT == MVT::Other) 12441 break; 12442 if (VT == MVT::f32) 12443 return RCPair(0U, &ARM::SPR_8RegClass); 12444 if (VT.getSizeInBits() == 64) 12445 return RCPair(0U, &ARM::DPR_8RegClass); 12446 if (VT.getSizeInBits() == 128) 12447 return RCPair(0U, &ARM::QPR_8RegClass); 12448 break; 12449 case 't': 12450 if (VT == MVT::f32) 12451 return RCPair(0U, &ARM::SPRRegClass); 12452 break; 12453 } 12454 } 12455 if (StringRef("{cc}").equals_lower(Constraint)) 12456 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 12457 12458 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 12459 } 12460 12461 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 12462 /// vector. If it is invalid, don't add anything to Ops. 12463 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 12464 std::string &Constraint, 12465 std::vector<SDValue>&Ops, 12466 SelectionDAG &DAG) const { 12467 SDValue Result; 12468 12469 // Currently only support length 1 constraints. 12470 if (Constraint.length() != 1) return; 12471 12472 char ConstraintLetter = Constraint[0]; 12473 switch (ConstraintLetter) { 12474 default: break; 12475 case 'j': 12476 case 'I': case 'J': case 'K': case 'L': 12477 case 'M': case 'N': case 'O': 12478 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 12479 if (!C) 12480 return; 12481 12482 int64_t CVal64 = C->getSExtValue(); 12483 int CVal = (int) CVal64; 12484 // None of these constraints allow values larger than 32 bits. Check 12485 // that the value fits in an int. 12486 if (CVal != CVal64) 12487 return; 12488 12489 switch (ConstraintLetter) { 12490 case 'j': 12491 // Constant suitable for movw, must be between 0 and 12492 // 65535. 12493 if (Subtarget->hasV6T2Ops()) 12494 if (CVal >= 0 && CVal <= 65535) 12495 break; 12496 return; 12497 case 'I': 12498 if (Subtarget->isThumb1Only()) { 12499 // This must be a constant between 0 and 255, for ADD 12500 // immediates. 12501 if (CVal >= 0 && CVal <= 255) 12502 break; 12503 } else if (Subtarget->isThumb2()) { 12504 // A constant that can be used as an immediate value in a 12505 // data-processing instruction. 12506 if (ARM_AM::getT2SOImmVal(CVal) != -1) 12507 break; 12508 } else { 12509 // A constant that can be used as an immediate value in a 12510 // data-processing instruction. 12511 if (ARM_AM::getSOImmVal(CVal) != -1) 12512 break; 12513 } 12514 return; 12515 12516 case 'J': 12517 if (Subtarget->isThumb1Only()) { 12518 // This must be a constant between -255 and -1, for negated ADD 12519 // immediates. This can be used in GCC with an "n" modifier that 12520 // prints the negated value, for use with SUB instructions. It is 12521 // not useful otherwise but is implemented for compatibility. 12522 if (CVal >= -255 && CVal <= -1) 12523 break; 12524 } else { 12525 // This must be a constant between -4095 and 4095. It is not clear 12526 // what this constraint is intended for. Implemented for 12527 // compatibility with GCC. 12528 if (CVal >= -4095 && CVal <= 4095) 12529 break; 12530 } 12531 return; 12532 12533 case 'K': 12534 if (Subtarget->isThumb1Only()) { 12535 // A 32-bit value where only one byte has a nonzero value. Exclude 12536 // zero to match GCC. This constraint is used by GCC internally for 12537 // constants that can be loaded with a move/shift combination. 12538 // It is not useful otherwise but is implemented for compatibility. 12539 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 12540 break; 12541 } else if (Subtarget->isThumb2()) { 12542 // A constant whose bitwise inverse can be used as an immediate 12543 // value in a data-processing instruction. This can be used in GCC 12544 // with a "B" modifier that prints the inverted value, for use with 12545 // BIC and MVN instructions. It is not useful otherwise but is 12546 // implemented for compatibility. 12547 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 12548 break; 12549 } else { 12550 // A constant whose bitwise inverse can be used as an immediate 12551 // value in a data-processing instruction. This can be used in GCC 12552 // with a "B" modifier that prints the inverted value, for use with 12553 // BIC and MVN instructions. It is not useful otherwise but is 12554 // implemented for compatibility. 12555 if (ARM_AM::getSOImmVal(~CVal) != -1) 12556 break; 12557 } 12558 return; 12559 12560 case 'L': 12561 if (Subtarget->isThumb1Only()) { 12562 // This must be a constant between -7 and 7, 12563 // for 3-operand ADD/SUB immediate instructions. 12564 if (CVal >= -7 && CVal < 7) 12565 break; 12566 } else if (Subtarget->isThumb2()) { 12567 // A constant whose negation can be used as an immediate value in a 12568 // data-processing instruction. This can be used in GCC with an "n" 12569 // modifier that prints the negated value, for use with SUB 12570 // instructions. It is not useful otherwise but is implemented for 12571 // compatibility. 12572 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 12573 break; 12574 } else { 12575 // A constant whose negation can be used as an immediate value in a 12576 // data-processing instruction. This can be used in GCC with an "n" 12577 // modifier that prints the negated value, for use with SUB 12578 // instructions. It is not useful otherwise but is implemented for 12579 // compatibility. 12580 if (ARM_AM::getSOImmVal(-CVal) != -1) 12581 break; 12582 } 12583 return; 12584 12585 case 'M': 12586 if (Subtarget->isThumb1Only()) { 12587 // This must be a multiple of 4 between 0 and 1020, for 12588 // ADD sp + immediate. 12589 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 12590 break; 12591 } else { 12592 // A power of two or a constant between 0 and 32. This is used in 12593 // GCC for the shift amount on shifted register operands, but it is 12594 // useful in general for any shift amounts. 12595 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 12596 break; 12597 } 12598 return; 12599 12600 case 'N': 12601 if (Subtarget->isThumb()) { // FIXME thumb2 12602 // This must be a constant between 0 and 31, for shift amounts. 12603 if (CVal >= 0 && CVal <= 31) 12604 break; 12605 } 12606 return; 12607 12608 case 'O': 12609 if (Subtarget->isThumb()) { // FIXME thumb2 12610 // This must be a multiple of 4 between -508 and 508, for 12611 // ADD/SUB sp = sp + immediate. 12612 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 12613 break; 12614 } 12615 return; 12616 } 12617 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 12618 break; 12619 } 12620 12621 if (Result.getNode()) { 12622 Ops.push_back(Result); 12623 return; 12624 } 12625 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12626 } 12627 12628 static RTLIB::Libcall getDivRemLibcall( 12629 const SDNode *N, MVT::SimpleValueType SVT) { 12630 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 12631 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 12632 "Unhandled Opcode in getDivRemLibcall"); 12633 bool isSigned = N->getOpcode() == ISD::SDIVREM || 12634 N->getOpcode() == ISD::SREM; 12635 RTLIB::Libcall LC; 12636 switch (SVT) { 12637 default: llvm_unreachable("Unexpected request for libcall!"); 12638 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 12639 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 12640 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 12641 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 12642 } 12643 return LC; 12644 } 12645 12646 static TargetLowering::ArgListTy getDivRemArgList( 12647 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 12648 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 12649 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 12650 "Unhandled Opcode in getDivRemArgList"); 12651 bool isSigned = N->getOpcode() == ISD::SDIVREM || 12652 N->getOpcode() == ISD::SREM; 12653 TargetLowering::ArgListTy Args; 12654 TargetLowering::ArgListEntry Entry; 12655 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 12656 EVT ArgVT = N->getOperand(i).getValueType(); 12657 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 12658 Entry.Node = N->getOperand(i); 12659 Entry.Ty = ArgTy; 12660 Entry.isSExt = isSigned; 12661 Entry.isZExt = !isSigned; 12662 Args.push_back(Entry); 12663 } 12664 if (Subtarget->isTargetWindows() && Args.size() >= 2) 12665 std::swap(Args[0], Args[1]); 12666 return Args; 12667 } 12668 12669 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 12670 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 12671 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 12672 Subtarget->isTargetWindows()) && 12673 "Register-based DivRem lowering only"); 12674 unsigned Opcode = Op->getOpcode(); 12675 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 12676 "Invalid opcode for Div/Rem lowering"); 12677 bool isSigned = (Opcode == ISD::SDIVREM); 12678 EVT VT = Op->getValueType(0); 12679 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 12680 SDLoc dl(Op); 12681 12682 // If the target has hardware divide, use divide + multiply + subtract: 12683 // div = a / b 12684 // rem = a - b * div 12685 // return {div, rem} 12686 // This should be lowered into UDIV/SDIV + MLS later on. 12687 if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() && 12688 Op->getSimpleValueType(0) == MVT::i32) { 12689 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 12690 const SDValue Dividend = Op->getOperand(0); 12691 const SDValue Divisor = Op->getOperand(1); 12692 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 12693 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 12694 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 12695 12696 SDValue Values[2] = {Div, Rem}; 12697 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 12698 } 12699 12700 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 12701 VT.getSimpleVT().SimpleTy); 12702 SDValue InChain = DAG.getEntryNode(); 12703 12704 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 12705 DAG.getContext(), 12706 Subtarget); 12707 12708 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 12709 getPointerTy(DAG.getDataLayout())); 12710 12711 Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); 12712 12713 if (Subtarget->isTargetWindows()) 12714 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 12715 12716 TargetLowering::CallLoweringInfo CLI(DAG); 12717 CLI.setDebugLoc(dl).setChain(InChain) 12718 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 12719 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 12720 12721 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 12722 return CallInfo.first; 12723 } 12724 12725 // Lowers REM using divmod helpers 12726 // see RTABI section 4.2/4.3 12727 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 12728 // Build return types (div and rem) 12729 std::vector<Type*> RetTyParams; 12730 Type *RetTyElement; 12731 12732 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 12733 default: llvm_unreachable("Unexpected request for libcall!"); 12734 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 12735 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 12736 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 12737 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 12738 } 12739 12740 RetTyParams.push_back(RetTyElement); 12741 RetTyParams.push_back(RetTyElement); 12742 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 12743 Type *RetTy = StructType::get(*DAG.getContext(), ret); 12744 12745 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 12746 SimpleTy); 12747 SDValue InChain = DAG.getEntryNode(); 12748 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 12749 Subtarget); 12750 bool isSigned = N->getOpcode() == ISD::SREM; 12751 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 12752 getPointerTy(DAG.getDataLayout())); 12753 12754 if (Subtarget->isTargetWindows()) 12755 InChain = WinDBZCheckDenominator(DAG, N, InChain); 12756 12757 // Lower call 12758 CallLoweringInfo CLI(DAG); 12759 CLI.setChain(InChain) 12760 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 12761 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 12762 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 12763 12764 // Return second (rem) result operand (first contains div) 12765 SDNode *ResNode = CallResult.first.getNode(); 12766 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 12767 return ResNode->getOperand(1); 12768 } 12769 12770 SDValue 12771 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 12772 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 12773 SDLoc DL(Op); 12774 12775 // Get the inputs. 12776 SDValue Chain = Op.getOperand(0); 12777 SDValue Size = Op.getOperand(1); 12778 12779 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 12780 DAG.getConstant(2, DL, MVT::i32)); 12781 12782 SDValue Flag; 12783 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 12784 Flag = Chain.getValue(1); 12785 12786 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 12787 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 12788 12789 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 12790 Chain = NewSP.getValue(1); 12791 12792 SDValue Ops[2] = { NewSP, Chain }; 12793 return DAG.getMergeValues(Ops, DL); 12794 } 12795 12796 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 12797 assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && 12798 "Unexpected type for custom-lowering FP_EXTEND"); 12799 12800 RTLIB::Libcall LC; 12801 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 12802 12803 SDValue SrcVal = Op.getOperand(0); 12804 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 12805 SDLoc(Op)).first; 12806 } 12807 12808 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 12809 assert(Op.getOperand(0).getValueType() == MVT::f64 && 12810 Subtarget->isFPOnlySP() && 12811 "Unexpected type for custom-lowering FP_ROUND"); 12812 12813 RTLIB::Libcall LC; 12814 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 12815 12816 SDValue SrcVal = Op.getOperand(0); 12817 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 12818 SDLoc(Op)).first; 12819 } 12820 12821 bool 12822 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 12823 // The ARM target isn't yet aware of offsets. 12824 return false; 12825 } 12826 12827 bool ARM::isBitFieldInvertedMask(unsigned v) { 12828 if (v == 0xffffffff) 12829 return false; 12830 12831 // there can be 1's on either or both "outsides", all the "inside" 12832 // bits must be 0's 12833 return isShiftedMask_32(~v); 12834 } 12835 12836 /// isFPImmLegal - Returns true if the target can instruction select the 12837 /// specified FP immediate natively. If false, the legalizer will 12838 /// materialize the FP immediate as a load from a constant pool. 12839 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 12840 if (!Subtarget->hasVFP3()) 12841 return false; 12842 if (VT == MVT::f32) 12843 return ARM_AM::getFP32Imm(Imm) != -1; 12844 if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) 12845 return ARM_AM::getFP64Imm(Imm) != -1; 12846 return false; 12847 } 12848 12849 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 12850 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 12851 /// specified in the intrinsic calls. 12852 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 12853 const CallInst &I, 12854 unsigned Intrinsic) const { 12855 switch (Intrinsic) { 12856 case Intrinsic::arm_neon_vld1: 12857 case Intrinsic::arm_neon_vld2: 12858 case Intrinsic::arm_neon_vld3: 12859 case Intrinsic::arm_neon_vld4: 12860 case Intrinsic::arm_neon_vld2lane: 12861 case Intrinsic::arm_neon_vld3lane: 12862 case Intrinsic::arm_neon_vld4lane: { 12863 Info.opc = ISD::INTRINSIC_W_CHAIN; 12864 // Conservatively set memVT to the entire set of vectors loaded. 12865 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12866 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 12867 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 12868 Info.ptrVal = I.getArgOperand(0); 12869 Info.offset = 0; 12870 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 12871 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 12872 Info.vol = false; // volatile loads with NEON intrinsics not supported 12873 Info.readMem = true; 12874 Info.writeMem = false; 12875 return true; 12876 } 12877 case Intrinsic::arm_neon_vst1: 12878 case Intrinsic::arm_neon_vst2: 12879 case Intrinsic::arm_neon_vst3: 12880 case Intrinsic::arm_neon_vst4: 12881 case Intrinsic::arm_neon_vst2lane: 12882 case Intrinsic::arm_neon_vst3lane: 12883 case Intrinsic::arm_neon_vst4lane: { 12884 Info.opc = ISD::INTRINSIC_VOID; 12885 // Conservatively set memVT to the entire set of vectors stored. 12886 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12887 unsigned NumElts = 0; 12888 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 12889 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 12890 if (!ArgTy->isVectorTy()) 12891 break; 12892 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 12893 } 12894 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 12895 Info.ptrVal = I.getArgOperand(0); 12896 Info.offset = 0; 12897 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 12898 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 12899 Info.vol = false; // volatile stores with NEON intrinsics not supported 12900 Info.readMem = false; 12901 Info.writeMem = true; 12902 return true; 12903 } 12904 case Intrinsic::arm_ldaex: 12905 case Intrinsic::arm_ldrex: { 12906 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12907 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 12908 Info.opc = ISD::INTRINSIC_W_CHAIN; 12909 Info.memVT = MVT::getVT(PtrTy->getElementType()); 12910 Info.ptrVal = I.getArgOperand(0); 12911 Info.offset = 0; 12912 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 12913 Info.vol = true; 12914 Info.readMem = true; 12915 Info.writeMem = false; 12916 return true; 12917 } 12918 case Intrinsic::arm_stlex: 12919 case Intrinsic::arm_strex: { 12920 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 12921 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 12922 Info.opc = ISD::INTRINSIC_W_CHAIN; 12923 Info.memVT = MVT::getVT(PtrTy->getElementType()); 12924 Info.ptrVal = I.getArgOperand(1); 12925 Info.offset = 0; 12926 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 12927 Info.vol = true; 12928 Info.readMem = false; 12929 Info.writeMem = true; 12930 return true; 12931 } 12932 case Intrinsic::arm_stlexd: 12933 case Intrinsic::arm_strexd: { 12934 Info.opc = ISD::INTRINSIC_W_CHAIN; 12935 Info.memVT = MVT::i64; 12936 Info.ptrVal = I.getArgOperand(2); 12937 Info.offset = 0; 12938 Info.align = 8; 12939 Info.vol = true; 12940 Info.readMem = false; 12941 Info.writeMem = true; 12942 return true; 12943 } 12944 case Intrinsic::arm_ldaexd: 12945 case Intrinsic::arm_ldrexd: { 12946 Info.opc = ISD::INTRINSIC_W_CHAIN; 12947 Info.memVT = MVT::i64; 12948 Info.ptrVal = I.getArgOperand(0); 12949 Info.offset = 0; 12950 Info.align = 8; 12951 Info.vol = true; 12952 Info.readMem = true; 12953 Info.writeMem = false; 12954 return true; 12955 } 12956 default: 12957 break; 12958 } 12959 12960 return false; 12961 } 12962 12963 /// \brief Returns true if it is beneficial to convert a load of a constant 12964 /// to just the constant itself. 12965 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 12966 Type *Ty) const { 12967 assert(Ty->isIntegerTy()); 12968 12969 unsigned Bits = Ty->getPrimitiveSizeInBits(); 12970 if (Bits == 0 || Bits > 32) 12971 return false; 12972 return true; 12973 } 12974 12975 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, 12976 unsigned Index) const { 12977 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 12978 return false; 12979 12980 return (Index == 0 || Index == ResVT.getVectorNumElements()); 12981 } 12982 12983 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 12984 ARM_MB::MemBOpt Domain) const { 12985 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 12986 12987 // First, if the target has no DMB, see what fallback we can use. 12988 if (!Subtarget->hasDataBarrier()) { 12989 // Some ARMv6 cpus can support data barriers with an mcr instruction. 12990 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 12991 // here. 12992 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 12993 Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 12994 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 12995 Builder.getInt32(0), Builder.getInt32(7), 12996 Builder.getInt32(10), Builder.getInt32(5)}; 12997 return Builder.CreateCall(MCR, args); 12998 } else { 12999 // Instead of using barriers, atomic accesses on these subtargets use 13000 // libcalls. 13001 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 13002 } 13003 } else { 13004 Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 13005 // Only a full system barrier exists in the M-class architectures. 13006 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 13007 Constant *CDomain = Builder.getInt32(Domain); 13008 return Builder.CreateCall(DMB, CDomain); 13009 } 13010 } 13011 13012 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 13013 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 13014 AtomicOrdering Ord, bool IsStore, 13015 bool IsLoad) const { 13016 switch (Ord) { 13017 case AtomicOrdering::NotAtomic: 13018 case AtomicOrdering::Unordered: 13019 llvm_unreachable("Invalid fence: unordered/non-atomic"); 13020 case AtomicOrdering::Monotonic: 13021 case AtomicOrdering::Acquire: 13022 return nullptr; // Nothing to do 13023 case AtomicOrdering::SequentiallyConsistent: 13024 if (!IsStore) 13025 return nullptr; // Nothing to do 13026 /*FALLTHROUGH*/ 13027 case AtomicOrdering::Release: 13028 case AtomicOrdering::AcquireRelease: 13029 if (Subtarget->preferISHSTBarriers()) 13030 return makeDMB(Builder, ARM_MB::ISHST); 13031 // FIXME: add a comment with a link to documentation justifying this. 13032 else 13033 return makeDMB(Builder, ARM_MB::ISH); 13034 } 13035 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 13036 } 13037 13038 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 13039 AtomicOrdering Ord, bool IsStore, 13040 bool IsLoad) const { 13041 switch (Ord) { 13042 case AtomicOrdering::NotAtomic: 13043 case AtomicOrdering::Unordered: 13044 llvm_unreachable("Invalid fence: unordered/not-atomic"); 13045 case AtomicOrdering::Monotonic: 13046 case AtomicOrdering::Release: 13047 return nullptr; // Nothing to do 13048 case AtomicOrdering::Acquire: 13049 case AtomicOrdering::AcquireRelease: 13050 case AtomicOrdering::SequentiallyConsistent: 13051 return makeDMB(Builder, ARM_MB::ISH); 13052 } 13053 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 13054 } 13055 13056 // Loads and stores less than 64-bits are already atomic; ones above that 13057 // are doomed anyway, so defer to the default libcall and blame the OS when 13058 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 13059 // anything for those. 13060 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 13061 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 13062 return (Size == 64) && !Subtarget->isMClass(); 13063 } 13064 13065 // Loads and stores less than 64-bits are already atomic; ones above that 13066 // are doomed anyway, so defer to the default libcall and blame the OS when 13067 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 13068 // anything for those. 13069 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 13070 // guarantee, see DDI0406C ARM architecture reference manual, 13071 // sections A8.8.72-74 LDRD) 13072 TargetLowering::AtomicExpansionKind 13073 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 13074 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 13075 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 13076 : AtomicExpansionKind::None; 13077 } 13078 13079 // For the real atomic operations, we have ldrex/strex up to 32 bits, 13080 // and up to 64 bits on the non-M profiles 13081 TargetLowering::AtomicExpansionKind 13082 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 13083 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 13084 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 13085 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 13086 ? AtomicExpansionKind::LLSC 13087 : AtomicExpansionKind::None; 13088 } 13089 13090 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( 13091 AtomicCmpXchgInst *AI) const { 13092 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 13093 // implement cmpxchg without spilling. If the address being exchanged is also 13094 // on the stack and close enough to the spill slot, this can lead to a 13095 // situation where the monitor always gets cleared and the atomic operation 13096 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 13097 bool hasAtomicCmpXchg = 13098 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 13099 return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg; 13100 } 13101 13102 bool ARMTargetLowering::shouldInsertFencesForAtomic( 13103 const Instruction *I) const { 13104 return InsertFencesForAtomic; 13105 } 13106 13107 // This has so far only been implemented for MachO. 13108 bool ARMTargetLowering::useLoadStackGuardNode() const { 13109 return Subtarget->isTargetMachO(); 13110 } 13111 13112 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 13113 unsigned &Cost) const { 13114 // If we do not have NEON, vector types are not natively supported. 13115 if (!Subtarget->hasNEON()) 13116 return false; 13117 13118 // Floating point values and vector values map to the same register file. 13119 // Therefore, although we could do a store extract of a vector type, this is 13120 // better to leave at float as we have more freedom in the addressing mode for 13121 // those. 13122 if (VectorTy->isFPOrFPVectorTy()) 13123 return false; 13124 13125 // If the index is unknown at compile time, this is very expensive to lower 13126 // and it is not possible to combine the store with the extract. 13127 if (!isa<ConstantInt>(Idx)) 13128 return false; 13129 13130 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 13131 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 13132 // We can do a store + vector extract on any vector that fits perfectly in a D 13133 // or Q register. 13134 if (BitWidth == 64 || BitWidth == 128) { 13135 Cost = 0; 13136 return true; 13137 } 13138 return false; 13139 } 13140 13141 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 13142 return Subtarget->hasV6T2Ops(); 13143 } 13144 13145 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 13146 return Subtarget->hasV6T2Ops(); 13147 } 13148 13149 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 13150 AtomicOrdering Ord) const { 13151 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 13152 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 13153 bool IsAcquire = isAcquireOrStronger(Ord); 13154 13155 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 13156 // intrinsic must return {i32, i32} and we have to recombine them into a 13157 // single i64 here. 13158 if (ValTy->getPrimitiveSizeInBits() == 64) { 13159 Intrinsic::ID Int = 13160 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 13161 Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int); 13162 13163 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 13164 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 13165 13166 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 13167 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 13168 if (!Subtarget->isLittle()) 13169 std::swap (Lo, Hi); 13170 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 13171 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 13172 return Builder.CreateOr( 13173 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 13174 } 13175 13176 Type *Tys[] = { Addr->getType() }; 13177 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 13178 Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys); 13179 13180 return Builder.CreateTruncOrBitCast( 13181 Builder.CreateCall(Ldrex, Addr), 13182 cast<PointerType>(Addr->getType())->getElementType()); 13183 } 13184 13185 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 13186 IRBuilder<> &Builder) const { 13187 if (!Subtarget->hasV7Ops()) 13188 return; 13189 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 13190 Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 13191 } 13192 13193 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 13194 Value *Addr, 13195 AtomicOrdering Ord) const { 13196 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 13197 bool IsRelease = isReleaseOrStronger(Ord); 13198 13199 // Since the intrinsics must have legal type, the i64 intrinsics take two 13200 // parameters: "i32, i32". We must marshal Val into the appropriate form 13201 // before the call. 13202 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 13203 Intrinsic::ID Int = 13204 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 13205 Function *Strex = Intrinsic::getDeclaration(M, Int); 13206 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 13207 13208 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 13209 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 13210 if (!Subtarget->isLittle()) 13211 std::swap (Lo, Hi); 13212 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 13213 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 13214 } 13215 13216 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 13217 Type *Tys[] = { Addr->getType() }; 13218 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 13219 13220 return Builder.CreateCall( 13221 Strex, {Builder.CreateZExtOrBitCast( 13222 Val, Strex->getFunctionType()->getParamType(0)), 13223 Addr}); 13224 } 13225 13226 /// \brief Lower an interleaved load into a vldN intrinsic. 13227 /// 13228 /// E.g. Lower an interleaved load (Factor = 2): 13229 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 13230 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 13231 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 13232 /// 13233 /// Into: 13234 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 13235 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 13236 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 13237 bool ARMTargetLowering::lowerInterleavedLoad( 13238 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 13239 ArrayRef<unsigned> Indices, unsigned Factor) const { 13240 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 13241 "Invalid interleave factor"); 13242 assert(!Shuffles.empty() && "Empty shufflevector input"); 13243 assert(Shuffles.size() == Indices.size() && 13244 "Unmatched number of shufflevectors and indices"); 13245 13246 VectorType *VecTy = Shuffles[0]->getType(); 13247 Type *EltTy = VecTy->getVectorElementType(); 13248 13249 const DataLayout &DL = LI->getModule()->getDataLayout(); 13250 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 13251 bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; 13252 13253 // Skip if we do not have NEON and skip illegal vector types and vector types 13254 // with i64/f64 elements (vldN doesn't support i64/f64 elements). 13255 if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) 13256 return false; 13257 13258 // A pointer vector can not be the return type of the ldN intrinsics. Need to 13259 // load integer vectors first and then convert to pointer vectors. 13260 if (EltTy->isPointerTy()) 13261 VecTy = 13262 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 13263 13264 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 13265 Intrinsic::arm_neon_vld3, 13266 Intrinsic::arm_neon_vld4}; 13267 13268 IRBuilder<> Builder(LI); 13269 SmallVector<Value *, 2> Ops; 13270 13271 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 13272 Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); 13273 Ops.push_back(Builder.getInt32(LI->getAlignment())); 13274 13275 Type *Tys[] = { VecTy, Int8Ptr }; 13276 Function *VldnFunc = 13277 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 13278 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); 13279 13280 // Replace uses of each shufflevector with the corresponding vector loaded 13281 // by ldN. 13282 for (unsigned i = 0; i < Shuffles.size(); i++) { 13283 ShuffleVectorInst *SV = Shuffles[i]; 13284 unsigned Index = Indices[i]; 13285 13286 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 13287 13288 // Convert the integer vector to pointer vector if the element is pointer. 13289 if (EltTy->isPointerTy()) 13290 SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); 13291 13292 SV->replaceAllUsesWith(SubVec); 13293 } 13294 13295 return true; 13296 } 13297 13298 /// \brief Get a mask consisting of sequential integers starting from \p Start. 13299 /// 13300 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1> 13301 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, 13302 unsigned NumElts) { 13303 SmallVector<Constant *, 16> Mask; 13304 for (unsigned i = 0; i < NumElts; i++) 13305 Mask.push_back(Builder.getInt32(Start + i)); 13306 13307 return ConstantVector::get(Mask); 13308 } 13309 13310 /// \brief Lower an interleaved store into a vstN intrinsic. 13311 /// 13312 /// E.g. Lower an interleaved store (Factor = 3): 13313 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 13314 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 13315 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 13316 /// 13317 /// Into: 13318 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 13319 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 13320 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 13321 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 13322 /// 13323 /// Note that the new shufflevectors will be removed and we'll only generate one 13324 /// vst3 instruction in CodeGen. 13325 /// 13326 /// Example for a more general valid mask (Factor 3). Lower: 13327 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 13328 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 13329 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 13330 /// 13331 /// Into: 13332 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 13333 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 13334 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 13335 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 13336 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 13337 ShuffleVectorInst *SVI, 13338 unsigned Factor) const { 13339 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 13340 "Invalid interleave factor"); 13341 13342 VectorType *VecTy = SVI->getType(); 13343 assert(VecTy->getVectorNumElements() % Factor == 0 && 13344 "Invalid interleaved store"); 13345 13346 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 13347 Type *EltTy = VecTy->getVectorElementType(); 13348 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 13349 13350 const DataLayout &DL = SI->getModule()->getDataLayout(); 13351 unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); 13352 bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; 13353 13354 // Skip if we do not have NEON and skip illegal vector types and vector types 13355 // with i64/f64 elements (vstN doesn't support i64/f64 elements). 13356 if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || 13357 EltIs64Bits) 13358 return false; 13359 13360 Value *Op0 = SVI->getOperand(0); 13361 Value *Op1 = SVI->getOperand(1); 13362 IRBuilder<> Builder(SI); 13363 13364 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 13365 // vectors to integer vectors. 13366 if (EltTy->isPointerTy()) { 13367 Type *IntTy = DL.getIntPtrType(EltTy); 13368 13369 // Convert to the corresponding integer vector. 13370 Type *IntVecTy = 13371 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 13372 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 13373 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 13374 13375 SubVecTy = VectorType::get(IntTy, LaneLen); 13376 } 13377 13378 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 13379 Intrinsic::arm_neon_vst3, 13380 Intrinsic::arm_neon_vst4}; 13381 SmallVector<Value *, 6> Ops; 13382 13383 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 13384 Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); 13385 13386 Type *Tys[] = { Int8Ptr, SubVecTy }; 13387 Function *VstNFunc = Intrinsic::getDeclaration( 13388 SI->getModule(), StoreInts[Factor - 2], Tys); 13389 13390 // Split the shufflevector operands into sub vectors for the new vstN call. 13391 auto Mask = SVI->getShuffleMask(); 13392 for (unsigned i = 0; i < Factor; i++) { 13393 if (Mask[i] >= 0) { 13394 Ops.push_back(Builder.CreateShuffleVector( 13395 Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); 13396 } else { 13397 unsigned StartMask = 0; 13398 for (unsigned j = 1; j < LaneLen; j++) { 13399 if (Mask[j*Factor + i] >= 0) { 13400 StartMask = Mask[j*Factor + i] - j; 13401 break; 13402 } 13403 } 13404 // Note: If all elements in a chunk are undefs, StartMask=0! 13405 // Note: Filling undef gaps with random elements is ok, since 13406 // those elements were being written anyway (with undefs). 13407 // In the case of all undefs we're defaulting to using elems from 0 13408 // Note: StartMask cannot be negative, it's checked in isReInterleaveMask 13409 Ops.push_back(Builder.CreateShuffleVector( 13410 Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); 13411 } 13412 } 13413 13414 Ops.push_back(Builder.getInt32(SI->getAlignment())); 13415 Builder.CreateCall(VstNFunc, Ops); 13416 return true; 13417 } 13418 13419 enum HABaseType { 13420 HA_UNKNOWN = 0, 13421 HA_FLOAT, 13422 HA_DOUBLE, 13423 HA_VECT64, 13424 HA_VECT128 13425 }; 13426 13427 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 13428 uint64_t &Members) { 13429 if (auto *ST = dyn_cast<StructType>(Ty)) { 13430 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 13431 uint64_t SubMembers = 0; 13432 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 13433 return false; 13434 Members += SubMembers; 13435 } 13436 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 13437 uint64_t SubMembers = 0; 13438 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 13439 return false; 13440 Members += SubMembers * AT->getNumElements(); 13441 } else if (Ty->isFloatTy()) { 13442 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 13443 return false; 13444 Members = 1; 13445 Base = HA_FLOAT; 13446 } else if (Ty->isDoubleTy()) { 13447 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 13448 return false; 13449 Members = 1; 13450 Base = HA_DOUBLE; 13451 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 13452 Members = 1; 13453 switch (Base) { 13454 case HA_FLOAT: 13455 case HA_DOUBLE: 13456 return false; 13457 case HA_VECT64: 13458 return VT->getBitWidth() == 64; 13459 case HA_VECT128: 13460 return VT->getBitWidth() == 128; 13461 case HA_UNKNOWN: 13462 switch (VT->getBitWidth()) { 13463 case 64: 13464 Base = HA_VECT64; 13465 return true; 13466 case 128: 13467 Base = HA_VECT128; 13468 return true; 13469 default: 13470 return false; 13471 } 13472 } 13473 } 13474 13475 return (Members > 0 && Members <= 4); 13476 } 13477 13478 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 13479 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 13480 /// passing according to AAPCS rules. 13481 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 13482 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 13483 if (getEffectiveCallingConv(CallConv, isVarArg) != 13484 CallingConv::ARM_AAPCS_VFP) 13485 return false; 13486 13487 HABaseType Base = HA_UNKNOWN; 13488 uint64_t Members = 0; 13489 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 13490 DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 13491 13492 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 13493 return IsHA || IsIntArray; 13494 } 13495 13496 unsigned ARMTargetLowering::getExceptionPointerRegister( 13497 const Constant *PersonalityFn) const { 13498 // Platforms which do not use SjLj EH may return values in these registers 13499 // via the personality function. 13500 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 13501 } 13502 13503 unsigned ARMTargetLowering::getExceptionSelectorRegister( 13504 const Constant *PersonalityFn) const { 13505 // Platforms which do not use SjLj EH may return values in these registers 13506 // via the personality function. 13507 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 13508 } 13509 13510 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 13511 // Update IsSplitCSR in ARMFunctionInfo. 13512 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 13513 AFI->setIsSplitCSR(true); 13514 } 13515 13516 void ARMTargetLowering::insertCopiesSplitCSR( 13517 MachineBasicBlock *Entry, 13518 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 13519 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 13520 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 13521 if (!IStart) 13522 return; 13523 13524 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 13525 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 13526 MachineBasicBlock::iterator MBBI = Entry->begin(); 13527 for (const MCPhysReg *I = IStart; *I; ++I) { 13528 const TargetRegisterClass *RC = nullptr; 13529 if (ARM::GPRRegClass.contains(*I)) 13530 RC = &ARM::GPRRegClass; 13531 else if (ARM::DPRRegClass.contains(*I)) 13532 RC = &ARM::DPRRegClass; 13533 else 13534 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 13535 13536 unsigned NewVR = MRI->createVirtualRegister(RC); 13537 // Create copy from CSR to a virtual register. 13538 // FIXME: this currently does not emit CFI pseudo-instructions, it works 13539 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 13540 // nounwind. If we want to generalize this later, we may need to emit 13541 // CFI pseudo-instructions. 13542 assert(Entry->getParent()->getFunction()->hasFnAttribute( 13543 Attribute::NoUnwind) && 13544 "Function should be nounwind in insertCopiesSplitCSR!"); 13545 Entry->addLiveIn(*I); 13546 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 13547 .addReg(*I); 13548 13549 // Insert the copy-back instructions right before the terminator. 13550 for (auto *Exit : Exits) 13551 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 13552 TII->get(TargetOpcode::COPY), *I) 13553 .addReg(NewVR); 13554 } 13555 } 13556