1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that ARM uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "arm-isel" 16 #include "ARMISelLowering.h" 17 #include "ARM.h" 18 #include "ARMCallingConv.h" 19 #include "ARMConstantPoolValue.h" 20 #include "ARMMachineFunctionInfo.h" 21 #include "ARMPerfectShuffle.h" 22 #include "ARMSubtarget.h" 23 #include "ARMTargetMachine.h" 24 #include "ARMTargetObjectFile.h" 25 #include "MCTargetDesc/ARMAddressingModes.h" 26 #include "llvm/ADT/Statistic.h" 27 #include "llvm/ADT/StringExtras.h" 28 #include "llvm/CallingConv.h" 29 #include "llvm/CodeGen/CallingConvLower.h" 30 #include "llvm/CodeGen/IntrinsicLowering.h" 31 #include "llvm/CodeGen/MachineBasicBlock.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineFunction.h" 34 #include "llvm/CodeGen/MachineInstrBuilder.h" 35 #include "llvm/CodeGen/MachineModuleInfo.h" 36 #include "llvm/CodeGen/MachineRegisterInfo.h" 37 #include "llvm/CodeGen/SelectionDAG.h" 38 #include "llvm/Constants.h" 39 #include "llvm/Function.h" 40 #include "llvm/GlobalValue.h" 41 #include "llvm/Instruction.h" 42 #include "llvm/Instructions.h" 43 #include "llvm/Intrinsics.h" 44 #include "llvm/MC/MCSectionMachO.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Support/raw_ostream.h" 49 #include "llvm/Target/TargetOptions.h" 50 #include "llvm/Type.h" 51 using namespace llvm; 52 53 STATISTIC(NumTailCalls, "Number of tail calls"); 54 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 55 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 56 57 // This option should go away when tail calls fully work. 58 static cl::opt<bool> 59 EnableARMTailCalls("arm-tail-calls", cl::Hidden, 60 cl::desc("Generate tail calls (TEMPORARY OPTION)."), 61 cl::init(false)); 62 63 cl::opt<bool> 64 EnableARMLongCalls("arm-long-calls", cl::Hidden, 65 cl::desc("Generate calls via indirect call instructions"), 66 cl::init(false)); 67 68 static cl::opt<bool> 69 ARMInterworking("arm-interworking", cl::Hidden, 70 cl::desc("Enable / disable ARM interworking (for debugging only)"), 71 cl::init(true)); 72 73 namespace { 74 class ARMCCState : public CCState { 75 public: 76 ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, 77 const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs, 78 LLVMContext &C, ParmContext PC) 79 : CCState(CC, isVarArg, MF, TM, locs, C) { 80 assert(((PC == Call) || (PC == Prologue)) && 81 "ARMCCState users must specify whether their context is call" 82 "or prologue generation."); 83 CallOrPrologue = PC; 84 } 85 }; 86 } 87 88 // The APCS parameter registers. 89 static const uint16_t GPRArgRegs[] = { 90 ARM::R0, ARM::R1, ARM::R2, ARM::R3 91 }; 92 93 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 94 MVT PromotedBitwiseVT) { 95 if (VT != PromotedLdStVT) { 96 setOperationAction(ISD::LOAD, VT, Promote); 97 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 98 99 setOperationAction(ISD::STORE, VT, Promote); 100 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 101 } 102 103 MVT ElemTy = VT.getVectorElementType(); 104 if (ElemTy != MVT::i64 && ElemTy != MVT::f64) 105 setOperationAction(ISD::SETCC, VT, Custom); 106 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 107 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 108 if (ElemTy == MVT::i32) { 109 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 110 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 111 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 112 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 113 } else { 114 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 115 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 116 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 117 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 118 } 119 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 120 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 121 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 122 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 123 setOperationAction(ISD::SELECT, VT, Expand); 124 setOperationAction(ISD::SELECT_CC, VT, Expand); 125 setOperationAction(ISD::VSELECT, VT, Expand); 126 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 127 if (VT.isInteger()) { 128 setOperationAction(ISD::SHL, VT, Custom); 129 setOperationAction(ISD::SRA, VT, Custom); 130 setOperationAction(ISD::SRL, VT, Custom); 131 } 132 133 // Promote all bit-wise operations. 134 if (VT.isInteger() && VT != PromotedBitwiseVT) { 135 setOperationAction(ISD::AND, VT, Promote); 136 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 137 setOperationAction(ISD::OR, VT, Promote); 138 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 139 setOperationAction(ISD::XOR, VT, Promote); 140 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 141 } 142 143 // Neon does not support vector divide/remainder operations. 144 setOperationAction(ISD::SDIV, VT, Expand); 145 setOperationAction(ISD::UDIV, VT, Expand); 146 setOperationAction(ISD::FDIV, VT, Expand); 147 setOperationAction(ISD::SREM, VT, Expand); 148 setOperationAction(ISD::UREM, VT, Expand); 149 setOperationAction(ISD::FREM, VT, Expand); 150 } 151 152 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 153 addRegisterClass(VT, &ARM::DPRRegClass); 154 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 155 } 156 157 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 158 addRegisterClass(VT, &ARM::QPRRegClass); 159 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 160 } 161 162 static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { 163 if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin()) 164 return new TargetLoweringObjectFileMachO(); 165 166 return new ARMElfTargetObjectFile(); 167 } 168 169 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) 170 : TargetLowering(TM, createTLOF(TM)) { 171 Subtarget = &TM.getSubtarget<ARMSubtarget>(); 172 RegInfo = TM.getRegisterInfo(); 173 Itins = TM.getInstrItineraryData(); 174 175 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 176 177 if (Subtarget->isTargetDarwin()) { 178 // Uses VFP for Thumb libfuncs if available. 179 if (Subtarget->isThumb() && Subtarget->hasVFP2()) { 180 // Single-precision floating-point arithmetic. 181 setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); 182 setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); 183 setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp"); 184 setLibcallName(RTLIB::DIV_F32, "__divsf3vfp"); 185 186 // Double-precision floating-point arithmetic. 187 setLibcallName(RTLIB::ADD_F64, "__adddf3vfp"); 188 setLibcallName(RTLIB::SUB_F64, "__subdf3vfp"); 189 setLibcallName(RTLIB::MUL_F64, "__muldf3vfp"); 190 setLibcallName(RTLIB::DIV_F64, "__divdf3vfp"); 191 192 // Single-precision comparisons. 193 setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp"); 194 setLibcallName(RTLIB::UNE_F32, "__nesf2vfp"); 195 setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp"); 196 setLibcallName(RTLIB::OLE_F32, "__lesf2vfp"); 197 setLibcallName(RTLIB::OGE_F32, "__gesf2vfp"); 198 setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp"); 199 setLibcallName(RTLIB::UO_F32, "__unordsf2vfp"); 200 setLibcallName(RTLIB::O_F32, "__unordsf2vfp"); 201 202 setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); 203 setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE); 204 setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); 205 setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); 206 setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); 207 setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); 208 setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); 209 setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); 210 211 // Double-precision comparisons. 212 setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp"); 213 setLibcallName(RTLIB::UNE_F64, "__nedf2vfp"); 214 setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp"); 215 setLibcallName(RTLIB::OLE_F64, "__ledf2vfp"); 216 setLibcallName(RTLIB::OGE_F64, "__gedf2vfp"); 217 setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp"); 218 setLibcallName(RTLIB::UO_F64, "__unorddf2vfp"); 219 setLibcallName(RTLIB::O_F64, "__unorddf2vfp"); 220 221 setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); 222 setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE); 223 setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); 224 setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); 225 setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); 226 setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); 227 setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); 228 setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); 229 230 // Floating-point to integer conversions. 231 // i64 conversions are done via library routines even when generating VFP 232 // instructions, so use the same ones. 233 setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp"); 234 setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp"); 235 setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp"); 236 setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp"); 237 238 // Conversions between floating types. 239 setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp"); 240 setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp"); 241 242 // Integer to floating-point conversions. 243 // i64 conversions are done via library routines even when generating VFP 244 // instructions, so use the same ones. 245 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 246 // e.g., __floatunsidf vs. __floatunssidfvfp. 247 setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp"); 248 setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp"); 249 setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp"); 250 setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp"); 251 } 252 } 253 254 // These libcalls are not available in 32-bit. 255 setLibcallName(RTLIB::SHL_I128, 0); 256 setLibcallName(RTLIB::SRL_I128, 0); 257 setLibcallName(RTLIB::SRA_I128, 0); 258 259 if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) { 260 // Double-precision floating-point arithmetic helper functions 261 // RTABI chapter 4.1.2, Table 2 262 setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd"); 263 setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv"); 264 setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul"); 265 setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub"); 266 setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS); 267 setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS); 268 setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS); 269 setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS); 270 271 // Double-precision floating-point comparison helper functions 272 // RTABI chapter 4.1.2, Table 3 273 setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq"); 274 setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); 275 setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq"); 276 setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ); 277 setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt"); 278 setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); 279 setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple"); 280 setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); 281 setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge"); 282 setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); 283 setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt"); 284 setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); 285 setLibcallName(RTLIB::UO_F64, "__aeabi_dcmpun"); 286 setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); 287 setLibcallName(RTLIB::O_F64, "__aeabi_dcmpun"); 288 setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); 289 setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS); 290 setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS); 291 setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS); 292 setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS); 293 setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS); 294 setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS); 295 setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS); 296 setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS); 297 298 // Single-precision floating-point arithmetic helper functions 299 // RTABI chapter 4.1.2, Table 4 300 setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd"); 301 setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv"); 302 setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul"); 303 setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub"); 304 setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS); 305 setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS); 306 setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS); 307 setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS); 308 309 // Single-precision floating-point comparison helper functions 310 // RTABI chapter 4.1.2, Table 5 311 setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq"); 312 setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); 313 setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq"); 314 setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ); 315 setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt"); 316 setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); 317 setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple"); 318 setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); 319 setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge"); 320 setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); 321 setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt"); 322 setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); 323 setLibcallName(RTLIB::UO_F32, "__aeabi_fcmpun"); 324 setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); 325 setLibcallName(RTLIB::O_F32, "__aeabi_fcmpun"); 326 setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); 327 setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS); 328 setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS); 329 setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS); 330 setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS); 331 setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS); 332 setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS); 333 setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS); 334 setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS); 335 336 // Floating-point to integer conversions. 337 // RTABI chapter 4.1.2, Table 6 338 setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz"); 339 setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz"); 340 setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz"); 341 setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz"); 342 setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz"); 343 setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz"); 344 setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz"); 345 setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz"); 346 setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS); 347 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS); 348 setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS); 349 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS); 350 setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS); 351 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS); 352 setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS); 353 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS); 354 355 // Conversions between floating types. 356 // RTABI chapter 4.1.2, Table 7 357 setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f"); 358 setLibcallName(RTLIB::FPEXT_F32_F64, "__aeabi_f2d"); 359 setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS); 360 setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS); 361 362 // Integer to floating-point conversions. 363 // RTABI chapter 4.1.2, Table 8 364 setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d"); 365 setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d"); 366 setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d"); 367 setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d"); 368 setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f"); 369 setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f"); 370 setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f"); 371 setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f"); 372 setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS); 373 setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS); 374 setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS); 375 setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS); 376 setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS); 377 setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS); 378 setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS); 379 setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS); 380 381 // Long long helper functions 382 // RTABI chapter 4.2, Table 9 383 setLibcallName(RTLIB::MUL_I64, "__aeabi_lmul"); 384 setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl"); 385 setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr"); 386 setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr"); 387 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS); 388 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS); 389 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS); 390 setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS); 391 setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS); 392 setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS); 393 394 // Integer division functions 395 // RTABI chapter 4.3.1 396 setLibcallName(RTLIB::SDIV_I8, "__aeabi_idiv"); 397 setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv"); 398 setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv"); 399 setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod"); 400 setLibcallName(RTLIB::UDIV_I8, "__aeabi_uidiv"); 401 setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv"); 402 setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv"); 403 setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod"); 404 setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS); 405 setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS); 406 setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS); 407 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS); 408 setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS); 409 setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS); 410 setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS); 411 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS); 412 413 // Memory operations 414 // RTABI chapter 4.3.4 415 setLibcallName(RTLIB::MEMCPY, "__aeabi_memcpy"); 416 setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove"); 417 setLibcallName(RTLIB::MEMSET, "__aeabi_memset"); 418 setLibcallCallingConv(RTLIB::MEMCPY, CallingConv::ARM_AAPCS); 419 setLibcallCallingConv(RTLIB::MEMMOVE, CallingConv::ARM_AAPCS); 420 setLibcallCallingConv(RTLIB::MEMSET, CallingConv::ARM_AAPCS); 421 } 422 423 // Use divmod compiler-rt calls for iOS 5.0 and later. 424 if (Subtarget->getTargetTriple().getOS() == Triple::IOS && 425 !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { 426 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 427 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 428 } 429 430 if (Subtarget->isThumb1Only()) 431 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 432 else 433 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 434 if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && 435 !Subtarget->isThumb1Only()) { 436 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 437 if (!Subtarget->isFPOnlySP()) 438 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 439 440 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 441 } 442 443 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 444 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 445 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 446 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 447 setTruncStoreAction((MVT::SimpleValueType)VT, 448 (MVT::SimpleValueType)InnerVT, Expand); 449 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 450 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 451 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 452 } 453 454 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 455 456 if (Subtarget->hasNEON()) { 457 addDRTypeForNEON(MVT::v2f32); 458 addDRTypeForNEON(MVT::v8i8); 459 addDRTypeForNEON(MVT::v4i16); 460 addDRTypeForNEON(MVT::v2i32); 461 addDRTypeForNEON(MVT::v1i64); 462 463 addQRTypeForNEON(MVT::v4f32); 464 addQRTypeForNEON(MVT::v2f64); 465 addQRTypeForNEON(MVT::v16i8); 466 addQRTypeForNEON(MVT::v8i16); 467 addQRTypeForNEON(MVT::v4i32); 468 addQRTypeForNEON(MVT::v2i64); 469 470 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 471 // neither Neon nor VFP support any arithmetic operations on it. 472 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 473 // supported for v4f32. 474 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 475 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 476 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 477 // FIXME: Code duplication: FDIV and FREM are expanded always, see 478 // ARMTargetLowering::addTypeForNEON method for details. 479 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 480 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 481 // FIXME: Create unittest. 482 // In another words, find a way when "copysign" appears in DAG with vector 483 // operands. 484 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 485 // FIXME: Code duplication: SETCC has custom operation action, see 486 // ARMTargetLowering::addTypeForNEON method for details. 487 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 488 // FIXME: Create unittest for FNEG and for FABS. 489 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 490 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 491 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 492 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 493 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 494 setOperationAction(ISD::FPOWI, MVT::v2f64, Expand); 495 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 496 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 497 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 498 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 499 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 500 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 501 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 502 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 503 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 504 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 505 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 506 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 507 508 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 509 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 510 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 511 setOperationAction(ISD::FPOWI, MVT::v4f32, Expand); 512 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 513 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 514 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 515 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 516 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 517 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 518 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 519 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 520 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 521 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 522 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 523 524 // Neon does not support some operations on v1i64 and v2i64 types. 525 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 526 // Custom handling for some quad-vector types to detect VMULL. 527 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 528 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 529 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 530 // Custom handling for some vector types to avoid expensive expansions 531 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 532 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 533 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 534 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 535 setOperationAction(ISD::SETCC, MVT::v1i64, Expand); 536 setOperationAction(ISD::SETCC, MVT::v2i64, Expand); 537 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 538 // a destination type that is wider than the source, and nor does 539 // it have a FP_TO_[SU]INT instruction with a narrower destination than 540 // source. 541 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 542 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 543 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 544 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 545 546 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 547 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 548 549 // NEON does not have single instruction CTPOP for vectors with element 550 // types wider than 8-bits. However, custom lowering can leverage the 551 // v8i8/v16i8 vcnt instruction. 552 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 553 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 554 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 555 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 556 557 setTargetDAGCombine(ISD::INTRINSIC_VOID); 558 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 559 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 560 setTargetDAGCombine(ISD::SHL); 561 setTargetDAGCombine(ISD::SRL); 562 setTargetDAGCombine(ISD::SRA); 563 setTargetDAGCombine(ISD::SIGN_EXTEND); 564 setTargetDAGCombine(ISD::ZERO_EXTEND); 565 setTargetDAGCombine(ISD::ANY_EXTEND); 566 setTargetDAGCombine(ISD::SELECT_CC); 567 setTargetDAGCombine(ISD::BUILD_VECTOR); 568 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 569 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 570 setTargetDAGCombine(ISD::STORE); 571 setTargetDAGCombine(ISD::FP_TO_SINT); 572 setTargetDAGCombine(ISD::FP_TO_UINT); 573 setTargetDAGCombine(ISD::FDIV); 574 575 // It is legal to extload from v4i8 to v4i16 or v4i32. 576 MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8, 577 MVT::v4i16, MVT::v2i16, 578 MVT::v2i32}; 579 for (unsigned i = 0; i < 6; ++i) { 580 setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal); 581 setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal); 582 setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal); 583 } 584 } 585 586 // ARM and Thumb2 support UMLAL/SMLAL. 587 if (!Subtarget->isThumb1Only()) 588 setTargetDAGCombine(ISD::ADDC); 589 590 591 computeRegisterProperties(); 592 593 // ARM does not have f32 extending load. 594 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 595 596 // ARM does not have i1 sign extending load. 597 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 598 599 // ARM supports all 4 flavors of integer indexed load / store. 600 if (!Subtarget->isThumb1Only()) { 601 for (unsigned im = (unsigned)ISD::PRE_INC; 602 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 603 setIndexedLoadAction(im, MVT::i1, Legal); 604 setIndexedLoadAction(im, MVT::i8, Legal); 605 setIndexedLoadAction(im, MVT::i16, Legal); 606 setIndexedLoadAction(im, MVT::i32, Legal); 607 setIndexedStoreAction(im, MVT::i1, Legal); 608 setIndexedStoreAction(im, MVT::i8, Legal); 609 setIndexedStoreAction(im, MVT::i16, Legal); 610 setIndexedStoreAction(im, MVT::i32, Legal); 611 } 612 } 613 614 // i64 operation support. 615 setOperationAction(ISD::MUL, MVT::i64, Expand); 616 setOperationAction(ISD::MULHU, MVT::i32, Expand); 617 if (Subtarget->isThumb1Only()) { 618 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 619 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 620 } 621 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 622 || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP())) 623 setOperationAction(ISD::MULHS, MVT::i32, Expand); 624 625 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 626 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 627 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 628 setOperationAction(ISD::SRL, MVT::i64, Custom); 629 setOperationAction(ISD::SRA, MVT::i64, Custom); 630 631 if (!Subtarget->isThumb1Only()) { 632 // FIXME: We should do this for Thumb1 as well. 633 setOperationAction(ISD::ADDC, MVT::i32, Custom); 634 setOperationAction(ISD::ADDE, MVT::i32, Custom); 635 setOperationAction(ISD::SUBC, MVT::i32, Custom); 636 setOperationAction(ISD::SUBE, MVT::i32, Custom); 637 } 638 639 // ARM does not have ROTL. 640 setOperationAction(ISD::ROTL, MVT::i32, Expand); 641 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 642 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 643 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) 644 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 645 646 // These just redirect to CTTZ and CTLZ on ARM. 647 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); 648 setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); 649 650 // Only ARMv6 has BSWAP. 651 if (!Subtarget->hasV6Ops()) 652 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 653 654 if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && 655 !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { 656 // These are expanded into libcalls if the cpu doesn't have HW divider. 657 setOperationAction(ISD::SDIV, MVT::i32, Expand); 658 setOperationAction(ISD::UDIV, MVT::i32, Expand); 659 } 660 setOperationAction(ISD::SREM, MVT::i32, Expand); 661 setOperationAction(ISD::UREM, MVT::i32, Expand); 662 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 663 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 664 665 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 666 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 667 setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); 668 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 669 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 670 671 setOperationAction(ISD::TRAP, MVT::Other, Legal); 672 673 // Use the default implementation. 674 setOperationAction(ISD::VASTART, MVT::Other, Custom); 675 setOperationAction(ISD::VAARG, MVT::Other, Expand); 676 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 677 setOperationAction(ISD::VAEND, MVT::Other, Expand); 678 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 679 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 680 681 if (!Subtarget->isTargetDarwin()) { 682 // Non-Darwin platforms may return values in these registers via the 683 // personality function. 684 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 685 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 686 setExceptionPointerRegister(ARM::R0); 687 setExceptionSelectorRegister(ARM::R1); 688 } 689 690 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 691 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 692 // the default expansion. 693 // FIXME: This should be checking for v6k, not just v6. 694 if (Subtarget->hasDataBarrier() || 695 (Subtarget->hasV6Ops() && !Subtarget->isThumb())) { 696 // membarrier needs custom lowering; the rest are legal and handled 697 // normally. 698 setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); 699 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 700 // Custom lowering for 64-bit ops 701 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 702 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 703 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 704 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 705 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 706 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 707 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); 708 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); 709 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); 710 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); 711 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 712 // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc. 713 setInsertFencesForAtomic(true); 714 } else { 715 // Set them all for expansion, which will force libcalls. 716 setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); 717 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); 718 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 719 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 720 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 721 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 722 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 723 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 724 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 725 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 726 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 727 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 728 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 729 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 730 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 731 // Unordered/Monotonic case. 732 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 733 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 734 // Since the libcalls include locking, fold in the fences 735 setShouldFoldAtomicFences(true); 736 } 737 738 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 739 740 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 741 if (!Subtarget->hasV6Ops()) { 742 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 743 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 744 } 745 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 746 747 if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && 748 !Subtarget->isThumb1Only()) { 749 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 750 // iff target supports vfp2. 751 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 752 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 753 } 754 755 // We want to custom lower some of our intrinsics. 756 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 757 if (Subtarget->isTargetDarwin()) { 758 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 759 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 760 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 761 } 762 763 setOperationAction(ISD::SETCC, MVT::i32, Expand); 764 setOperationAction(ISD::SETCC, MVT::f32, Expand); 765 setOperationAction(ISD::SETCC, MVT::f64, Expand); 766 setOperationAction(ISD::SELECT, MVT::i32, Custom); 767 setOperationAction(ISD::SELECT, MVT::f32, Custom); 768 setOperationAction(ISD::SELECT, MVT::f64, Custom); 769 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 770 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 771 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 772 773 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 774 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 775 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 776 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 777 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 778 779 // We don't support sin/cos/fmod/copysign/pow 780 setOperationAction(ISD::FSIN, MVT::f64, Expand); 781 setOperationAction(ISD::FSIN, MVT::f32, Expand); 782 setOperationAction(ISD::FCOS, MVT::f32, Expand); 783 setOperationAction(ISD::FCOS, MVT::f64, Expand); 784 setOperationAction(ISD::FREM, MVT::f64, Expand); 785 setOperationAction(ISD::FREM, MVT::f32, Expand); 786 if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && 787 !Subtarget->isThumb1Only()) { 788 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 789 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 790 } 791 setOperationAction(ISD::FPOW, MVT::f64, Expand); 792 setOperationAction(ISD::FPOW, MVT::f32, Expand); 793 794 if (!Subtarget->hasVFP4()) { 795 setOperationAction(ISD::FMA, MVT::f64, Expand); 796 setOperationAction(ISD::FMA, MVT::f32, Expand); 797 } 798 799 // Various VFP goodness 800 if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) { 801 // int <-> fp are custom expanded into bit_convert + ARMISD ops. 802 if (Subtarget->hasVFP2()) { 803 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 804 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 805 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 806 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 807 } 808 // Special handling for half-precision FP. 809 if (!Subtarget->hasFP16()) { 810 setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); 811 setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand); 812 } 813 } 814 815 // We have target-specific dag combine patterns for the following nodes: 816 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 817 setTargetDAGCombine(ISD::ADD); 818 setTargetDAGCombine(ISD::SUB); 819 setTargetDAGCombine(ISD::MUL); 820 setTargetDAGCombine(ISD::AND); 821 setTargetDAGCombine(ISD::OR); 822 setTargetDAGCombine(ISD::XOR); 823 824 if (Subtarget->hasV6Ops()) 825 setTargetDAGCombine(ISD::SRL); 826 827 setStackPointerRegisterToSaveRestore(ARM::SP); 828 829 if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() || 830 !Subtarget->hasVFP2()) 831 setSchedulingPreference(Sched::RegPressure); 832 else 833 setSchedulingPreference(Sched::Hybrid); 834 835 //// temporary - rewrite interface to use type 836 maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1; 837 maxStoresPerMemset = 16; 838 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 839 840 // On ARM arguments smaller than 4 bytes are extended, so all arguments 841 // are at least 4 bytes aligned. 842 setMinStackArgumentAlignment(4); 843 844 benefitFromCodePlacementOpt = true; 845 846 // Prefer likely predicted branches to selects on out-of-order cores. 847 predictableSelectIsExpensive = Subtarget->isLikeA9(); 848 849 setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); 850 } 851 852 // FIXME: It might make sense to define the representative register class as the 853 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 854 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 855 // SPR's representative would be DPR_VFP2. This should work well if register 856 // pressure tracking were modified such that a register use would increment the 857 // pressure of the register class's representative and all of it's super 858 // classes' representatives transitively. We have not implemented this because 859 // of the difficulty prior to coalescing of modeling operand register classes 860 // due to the common occurrence of cross class copies and subregister insertions 861 // and extractions. 862 std::pair<const TargetRegisterClass*, uint8_t> 863 ARMTargetLowering::findRepresentativeClass(EVT VT) const{ 864 const TargetRegisterClass *RRC = 0; 865 uint8_t Cost = 1; 866 switch (VT.getSimpleVT().SimpleTy) { 867 default: 868 return TargetLowering::findRepresentativeClass(VT); 869 // Use DPR as representative register class for all floating point 870 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 871 // the cost is 1 for both f32 and f64. 872 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 873 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 874 RRC = &ARM::DPRRegClass; 875 // When NEON is used for SP, only half of the register file is available 876 // because operations that define both SP and DP results will be constrained 877 // to the VFP2 class (D0-D15). We currently model this constraint prior to 878 // coalescing by double-counting the SP regs. See the FIXME above. 879 if (Subtarget->useNEONForSinglePrecisionFP()) 880 Cost = 2; 881 break; 882 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 883 case MVT::v4f32: case MVT::v2f64: 884 RRC = &ARM::DPRRegClass; 885 Cost = 2; 886 break; 887 case MVT::v4i64: 888 RRC = &ARM::DPRRegClass; 889 Cost = 4; 890 break; 891 case MVT::v8i64: 892 RRC = &ARM::DPRRegClass; 893 Cost = 8; 894 break; 895 } 896 return std::make_pair(RRC, Cost); 897 } 898 899 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 900 switch (Opcode) { 901 default: return 0; 902 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 903 case ARMISD::WrapperDYN: return "ARMISD::WrapperDYN"; 904 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 905 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 906 case ARMISD::CALL: return "ARMISD::CALL"; 907 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 908 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 909 case ARMISD::tCALL: return "ARMISD::tCALL"; 910 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 911 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 912 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 913 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 914 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 915 case ARMISD::CMP: return "ARMISD::CMP"; 916 case ARMISD::CMN: return "ARMISD::CMN"; 917 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 918 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 919 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 920 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 921 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 922 923 case ARMISD::CMOV: return "ARMISD::CMOV"; 924 925 case ARMISD::RBIT: return "ARMISD::RBIT"; 926 927 case ARMISD::FTOSI: return "ARMISD::FTOSI"; 928 case ARMISD::FTOUI: return "ARMISD::FTOUI"; 929 case ARMISD::SITOF: return "ARMISD::SITOF"; 930 case ARMISD::UITOF: return "ARMISD::UITOF"; 931 932 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 933 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 934 case ARMISD::RRX: return "ARMISD::RRX"; 935 936 case ARMISD::ADDC: return "ARMISD::ADDC"; 937 case ARMISD::ADDE: return "ARMISD::ADDE"; 938 case ARMISD::SUBC: return "ARMISD::SUBC"; 939 case ARMISD::SUBE: return "ARMISD::SUBE"; 940 941 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 942 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 943 944 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 945 case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; 946 947 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 948 949 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 950 951 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 952 953 case ARMISD::MEMBARRIER: return "ARMISD::MEMBARRIER"; 954 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 955 956 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 957 958 case ARMISD::VCEQ: return "ARMISD::VCEQ"; 959 case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; 960 case ARMISD::VCGE: return "ARMISD::VCGE"; 961 case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; 962 case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; 963 case ARMISD::VCGEU: return "ARMISD::VCGEU"; 964 case ARMISD::VCGT: return "ARMISD::VCGT"; 965 case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; 966 case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; 967 case ARMISD::VCGTU: return "ARMISD::VCGTU"; 968 case ARMISD::VTST: return "ARMISD::VTST"; 969 970 case ARMISD::VSHL: return "ARMISD::VSHL"; 971 case ARMISD::VSHRs: return "ARMISD::VSHRs"; 972 case ARMISD::VSHRu: return "ARMISD::VSHRu"; 973 case ARMISD::VSHLLs: return "ARMISD::VSHLLs"; 974 case ARMISD::VSHLLu: return "ARMISD::VSHLLu"; 975 case ARMISD::VSHLLi: return "ARMISD::VSHLLi"; 976 case ARMISD::VSHRN: return "ARMISD::VSHRN"; 977 case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; 978 case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; 979 case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; 980 case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; 981 case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; 982 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; 983 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; 984 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; 985 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; 986 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; 987 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; 988 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; 989 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 990 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 991 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 992 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 993 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 994 case ARMISD::VDUP: return "ARMISD::VDUP"; 995 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 996 case ARMISD::VEXT: return "ARMISD::VEXT"; 997 case ARMISD::VREV64: return "ARMISD::VREV64"; 998 case ARMISD::VREV32: return "ARMISD::VREV32"; 999 case ARMISD::VREV16: return "ARMISD::VREV16"; 1000 case ARMISD::VZIP: return "ARMISD::VZIP"; 1001 case ARMISD::VUZP: return "ARMISD::VUZP"; 1002 case ARMISD::VTRN: return "ARMISD::VTRN"; 1003 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1004 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1005 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1006 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1007 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1008 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1009 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1010 case ARMISD::FMAX: return "ARMISD::FMAX"; 1011 case ARMISD::FMIN: return "ARMISD::FMIN"; 1012 case ARMISD::BFI: return "ARMISD::BFI"; 1013 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1014 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1015 case ARMISD::VBSL: return "ARMISD::VBSL"; 1016 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1017 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1018 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1019 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1020 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1021 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1022 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1023 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1024 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1025 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1026 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1027 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1028 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1029 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1030 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1031 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1032 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1033 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1034 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1035 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1036 } 1037 } 1038 1039 EVT ARMTargetLowering::getSetCCResultType(EVT VT) const { 1040 if (!VT.isVector()) return getPointerTy(); 1041 return VT.changeVectorElementTypeToInteger(); 1042 } 1043 1044 /// getRegClassFor - Return the register class that should be used for the 1045 /// specified value type. 1046 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { 1047 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1048 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1049 // load / store 4 to 8 consecutive D registers. 1050 if (Subtarget->hasNEON()) { 1051 if (VT == MVT::v4i64) 1052 return &ARM::QQPRRegClass; 1053 if (VT == MVT::v8i64) 1054 return &ARM::QQQQPRRegClass; 1055 } 1056 return TargetLowering::getRegClassFor(VT); 1057 } 1058 1059 // Create a fast isel object. 1060 FastISel * 1061 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1062 const TargetLibraryInfo *libInfo) const { 1063 return ARM::createFastISel(funcInfo, libInfo); 1064 } 1065 1066 /// getMaximalGlobalOffset - Returns the maximal possible offset which can 1067 /// be used for loads / stores from the global. 1068 unsigned ARMTargetLowering::getMaximalGlobalOffset() const { 1069 return (Subtarget->isThumb1Only() ? 127 : 4095); 1070 } 1071 1072 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1073 unsigned NumVals = N->getNumValues(); 1074 if (!NumVals) 1075 return Sched::RegPressure; 1076 1077 for (unsigned i = 0; i != NumVals; ++i) { 1078 EVT VT = N->getValueType(i); 1079 if (VT == MVT::Glue || VT == MVT::Other) 1080 continue; 1081 if (VT.isFloatingPoint() || VT.isVector()) 1082 return Sched::ILP; 1083 } 1084 1085 if (!N->isMachineOpcode()) 1086 return Sched::RegPressure; 1087 1088 // Load are scheduled for latency even if there instruction itinerary 1089 // is not available. 1090 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 1091 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1092 1093 if (MCID.getNumDefs() == 0) 1094 return Sched::RegPressure; 1095 if (!Itins->isEmpty() && 1096 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1097 return Sched::ILP; 1098 1099 return Sched::RegPressure; 1100 } 1101 1102 //===----------------------------------------------------------------------===// 1103 // Lowering Code 1104 //===----------------------------------------------------------------------===// 1105 1106 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1107 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1108 switch (CC) { 1109 default: llvm_unreachable("Unknown condition code!"); 1110 case ISD::SETNE: return ARMCC::NE; 1111 case ISD::SETEQ: return ARMCC::EQ; 1112 case ISD::SETGT: return ARMCC::GT; 1113 case ISD::SETGE: return ARMCC::GE; 1114 case ISD::SETLT: return ARMCC::LT; 1115 case ISD::SETLE: return ARMCC::LE; 1116 case ISD::SETUGT: return ARMCC::HI; 1117 case ISD::SETUGE: return ARMCC::HS; 1118 case ISD::SETULT: return ARMCC::LO; 1119 case ISD::SETULE: return ARMCC::LS; 1120 } 1121 } 1122 1123 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1124 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1125 ARMCC::CondCodes &CondCode2) { 1126 CondCode2 = ARMCC::AL; 1127 switch (CC) { 1128 default: llvm_unreachable("Unknown FP condition!"); 1129 case ISD::SETEQ: 1130 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1131 case ISD::SETGT: 1132 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1133 case ISD::SETGE: 1134 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1135 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1136 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1137 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1138 case ISD::SETO: CondCode = ARMCC::VC; break; 1139 case ISD::SETUO: CondCode = ARMCC::VS; break; 1140 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1141 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1142 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1143 case ISD::SETLT: 1144 case ISD::SETULT: CondCode = ARMCC::LT; break; 1145 case ISD::SETLE: 1146 case ISD::SETULE: CondCode = ARMCC::LE; break; 1147 case ISD::SETNE: 1148 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1149 } 1150 } 1151 1152 //===----------------------------------------------------------------------===// 1153 // Calling Convention Implementation 1154 //===----------------------------------------------------------------------===// 1155 1156 #include "ARMGenCallingConv.inc" 1157 1158 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1159 /// given CallingConvention value. 1160 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1161 bool Return, 1162 bool isVarArg) const { 1163 switch (CC) { 1164 default: 1165 llvm_unreachable("Unsupported calling convention"); 1166 case CallingConv::Fast: 1167 if (Subtarget->hasVFP2() && !isVarArg) { 1168 if (!Subtarget->isAAPCS_ABI()) 1169 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1170 // For AAPCS ABI targets, just use VFP variant of the calling convention. 1171 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1172 } 1173 // Fallthrough 1174 case CallingConv::C: { 1175 // Use target triple & subtarget features to do actual dispatch. 1176 if (!Subtarget->isAAPCS_ABI()) 1177 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1178 else if (Subtarget->hasVFP2() && 1179 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1180 !isVarArg) 1181 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1182 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1183 } 1184 case CallingConv::ARM_AAPCS_VFP: 1185 if (!isVarArg) 1186 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1187 // Fallthrough 1188 case CallingConv::ARM_AAPCS: 1189 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1190 case CallingConv::ARM_APCS: 1191 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1192 case CallingConv::GHC: 1193 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1194 } 1195 } 1196 1197 /// LowerCallResult - Lower the result values of a call into the 1198 /// appropriate copies out of appropriate physical registers. 1199 SDValue 1200 ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1201 CallingConv::ID CallConv, bool isVarArg, 1202 const SmallVectorImpl<ISD::InputArg> &Ins, 1203 DebugLoc dl, SelectionDAG &DAG, 1204 SmallVectorImpl<SDValue> &InVals) const { 1205 1206 // Assign locations to each value returned by this call. 1207 SmallVector<CCValAssign, 16> RVLocs; 1208 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1209 getTargetMachine(), RVLocs, *DAG.getContext(), Call); 1210 CCInfo.AnalyzeCallResult(Ins, 1211 CCAssignFnForNode(CallConv, /* Return*/ true, 1212 isVarArg)); 1213 1214 // Copy all of the result registers out of their specified physreg. 1215 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1216 CCValAssign VA = RVLocs[i]; 1217 1218 SDValue Val; 1219 if (VA.needsCustom()) { 1220 // Handle f64 or half of a v2f64. 1221 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1222 InFlag); 1223 Chain = Lo.getValue(1); 1224 InFlag = Lo.getValue(2); 1225 VA = RVLocs[++i]; // skip ahead to next loc 1226 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1227 InFlag); 1228 Chain = Hi.getValue(1); 1229 InFlag = Hi.getValue(2); 1230 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1231 1232 if (VA.getLocVT() == MVT::v2f64) { 1233 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1234 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1235 DAG.getConstant(0, MVT::i32)); 1236 1237 VA = RVLocs[++i]; // skip ahead to next loc 1238 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1239 Chain = Lo.getValue(1); 1240 InFlag = Lo.getValue(2); 1241 VA = RVLocs[++i]; // skip ahead to next loc 1242 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1243 Chain = Hi.getValue(1); 1244 InFlag = Hi.getValue(2); 1245 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1246 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1247 DAG.getConstant(1, MVT::i32)); 1248 } 1249 } else { 1250 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1251 InFlag); 1252 Chain = Val.getValue(1); 1253 InFlag = Val.getValue(2); 1254 } 1255 1256 switch (VA.getLocInfo()) { 1257 default: llvm_unreachable("Unknown loc info!"); 1258 case CCValAssign::Full: break; 1259 case CCValAssign::BCvt: 1260 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1261 break; 1262 } 1263 1264 InVals.push_back(Val); 1265 } 1266 1267 return Chain; 1268 } 1269 1270 /// LowerMemOpCallTo - Store the argument to the stack. 1271 SDValue 1272 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, 1273 SDValue StackPtr, SDValue Arg, 1274 DebugLoc dl, SelectionDAG &DAG, 1275 const CCValAssign &VA, 1276 ISD::ArgFlagsTy Flags) const { 1277 unsigned LocMemOffset = VA.getLocMemOffset(); 1278 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1279 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1280 return DAG.getStore(Chain, dl, Arg, PtrOff, 1281 MachinePointerInfo::getStack(LocMemOffset), 1282 false, false, 0); 1283 } 1284 1285 void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, 1286 SDValue Chain, SDValue &Arg, 1287 RegsToPassVector &RegsToPass, 1288 CCValAssign &VA, CCValAssign &NextVA, 1289 SDValue &StackPtr, 1290 SmallVector<SDValue, 8> &MemOpChains, 1291 ISD::ArgFlagsTy Flags) const { 1292 1293 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1294 DAG.getVTList(MVT::i32, MVT::i32), Arg); 1295 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd)); 1296 1297 if (NextVA.isRegLoc()) 1298 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1))); 1299 else { 1300 assert(NextVA.isMemLoc()); 1301 if (StackPtr.getNode() == 0) 1302 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); 1303 1304 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1), 1305 dl, DAG, NextVA, 1306 Flags)); 1307 } 1308 } 1309 1310 /// LowerCall - Lowering a call into a callseq_start <- 1311 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 1312 /// nodes. 1313 SDValue 1314 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1315 SmallVectorImpl<SDValue> &InVals) const { 1316 SelectionDAG &DAG = CLI.DAG; 1317 DebugLoc &dl = CLI.DL; 1318 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 1319 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 1320 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 1321 SDValue Chain = CLI.Chain; 1322 SDValue Callee = CLI.Callee; 1323 bool &isTailCall = CLI.IsTailCall; 1324 CallingConv::ID CallConv = CLI.CallConv; 1325 bool doesNotRet = CLI.DoesNotReturn; 1326 bool isVarArg = CLI.IsVarArg; 1327 1328 MachineFunction &MF = DAG.getMachineFunction(); 1329 bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 1330 bool IsSibCall = false; 1331 // Disable tail calls if they're not supported. 1332 if (!EnableARMTailCalls && !Subtarget->supportsTailCall()) 1333 isTailCall = false; 1334 if (isTailCall) { 1335 // Check if it's really possible to do a tail call. 1336 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1337 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1338 Outs, OutVals, Ins, DAG); 1339 // We don't support GuaranteedTailCallOpt for ARM, only automatically 1340 // detected sibcalls. 1341 if (isTailCall) { 1342 ++NumTailCalls; 1343 IsSibCall = true; 1344 } 1345 } 1346 1347 // Analyze operands of the call, assigning locations to each operand. 1348 SmallVector<CCValAssign, 16> ArgLocs; 1349 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1350 getTargetMachine(), ArgLocs, *DAG.getContext(), Call); 1351 CCInfo.AnalyzeCallOperands(Outs, 1352 CCAssignFnForNode(CallConv, /* Return*/ false, 1353 isVarArg)); 1354 1355 // Get a count of how many bytes are to be pushed on the stack. 1356 unsigned NumBytes = CCInfo.getNextStackOffset(); 1357 1358 // For tail calls, memory operands are available in our caller's stack. 1359 if (IsSibCall) 1360 NumBytes = 0; 1361 1362 // Adjust the stack pointer for the new arguments... 1363 // These operations are automatically eliminated by the prolog/epilog pass 1364 if (!IsSibCall) 1365 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1366 1367 SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); 1368 1369 RegsToPassVector RegsToPass; 1370 SmallVector<SDValue, 8> MemOpChains; 1371 1372 // Walk the register/memloc assignments, inserting copies/loads. In the case 1373 // of tail call optimization, arguments are handled later. 1374 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1375 i != e; 1376 ++i, ++realArgIdx) { 1377 CCValAssign &VA = ArgLocs[i]; 1378 SDValue Arg = OutVals[realArgIdx]; 1379 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1380 bool isByVal = Flags.isByVal(); 1381 1382 // Promote the value if needed. 1383 switch (VA.getLocInfo()) { 1384 default: llvm_unreachable("Unknown loc info!"); 1385 case CCValAssign::Full: break; 1386 case CCValAssign::SExt: 1387 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1388 break; 1389 case CCValAssign::ZExt: 1390 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1391 break; 1392 case CCValAssign::AExt: 1393 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1394 break; 1395 case CCValAssign::BCvt: 1396 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1397 break; 1398 } 1399 1400 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 1401 if (VA.needsCustom()) { 1402 if (VA.getLocVT() == MVT::v2f64) { 1403 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1404 DAG.getConstant(0, MVT::i32)); 1405 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1406 DAG.getConstant(1, MVT::i32)); 1407 1408 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 1409 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1410 1411 VA = ArgLocs[++i]; // skip ahead to next loc 1412 if (VA.isRegLoc()) { 1413 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 1414 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1415 } else { 1416 assert(VA.isMemLoc()); 1417 1418 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 1419 dl, DAG, VA, Flags)); 1420 } 1421 } else { 1422 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 1423 StackPtr, MemOpChains, Flags); 1424 } 1425 } else if (VA.isRegLoc()) { 1426 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1427 } else if (isByVal) { 1428 assert(VA.isMemLoc()); 1429 unsigned offset = 0; 1430 1431 // True if this byval aggregate will be split between registers 1432 // and memory. 1433 if (CCInfo.isFirstByValRegValid()) { 1434 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); 1435 unsigned int i, j; 1436 for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) { 1437 SDValue Const = DAG.getConstant(4*i, MVT::i32); 1438 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 1439 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 1440 MachinePointerInfo(), 1441 false, false, false, 0); 1442 MemOpChains.push_back(Load.getValue(1)); 1443 RegsToPass.push_back(std::make_pair(j, Load)); 1444 } 1445 offset = ARM::R4 - CCInfo.getFirstByValReg(); 1446 CCInfo.clearFirstByValReg(); 1447 } 1448 1449 if (Flags.getByValSize() - 4*offset > 0) { 1450 unsigned LocMemOffset = VA.getLocMemOffset(); 1451 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); 1452 SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, 1453 StkPtrOff); 1454 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); 1455 SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); 1456 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, 1457 MVT::i32); 1458 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32); 1459 1460 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 1461 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 1462 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 1463 Ops, array_lengthof(Ops))); 1464 } 1465 } else if (!IsSibCall) { 1466 assert(VA.isMemLoc()); 1467 1468 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1469 dl, DAG, VA, Flags)); 1470 } 1471 } 1472 1473 if (!MemOpChains.empty()) 1474 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1475 &MemOpChains[0], MemOpChains.size()); 1476 1477 // Build a sequence of copy-to-reg nodes chained together with token chain 1478 // and flag operands which copy the outgoing args into the appropriate regs. 1479 SDValue InFlag; 1480 // Tail call byval lowering might overwrite argument registers so in case of 1481 // tail call optimization the copies to registers are lowered later. 1482 if (!isTailCall) 1483 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1484 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1485 RegsToPass[i].second, InFlag); 1486 InFlag = Chain.getValue(1); 1487 } 1488 1489 // For tail calls lower the arguments to the 'real' stack slot. 1490 if (isTailCall) { 1491 // Force all the incoming stack arguments to be loaded from the stack 1492 // before any new outgoing arguments are stored to the stack, because the 1493 // outgoing stack slots may alias the incoming argument stack slots, and 1494 // the alias isn't otherwise explicit. This is slightly more conservative 1495 // than necessary, because it means that each store effectively depends 1496 // on every argument instead of just those arguments it would clobber. 1497 1498 // Do not flag preceding copytoreg stuff together with the following stuff. 1499 InFlag = SDValue(); 1500 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1501 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1502 RegsToPass[i].second, InFlag); 1503 InFlag = Chain.getValue(1); 1504 } 1505 InFlag =SDValue(); 1506 } 1507 1508 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 1509 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 1510 // node so that legalize doesn't hack it. 1511 bool isDirect = false; 1512 bool isARMFunc = false; 1513 bool isLocalARMFunc = false; 1514 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 1515 1516 if (EnableARMLongCalls) { 1517 assert (getTargetMachine().getRelocationModel() == Reloc::Static 1518 && "long-calls with non-static relocation model!"); 1519 // Handle a global address or an external symbol. If it's not one of 1520 // those, the target's already in a register, so we don't need to do 1521 // anything extra. 1522 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1523 const GlobalValue *GV = G->getGlobal(); 1524 // Create a constant pool entry for the callee address 1525 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1526 ARMConstantPoolValue *CPV = 1527 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 1528 1529 // Get the address of the callee into a register 1530 SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); 1531 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1532 Callee = DAG.getLoad(getPointerTy(), dl, 1533 DAG.getEntryNode(), CPAddr, 1534 MachinePointerInfo::getConstantPool(), 1535 false, false, false, 0); 1536 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 1537 const char *Sym = S->getSymbol(); 1538 1539 // Create a constant pool entry for the callee address 1540 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1541 ARMConstantPoolValue *CPV = 1542 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1543 ARMPCLabelIndex, 0); 1544 // Get the address of the callee into a register 1545 SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); 1546 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1547 Callee = DAG.getLoad(getPointerTy(), dl, 1548 DAG.getEntryNode(), CPAddr, 1549 MachinePointerInfo::getConstantPool(), 1550 false, false, false, 0); 1551 } 1552 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1553 const GlobalValue *GV = G->getGlobal(); 1554 isDirect = true; 1555 bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); 1556 bool isStub = (isExt && Subtarget->isTargetDarwin()) && 1557 getTargetMachine().getRelocationModel() != Reloc::Static; 1558 isARMFunc = !Subtarget->isThumb() || isStub; 1559 // ARM call to a local ARM function is predicable. 1560 isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); 1561 // tBX takes a register source operand. 1562 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1563 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1564 ARMConstantPoolValue *CPV = 1565 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4); 1566 SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); 1567 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1568 Callee = DAG.getLoad(getPointerTy(), dl, 1569 DAG.getEntryNode(), CPAddr, 1570 MachinePointerInfo::getConstantPool(), 1571 false, false, false, 0); 1572 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 1573 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, 1574 getPointerTy(), Callee, PICLabel); 1575 } else { 1576 // On ELF targets for PIC code, direct calls should go through the PLT 1577 unsigned OpFlags = 0; 1578 if (Subtarget->isTargetELF() && 1579 getTargetMachine().getRelocationModel() == Reloc::PIC_) 1580 OpFlags = ARMII::MO_PLT; 1581 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 1582 } 1583 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1584 isDirect = true; 1585 bool isStub = Subtarget->isTargetDarwin() && 1586 getTargetMachine().getRelocationModel() != Reloc::Static; 1587 isARMFunc = !Subtarget->isThumb() || isStub; 1588 // tBX takes a register source operand. 1589 const char *Sym = S->getSymbol(); 1590 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 1591 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 1592 ARMConstantPoolValue *CPV = 1593 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 1594 ARMPCLabelIndex, 4); 1595 SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); 1596 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 1597 Callee = DAG.getLoad(getPointerTy(), dl, 1598 DAG.getEntryNode(), CPAddr, 1599 MachinePointerInfo::getConstantPool(), 1600 false, false, false, 0); 1601 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 1602 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, 1603 getPointerTy(), Callee, PICLabel); 1604 } else { 1605 unsigned OpFlags = 0; 1606 // On ELF targets for PIC code, direct calls should go through the PLT 1607 if (Subtarget->isTargetELF() && 1608 getTargetMachine().getRelocationModel() == Reloc::PIC_) 1609 OpFlags = ARMII::MO_PLT; 1610 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags); 1611 } 1612 } 1613 1614 // FIXME: handle tail calls differently. 1615 unsigned CallOpc; 1616 bool HasMinSizeAttr = MF.getFunction()->getFnAttributes(). 1617 hasAttribute(Attributes::MinSize); 1618 if (Subtarget->isThumb()) { 1619 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 1620 CallOpc = ARMISD::CALL_NOLINK; 1621 else 1622 CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; 1623 } else { 1624 if (!isDirect && !Subtarget->hasV5TOps()) 1625 CallOpc = ARMISD::CALL_NOLINK; 1626 else if (doesNotRet && isDirect && Subtarget->hasRAS() && 1627 // Emit regular call when code size is the priority 1628 !HasMinSizeAttr) 1629 // "mov lr, pc; b _foo" to avoid confusing the RSP 1630 CallOpc = ARMISD::CALL_NOLINK; 1631 else 1632 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 1633 } 1634 1635 std::vector<SDValue> Ops; 1636 Ops.push_back(Chain); 1637 Ops.push_back(Callee); 1638 1639 // Add argument registers to the end of the list so that they are known live 1640 // into the call. 1641 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1642 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1643 RegsToPass[i].second.getValueType())); 1644 1645 // Add a register mask operand representing the call-preserved registers. 1646 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1647 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 1648 assert(Mask && "Missing call preserved mask for calling convention"); 1649 Ops.push_back(DAG.getRegisterMask(Mask)); 1650 1651 if (InFlag.getNode()) 1652 Ops.push_back(InFlag); 1653 1654 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1655 if (isTailCall) 1656 return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 1657 1658 // Returns a chain and a flag for retval copy to use. 1659 Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size()); 1660 InFlag = Chain.getValue(1); 1661 1662 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1663 DAG.getIntPtrConstant(0, true), InFlag); 1664 if (!Ins.empty()) 1665 InFlag = Chain.getValue(1); 1666 1667 // Handle result values, copying them out of physregs into vregs that we 1668 // return. 1669 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, 1670 dl, DAG, InVals); 1671 } 1672 1673 /// HandleByVal - Every parameter *after* a byval parameter is passed 1674 /// on the stack. Remember the next parameter register to allocate, 1675 /// and then confiscate the rest of the parameter registers to insure 1676 /// this. 1677 void 1678 ARMTargetLowering::HandleByVal( 1679 CCState *State, unsigned &size, unsigned Align) const { 1680 unsigned reg = State->AllocateReg(GPRArgRegs, 4); 1681 assert((State->getCallOrPrologue() == Prologue || 1682 State->getCallOrPrologue() == Call) && 1683 "unhandled ParmContext"); 1684 if ((!State->isFirstByValRegValid()) && 1685 (ARM::R0 <= reg) && (reg <= ARM::R3)) { 1686 if (Subtarget->isAAPCS_ABI() && Align > 4) { 1687 unsigned AlignInRegs = Align / 4; 1688 unsigned Waste = (ARM::R4 - reg) % AlignInRegs; 1689 for (unsigned i = 0; i < Waste; ++i) 1690 reg = State->AllocateReg(GPRArgRegs, 4); 1691 } 1692 if (reg != 0) { 1693 State->setFirstByValReg(reg); 1694 // At a call site, a byval parameter that is split between 1695 // registers and memory needs its size truncated here. In a 1696 // function prologue, such byval parameters are reassembled in 1697 // memory, and are not truncated. 1698 if (State->getCallOrPrologue() == Call) { 1699 unsigned excess = 4 * (ARM::R4 - reg); 1700 assert(size >= excess && "expected larger existing stack allocation"); 1701 size -= excess; 1702 } 1703 } 1704 } 1705 // Confiscate any remaining parameter registers to preclude their 1706 // assignment to subsequent parameters. 1707 while (State->AllocateReg(GPRArgRegs, 4)) 1708 ; 1709 } 1710 1711 /// MatchingStackOffset - Return true if the given stack call argument is 1712 /// already available in the same position (relatively) of the caller's 1713 /// incoming argument stack. 1714 static 1715 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 1716 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 1717 const TargetInstrInfo *TII) { 1718 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 1719 int FI = INT_MAX; 1720 if (Arg.getOpcode() == ISD::CopyFromReg) { 1721 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 1722 if (!TargetRegisterInfo::isVirtualRegister(VR)) 1723 return false; 1724 MachineInstr *Def = MRI->getVRegDef(VR); 1725 if (!Def) 1726 return false; 1727 if (!Flags.isByVal()) { 1728 if (!TII->isLoadFromStackSlot(Def, FI)) 1729 return false; 1730 } else { 1731 return false; 1732 } 1733 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 1734 if (Flags.isByVal()) 1735 // ByVal argument is passed in as a pointer but it's now being 1736 // dereferenced. e.g. 1737 // define @foo(%struct.X* %A) { 1738 // tail call @bar(%struct.X* byval %A) 1739 // } 1740 return false; 1741 SDValue Ptr = Ld->getBasePtr(); 1742 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 1743 if (!FINode) 1744 return false; 1745 FI = FINode->getIndex(); 1746 } else 1747 return false; 1748 1749 assert(FI != INT_MAX); 1750 if (!MFI->isFixedObjectIndex(FI)) 1751 return false; 1752 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 1753 } 1754 1755 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 1756 /// for tail call optimization. Targets which want to do tail call 1757 /// optimization should implement this function. 1758 bool 1759 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 1760 CallingConv::ID CalleeCC, 1761 bool isVarArg, 1762 bool isCalleeStructRet, 1763 bool isCallerStructRet, 1764 const SmallVectorImpl<ISD::OutputArg> &Outs, 1765 const SmallVectorImpl<SDValue> &OutVals, 1766 const SmallVectorImpl<ISD::InputArg> &Ins, 1767 SelectionDAG& DAG) const { 1768 const Function *CallerF = DAG.getMachineFunction().getFunction(); 1769 CallingConv::ID CallerCC = CallerF->getCallingConv(); 1770 bool CCMatch = CallerCC == CalleeCC; 1771 1772 // Look for obvious safe cases to perform tail call optimization that do not 1773 // require ABI changes. This is what gcc calls sibcall. 1774 1775 // Do not sibcall optimize vararg calls unless the call site is not passing 1776 // any arguments. 1777 if (isVarArg && !Outs.empty()) 1778 return false; 1779 1780 // Also avoid sibcall optimization if either caller or callee uses struct 1781 // return semantics. 1782 if (isCalleeStructRet || isCallerStructRet) 1783 return false; 1784 1785 // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: 1786 // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as 1787 // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation 1788 // support in the assembler and linker to be used. This would need to be 1789 // fixed to fully support tail calls in Thumb1. 1790 // 1791 // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take 1792 // LR. This means if we need to reload LR, it takes an extra instructions, 1793 // which outweighs the value of the tail call; but here we don't know yet 1794 // whether LR is going to be used. Probably the right approach is to 1795 // generate the tail call here and turn it back into CALL/RET in 1796 // emitEpilogue if LR is used. 1797 1798 // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, 1799 // but we need to make sure there are enough registers; the only valid 1800 // registers are the 4 used for parameters. We don't currently do this 1801 // case. 1802 if (Subtarget->isThumb1Only()) 1803 return false; 1804 1805 // If the calling conventions do not match, then we'd better make sure the 1806 // results are returned in the same way as what the caller expects. 1807 if (!CCMatch) { 1808 SmallVector<CCValAssign, 16> RVLocs1; 1809 ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 1810 getTargetMachine(), RVLocs1, *DAG.getContext(), Call); 1811 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); 1812 1813 SmallVector<CCValAssign, 16> RVLocs2; 1814 ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 1815 getTargetMachine(), RVLocs2, *DAG.getContext(), Call); 1816 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); 1817 1818 if (RVLocs1.size() != RVLocs2.size()) 1819 return false; 1820 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 1821 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 1822 return false; 1823 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 1824 return false; 1825 if (RVLocs1[i].isRegLoc()) { 1826 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 1827 return false; 1828 } else { 1829 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 1830 return false; 1831 } 1832 } 1833 } 1834 1835 // If Caller's vararg or byval argument has been split between registers and 1836 // stack, do not perform tail call, since part of the argument is in caller's 1837 // local frame. 1838 const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction(). 1839 getInfo<ARMFunctionInfo>(); 1840 if (AFI_Caller->getVarArgsRegSaveSize()) 1841 return false; 1842 1843 // If the callee takes no arguments then go on to check the results of the 1844 // call. 1845 if (!Outs.empty()) { 1846 // Check if stack adjustment is needed. For now, do not do this if any 1847 // argument is passed on the stack. 1848 SmallVector<CCValAssign, 16> ArgLocs; 1849 ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 1850 getTargetMachine(), ArgLocs, *DAG.getContext(), Call); 1851 CCInfo.AnalyzeCallOperands(Outs, 1852 CCAssignFnForNode(CalleeCC, false, isVarArg)); 1853 if (CCInfo.getNextStackOffset()) { 1854 MachineFunction &MF = DAG.getMachineFunction(); 1855 1856 // Check if the arguments are already laid out in the right way as 1857 // the caller's fixed stack objects. 1858 MachineFrameInfo *MFI = MF.getFrameInfo(); 1859 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 1860 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 1861 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1862 i != e; 1863 ++i, ++realArgIdx) { 1864 CCValAssign &VA = ArgLocs[i]; 1865 EVT RegVT = VA.getLocVT(); 1866 SDValue Arg = OutVals[realArgIdx]; 1867 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1868 if (VA.getLocInfo() == CCValAssign::Indirect) 1869 return false; 1870 if (VA.needsCustom()) { 1871 // f64 and vector types are split into multiple registers or 1872 // register/stack-slot combinations. The types will not match 1873 // the registers; give up on memory f64 refs until we figure 1874 // out what to do about this. 1875 if (!VA.isRegLoc()) 1876 return false; 1877 if (!ArgLocs[++i].isRegLoc()) 1878 return false; 1879 if (RegVT == MVT::v2f64) { 1880 if (!ArgLocs[++i].isRegLoc()) 1881 return false; 1882 if (!ArgLocs[++i].isRegLoc()) 1883 return false; 1884 } 1885 } else if (!VA.isRegLoc()) { 1886 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 1887 MFI, MRI, TII)) 1888 return false; 1889 } 1890 } 1891 } 1892 } 1893 1894 return true; 1895 } 1896 1897 bool 1898 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1899 MachineFunction &MF, bool isVarArg, 1900 const SmallVectorImpl<ISD::OutputArg> &Outs, 1901 LLVMContext &Context) const { 1902 SmallVector<CCValAssign, 16> RVLocs; 1903 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); 1904 return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true, 1905 isVarArg)); 1906 } 1907 1908 SDValue 1909 ARMTargetLowering::LowerReturn(SDValue Chain, 1910 CallingConv::ID CallConv, bool isVarArg, 1911 const SmallVectorImpl<ISD::OutputArg> &Outs, 1912 const SmallVectorImpl<SDValue> &OutVals, 1913 DebugLoc dl, SelectionDAG &DAG) const { 1914 1915 // CCValAssign - represent the assignment of the return value to a location. 1916 SmallVector<CCValAssign, 16> RVLocs; 1917 1918 // CCState - Info about the registers and stack slots. 1919 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1920 getTargetMachine(), RVLocs, *DAG.getContext(), Call); 1921 1922 // Analyze outgoing return values. 1923 CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true, 1924 isVarArg)); 1925 1926 // If this is the first return lowered for this function, add 1927 // the regs to the liveout set for the function. 1928 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 1929 for (unsigned i = 0; i != RVLocs.size(); ++i) 1930 if (RVLocs[i].isRegLoc()) 1931 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 1932 } 1933 1934 SDValue Flag; 1935 1936 // Copy the result values into the output registers. 1937 for (unsigned i = 0, realRVLocIdx = 0; 1938 i != RVLocs.size(); 1939 ++i, ++realRVLocIdx) { 1940 CCValAssign &VA = RVLocs[i]; 1941 assert(VA.isRegLoc() && "Can only return in registers!"); 1942 1943 SDValue Arg = OutVals[realRVLocIdx]; 1944 1945 switch (VA.getLocInfo()) { 1946 default: llvm_unreachable("Unknown loc info!"); 1947 case CCValAssign::Full: break; 1948 case CCValAssign::BCvt: 1949 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1950 break; 1951 } 1952 1953 if (VA.needsCustom()) { 1954 if (VA.getLocVT() == MVT::v2f64) { 1955 // Extract the first half and return it in two registers. 1956 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1957 DAG.getConstant(0, MVT::i32)); 1958 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 1959 DAG.getVTList(MVT::i32, MVT::i32), Half); 1960 1961 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag); 1962 Flag = Chain.getValue(1); 1963 VA = RVLocs[++i]; // skip ahead to next loc 1964 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 1965 HalfGPRs.getValue(1), Flag); 1966 Flag = Chain.getValue(1); 1967 VA = RVLocs[++i]; // skip ahead to next loc 1968 1969 // Extract the 2nd half and fall through to handle it as an f64 value. 1970 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1971 DAG.getConstant(1, MVT::i32)); 1972 } 1973 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 1974 // available. 1975 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1976 DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1); 1977 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag); 1978 Flag = Chain.getValue(1); 1979 VA = RVLocs[++i]; // skip ahead to next loc 1980 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1), 1981 Flag); 1982 } else 1983 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 1984 1985 // Guarantee that all emitted copies are 1986 // stuck together, avoiding something bad. 1987 Flag = Chain.getValue(1); 1988 } 1989 1990 SDValue result; 1991 if (Flag.getNode()) 1992 result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag); 1993 else // Return Void 1994 result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain); 1995 1996 return result; 1997 } 1998 1999 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2000 if (N->getNumValues() != 1) 2001 return false; 2002 if (!N->hasNUsesOfValue(1, 0)) 2003 return false; 2004 2005 SDValue TCChain = Chain; 2006 SDNode *Copy = *N->use_begin(); 2007 if (Copy->getOpcode() == ISD::CopyToReg) { 2008 // If the copy has a glue operand, we conservatively assume it isn't safe to 2009 // perform a tail call. 2010 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2011 return false; 2012 TCChain = Copy->getOperand(0); 2013 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2014 SDNode *VMov = Copy; 2015 // f64 returned in a pair of GPRs. 2016 SmallPtrSet<SDNode*, 2> Copies; 2017 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2018 UI != UE; ++UI) { 2019 if (UI->getOpcode() != ISD::CopyToReg) 2020 return false; 2021 Copies.insert(*UI); 2022 } 2023 if (Copies.size() > 2) 2024 return false; 2025 2026 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2027 UI != UE; ++UI) { 2028 SDValue UseChain = UI->getOperand(0); 2029 if (Copies.count(UseChain.getNode())) 2030 // Second CopyToReg 2031 Copy = *UI; 2032 else 2033 // First CopyToReg 2034 TCChain = UseChain; 2035 } 2036 } else if (Copy->getOpcode() == ISD::BITCAST) { 2037 // f32 returned in a single GPR. 2038 if (!Copy->hasOneUse()) 2039 return false; 2040 Copy = *Copy->use_begin(); 2041 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2042 return false; 2043 Chain = Copy->getOperand(0); 2044 } else { 2045 return false; 2046 } 2047 2048 bool HasRet = false; 2049 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2050 UI != UE; ++UI) { 2051 if (UI->getOpcode() != ARMISD::RET_FLAG) 2052 return false; 2053 HasRet = true; 2054 } 2055 2056 if (!HasRet) 2057 return false; 2058 2059 Chain = TCChain; 2060 return true; 2061 } 2062 2063 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2064 if (!EnableARMTailCalls && !Subtarget->supportsTailCall()) 2065 return false; 2066 2067 if (!CI->isTailCall()) 2068 return false; 2069 2070 return !Subtarget->isThumb1Only(); 2071 } 2072 2073 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2074 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2075 // one of the above mentioned nodes. It has to be wrapped because otherwise 2076 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2077 // be used to form addressing mode. These wrapped nodes will be selected 2078 // into MOVi. 2079 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 2080 EVT PtrVT = Op.getValueType(); 2081 // FIXME there is no actual debug info here 2082 DebugLoc dl = Op.getDebugLoc(); 2083 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2084 SDValue Res; 2085 if (CP->isMachineConstantPoolEntry()) 2086 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 2087 CP->getAlignment()); 2088 else 2089 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 2090 CP->getAlignment()); 2091 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 2092 } 2093 2094 unsigned ARMTargetLowering::getJumpTableEncoding() const { 2095 return MachineJumpTableInfo::EK_Inline; 2096 } 2097 2098 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 2099 SelectionDAG &DAG) const { 2100 MachineFunction &MF = DAG.getMachineFunction(); 2101 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2102 unsigned ARMPCLabelIndex = 0; 2103 DebugLoc DL = Op.getDebugLoc(); 2104 EVT PtrVT = getPointerTy(); 2105 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 2106 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2107 SDValue CPAddr; 2108 if (RelocM == Reloc::Static) { 2109 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 2110 } else { 2111 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2112 ARMPCLabelIndex = AFI->createPICLabelUId(); 2113 ARMConstantPoolValue *CPV = 2114 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 2115 ARMCP::CPBlockAddress, PCAdj); 2116 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2117 } 2118 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 2119 SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, 2120 MachinePointerInfo::getConstantPool(), 2121 false, false, false, 0); 2122 if (RelocM == Reloc::Static) 2123 return Result; 2124 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 2125 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 2126 } 2127 2128 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 2129 SDValue 2130 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 2131 SelectionDAG &DAG) const { 2132 DebugLoc dl = GA->getDebugLoc(); 2133 EVT PtrVT = getPointerTy(); 2134 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2135 MachineFunction &MF = DAG.getMachineFunction(); 2136 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2137 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2138 ARMConstantPoolValue *CPV = 2139 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2140 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 2141 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2142 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 2143 Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, 2144 MachinePointerInfo::getConstantPool(), 2145 false, false, false, 0); 2146 SDValue Chain = Argument.getValue(1); 2147 2148 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 2149 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 2150 2151 // call __tls_get_addr. 2152 ArgListTy Args; 2153 ArgListEntry Entry; 2154 Entry.Node = Argument; 2155 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 2156 Args.push_back(Entry); 2157 // FIXME: is there useful debug info available here? 2158 TargetLowering::CallLoweringInfo CLI(Chain, 2159 (Type *) Type::getInt32Ty(*DAG.getContext()), 2160 false, false, false, false, 2161 0, CallingConv::C, /*isTailCall=*/false, 2162 /*doesNotRet=*/false, /*isReturnValueUsed=*/true, 2163 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl); 2164 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2165 return CallResult.first; 2166 } 2167 2168 // Lower ISD::GlobalTLSAddress using the "initial exec" or 2169 // "local exec" model. 2170 SDValue 2171 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 2172 SelectionDAG &DAG, 2173 TLSModel::Model model) const { 2174 const GlobalValue *GV = GA->getGlobal(); 2175 DebugLoc dl = GA->getDebugLoc(); 2176 SDValue Offset; 2177 SDValue Chain = DAG.getEntryNode(); 2178 EVT PtrVT = getPointerTy(); 2179 // Get the Thread Pointer 2180 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2181 2182 if (model == TLSModel::InitialExec) { 2183 MachineFunction &MF = DAG.getMachineFunction(); 2184 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2185 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2186 // Initial exec model. 2187 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2188 ARMConstantPoolValue *CPV = 2189 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2190 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 2191 true); 2192 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2193 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2194 Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, 2195 MachinePointerInfo::getConstantPool(), 2196 false, false, false, 0); 2197 Chain = Offset.getValue(1); 2198 2199 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 2200 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 2201 2202 Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, 2203 MachinePointerInfo::getConstantPool(), 2204 false, false, false, 0); 2205 } else { 2206 // local exec model 2207 assert(model == TLSModel::LocalExec); 2208 ARMConstantPoolValue *CPV = 2209 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 2210 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2211 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2212 Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, 2213 MachinePointerInfo::getConstantPool(), 2214 false, false, false, 0); 2215 } 2216 2217 // The address of the thread local variable is the add of the thread 2218 // pointer with the offset of the variable. 2219 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 2220 } 2221 2222 SDValue 2223 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 2224 // TODO: implement the "local dynamic" model 2225 assert(Subtarget->isTargetELF() && 2226 "TLS not implemented for non-ELF targets"); 2227 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2228 2229 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 2230 2231 switch (model) { 2232 case TLSModel::GeneralDynamic: 2233 case TLSModel::LocalDynamic: 2234 return LowerToTLSGeneralDynamicModel(GA, DAG); 2235 case TLSModel::InitialExec: 2236 case TLSModel::LocalExec: 2237 return LowerToTLSExecModels(GA, DAG, model); 2238 } 2239 llvm_unreachable("bogus TLS model"); 2240 } 2241 2242 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 2243 SelectionDAG &DAG) const { 2244 EVT PtrVT = getPointerTy(); 2245 DebugLoc dl = Op.getDebugLoc(); 2246 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2247 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2248 if (RelocM == Reloc::PIC_) { 2249 bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); 2250 ARMConstantPoolValue *CPV = 2251 ARMConstantPoolConstant::Create(GV, 2252 UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); 2253 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2254 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2255 SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 2256 CPAddr, 2257 MachinePointerInfo::getConstantPool(), 2258 false, false, false, 0); 2259 SDValue Chain = Result.getValue(1); 2260 SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); 2261 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); 2262 if (!UseGOTOFF) 2263 Result = DAG.getLoad(PtrVT, dl, Chain, Result, 2264 MachinePointerInfo::getGOT(), 2265 false, false, false, 0); 2266 return Result; 2267 } 2268 2269 // If we have T2 ops, we can materialize the address directly via movt/movw 2270 // pair. This is always cheaper. 2271 if (Subtarget->useMovt()) { 2272 ++NumMovwMovt; 2273 // FIXME: Once remat is capable of dealing with instructions with register 2274 // operands, expand this into two nodes. 2275 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 2276 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 2277 } else { 2278 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 2279 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2280 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, 2281 MachinePointerInfo::getConstantPool(), 2282 false, false, false, 0); 2283 } 2284 } 2285 2286 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 2287 SelectionDAG &DAG) const { 2288 EVT PtrVT = getPointerTy(); 2289 DebugLoc dl = Op.getDebugLoc(); 2290 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 2291 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2292 MachineFunction &MF = DAG.getMachineFunction(); 2293 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2294 2295 // FIXME: Enable this for static codegen when tool issues are fixed. Also 2296 // update ARMFastISel::ARMMaterializeGV. 2297 if (Subtarget->useMovt() && RelocM != Reloc::Static) { 2298 ++NumMovwMovt; 2299 // FIXME: Once remat is capable of dealing with instructions with register 2300 // operands, expand this into two nodes. 2301 if (RelocM == Reloc::Static) 2302 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 2303 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 2304 2305 unsigned Wrapper = (RelocM == Reloc::PIC_) 2306 ? ARMISD::WrapperPIC : ARMISD::WrapperDYN; 2307 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, 2308 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 2309 if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) 2310 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 2311 MachinePointerInfo::getGOT(), 2312 false, false, false, 0); 2313 return Result; 2314 } 2315 2316 unsigned ARMPCLabelIndex = 0; 2317 SDValue CPAddr; 2318 if (RelocM == Reloc::Static) { 2319 CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 2320 } else { 2321 ARMPCLabelIndex = AFI->createPICLabelUId(); 2322 unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8); 2323 ARMConstantPoolValue *CPV = 2324 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 2325 PCAdj); 2326 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2327 } 2328 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2329 2330 SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, 2331 MachinePointerInfo::getConstantPool(), 2332 false, false, false, 0); 2333 SDValue Chain = Result.getValue(1); 2334 2335 if (RelocM == Reloc::PIC_) { 2336 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 2337 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 2338 } 2339 2340 if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) 2341 Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(), 2342 false, false, false, 0); 2343 2344 return Result; 2345 } 2346 2347 SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, 2348 SelectionDAG &DAG) const { 2349 assert(Subtarget->isTargetELF() && 2350 "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); 2351 MachineFunction &MF = DAG.getMachineFunction(); 2352 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2353 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2354 EVT PtrVT = getPointerTy(); 2355 DebugLoc dl = Op.getDebugLoc(); 2356 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2357 ARMConstantPoolValue *CPV = 2358 ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", 2359 ARMPCLabelIndex, PCAdj); 2360 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2361 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2362 SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, 2363 MachinePointerInfo::getConstantPool(), 2364 false, false, false, 0); 2365 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 2366 return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 2367 } 2368 2369 SDValue 2370 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 2371 DebugLoc dl = Op.getDebugLoc(); 2372 SDValue Val = DAG.getConstant(0, MVT::i32); 2373 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 2374 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 2375 Op.getOperand(1), Val); 2376 } 2377 2378 SDValue 2379 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 2380 DebugLoc dl = Op.getDebugLoc(); 2381 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 2382 Op.getOperand(1), DAG.getConstant(0, MVT::i32)); 2383 } 2384 2385 SDValue 2386 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 2387 const ARMSubtarget *Subtarget) const { 2388 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2389 DebugLoc dl = Op.getDebugLoc(); 2390 switch (IntNo) { 2391 default: return SDValue(); // Don't custom lower most intrinsics. 2392 case Intrinsic::arm_thread_pointer: { 2393 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); 2394 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2395 } 2396 case Intrinsic::eh_sjlj_lsda: { 2397 MachineFunction &MF = DAG.getMachineFunction(); 2398 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2399 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2400 EVT PtrVT = getPointerTy(); 2401 DebugLoc dl = Op.getDebugLoc(); 2402 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2403 SDValue CPAddr; 2404 unsigned PCAdj = (RelocM != Reloc::PIC_) 2405 ? 0 : (Subtarget->isThumb() ? 4 : 8); 2406 ARMConstantPoolValue *CPV = 2407 ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex, 2408 ARMCP::CPLSDA, PCAdj); 2409 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2410 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2411 SDValue Result = 2412 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, 2413 MachinePointerInfo::getConstantPool(), 2414 false, false, false, 0); 2415 2416 if (RelocM == Reloc::PIC_) { 2417 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); 2418 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 2419 } 2420 return Result; 2421 } 2422 case Intrinsic::arm_neon_vmulls: 2423 case Intrinsic::arm_neon_vmullu: { 2424 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 2425 ? ARMISD::VMULLs : ARMISD::VMULLu; 2426 return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(), 2427 Op.getOperand(1), Op.getOperand(2)); 2428 } 2429 } 2430 } 2431 2432 static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, 2433 const ARMSubtarget *Subtarget) { 2434 DebugLoc dl = Op.getDebugLoc(); 2435 if (!Subtarget->hasDataBarrier()) { 2436 // Some ARMv6 cpus can support data barriers with an mcr instruction. 2437 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 2438 // here. 2439 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 2440 "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); 2441 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 2442 DAG.getConstant(0, MVT::i32)); 2443 } 2444 2445 SDValue Op5 = Op.getOperand(5); 2446 bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0; 2447 unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 2448 unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 2449 bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0); 2450 2451 ARM_MB::MemBOpt DMBOpt; 2452 if (isDeviceBarrier) 2453 DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY; 2454 else 2455 DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH; 2456 return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), 2457 DAG.getConstant(DMBOpt, MVT::i32)); 2458 } 2459 2460 2461 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 2462 const ARMSubtarget *Subtarget) { 2463 // FIXME: handle "fence singlethread" more efficiently. 2464 DebugLoc dl = Op.getDebugLoc(); 2465 if (!Subtarget->hasDataBarrier()) { 2466 // Some ARMv6 cpus can support data barriers with an mcr instruction. 2467 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 2468 // here. 2469 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 2470 "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); 2471 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 2472 DAG.getConstant(0, MVT::i32)); 2473 } 2474 2475 return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), 2476 DAG.getConstant(ARM_MB::ISH, MVT::i32)); 2477 } 2478 2479 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 2480 const ARMSubtarget *Subtarget) { 2481 // ARM pre v5TE and Thumb1 does not have preload instructions. 2482 if (!(Subtarget->isThumb2() || 2483 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 2484 // Just preserve the chain. 2485 return Op.getOperand(0); 2486 2487 DebugLoc dl = Op.getDebugLoc(); 2488 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 2489 if (!isRead && 2490 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 2491 // ARMv7 with MP extension has PLDW. 2492 return Op.getOperand(0); 2493 2494 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 2495 if (Subtarget->isThumb()) { 2496 // Invert the bits. 2497 isRead = ~isRead & 1; 2498 isData = ~isData & 1; 2499 } 2500 2501 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 2502 Op.getOperand(1), DAG.getConstant(isRead, MVT::i32), 2503 DAG.getConstant(isData, MVT::i32)); 2504 } 2505 2506 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 2507 MachineFunction &MF = DAG.getMachineFunction(); 2508 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 2509 2510 // vastart just stores the address of the VarArgsFrameIndex slot into the 2511 // memory location argument. 2512 DebugLoc dl = Op.getDebugLoc(); 2513 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); 2514 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 2515 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2516 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 2517 MachinePointerInfo(SV), false, false, 0); 2518 } 2519 2520 SDValue 2521 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, 2522 SDValue &Root, SelectionDAG &DAG, 2523 DebugLoc dl) const { 2524 MachineFunction &MF = DAG.getMachineFunction(); 2525 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2526 2527 const TargetRegisterClass *RC; 2528 if (AFI->isThumb1OnlyFunction()) 2529 RC = &ARM::tGPRRegClass; 2530 else 2531 RC = &ARM::GPRRegClass; 2532 2533 // Transform the arguments stored in physical registers into virtual ones. 2534 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2535 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 2536 2537 SDValue ArgValue2; 2538 if (NextVA.isMemLoc()) { 2539 MachineFrameInfo *MFI = MF.getFrameInfo(); 2540 int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); 2541 2542 // Create load node to retrieve arguments from the stack. 2543 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2544 ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, 2545 MachinePointerInfo::getFixedStack(FI), 2546 false, false, false, 0); 2547 } else { 2548 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 2549 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 2550 } 2551 2552 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 2553 } 2554 2555 void 2556 ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, 2557 unsigned &VARegSize, unsigned &VARegSaveSize) 2558 const { 2559 unsigned NumGPRs; 2560 if (CCInfo.isFirstByValRegValid()) 2561 NumGPRs = ARM::R4 - CCInfo.getFirstByValReg(); 2562 else { 2563 unsigned int firstUnalloced; 2564 firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs, 2565 sizeof(GPRArgRegs) / 2566 sizeof(GPRArgRegs[0])); 2567 NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0; 2568 } 2569 2570 unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); 2571 VARegSize = NumGPRs * 4; 2572 VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); 2573 } 2574 2575 // The remaining GPRs hold either the beginning of variable-argument 2576 // data, or the beginning of an aggregate passed by value (usuall 2577 // byval). Either way, we allocate stack slots adjacent to the data 2578 // provided by our caller, and store the unallocated registers there. 2579 // If this is a variadic function, the va_list pointer will begin with 2580 // these values; otherwise, this reassembles a (byval) structure that 2581 // was split between registers and memory. 2582 void 2583 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 2584 DebugLoc dl, SDValue &Chain, 2585 const Value *OrigArg, 2586 unsigned OffsetFromOrigArg, 2587 unsigned ArgOffset, 2588 bool ForceMutable) const { 2589 MachineFunction &MF = DAG.getMachineFunction(); 2590 MachineFrameInfo *MFI = MF.getFrameInfo(); 2591 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2592 unsigned firstRegToSaveIndex; 2593 if (CCInfo.isFirstByValRegValid()) 2594 firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0; 2595 else { 2596 firstRegToSaveIndex = CCInfo.getFirstUnallocated 2597 (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0])); 2598 } 2599 2600 unsigned VARegSize, VARegSaveSize; 2601 computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize); 2602 if (VARegSaveSize) { 2603 // If this function is vararg, store any remaining integer argument regs 2604 // to their spots on the stack so that they may be loaded by deferencing 2605 // the result of va_next. 2606 AFI->setVarArgsRegSaveSize(VARegSaveSize); 2607 AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize, 2608 ArgOffset + VARegSaveSize 2609 - VARegSize, 2610 false)); 2611 SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), 2612 getPointerTy()); 2613 2614 SmallVector<SDValue, 4> MemOps; 2615 for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) { 2616 const TargetRegisterClass *RC; 2617 if (AFI->isThumb1OnlyFunction()) 2618 RC = &ARM::tGPRRegClass; 2619 else 2620 RC = &ARM::GPRRegClass; 2621 2622 unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); 2623 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 2624 SDValue Store = 2625 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2626 MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i), 2627 false, false, 0); 2628 MemOps.push_back(Store); 2629 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 2630 DAG.getConstant(4, getPointerTy())); 2631 } 2632 if (!MemOps.empty()) 2633 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2634 &MemOps[0], MemOps.size()); 2635 } else 2636 // This will point to the next argument passed via stack. 2637 AFI->setVarArgsFrameIndex( 2638 MFI->CreateFixedObject(4, ArgOffset, !ForceMutable)); 2639 } 2640 2641 SDValue 2642 ARMTargetLowering::LowerFormalArguments(SDValue Chain, 2643 CallingConv::ID CallConv, bool isVarArg, 2644 const SmallVectorImpl<ISD::InputArg> 2645 &Ins, 2646 DebugLoc dl, SelectionDAG &DAG, 2647 SmallVectorImpl<SDValue> &InVals) 2648 const { 2649 MachineFunction &MF = DAG.getMachineFunction(); 2650 MachineFrameInfo *MFI = MF.getFrameInfo(); 2651 2652 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2653 2654 // Assign locations to all of the incoming arguments. 2655 SmallVector<CCValAssign, 16> ArgLocs; 2656 ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 2657 getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue); 2658 CCInfo.AnalyzeFormalArguments(Ins, 2659 CCAssignFnForNode(CallConv, /* Return*/ false, 2660 isVarArg)); 2661 2662 SmallVector<SDValue, 16> ArgValues; 2663 int lastInsIndex = -1; 2664 SDValue ArgValue; 2665 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 2666 unsigned CurArgIdx = 0; 2667 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2668 CCValAssign &VA = ArgLocs[i]; 2669 std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx); 2670 CurArgIdx = Ins[VA.getValNo()].OrigArgIndex; 2671 // Arguments stored in registers. 2672 if (VA.isRegLoc()) { 2673 EVT RegVT = VA.getLocVT(); 2674 2675 if (VA.needsCustom()) { 2676 // f64 and vector types are split up into multiple registers or 2677 // combinations of registers and stack slots. 2678 if (VA.getLocVT() == MVT::v2f64) { 2679 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 2680 Chain, DAG, dl); 2681 VA = ArgLocs[++i]; // skip ahead to next loc 2682 SDValue ArgValue2; 2683 if (VA.isMemLoc()) { 2684 int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); 2685 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2686 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 2687 MachinePointerInfo::getFixedStack(FI), 2688 false, false, false, 0); 2689 } else { 2690 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 2691 Chain, DAG, dl); 2692 } 2693 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2694 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 2695 ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); 2696 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 2697 ArgValue, ArgValue2, DAG.getIntPtrConstant(1)); 2698 } else 2699 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 2700 2701 } else { 2702 const TargetRegisterClass *RC; 2703 2704 if (RegVT == MVT::f32) 2705 RC = &ARM::SPRRegClass; 2706 else if (RegVT == MVT::f64) 2707 RC = &ARM::DPRRegClass; 2708 else if (RegVT == MVT::v2f64) 2709 RC = &ARM::QPRRegClass; 2710 else if (RegVT == MVT::i32) 2711 RC = AFI->isThumb1OnlyFunction() ? 2712 (const TargetRegisterClass*)&ARM::tGPRRegClass : 2713 (const TargetRegisterClass*)&ARM::GPRRegClass; 2714 else 2715 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 2716 2717 // Transform the arguments in physical registers into virtual ones. 2718 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2719 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2720 } 2721 2722 // If this is an 8 or 16-bit value, it is really passed promoted 2723 // to 32 bits. Insert an assert[sz]ext to capture this, then 2724 // truncate to the right size. 2725 switch (VA.getLocInfo()) { 2726 default: llvm_unreachable("Unknown loc info!"); 2727 case CCValAssign::Full: break; 2728 case CCValAssign::BCvt: 2729 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 2730 break; 2731 case CCValAssign::SExt: 2732 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2733 DAG.getValueType(VA.getValVT())); 2734 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2735 break; 2736 case CCValAssign::ZExt: 2737 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2738 DAG.getValueType(VA.getValVT())); 2739 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2740 break; 2741 } 2742 2743 InVals.push_back(ArgValue); 2744 2745 } else { // VA.isRegLoc() 2746 2747 // sanity check 2748 assert(VA.isMemLoc()); 2749 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 2750 2751 int index = ArgLocs[i].getValNo(); 2752 2753 // Some Ins[] entries become multiple ArgLoc[] entries. 2754 // Process them only once. 2755 if (index != lastInsIndex) 2756 { 2757 ISD::ArgFlagsTy Flags = Ins[index].Flags; 2758 // FIXME: For now, all byval parameter objects are marked mutable. 2759 // This can be changed with more analysis. 2760 // In case of tail call optimization mark all arguments mutable. 2761 // Since they could be overwritten by lowering of arguments in case of 2762 // a tail call. 2763 if (Flags.isByVal()) { 2764 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2765 if (!AFI->getVarArgsFrameIndex()) { 2766 VarArgStyleRegisters(CCInfo, DAG, 2767 dl, Chain, CurOrigArg, 2768 Ins[VA.getValNo()].PartOffset, 2769 VA.getLocMemOffset(), 2770 true /*force mutable frames*/); 2771 int VAFrameIndex = AFI->getVarArgsFrameIndex(); 2772 InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy())); 2773 } else { 2774 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 2775 VA.getLocMemOffset(), false); 2776 InVals.push_back(DAG.getFrameIndex(FI, getPointerTy())); 2777 } 2778 } else { 2779 int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 2780 VA.getLocMemOffset(), true); 2781 2782 // Create load nodes to retrieve arguments from the stack. 2783 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2784 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 2785 MachinePointerInfo::getFixedStack(FI), 2786 false, false, false, 0)); 2787 } 2788 lastInsIndex = index; 2789 } 2790 } 2791 } 2792 2793 // varargs 2794 if (isVarArg) 2795 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0, 2796 CCInfo.getNextStackOffset()); 2797 2798 return Chain; 2799 } 2800 2801 /// isFloatingPointZero - Return true if this is +0.0. 2802 static bool isFloatingPointZero(SDValue Op) { 2803 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 2804 return CFP->getValueAPF().isPosZero(); 2805 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 2806 // Maybe this has already been legalized into the constant pool? 2807 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 2808 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 2809 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 2810 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 2811 return CFP->getValueAPF().isPosZero(); 2812 } 2813 } 2814 return false; 2815 } 2816 2817 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 2818 /// the given operands. 2819 SDValue 2820 ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 2821 SDValue &ARMcc, SelectionDAG &DAG, 2822 DebugLoc dl) const { 2823 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 2824 unsigned C = RHSC->getZExtValue(); 2825 if (!isLegalICmpImmediate(C)) { 2826 // Constant does not fit, try adjusting it by one? 2827 switch (CC) { 2828 default: break; 2829 case ISD::SETLT: 2830 case ISD::SETGE: 2831 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 2832 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 2833 RHS = DAG.getConstant(C-1, MVT::i32); 2834 } 2835 break; 2836 case ISD::SETULT: 2837 case ISD::SETUGE: 2838 if (C != 0 && isLegalICmpImmediate(C-1)) { 2839 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 2840 RHS = DAG.getConstant(C-1, MVT::i32); 2841 } 2842 break; 2843 case ISD::SETLE: 2844 case ISD::SETGT: 2845 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 2846 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 2847 RHS = DAG.getConstant(C+1, MVT::i32); 2848 } 2849 break; 2850 case ISD::SETULE: 2851 case ISD::SETUGT: 2852 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 2853 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 2854 RHS = DAG.getConstant(C+1, MVT::i32); 2855 } 2856 break; 2857 } 2858 } 2859 } 2860 2861 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 2862 ARMISD::NodeType CompareType; 2863 switch (CondCode) { 2864 default: 2865 CompareType = ARMISD::CMP; 2866 break; 2867 case ARMCC::EQ: 2868 case ARMCC::NE: 2869 // Uses only Z Flag 2870 CompareType = ARMISD::CMPZ; 2871 break; 2872 } 2873 ARMcc = DAG.getConstant(CondCode, MVT::i32); 2874 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 2875 } 2876 2877 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 2878 SDValue 2879 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, 2880 DebugLoc dl) const { 2881 SDValue Cmp; 2882 if (!isFloatingPointZero(RHS)) 2883 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS); 2884 else 2885 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS); 2886 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 2887 } 2888 2889 /// duplicateCmp - Glue values can have only one use, so this function 2890 /// duplicates a comparison node. 2891 SDValue 2892 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 2893 unsigned Opc = Cmp.getOpcode(); 2894 DebugLoc DL = Cmp.getDebugLoc(); 2895 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 2896 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 2897 2898 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 2899 Cmp = Cmp.getOperand(0); 2900 Opc = Cmp.getOpcode(); 2901 if (Opc == ARMISD::CMPFP) 2902 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 2903 else { 2904 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 2905 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 2906 } 2907 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 2908 } 2909 2910 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2911 SDValue Cond = Op.getOperand(0); 2912 SDValue SelectTrue = Op.getOperand(1); 2913 SDValue SelectFalse = Op.getOperand(2); 2914 DebugLoc dl = Op.getDebugLoc(); 2915 2916 // Convert: 2917 // 2918 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 2919 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 2920 // 2921 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 2922 const ConstantSDNode *CMOVTrue = 2923 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 2924 const ConstantSDNode *CMOVFalse = 2925 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 2926 2927 if (CMOVTrue && CMOVFalse) { 2928 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 2929 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 2930 2931 SDValue True; 2932 SDValue False; 2933 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 2934 True = SelectTrue; 2935 False = SelectFalse; 2936 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 2937 True = SelectFalse; 2938 False = SelectTrue; 2939 } 2940 2941 if (True.getNode() && False.getNode()) { 2942 EVT VT = Op.getValueType(); 2943 SDValue ARMcc = Cond.getOperand(2); 2944 SDValue CCR = Cond.getOperand(3); 2945 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 2946 assert(True.getValueType() == VT); 2947 return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp); 2948 } 2949 } 2950 } 2951 2952 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 2953 // undefined bits before doing a full-word comparison with zero. 2954 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 2955 DAG.getConstant(1, Cond.getValueType())); 2956 2957 return DAG.getSelectCC(dl, Cond, 2958 DAG.getConstant(0, Cond.getValueType()), 2959 SelectTrue, SelectFalse, ISD::SETNE); 2960 } 2961 2962 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 2963 EVT VT = Op.getValueType(); 2964 SDValue LHS = Op.getOperand(0); 2965 SDValue RHS = Op.getOperand(1); 2966 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 2967 SDValue TrueVal = Op.getOperand(2); 2968 SDValue FalseVal = Op.getOperand(3); 2969 DebugLoc dl = Op.getDebugLoc(); 2970 2971 if (LHS.getValueType() == MVT::i32) { 2972 SDValue ARMcc; 2973 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 2974 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 2975 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp); 2976 } 2977 2978 ARMCC::CondCodes CondCode, CondCode2; 2979 FPCCToARMCC(CC, CondCode, CondCode2); 2980 2981 SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); 2982 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 2983 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 2984 SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 2985 ARMcc, CCR, Cmp); 2986 if (CondCode2 != ARMCC::AL) { 2987 SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32); 2988 // FIXME: Needs another CMP because flag can have but one use. 2989 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 2990 Result = DAG.getNode(ARMISD::CMOV, dl, VT, 2991 Result, TrueVal, ARMcc2, CCR, Cmp2); 2992 } 2993 return Result; 2994 } 2995 2996 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 2997 /// to morph to an integer compare sequence. 2998 static bool canChangeToInt(SDValue Op, bool &SeenZero, 2999 const ARMSubtarget *Subtarget) { 3000 SDNode *N = Op.getNode(); 3001 if (!N->hasOneUse()) 3002 // Otherwise it requires moving the value from fp to integer registers. 3003 return false; 3004 if (!N->getNumValues()) 3005 return false; 3006 EVT VT = Op.getValueType(); 3007 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 3008 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 3009 // vmrs are very slow, e.g. cortex-a8. 3010 return false; 3011 3012 if (isFloatingPointZero(Op)) { 3013 SeenZero = true; 3014 return true; 3015 } 3016 return ISD::isNormalLoad(N); 3017 } 3018 3019 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 3020 if (isFloatingPointZero(Op)) 3021 return DAG.getConstant(0, MVT::i32); 3022 3023 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 3024 return DAG.getLoad(MVT::i32, Op.getDebugLoc(), 3025 Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), 3026 Ld->isVolatile(), Ld->isNonTemporal(), 3027 Ld->isInvariant(), Ld->getAlignment()); 3028 3029 llvm_unreachable("Unknown VFP cmp argument!"); 3030 } 3031 3032 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 3033 SDValue &RetVal1, SDValue &RetVal2) { 3034 if (isFloatingPointZero(Op)) { 3035 RetVal1 = DAG.getConstant(0, MVT::i32); 3036 RetVal2 = DAG.getConstant(0, MVT::i32); 3037 return; 3038 } 3039 3040 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 3041 SDValue Ptr = Ld->getBasePtr(); 3042 RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), 3043 Ld->getChain(), Ptr, 3044 Ld->getPointerInfo(), 3045 Ld->isVolatile(), Ld->isNonTemporal(), 3046 Ld->isInvariant(), Ld->getAlignment()); 3047 3048 EVT PtrType = Ptr.getValueType(); 3049 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 3050 SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(), 3051 PtrType, Ptr, DAG.getConstant(4, PtrType)); 3052 RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), 3053 Ld->getChain(), NewPtr, 3054 Ld->getPointerInfo().getWithOffset(4), 3055 Ld->isVolatile(), Ld->isNonTemporal(), 3056 Ld->isInvariant(), NewAlign); 3057 return; 3058 } 3059 3060 llvm_unreachable("Unknown VFP cmp argument!"); 3061 } 3062 3063 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 3064 /// f32 and even f64 comparisons to integer ones. 3065 SDValue 3066 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 3067 SDValue Chain = Op.getOperand(0); 3068 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3069 SDValue LHS = Op.getOperand(2); 3070 SDValue RHS = Op.getOperand(3); 3071 SDValue Dest = Op.getOperand(4); 3072 DebugLoc dl = Op.getDebugLoc(); 3073 3074 bool LHSSeenZero = false; 3075 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 3076 bool RHSSeenZero = false; 3077 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 3078 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 3079 // If unsafe fp math optimization is enabled and there are no other uses of 3080 // the CMP operands, and the condition code is EQ or NE, we can optimize it 3081 // to an integer comparison. 3082 if (CC == ISD::SETOEQ) 3083 CC = ISD::SETEQ; 3084 else if (CC == ISD::SETUNE) 3085 CC = ISD::SETNE; 3086 3087 SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32); 3088 SDValue ARMcc; 3089 if (LHS.getValueType() == MVT::f32) { 3090 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 3091 bitcastf32Toi32(LHS, DAG), Mask); 3092 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 3093 bitcastf32Toi32(RHS, DAG), Mask); 3094 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 3095 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3096 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 3097 Chain, Dest, ARMcc, CCR, Cmp); 3098 } 3099 3100 SDValue LHS1, LHS2; 3101 SDValue RHS1, RHS2; 3102 expandf64Toi32(LHS, DAG, LHS1, LHS2); 3103 expandf64Toi32(RHS, DAG, RHS1, RHS2); 3104 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 3105 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 3106 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3107 ARMcc = DAG.getConstant(CondCode, MVT::i32); 3108 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 3109 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 3110 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7); 3111 } 3112 3113 return SDValue(); 3114 } 3115 3116 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3117 SDValue Chain = Op.getOperand(0); 3118 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3119 SDValue LHS = Op.getOperand(2); 3120 SDValue RHS = Op.getOperand(3); 3121 SDValue Dest = Op.getOperand(4); 3122 DebugLoc dl = Op.getDebugLoc(); 3123 3124 if (LHS.getValueType() == MVT::i32) { 3125 SDValue ARMcc; 3126 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 3127 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3128 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 3129 Chain, Dest, ARMcc, CCR, Cmp); 3130 } 3131 3132 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3133 3134 if (getTargetMachine().Options.UnsafeFPMath && 3135 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 3136 CC == ISD::SETNE || CC == ISD::SETUNE)) { 3137 SDValue Result = OptimizeVFPBrcond(Op, DAG); 3138 if (Result.getNode()) 3139 return Result; 3140 } 3141 3142 ARMCC::CondCodes CondCode, CondCode2; 3143 FPCCToARMCC(CC, CondCode, CondCode2); 3144 3145 SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); 3146 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 3147 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3148 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 3149 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 3150 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); 3151 if (CondCode2 != ARMCC::AL) { 3152 ARMcc = DAG.getConstant(CondCode2, MVT::i32); 3153 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 3154 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); 3155 } 3156 return Res; 3157 } 3158 3159 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 3160 SDValue Chain = Op.getOperand(0); 3161 SDValue Table = Op.getOperand(1); 3162 SDValue Index = Op.getOperand(2); 3163 DebugLoc dl = Op.getDebugLoc(); 3164 3165 EVT PTy = getPointerTy(); 3166 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 3167 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3168 SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy); 3169 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 3170 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId); 3171 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy)); 3172 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); 3173 if (Subtarget->isThumb2()) { 3174 // Thumb2 uses a two-level jump. That is, it jumps into the jump table 3175 // which does another jump to the destination. This also makes it easier 3176 // to translate it to TBB / TBH later. 3177 // FIXME: This might not work if the function is extremely large. 3178 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 3179 Addr, Op.getOperand(2), JTI, UId); 3180 } 3181 if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { 3182 Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 3183 MachinePointerInfo::getJumpTable(), 3184 false, false, false, 0); 3185 Chain = Addr.getValue(1); 3186 Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); 3187 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); 3188 } else { 3189 Addr = DAG.getLoad(PTy, dl, Chain, Addr, 3190 MachinePointerInfo::getJumpTable(), 3191 false, false, false, 0); 3192 Chain = Addr.getValue(1); 3193 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); 3194 } 3195 } 3196 3197 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 3198 EVT VT = Op.getValueType(); 3199 DebugLoc dl = Op.getDebugLoc(); 3200 3201 if (Op.getValueType().getVectorElementType() == MVT::i32) { 3202 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 3203 return Op; 3204 return DAG.UnrollVectorOp(Op.getNode()); 3205 } 3206 3207 assert(Op.getOperand(0).getValueType() == MVT::v4f32 && 3208 "Invalid type for custom lowering!"); 3209 if (VT != MVT::v4i16) 3210 return DAG.UnrollVectorOp(Op.getNode()); 3211 3212 Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); 3213 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 3214 } 3215 3216 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 3217 EVT VT = Op.getValueType(); 3218 if (VT.isVector()) 3219 return LowerVectorFP_TO_INT(Op, DAG); 3220 3221 DebugLoc dl = Op.getDebugLoc(); 3222 unsigned Opc; 3223 3224 switch (Op.getOpcode()) { 3225 default: llvm_unreachable("Invalid opcode!"); 3226 case ISD::FP_TO_SINT: 3227 Opc = ARMISD::FTOSI; 3228 break; 3229 case ISD::FP_TO_UINT: 3230 Opc = ARMISD::FTOUI; 3231 break; 3232 } 3233 Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0)); 3234 return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); 3235 } 3236 3237 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 3238 EVT VT = Op.getValueType(); 3239 DebugLoc dl = Op.getDebugLoc(); 3240 3241 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 3242 if (VT.getVectorElementType() == MVT::f32) 3243 return Op; 3244 return DAG.UnrollVectorOp(Op.getNode()); 3245 } 3246 3247 assert(Op.getOperand(0).getValueType() == MVT::v4i16 && 3248 "Invalid type for custom lowering!"); 3249 if (VT != MVT::v4f32) 3250 return DAG.UnrollVectorOp(Op.getNode()); 3251 3252 unsigned CastOpc; 3253 unsigned Opc; 3254 switch (Op.getOpcode()) { 3255 default: llvm_unreachable("Invalid opcode!"); 3256 case ISD::SINT_TO_FP: 3257 CastOpc = ISD::SIGN_EXTEND; 3258 Opc = ISD::SINT_TO_FP; 3259 break; 3260 case ISD::UINT_TO_FP: 3261 CastOpc = ISD::ZERO_EXTEND; 3262 Opc = ISD::UINT_TO_FP; 3263 break; 3264 } 3265 3266 Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); 3267 return DAG.getNode(Opc, dl, VT, Op); 3268 } 3269 3270 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 3271 EVT VT = Op.getValueType(); 3272 if (VT.isVector()) 3273 return LowerVectorINT_TO_FP(Op, DAG); 3274 3275 DebugLoc dl = Op.getDebugLoc(); 3276 unsigned Opc; 3277 3278 switch (Op.getOpcode()) { 3279 default: llvm_unreachable("Invalid opcode!"); 3280 case ISD::SINT_TO_FP: 3281 Opc = ARMISD::SITOF; 3282 break; 3283 case ISD::UINT_TO_FP: 3284 Opc = ARMISD::UITOF; 3285 break; 3286 } 3287 3288 Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0)); 3289 return DAG.getNode(Opc, dl, VT, Op); 3290 } 3291 3292 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 3293 // Implement fcopysign with a fabs and a conditional fneg. 3294 SDValue Tmp0 = Op.getOperand(0); 3295 SDValue Tmp1 = Op.getOperand(1); 3296 DebugLoc dl = Op.getDebugLoc(); 3297 EVT VT = Op.getValueType(); 3298 EVT SrcVT = Tmp1.getValueType(); 3299 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 3300 Tmp0.getOpcode() == ARMISD::VMOVDRR; 3301 bool UseNEON = !InGPR && Subtarget->hasNEON(); 3302 3303 if (UseNEON) { 3304 // Use VBSL to copy the sign bit. 3305 unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); 3306 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 3307 DAG.getTargetConstant(EncodedVal, MVT::i32)); 3308 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 3309 if (VT == MVT::f64) 3310 Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, 3311 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 3312 DAG.getConstant(32, MVT::i32)); 3313 else /*if (VT == MVT::f32)*/ 3314 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 3315 if (SrcVT == MVT::f32) { 3316 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 3317 if (VT == MVT::f64) 3318 Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, 3319 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 3320 DAG.getConstant(32, MVT::i32)); 3321 } else if (VT == MVT::f32) 3322 Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, 3323 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 3324 DAG.getConstant(32, MVT::i32)); 3325 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 3326 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 3327 3328 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), 3329 MVT::i32); 3330 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 3331 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 3332 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 3333 3334 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 3335 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 3336 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 3337 if (VT == MVT::f32) { 3338 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 3339 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 3340 DAG.getConstant(0, MVT::i32)); 3341 } else { 3342 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 3343 } 3344 3345 return Res; 3346 } 3347 3348 // Bitcast operand 1 to i32. 3349 if (SrcVT == MVT::f64) 3350 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 3351 &Tmp1, 1).getValue(1); 3352 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 3353 3354 // Or in the signbit with integer operations. 3355 SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32); 3356 SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32); 3357 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 3358 if (VT == MVT::f32) { 3359 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 3360 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 3361 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 3362 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 3363 } 3364 3365 // f64: Or the high part with signbit and then combine two parts. 3366 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 3367 &Tmp0, 1); 3368 SDValue Lo = Tmp0.getValue(0); 3369 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 3370 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 3371 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 3372 } 3373 3374 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 3375 MachineFunction &MF = DAG.getMachineFunction(); 3376 MachineFrameInfo *MFI = MF.getFrameInfo(); 3377 MFI->setReturnAddressIsTaken(true); 3378 3379 EVT VT = Op.getValueType(); 3380 DebugLoc dl = Op.getDebugLoc(); 3381 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3382 if (Depth) { 3383 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 3384 SDValue Offset = DAG.getConstant(4, MVT::i32); 3385 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 3386 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 3387 MachinePointerInfo(), false, false, false, 0); 3388 } 3389 3390 // Return LR, which contains the return address. Mark it an implicit live-in. 3391 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3392 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 3393 } 3394 3395 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 3396 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3397 MFI->setFrameAddressIsTaken(true); 3398 3399 EVT VT = Op.getValueType(); 3400 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 3401 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3402 unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin()) 3403 ? ARM::R7 : ARM::R11; 3404 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 3405 while (Depth--) 3406 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 3407 MachinePointerInfo(), 3408 false, false, false, 0); 3409 return FrameAddr; 3410 } 3411 3412 /// ExpandBITCAST - If the target supports VFP, this function is called to 3413 /// expand a bit convert where either the source or destination type is i64 to 3414 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 3415 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 3416 /// vectors), since the legalizer won't know what to do with that. 3417 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { 3418 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3419 DebugLoc dl = N->getDebugLoc(); 3420 SDValue Op = N->getOperand(0); 3421 3422 // This function is only supposed to be called for i64 types, either as the 3423 // source or destination of the bit convert. 3424 EVT SrcVT = Op.getValueType(); 3425 EVT DstVT = N->getValueType(0); 3426 assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && 3427 "ExpandBITCAST called for non-i64 type"); 3428 3429 // Turn i64->f64 into VMOVDRR. 3430 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 3431 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 3432 DAG.getConstant(0, MVT::i32)); 3433 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 3434 DAG.getConstant(1, MVT::i32)); 3435 return DAG.getNode(ISD::BITCAST, dl, DstVT, 3436 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 3437 } 3438 3439 // Turn f64->i64 into VMOVRRD. 3440 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 3441 SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 3442 DAG.getVTList(MVT::i32, MVT::i32), &Op, 1); 3443 // Merge the pieces into a single i64 value. 3444 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 3445 } 3446 3447 return SDValue(); 3448 } 3449 3450 /// getZeroVector - Returns a vector of specified type with all zero elements. 3451 /// Zero vectors are used to represent vector negation and in those cases 3452 /// will be implemented with the NEON VNEG instruction. However, VNEG does 3453 /// not support i64 elements, so sometimes the zero vectors will need to be 3454 /// explicitly constructed. Regardless, use a canonical VMOV to create the 3455 /// zero vector. 3456 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3457 assert(VT.isVector() && "Expected a vector type"); 3458 // The canonical modified immediate encoding of a zero vector is....0! 3459 SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); 3460 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 3461 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 3462 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 3463 } 3464 3465 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 3466 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 3467 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 3468 SelectionDAG &DAG) const { 3469 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 3470 EVT VT = Op.getValueType(); 3471 unsigned VTBits = VT.getSizeInBits(); 3472 DebugLoc dl = Op.getDebugLoc(); 3473 SDValue ShOpLo = Op.getOperand(0); 3474 SDValue ShOpHi = Op.getOperand(1); 3475 SDValue ShAmt = Op.getOperand(2); 3476 SDValue ARMcc; 3477 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 3478 3479 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 3480 3481 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 3482 DAG.getConstant(VTBits, MVT::i32), ShAmt); 3483 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 3484 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 3485 DAG.getConstant(VTBits, MVT::i32)); 3486 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 3487 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 3488 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 3489 3490 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3491 SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, 3492 ARMcc, DAG, dl); 3493 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 3494 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, 3495 CCR, Cmp); 3496 3497 SDValue Ops[2] = { Lo, Hi }; 3498 return DAG.getMergeValues(Ops, 2, dl); 3499 } 3500 3501 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 3502 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 3503 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 3504 SelectionDAG &DAG) const { 3505 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 3506 EVT VT = Op.getValueType(); 3507 unsigned VTBits = VT.getSizeInBits(); 3508 DebugLoc dl = Op.getDebugLoc(); 3509 SDValue ShOpLo = Op.getOperand(0); 3510 SDValue ShOpHi = Op.getOperand(1); 3511 SDValue ShAmt = Op.getOperand(2); 3512 SDValue ARMcc; 3513 3514 assert(Op.getOpcode() == ISD::SHL_PARTS); 3515 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 3516 DAG.getConstant(VTBits, MVT::i32), ShAmt); 3517 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 3518 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 3519 DAG.getConstant(VTBits, MVT::i32)); 3520 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 3521 SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 3522 3523 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 3524 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 3525 SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, 3526 ARMcc, DAG, dl); 3527 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 3528 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, 3529 CCR, Cmp); 3530 3531 SDValue Ops[2] = { Lo, Hi }; 3532 return DAG.getMergeValues(Ops, 2, dl); 3533 } 3534 3535 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 3536 SelectionDAG &DAG) const { 3537 // The rounding mode is in bits 23:22 of the FPSCR. 3538 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 3539 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 3540 // so that the shift + and get folded into a bitfield extract. 3541 DebugLoc dl = Op.getDebugLoc(); 3542 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, 3543 DAG.getConstant(Intrinsic::arm_get_fpscr, 3544 MVT::i32)); 3545 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 3546 DAG.getConstant(1U << 22, MVT::i32)); 3547 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 3548 DAG.getConstant(22, MVT::i32)); 3549 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 3550 DAG.getConstant(3, MVT::i32)); 3551 } 3552 3553 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 3554 const ARMSubtarget *ST) { 3555 EVT VT = N->getValueType(0); 3556 DebugLoc dl = N->getDebugLoc(); 3557 3558 if (!ST->hasV6T2Ops()) 3559 return SDValue(); 3560 3561 SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0)); 3562 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 3563 } 3564 3565 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count 3566 /// for each 16-bit element from operand, repeated. The basic idea is to 3567 /// leverage vcnt to get the 8-bit counts, gather and add the results. 3568 /// 3569 /// Trace for v4i16: 3570 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 3571 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) 3572 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) 3573 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 3574 /// [b0 b1 b2 b3 b4 b5 b6 b7] 3575 /// +[b1 b0 b3 b2 b5 b4 b7 b6] 3576 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, 3577 /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) 3578 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { 3579 EVT VT = N->getValueType(0); 3580 DebugLoc DL = N->getDebugLoc(); 3581 3582 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 3583 SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); 3584 SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); 3585 SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); 3586 SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); 3587 return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); 3588 } 3589 3590 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the 3591 /// bit-count for each 16-bit element from the operand. We need slightly 3592 /// different sequencing for v4i16 and v8i16 to stay within NEON's available 3593 /// 64/128-bit registers. 3594 /// 3595 /// Trace for v4i16: 3596 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 3597 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) 3598 /// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] 3599 /// v4i16:Extracted = [k0 k1 k2 k3 ] 3600 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { 3601 EVT VT = N->getValueType(0); 3602 DebugLoc DL = N->getDebugLoc(); 3603 3604 SDValue BitCounts = getCTPOP16BitCounts(N, DAG); 3605 if (VT.is64BitVector()) { 3606 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); 3607 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, 3608 DAG.getIntPtrConstant(0)); 3609 } else { 3610 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, 3611 BitCounts, DAG.getIntPtrConstant(0)); 3612 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); 3613 } 3614 } 3615 3616 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the 3617 /// bit-count for each 32-bit element from the operand. The idea here is 3618 /// to split the vector into 16-bit elements, leverage the 16-bit count 3619 /// routine, and then combine the results. 3620 /// 3621 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): 3622 /// input = [v0 v1 ] (vi: 32-bit elements) 3623 /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) 3624 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) 3625 /// vrev: N0 = [k1 k0 k3 k2 ] 3626 /// [k0 k1 k2 k3 ] 3627 /// N1 =+[k1 k0 k3 k2 ] 3628 /// [k0 k2 k1 k3 ] 3629 /// N2 =+[k1 k3 k0 k2 ] 3630 /// [k0 k2 k1 k3 ] 3631 /// Extended =+[k1 k3 k0 k2 ] 3632 /// [k0 k2 ] 3633 /// Extracted=+[k1 k3 ] 3634 /// 3635 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { 3636 EVT VT = N->getValueType(0); 3637 DebugLoc DL = N->getDebugLoc(); 3638 3639 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 3640 3641 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); 3642 SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); 3643 SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); 3644 SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); 3645 SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); 3646 3647 if (VT.is64BitVector()) { 3648 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); 3649 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, 3650 DAG.getIntPtrConstant(0)); 3651 } else { 3652 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, 3653 DAG.getIntPtrConstant(0)); 3654 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); 3655 } 3656 } 3657 3658 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 3659 const ARMSubtarget *ST) { 3660 EVT VT = N->getValueType(0); 3661 3662 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 3663 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || 3664 VT == MVT::v4i16 || VT == MVT::v8i16) && 3665 "Unexpected type for custom ctpop lowering"); 3666 3667 if (VT.getVectorElementType() == MVT::i32) 3668 return lowerCTPOP32BitElements(N, DAG); 3669 else 3670 return lowerCTPOP16BitElements(N, DAG); 3671 } 3672 3673 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 3674 const ARMSubtarget *ST) { 3675 EVT VT = N->getValueType(0); 3676 DebugLoc dl = N->getDebugLoc(); 3677 3678 if (!VT.isVector()) 3679 return SDValue(); 3680 3681 // Lower vector shifts on NEON to use VSHL. 3682 assert(ST->hasNEON() && "unexpected vector shift"); 3683 3684 // Left shifts translate directly to the vshiftu intrinsic. 3685 if (N->getOpcode() == ISD::SHL) 3686 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 3687 DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), 3688 N->getOperand(0), N->getOperand(1)); 3689 3690 assert((N->getOpcode() == ISD::SRA || 3691 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); 3692 3693 // NEON uses the same intrinsics for both left and right shifts. For 3694 // right shifts, the shift amounts are negative, so negate the vector of 3695 // shift amounts. 3696 EVT ShiftVT = N->getOperand(1).getValueType(); 3697 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, 3698 getZeroVector(ShiftVT, DAG, dl), 3699 N->getOperand(1)); 3700 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? 3701 Intrinsic::arm_neon_vshifts : 3702 Intrinsic::arm_neon_vshiftu); 3703 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 3704 DAG.getConstant(vshiftInt, MVT::i32), 3705 N->getOperand(0), NegatedCount); 3706 } 3707 3708 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 3709 const ARMSubtarget *ST) { 3710 EVT VT = N->getValueType(0); 3711 DebugLoc dl = N->getDebugLoc(); 3712 3713 // We can get here for a node like i32 = ISD::SHL i32, i64 3714 if (VT != MVT::i64) 3715 return SDValue(); 3716 3717 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 3718 "Unknown shift to lower!"); 3719 3720 // We only lower SRA, SRL of 1 here, all others use generic lowering. 3721 if (!isa<ConstantSDNode>(N->getOperand(1)) || 3722 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1) 3723 return SDValue(); 3724 3725 // If we are in thumb mode, we don't have RRX. 3726 if (ST->isThumb1Only()) return SDValue(); 3727 3728 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 3729 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 3730 DAG.getConstant(0, MVT::i32)); 3731 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 3732 DAG.getConstant(1, MVT::i32)); 3733 3734 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 3735 // captures the result into a carry flag. 3736 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 3737 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1); 3738 3739 // The low part is an ARMISD::RRX operand, which shifts the carry in. 3740 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 3741 3742 // Merge the pieces into a single i64 value. 3743 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 3744 } 3745 3746 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 3747 SDValue TmpOp0, TmpOp1; 3748 bool Invert = false; 3749 bool Swap = false; 3750 unsigned Opc = 0; 3751 3752 SDValue Op0 = Op.getOperand(0); 3753 SDValue Op1 = Op.getOperand(1); 3754 SDValue CC = Op.getOperand(2); 3755 EVT VT = Op.getValueType(); 3756 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 3757 DebugLoc dl = Op.getDebugLoc(); 3758 3759 if (Op.getOperand(1).getValueType().isFloatingPoint()) { 3760 switch (SetCCOpcode) { 3761 default: llvm_unreachable("Illegal FP comparison"); 3762 case ISD::SETUNE: 3763 case ISD::SETNE: Invert = true; // Fallthrough 3764 case ISD::SETOEQ: 3765 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 3766 case ISD::SETOLT: 3767 case ISD::SETLT: Swap = true; // Fallthrough 3768 case ISD::SETOGT: 3769 case ISD::SETGT: Opc = ARMISD::VCGT; break; 3770 case ISD::SETOLE: 3771 case ISD::SETLE: Swap = true; // Fallthrough 3772 case ISD::SETOGE: 3773 case ISD::SETGE: Opc = ARMISD::VCGE; break; 3774 case ISD::SETUGE: Swap = true; // Fallthrough 3775 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; 3776 case ISD::SETUGT: Swap = true; // Fallthrough 3777 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; 3778 case ISD::SETUEQ: Invert = true; // Fallthrough 3779 case ISD::SETONE: 3780 // Expand this to (OLT | OGT). 3781 TmpOp0 = Op0; 3782 TmpOp1 = Op1; 3783 Opc = ISD::OR; 3784 Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); 3785 Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1); 3786 break; 3787 case ISD::SETUO: Invert = true; // Fallthrough 3788 case ISD::SETO: 3789 // Expand this to (OLT | OGE). 3790 TmpOp0 = Op0; 3791 TmpOp1 = Op1; 3792 Opc = ISD::OR; 3793 Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); 3794 Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1); 3795 break; 3796 } 3797 } else { 3798 // Integer comparisons. 3799 switch (SetCCOpcode) { 3800 default: llvm_unreachable("Illegal integer comparison"); 3801 case ISD::SETNE: Invert = true; 3802 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 3803 case ISD::SETLT: Swap = true; 3804 case ISD::SETGT: Opc = ARMISD::VCGT; break; 3805 case ISD::SETLE: Swap = true; 3806 case ISD::SETGE: Opc = ARMISD::VCGE; break; 3807 case ISD::SETULT: Swap = true; 3808 case ISD::SETUGT: Opc = ARMISD::VCGTU; break; 3809 case ISD::SETULE: Swap = true; 3810 case ISD::SETUGE: Opc = ARMISD::VCGEU; break; 3811 } 3812 3813 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 3814 if (Opc == ARMISD::VCEQ) { 3815 3816 SDValue AndOp; 3817 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 3818 AndOp = Op0; 3819 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 3820 AndOp = Op1; 3821 3822 // Ignore bitconvert. 3823 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 3824 AndOp = AndOp.getOperand(0); 3825 3826 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 3827 Opc = ARMISD::VTST; 3828 Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0)); 3829 Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1)); 3830 Invert = !Invert; 3831 } 3832 } 3833 } 3834 3835 if (Swap) 3836 std::swap(Op0, Op1); 3837 3838 // If one of the operands is a constant vector zero, attempt to fold the 3839 // comparison to a specialized compare-against-zero form. 3840 SDValue SingleOp; 3841 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 3842 SingleOp = Op0; 3843 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 3844 if (Opc == ARMISD::VCGE) 3845 Opc = ARMISD::VCLEZ; 3846 else if (Opc == ARMISD::VCGT) 3847 Opc = ARMISD::VCLTZ; 3848 SingleOp = Op1; 3849 } 3850 3851 SDValue Result; 3852 if (SingleOp.getNode()) { 3853 switch (Opc) { 3854 case ARMISD::VCEQ: 3855 Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break; 3856 case ARMISD::VCGE: 3857 Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break; 3858 case ARMISD::VCLEZ: 3859 Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break; 3860 case ARMISD::VCGT: 3861 Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break; 3862 case ARMISD::VCLTZ: 3863 Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break; 3864 default: 3865 Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 3866 } 3867 } else { 3868 Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 3869 } 3870 3871 if (Invert) 3872 Result = DAG.getNOT(dl, Result, VT); 3873 3874 return Result; 3875 } 3876 3877 /// isNEONModifiedImm - Check if the specified splat value corresponds to a 3878 /// valid vector constant for a NEON instruction with a "modified immediate" 3879 /// operand (e.g., VMOV). If so, return the encoded value. 3880 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 3881 unsigned SplatBitSize, SelectionDAG &DAG, 3882 EVT &VT, bool is128Bits, NEONModImmType type) { 3883 unsigned OpCmode, Imm; 3884 3885 // SplatBitSize is set to the smallest size that splats the vector, so a 3886 // zero vector will always have SplatBitSize == 8. However, NEON modified 3887 // immediate instructions others than VMOV do not support the 8-bit encoding 3888 // of a zero vector, and the default encoding of zero is supposed to be the 3889 // 32-bit version. 3890 if (SplatBits == 0) 3891 SplatBitSize = 32; 3892 3893 switch (SplatBitSize) { 3894 case 8: 3895 if (type != VMOVModImm) 3896 return SDValue(); 3897 // Any 1-byte value is OK. Op=0, Cmode=1110. 3898 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 3899 OpCmode = 0xe; 3900 Imm = SplatBits; 3901 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 3902 break; 3903 3904 case 16: 3905 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 3906 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 3907 if ((SplatBits & ~0xff) == 0) { 3908 // Value = 0x00nn: Op=x, Cmode=100x. 3909 OpCmode = 0x8; 3910 Imm = SplatBits; 3911 break; 3912 } 3913 if ((SplatBits & ~0xff00) == 0) { 3914 // Value = 0xnn00: Op=x, Cmode=101x. 3915 OpCmode = 0xa; 3916 Imm = SplatBits >> 8; 3917 break; 3918 } 3919 return SDValue(); 3920 3921 case 32: 3922 // NEON's 32-bit VMOV supports splat values where: 3923 // * only one byte is nonzero, or 3924 // * the least significant byte is 0xff and the second byte is nonzero, or 3925 // * the least significant 2 bytes are 0xff and the third is nonzero. 3926 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 3927 if ((SplatBits & ~0xff) == 0) { 3928 // Value = 0x000000nn: Op=x, Cmode=000x. 3929 OpCmode = 0; 3930 Imm = SplatBits; 3931 break; 3932 } 3933 if ((SplatBits & ~0xff00) == 0) { 3934 // Value = 0x0000nn00: Op=x, Cmode=001x. 3935 OpCmode = 0x2; 3936 Imm = SplatBits >> 8; 3937 break; 3938 } 3939 if ((SplatBits & ~0xff0000) == 0) { 3940 // Value = 0x00nn0000: Op=x, Cmode=010x. 3941 OpCmode = 0x4; 3942 Imm = SplatBits >> 16; 3943 break; 3944 } 3945 if ((SplatBits & ~0xff000000) == 0) { 3946 // Value = 0xnn000000: Op=x, Cmode=011x. 3947 OpCmode = 0x6; 3948 Imm = SplatBits >> 24; 3949 break; 3950 } 3951 3952 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 3953 if (type == OtherModImm) return SDValue(); 3954 3955 if ((SplatBits & ~0xffff) == 0 && 3956 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 3957 // Value = 0x0000nnff: Op=x, Cmode=1100. 3958 OpCmode = 0xc; 3959 Imm = SplatBits >> 8; 3960 SplatBits |= 0xff; 3961 break; 3962 } 3963 3964 if ((SplatBits & ~0xffffff) == 0 && 3965 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 3966 // Value = 0x00nnffff: Op=x, Cmode=1101. 3967 OpCmode = 0xd; 3968 Imm = SplatBits >> 16; 3969 SplatBits |= 0xffff; 3970 break; 3971 } 3972 3973 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 3974 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 3975 // VMOV.I32. A (very) minor optimization would be to replicate the value 3976 // and fall through here to test for a valid 64-bit splat. But, then the 3977 // caller would also need to check and handle the change in size. 3978 return SDValue(); 3979 3980 case 64: { 3981 if (type != VMOVModImm) 3982 return SDValue(); 3983 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 3984 uint64_t BitMask = 0xff; 3985 uint64_t Val = 0; 3986 unsigned ImmMask = 1; 3987 Imm = 0; 3988 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 3989 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 3990 Val |= BitMask; 3991 Imm |= ImmMask; 3992 } else if ((SplatBits & BitMask) != 0) { 3993 return SDValue(); 3994 } 3995 BitMask <<= 8; 3996 ImmMask <<= 1; 3997 } 3998 // Op=1, Cmode=1110. 3999 OpCmode = 0x1e; 4000 SplatBits = Val; 4001 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 4002 break; 4003 } 4004 4005 default: 4006 llvm_unreachable("unexpected size for isNEONModifiedImm"); 4007 } 4008 4009 unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); 4010 return DAG.getTargetConstant(EncodedVal, MVT::i32); 4011 } 4012 4013 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 4014 const ARMSubtarget *ST) const { 4015 if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16()) 4016 return SDValue(); 4017 4018 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 4019 assert(Op.getValueType() == MVT::f32 && 4020 "ConstantFP custom lowering should only occur for f32."); 4021 4022 // Try splatting with a VMOV.f32... 4023 APFloat FPVal = CFP->getValueAPF(); 4024 int ImmVal = ARM_AM::getFP32Imm(FPVal); 4025 if (ImmVal != -1) { 4026 DebugLoc DL = Op.getDebugLoc(); 4027 SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32); 4028 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 4029 NewVal); 4030 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 4031 DAG.getConstant(0, MVT::i32)); 4032 } 4033 4034 // If that fails, try a VMOV.i32 4035 EVT VMovVT; 4036 unsigned iVal = FPVal.bitcastToAPInt().getZExtValue(); 4037 SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false, 4038 VMOVModImm); 4039 if (NewVal != SDValue()) { 4040 DebugLoc DL = Op.getDebugLoc(); 4041 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 4042 NewVal); 4043 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 4044 VecConstant); 4045 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 4046 DAG.getConstant(0, MVT::i32)); 4047 } 4048 4049 // Finally, try a VMVN.i32 4050 NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false, 4051 VMVNModImm); 4052 if (NewVal != SDValue()) { 4053 DebugLoc DL = Op.getDebugLoc(); 4054 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 4055 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 4056 VecConstant); 4057 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 4058 DAG.getConstant(0, MVT::i32)); 4059 } 4060 4061 return SDValue(); 4062 } 4063 4064 // check if an VEXT instruction can handle the shuffle mask when the 4065 // vector sources of the shuffle are the same. 4066 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 4067 unsigned NumElts = VT.getVectorNumElements(); 4068 4069 // Assume that the first shuffle index is not UNDEF. Fail if it is. 4070 if (M[0] < 0) 4071 return false; 4072 4073 Imm = M[0]; 4074 4075 // If this is a VEXT shuffle, the immediate value is the index of the first 4076 // element. The other shuffle indices must be the successive elements after 4077 // the first one. 4078 unsigned ExpectedElt = Imm; 4079 for (unsigned i = 1; i < NumElts; ++i) { 4080 // Increment the expected index. If it wraps around, just follow it 4081 // back to index zero and keep going. 4082 ++ExpectedElt; 4083 if (ExpectedElt == NumElts) 4084 ExpectedElt = 0; 4085 4086 if (M[i] < 0) continue; // ignore UNDEF indices 4087 if (ExpectedElt != static_cast<unsigned>(M[i])) 4088 return false; 4089 } 4090 4091 return true; 4092 } 4093 4094 4095 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 4096 bool &ReverseVEXT, unsigned &Imm) { 4097 unsigned NumElts = VT.getVectorNumElements(); 4098 ReverseVEXT = false; 4099 4100 // Assume that the first shuffle index is not UNDEF. Fail if it is. 4101 if (M[0] < 0) 4102 return false; 4103 4104 Imm = M[0]; 4105 4106 // If this is a VEXT shuffle, the immediate value is the index of the first 4107 // element. The other shuffle indices must be the successive elements after 4108 // the first one. 4109 unsigned ExpectedElt = Imm; 4110 for (unsigned i = 1; i < NumElts; ++i) { 4111 // Increment the expected index. If it wraps around, it may still be 4112 // a VEXT but the source vectors must be swapped. 4113 ExpectedElt += 1; 4114 if (ExpectedElt == NumElts * 2) { 4115 ExpectedElt = 0; 4116 ReverseVEXT = true; 4117 } 4118 4119 if (M[i] < 0) continue; // ignore UNDEF indices 4120 if (ExpectedElt != static_cast<unsigned>(M[i])) 4121 return false; 4122 } 4123 4124 // Adjust the index value if the source operands will be swapped. 4125 if (ReverseVEXT) 4126 Imm -= NumElts; 4127 4128 return true; 4129 } 4130 4131 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 4132 /// instruction with the specified blocksize. (The order of the elements 4133 /// within each block of the vector is reversed.) 4134 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 4135 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 4136 "Only possible block sizes for VREV are: 16, 32, 64"); 4137 4138 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4139 if (EltSz == 64) 4140 return false; 4141 4142 unsigned NumElts = VT.getVectorNumElements(); 4143 unsigned BlockElts = M[0] + 1; 4144 // If the first shuffle index is UNDEF, be optimistic. 4145 if (M[0] < 0) 4146 BlockElts = BlockSize / EltSz; 4147 4148 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 4149 return false; 4150 4151 for (unsigned i = 0; i < NumElts; ++i) { 4152 if (M[i] < 0) continue; // ignore UNDEF indices 4153 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 4154 return false; 4155 } 4156 4157 return true; 4158 } 4159 4160 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 4161 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 4162 // range, then 0 is placed into the resulting vector. So pretty much any mask 4163 // of 8 elements can work here. 4164 return VT == MVT::v8i8 && M.size() == 8; 4165 } 4166 4167 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4168 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4169 if (EltSz == 64) 4170 return false; 4171 4172 unsigned NumElts = VT.getVectorNumElements(); 4173 WhichResult = (M[0] == 0 ? 0 : 1); 4174 for (unsigned i = 0; i < NumElts; i += 2) { 4175 if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || 4176 (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) 4177 return false; 4178 } 4179 return true; 4180 } 4181 4182 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 4183 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4184 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 4185 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 4186 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4187 if (EltSz == 64) 4188 return false; 4189 4190 unsigned NumElts = VT.getVectorNumElements(); 4191 WhichResult = (M[0] == 0 ? 0 : 1); 4192 for (unsigned i = 0; i < NumElts; i += 2) { 4193 if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || 4194 (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) 4195 return false; 4196 } 4197 return true; 4198 } 4199 4200 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4201 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4202 if (EltSz == 64) 4203 return false; 4204 4205 unsigned NumElts = VT.getVectorNumElements(); 4206 WhichResult = (M[0] == 0 ? 0 : 1); 4207 for (unsigned i = 0; i != NumElts; ++i) { 4208 if (M[i] < 0) continue; // ignore UNDEF indices 4209 if ((unsigned) M[i] != 2 * i + WhichResult) 4210 return false; 4211 } 4212 4213 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 4214 if (VT.is64BitVector() && EltSz == 32) 4215 return false; 4216 4217 return true; 4218 } 4219 4220 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 4221 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4222 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 4223 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 4224 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4225 if (EltSz == 64) 4226 return false; 4227 4228 unsigned Half = VT.getVectorNumElements() / 2; 4229 WhichResult = (M[0] == 0 ? 0 : 1); 4230 for (unsigned j = 0; j != 2; ++j) { 4231 unsigned Idx = WhichResult; 4232 for (unsigned i = 0; i != Half; ++i) { 4233 int MIdx = M[i + j * Half]; 4234 if (MIdx >= 0 && (unsigned) MIdx != Idx) 4235 return false; 4236 Idx += 2; 4237 } 4238 } 4239 4240 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 4241 if (VT.is64BitVector() && EltSz == 32) 4242 return false; 4243 4244 return true; 4245 } 4246 4247 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4248 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4249 if (EltSz == 64) 4250 return false; 4251 4252 unsigned NumElts = VT.getVectorNumElements(); 4253 WhichResult = (M[0] == 0 ? 0 : 1); 4254 unsigned Idx = WhichResult * NumElts / 2; 4255 for (unsigned i = 0; i != NumElts; i += 2) { 4256 if ((M[i] >= 0 && (unsigned) M[i] != Idx) || 4257 (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) 4258 return false; 4259 Idx += 1; 4260 } 4261 4262 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 4263 if (VT.is64BitVector() && EltSz == 32) 4264 return false; 4265 4266 return true; 4267 } 4268 4269 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 4270 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4271 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 4272 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 4273 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4274 if (EltSz == 64) 4275 return false; 4276 4277 unsigned NumElts = VT.getVectorNumElements(); 4278 WhichResult = (M[0] == 0 ? 0 : 1); 4279 unsigned Idx = WhichResult * NumElts / 2; 4280 for (unsigned i = 0; i != NumElts; i += 2) { 4281 if ((M[i] >= 0 && (unsigned) M[i] != Idx) || 4282 (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) 4283 return false; 4284 Idx += 1; 4285 } 4286 4287 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 4288 if (VT.is64BitVector() && EltSz == 32) 4289 return false; 4290 4291 return true; 4292 } 4293 4294 // If N is an integer constant that can be moved into a register in one 4295 // instruction, return an SDValue of such a constant (will become a MOV 4296 // instruction). Otherwise return null. 4297 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 4298 const ARMSubtarget *ST, DebugLoc dl) { 4299 uint64_t Val; 4300 if (!isa<ConstantSDNode>(N)) 4301 return SDValue(); 4302 Val = cast<ConstantSDNode>(N)->getZExtValue(); 4303 4304 if (ST->isThumb1Only()) { 4305 if (Val <= 255 || ~Val <= 255) 4306 return DAG.getConstant(Val, MVT::i32); 4307 } else { 4308 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 4309 return DAG.getConstant(Val, MVT::i32); 4310 } 4311 return SDValue(); 4312 } 4313 4314 // If this is a case we can't handle, return null and let the default 4315 // expansion code take care of it. 4316 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 4317 const ARMSubtarget *ST) const { 4318 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 4319 DebugLoc dl = Op.getDebugLoc(); 4320 EVT VT = Op.getValueType(); 4321 4322 APInt SplatBits, SplatUndef; 4323 unsigned SplatBitSize; 4324 bool HasAnyUndefs; 4325 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 4326 if (SplatBitSize <= 64) { 4327 // Check if an immediate VMOV works. 4328 EVT VmovVT; 4329 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 4330 SplatUndef.getZExtValue(), SplatBitSize, 4331 DAG, VmovVT, VT.is128BitVector(), 4332 VMOVModImm); 4333 if (Val.getNode()) { 4334 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 4335 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 4336 } 4337 4338 // Try an immediate VMVN. 4339 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 4340 Val = isNEONModifiedImm(NegatedImm, 4341 SplatUndef.getZExtValue(), SplatBitSize, 4342 DAG, VmovVT, VT.is128BitVector(), 4343 VMVNModImm); 4344 if (Val.getNode()) { 4345 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 4346 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 4347 } 4348 4349 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 4350 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 4351 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 4352 if (ImmVal != -1) { 4353 SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32); 4354 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 4355 } 4356 } 4357 } 4358 } 4359 4360 // Scan through the operands to see if only one value is used. 4361 // 4362 // As an optimisation, even if more than one value is used it may be more 4363 // profitable to splat with one value then change some lanes. 4364 // 4365 // Heuristically we decide to do this if the vector has a "dominant" value, 4366 // defined as splatted to more than half of the lanes. 4367 unsigned NumElts = VT.getVectorNumElements(); 4368 bool isOnlyLowElement = true; 4369 bool usesOnlyOneValue = true; 4370 bool hasDominantValue = false; 4371 bool isConstant = true; 4372 4373 // Map of the number of times a particular SDValue appears in the 4374 // element list. 4375 DenseMap<SDValue, unsigned> ValueCounts; 4376 SDValue Value; 4377 for (unsigned i = 0; i < NumElts; ++i) { 4378 SDValue V = Op.getOperand(i); 4379 if (V.getOpcode() == ISD::UNDEF) 4380 continue; 4381 if (i > 0) 4382 isOnlyLowElement = false; 4383 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 4384 isConstant = false; 4385 4386 ValueCounts.insert(std::make_pair(V, 0)); 4387 unsigned &Count = ValueCounts[V]; 4388 4389 // Is this value dominant? (takes up more than half of the lanes) 4390 if (++Count > (NumElts / 2)) { 4391 hasDominantValue = true; 4392 Value = V; 4393 } 4394 } 4395 if (ValueCounts.size() != 1) 4396 usesOnlyOneValue = false; 4397 if (!Value.getNode() && ValueCounts.size() > 0) 4398 Value = ValueCounts.begin()->first; 4399 4400 if (ValueCounts.size() == 0) 4401 return DAG.getUNDEF(VT); 4402 4403 if (isOnlyLowElement) 4404 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 4405 4406 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4407 4408 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 4409 // i32 and try again. 4410 if (hasDominantValue && EltSize <= 32) { 4411 if (!isConstant) { 4412 SDValue N; 4413 4414 // If we are VDUPing a value that comes directly from a vector, that will 4415 // cause an unnecessary move to and from a GPR, where instead we could 4416 // just use VDUPLANE. 4417 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 4418 // We need to create a new undef vector to use for the VDUPLANE if the 4419 // size of the vector from which we get the value is different than the 4420 // size of the vector that we need to create. We will insert the element 4421 // such that the register coalescer will remove unnecessary copies. 4422 if (VT != Value->getOperand(0).getValueType()) { 4423 ConstantSDNode *constIndex; 4424 constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)); 4425 assert(constIndex && "The index is not a constant!"); 4426 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 4427 VT.getVectorNumElements(); 4428 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 4429 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 4430 Value, DAG.getConstant(index, MVT::i32)), 4431 DAG.getConstant(index, MVT::i32)); 4432 } else { 4433 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 4434 Value->getOperand(0), Value->getOperand(1)); 4435 } 4436 } 4437 else 4438 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 4439 4440 if (!usesOnlyOneValue) { 4441 // The dominant value was splatted as 'N', but we now have to insert 4442 // all differing elements. 4443 for (unsigned I = 0; I < NumElts; ++I) { 4444 if (Op.getOperand(I) == Value) 4445 continue; 4446 SmallVector<SDValue, 3> Ops; 4447 Ops.push_back(N); 4448 Ops.push_back(Op.getOperand(I)); 4449 Ops.push_back(DAG.getConstant(I, MVT::i32)); 4450 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3); 4451 } 4452 } 4453 return N; 4454 } 4455 if (VT.getVectorElementType().isFloatingPoint()) { 4456 SmallVector<SDValue, 8> Ops; 4457 for (unsigned i = 0; i < NumElts; ++i) 4458 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, 4459 Op.getOperand(i))); 4460 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 4461 SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts); 4462 Val = LowerBUILD_VECTOR(Val, DAG, ST); 4463 if (Val.getNode()) 4464 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 4465 } 4466 if (usesOnlyOneValue) { 4467 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 4468 if (isConstant && Val.getNode()) 4469 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 4470 } 4471 } 4472 4473 // If all elements are constants and the case above didn't get hit, fall back 4474 // to the default expansion, which will generate a load from the constant 4475 // pool. 4476 if (isConstant) 4477 return SDValue(); 4478 4479 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 4480 if (NumElts >= 4) { 4481 SDValue shuffle = ReconstructShuffle(Op, DAG); 4482 if (shuffle != SDValue()) 4483 return shuffle; 4484 } 4485 4486 // Vectors with 32- or 64-bit elements can be built by directly assigning 4487 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 4488 // will be legalized. 4489 if (EltSize >= 32) { 4490 // Do the expansion with floating-point types, since that is what the VFP 4491 // registers are defined to use, and since i64 is not legal. 4492 EVT EltVT = EVT::getFloatingPointVT(EltSize); 4493 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 4494 SmallVector<SDValue, 8> Ops; 4495 for (unsigned i = 0; i < NumElts; ++i) 4496 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 4497 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); 4498 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 4499 } 4500 4501 return SDValue(); 4502 } 4503 4504 // Gather data to see if the operation can be modelled as a 4505 // shuffle in combination with VEXTs. 4506 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 4507 SelectionDAG &DAG) const { 4508 DebugLoc dl = Op.getDebugLoc(); 4509 EVT VT = Op.getValueType(); 4510 unsigned NumElts = VT.getVectorNumElements(); 4511 4512 SmallVector<SDValue, 2> SourceVecs; 4513 SmallVector<unsigned, 2> MinElts; 4514 SmallVector<unsigned, 2> MaxElts; 4515 4516 for (unsigned i = 0; i < NumElts; ++i) { 4517 SDValue V = Op.getOperand(i); 4518 if (V.getOpcode() == ISD::UNDEF) 4519 continue; 4520 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 4521 // A shuffle can only come from building a vector from various 4522 // elements of other vectors. 4523 return SDValue(); 4524 } else if (V.getOperand(0).getValueType().getVectorElementType() != 4525 VT.getVectorElementType()) { 4526 // This code doesn't know how to handle shuffles where the vector 4527 // element types do not match (this happens because type legalization 4528 // promotes the return type of EXTRACT_VECTOR_ELT). 4529 // FIXME: It might be appropriate to extend this code to handle 4530 // mismatched types. 4531 return SDValue(); 4532 } 4533 4534 // Record this extraction against the appropriate vector if possible... 4535 SDValue SourceVec = V.getOperand(0); 4536 // If the element number isn't a constant, we can't effectively 4537 // analyze what's going on. 4538 if (!isa<ConstantSDNode>(V.getOperand(1))) 4539 return SDValue(); 4540 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 4541 bool FoundSource = false; 4542 for (unsigned j = 0; j < SourceVecs.size(); ++j) { 4543 if (SourceVecs[j] == SourceVec) { 4544 if (MinElts[j] > EltNo) 4545 MinElts[j] = EltNo; 4546 if (MaxElts[j] < EltNo) 4547 MaxElts[j] = EltNo; 4548 FoundSource = true; 4549 break; 4550 } 4551 } 4552 4553 // Or record a new source if not... 4554 if (!FoundSource) { 4555 SourceVecs.push_back(SourceVec); 4556 MinElts.push_back(EltNo); 4557 MaxElts.push_back(EltNo); 4558 } 4559 } 4560 4561 // Currently only do something sane when at most two source vectors 4562 // involved. 4563 if (SourceVecs.size() > 2) 4564 return SDValue(); 4565 4566 SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; 4567 int VEXTOffsets[2] = {0, 0}; 4568 4569 // This loop extracts the usage patterns of the source vectors 4570 // and prepares appropriate SDValues for a shuffle if possible. 4571 for (unsigned i = 0; i < SourceVecs.size(); ++i) { 4572 if (SourceVecs[i].getValueType() == VT) { 4573 // No VEXT necessary 4574 ShuffleSrcs[i] = SourceVecs[i]; 4575 VEXTOffsets[i] = 0; 4576 continue; 4577 } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { 4578 // It probably isn't worth padding out a smaller vector just to 4579 // break it down again in a shuffle. 4580 return SDValue(); 4581 } 4582 4583 // Since only 64-bit and 128-bit vectors are legal on ARM and 4584 // we've eliminated the other cases... 4585 assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts && 4586 "unexpected vector sizes in ReconstructShuffle"); 4587 4588 if (MaxElts[i] - MinElts[i] >= NumElts) { 4589 // Span too large for a VEXT to cope 4590 return SDValue(); 4591 } 4592 4593 if (MinElts[i] >= NumElts) { 4594 // The extraction can just take the second half 4595 VEXTOffsets[i] = NumElts; 4596 ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 4597 SourceVecs[i], 4598 DAG.getIntPtrConstant(NumElts)); 4599 } else if (MaxElts[i] < NumElts) { 4600 // The extraction can just take the first half 4601 VEXTOffsets[i] = 0; 4602 ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 4603 SourceVecs[i], 4604 DAG.getIntPtrConstant(0)); 4605 } else { 4606 // An actual VEXT is needed 4607 VEXTOffsets[i] = MinElts[i]; 4608 SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 4609 SourceVecs[i], 4610 DAG.getIntPtrConstant(0)); 4611 SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, 4612 SourceVecs[i], 4613 DAG.getIntPtrConstant(NumElts)); 4614 ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, 4615 DAG.getConstant(VEXTOffsets[i], MVT::i32)); 4616 } 4617 } 4618 4619 SmallVector<int, 8> Mask; 4620 4621 for (unsigned i = 0; i < NumElts; ++i) { 4622 SDValue Entry = Op.getOperand(i); 4623 if (Entry.getOpcode() == ISD::UNDEF) { 4624 Mask.push_back(-1); 4625 continue; 4626 } 4627 4628 SDValue ExtractVec = Entry.getOperand(0); 4629 int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i) 4630 .getOperand(1))->getSExtValue(); 4631 if (ExtractVec == SourceVecs[0]) { 4632 Mask.push_back(ExtractElt - VEXTOffsets[0]); 4633 } else { 4634 Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); 4635 } 4636 } 4637 4638 // Final check before we try to produce nonsense... 4639 if (isShuffleMaskLegal(Mask, VT)) 4640 return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], 4641 &Mask[0]); 4642 4643 return SDValue(); 4644 } 4645 4646 /// isShuffleMaskLegal - Targets can use this to indicate that they only 4647 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 4648 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 4649 /// are assumed to be legal. 4650 bool 4651 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 4652 EVT VT) const { 4653 if (VT.getVectorNumElements() == 4 && 4654 (VT.is128BitVector() || VT.is64BitVector())) { 4655 unsigned PFIndexes[4]; 4656 for (unsigned i = 0; i != 4; ++i) { 4657 if (M[i] < 0) 4658 PFIndexes[i] = 8; 4659 else 4660 PFIndexes[i] = M[i]; 4661 } 4662 4663 // Compute the index in the perfect shuffle table. 4664 unsigned PFTableIndex = 4665 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 4666 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 4667 unsigned Cost = (PFEntry >> 30); 4668 4669 if (Cost <= 4) 4670 return true; 4671 } 4672 4673 bool ReverseVEXT; 4674 unsigned Imm, WhichResult; 4675 4676 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4677 return (EltSize >= 32 || 4678 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 4679 isVREVMask(M, VT, 64) || 4680 isVREVMask(M, VT, 32) || 4681 isVREVMask(M, VT, 16) || 4682 isVEXTMask(M, VT, ReverseVEXT, Imm) || 4683 isVTBLMask(M, VT) || 4684 isVTRNMask(M, VT, WhichResult) || 4685 isVUZPMask(M, VT, WhichResult) || 4686 isVZIPMask(M, VT, WhichResult) || 4687 isVTRN_v_undef_Mask(M, VT, WhichResult) || 4688 isVUZP_v_undef_Mask(M, VT, WhichResult) || 4689 isVZIP_v_undef_Mask(M, VT, WhichResult)); 4690 } 4691 4692 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 4693 /// the specified operations to build the shuffle. 4694 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 4695 SDValue RHS, SelectionDAG &DAG, 4696 DebugLoc dl) { 4697 unsigned OpNum = (PFEntry >> 26) & 0x0F; 4698 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 4699 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 4700 4701 enum { 4702 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 4703 OP_VREV, 4704 OP_VDUP0, 4705 OP_VDUP1, 4706 OP_VDUP2, 4707 OP_VDUP3, 4708 OP_VEXT1, 4709 OP_VEXT2, 4710 OP_VEXT3, 4711 OP_VUZPL, // VUZP, left result 4712 OP_VUZPR, // VUZP, right result 4713 OP_VZIPL, // VZIP, left result 4714 OP_VZIPR, // VZIP, right result 4715 OP_VTRNL, // VTRN, left result 4716 OP_VTRNR // VTRN, right result 4717 }; 4718 4719 if (OpNum == OP_COPY) { 4720 if (LHSID == (1*9+2)*9+3) return LHS; 4721 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 4722 return RHS; 4723 } 4724 4725 SDValue OpLHS, OpRHS; 4726 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 4727 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 4728 EVT VT = OpLHS.getValueType(); 4729 4730 switch (OpNum) { 4731 default: llvm_unreachable("Unknown shuffle opcode!"); 4732 case OP_VREV: 4733 // VREV divides the vector in half and swaps within the half. 4734 if (VT.getVectorElementType() == MVT::i32 || 4735 VT.getVectorElementType() == MVT::f32) 4736 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 4737 // vrev <4 x i16> -> VREV32 4738 if (VT.getVectorElementType() == MVT::i16) 4739 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 4740 // vrev <4 x i8> -> VREV16 4741 assert(VT.getVectorElementType() == MVT::i8); 4742 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 4743 case OP_VDUP0: 4744 case OP_VDUP1: 4745 case OP_VDUP2: 4746 case OP_VDUP3: 4747 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 4748 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32)); 4749 case OP_VEXT1: 4750 case OP_VEXT2: 4751 case OP_VEXT3: 4752 return DAG.getNode(ARMISD::VEXT, dl, VT, 4753 OpLHS, OpRHS, 4754 DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32)); 4755 case OP_VUZPL: 4756 case OP_VUZPR: 4757 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 4758 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 4759 case OP_VZIPL: 4760 case OP_VZIPR: 4761 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 4762 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 4763 case OP_VTRNL: 4764 case OP_VTRNR: 4765 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 4766 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 4767 } 4768 } 4769 4770 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 4771 ArrayRef<int> ShuffleMask, 4772 SelectionDAG &DAG) { 4773 // Check to see if we can use the VTBL instruction. 4774 SDValue V1 = Op.getOperand(0); 4775 SDValue V2 = Op.getOperand(1); 4776 DebugLoc DL = Op.getDebugLoc(); 4777 4778 SmallVector<SDValue, 8> VTBLMask; 4779 for (ArrayRef<int>::iterator 4780 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 4781 VTBLMask.push_back(DAG.getConstant(*I, MVT::i32)); 4782 4783 if (V2.getNode()->getOpcode() == ISD::UNDEF) 4784 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 4785 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, 4786 &VTBLMask[0], 8)); 4787 4788 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 4789 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, 4790 &VTBLMask[0], 8)); 4791 } 4792 4793 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4794 SDValue V1 = Op.getOperand(0); 4795 SDValue V2 = Op.getOperand(1); 4796 DebugLoc dl = Op.getDebugLoc(); 4797 EVT VT = Op.getValueType(); 4798 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 4799 4800 // Convert shuffles that are directly supported on NEON to target-specific 4801 // DAG nodes, instead of keeping them as shuffles and matching them again 4802 // during code selection. This is more efficient and avoids the possibility 4803 // of inconsistencies between legalization and selection. 4804 // FIXME: floating-point vectors should be canonicalized to integer vectors 4805 // of the same time so that they get CSEd properly. 4806 ArrayRef<int> ShuffleMask = SVN->getMask(); 4807 4808 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4809 if (EltSize <= 32) { 4810 if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { 4811 int Lane = SVN->getSplatIndex(); 4812 // If this is undef splat, generate it via "just" vdup, if possible. 4813 if (Lane == -1) Lane = 0; 4814 4815 // Test if V1 is a SCALAR_TO_VECTOR. 4816 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4817 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 4818 } 4819 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 4820 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 4821 // reaches it). 4822 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 4823 !isa<ConstantSDNode>(V1.getOperand(0))) { 4824 bool IsScalarToVector = true; 4825 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 4826 if (V1.getOperand(i).getOpcode() != ISD::UNDEF) { 4827 IsScalarToVector = false; 4828 break; 4829 } 4830 if (IsScalarToVector) 4831 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 4832 } 4833 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 4834 DAG.getConstant(Lane, MVT::i32)); 4835 } 4836 4837 bool ReverseVEXT; 4838 unsigned Imm; 4839 if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 4840 if (ReverseVEXT) 4841 std::swap(V1, V2); 4842 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 4843 DAG.getConstant(Imm, MVT::i32)); 4844 } 4845 4846 if (isVREVMask(ShuffleMask, VT, 64)) 4847 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 4848 if (isVREVMask(ShuffleMask, VT, 32)) 4849 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 4850 if (isVREVMask(ShuffleMask, VT, 16)) 4851 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 4852 4853 if (V2->getOpcode() == ISD::UNDEF && 4854 isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 4855 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 4856 DAG.getConstant(Imm, MVT::i32)); 4857 } 4858 4859 // Check for Neon shuffles that modify both input vectors in place. 4860 // If both results are used, i.e., if there are two shuffles with the same 4861 // source operands and with masks corresponding to both results of one of 4862 // these operations, DAG memoization will ensure that a single node is 4863 // used for both shuffles. 4864 unsigned WhichResult; 4865 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 4866 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 4867 V1, V2).getValue(WhichResult); 4868 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 4869 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 4870 V1, V2).getValue(WhichResult); 4871 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 4872 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 4873 V1, V2).getValue(WhichResult); 4874 4875 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 4876 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 4877 V1, V1).getValue(WhichResult); 4878 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 4879 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 4880 V1, V1).getValue(WhichResult); 4881 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 4882 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 4883 V1, V1).getValue(WhichResult); 4884 } 4885 4886 // If the shuffle is not directly supported and it has 4 elements, use 4887 // the PerfectShuffle-generated table to synthesize it from other shuffles. 4888 unsigned NumElts = VT.getVectorNumElements(); 4889 if (NumElts == 4) { 4890 unsigned PFIndexes[4]; 4891 for (unsigned i = 0; i != 4; ++i) { 4892 if (ShuffleMask[i] < 0) 4893 PFIndexes[i] = 8; 4894 else 4895 PFIndexes[i] = ShuffleMask[i]; 4896 } 4897 4898 // Compute the index in the perfect shuffle table. 4899 unsigned PFTableIndex = 4900 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 4901 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 4902 unsigned Cost = (PFEntry >> 30); 4903 4904 if (Cost <= 4) 4905 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 4906 } 4907 4908 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 4909 if (EltSize >= 32) { 4910 // Do the expansion with floating-point types, since that is what the VFP 4911 // registers are defined to use, and since i64 is not legal. 4912 EVT EltVT = EVT::getFloatingPointVT(EltSize); 4913 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 4914 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 4915 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 4916 SmallVector<SDValue, 8> Ops; 4917 for (unsigned i = 0; i < NumElts; ++i) { 4918 if (ShuffleMask[i] < 0) 4919 Ops.push_back(DAG.getUNDEF(EltVT)); 4920 else 4921 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 4922 ShuffleMask[i] < (int)NumElts ? V1 : V2, 4923 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 4924 MVT::i32))); 4925 } 4926 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); 4927 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 4928 } 4929 4930 if (VT == MVT::v8i8) { 4931 SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG); 4932 if (NewOp.getNode()) 4933 return NewOp; 4934 } 4935 4936 return SDValue(); 4937 } 4938 4939 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4940 // INSERT_VECTOR_ELT is legal only for immediate indexes. 4941 SDValue Lane = Op.getOperand(2); 4942 if (!isa<ConstantSDNode>(Lane)) 4943 return SDValue(); 4944 4945 return Op; 4946 } 4947 4948 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4949 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 4950 SDValue Lane = Op.getOperand(1); 4951 if (!isa<ConstantSDNode>(Lane)) 4952 return SDValue(); 4953 4954 SDValue Vec = Op.getOperand(0); 4955 if (Op.getValueType() == MVT::i32 && 4956 Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { 4957 DebugLoc dl = Op.getDebugLoc(); 4958 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 4959 } 4960 4961 return Op; 4962 } 4963 4964 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 4965 // The only time a CONCAT_VECTORS operation can have legal types is when 4966 // two 64-bit vectors are concatenated to a 128-bit vector. 4967 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 4968 "unexpected CONCAT_VECTORS"); 4969 DebugLoc dl = Op.getDebugLoc(); 4970 SDValue Val = DAG.getUNDEF(MVT::v2f64); 4971 SDValue Op0 = Op.getOperand(0); 4972 SDValue Op1 = Op.getOperand(1); 4973 if (Op0.getOpcode() != ISD::UNDEF) 4974 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 4975 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 4976 DAG.getIntPtrConstant(0)); 4977 if (Op1.getOpcode() != ISD::UNDEF) 4978 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 4979 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 4980 DAG.getIntPtrConstant(1)); 4981 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 4982 } 4983 4984 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 4985 /// element has been zero/sign-extended, depending on the isSigned parameter, 4986 /// from an integer type half its size. 4987 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 4988 bool isSigned) { 4989 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 4990 EVT VT = N->getValueType(0); 4991 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 4992 SDNode *BVN = N->getOperand(0).getNode(); 4993 if (BVN->getValueType(0) != MVT::v4i32 || 4994 BVN->getOpcode() != ISD::BUILD_VECTOR) 4995 return false; 4996 unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; 4997 unsigned HiElt = 1 - LoElt; 4998 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 4999 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 5000 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 5001 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 5002 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 5003 return false; 5004 if (isSigned) { 5005 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 5006 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 5007 return true; 5008 } else { 5009 if (Hi0->isNullValue() && Hi1->isNullValue()) 5010 return true; 5011 } 5012 return false; 5013 } 5014 5015 if (N->getOpcode() != ISD::BUILD_VECTOR) 5016 return false; 5017 5018 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 5019 SDNode *Elt = N->getOperand(i).getNode(); 5020 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 5021 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 5022 unsigned HalfSize = EltSize / 2; 5023 if (isSigned) { 5024 if (!isIntN(HalfSize, C->getSExtValue())) 5025 return false; 5026 } else { 5027 if (!isUIntN(HalfSize, C->getZExtValue())) 5028 return false; 5029 } 5030 continue; 5031 } 5032 return false; 5033 } 5034 5035 return true; 5036 } 5037 5038 /// isSignExtended - Check if a node is a vector value that is sign-extended 5039 /// or a constant BUILD_VECTOR with sign-extended elements. 5040 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 5041 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 5042 return true; 5043 if (isExtendedBUILD_VECTOR(N, DAG, true)) 5044 return true; 5045 return false; 5046 } 5047 5048 /// isZeroExtended - Check if a node is a vector value that is zero-extended 5049 /// or a constant BUILD_VECTOR with zero-extended elements. 5050 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 5051 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 5052 return true; 5053 if (isExtendedBUILD_VECTOR(N, DAG, false)) 5054 return true; 5055 return false; 5056 } 5057 5058 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 5059 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 5060 /// We insert the required extension here to get the vector to fill a D register. 5061 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 5062 const EVT &OrigTy, 5063 const EVT &ExtTy, 5064 unsigned ExtOpcode) { 5065 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 5066 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 5067 // 64-bits we need to insert a new extension so that it will be 64-bits. 5068 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 5069 if (OrigTy.getSizeInBits() >= 64) 5070 return N; 5071 5072 // Must extend size to at least 64 bits to be used as an operand for VMULL. 5073 MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy; 5074 EVT NewVT; 5075 switch (OrigSimpleTy) { 5076 default: llvm_unreachable("Unexpected Orig Vector Type"); 5077 case MVT::v2i8: 5078 case MVT::v2i16: 5079 NewVT = MVT::v2i32; 5080 break; 5081 case MVT::v4i8: 5082 NewVT = MVT::v4i16; 5083 break; 5084 } 5085 return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N); 5086 } 5087 5088 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 5089 /// does not do any sign/zero extension. If the original vector is less 5090 /// than 64 bits, an appropriate extension will be added after the load to 5091 /// reach a total size of 64 bits. We have to add the extension separately 5092 /// because ARM does not have a sign/zero extending load for vectors. 5093 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 5094 SDValue NonExtendingLoad = 5095 DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(), 5096 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), 5097 LD->isNonTemporal(), LD->isInvariant(), 5098 LD->getAlignment()); 5099 unsigned ExtOp = 0; 5100 switch (LD->getExtensionType()) { 5101 default: llvm_unreachable("Unexpected LoadExtType"); 5102 case ISD::EXTLOAD: 5103 case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break; 5104 case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break; 5105 } 5106 MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy; 5107 MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy; 5108 return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG, 5109 MemType, ExtType, ExtOp); 5110 } 5111 5112 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 5113 /// extending load, or BUILD_VECTOR with extended elements, return the 5114 /// unextended value. The unextended vector should be 64 bits so that it can 5115 /// be used as an operand to a VMULL instruction. If the original vector size 5116 /// before extension is less than 64 bits we add a an extension to resize 5117 /// the vector to 64 bits. 5118 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 5119 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 5120 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 5121 N->getOperand(0)->getValueType(0), 5122 N->getValueType(0), 5123 N->getOpcode()); 5124 5125 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) 5126 return SkipLoadExtensionForVMULL(LD, DAG); 5127 5128 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 5129 // have been legalized as a BITCAST from v4i32. 5130 if (N->getOpcode() == ISD::BITCAST) { 5131 SDNode *BVN = N->getOperand(0).getNode(); 5132 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 5133 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 5134 unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; 5135 return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32, 5136 BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); 5137 } 5138 // Construct a new BUILD_VECTOR with elements truncated to half the size. 5139 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 5140 EVT VT = N->getValueType(0); 5141 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 5142 unsigned NumElts = VT.getVectorNumElements(); 5143 MVT TruncVT = MVT::getIntegerVT(EltSize); 5144 SmallVector<SDValue, 8> Ops; 5145 for (unsigned i = 0; i != NumElts; ++i) { 5146 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 5147 const APInt &CInt = C->getAPIntValue(); 5148 // Element types smaller than 32 bits are not legal, so use i32 elements. 5149 // The values are implicitly truncated so sext vs. zext doesn't matter. 5150 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); 5151 } 5152 return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), 5153 MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); 5154 } 5155 5156 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 5157 unsigned Opcode = N->getOpcode(); 5158 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 5159 SDNode *N0 = N->getOperand(0).getNode(); 5160 SDNode *N1 = N->getOperand(1).getNode(); 5161 return N0->hasOneUse() && N1->hasOneUse() && 5162 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 5163 } 5164 return false; 5165 } 5166 5167 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 5168 unsigned Opcode = N->getOpcode(); 5169 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 5170 SDNode *N0 = N->getOperand(0).getNode(); 5171 SDNode *N1 = N->getOperand(1).getNode(); 5172 return N0->hasOneUse() && N1->hasOneUse() && 5173 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 5174 } 5175 return false; 5176 } 5177 5178 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 5179 // Multiplications are only custom-lowered for 128-bit vectors so that 5180 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 5181 EVT VT = Op.getValueType(); 5182 assert(VT.is128BitVector() && VT.isInteger() && 5183 "unexpected type for custom-lowering ISD::MUL"); 5184 SDNode *N0 = Op.getOperand(0).getNode(); 5185 SDNode *N1 = Op.getOperand(1).getNode(); 5186 unsigned NewOpc = 0; 5187 bool isMLA = false; 5188 bool isN0SExt = isSignExtended(N0, DAG); 5189 bool isN1SExt = isSignExtended(N1, DAG); 5190 if (isN0SExt && isN1SExt) 5191 NewOpc = ARMISD::VMULLs; 5192 else { 5193 bool isN0ZExt = isZeroExtended(N0, DAG); 5194 bool isN1ZExt = isZeroExtended(N1, DAG); 5195 if (isN0ZExt && isN1ZExt) 5196 NewOpc = ARMISD::VMULLu; 5197 else if (isN1SExt || isN1ZExt) { 5198 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 5199 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 5200 if (isN1SExt && isAddSubSExt(N0, DAG)) { 5201 NewOpc = ARMISD::VMULLs; 5202 isMLA = true; 5203 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 5204 NewOpc = ARMISD::VMULLu; 5205 isMLA = true; 5206 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 5207 std::swap(N0, N1); 5208 NewOpc = ARMISD::VMULLu; 5209 isMLA = true; 5210 } 5211 } 5212 5213 if (!NewOpc) { 5214 if (VT == MVT::v2i64) 5215 // Fall through to expand this. It is not legal. 5216 return SDValue(); 5217 else 5218 // Other vector multiplications are legal. 5219 return Op; 5220 } 5221 } 5222 5223 // Legalize to a VMULL instruction. 5224 DebugLoc DL = Op.getDebugLoc(); 5225 SDValue Op0; 5226 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 5227 if (!isMLA) { 5228 Op0 = SkipExtensionForVMULL(N0, DAG); 5229 assert(Op0.getValueType().is64BitVector() && 5230 Op1.getValueType().is64BitVector() && 5231 "unexpected types for extended operands to VMULL"); 5232 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 5233 } 5234 5235 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 5236 // isel lowering to take advantage of no-stall back to back vmul + vmla. 5237 // vmull q0, d4, d6 5238 // vmlal q0, d5, d6 5239 // is faster than 5240 // vaddl q0, d4, d5 5241 // vmovl q1, d6 5242 // vmul q0, q0, q1 5243 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 5244 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 5245 EVT Op1VT = Op1.getValueType(); 5246 return DAG.getNode(N0->getOpcode(), DL, VT, 5247 DAG.getNode(NewOpc, DL, VT, 5248 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 5249 DAG.getNode(NewOpc, DL, VT, 5250 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 5251 } 5252 5253 static SDValue 5254 LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) { 5255 // Convert to float 5256 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 5257 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 5258 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 5259 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 5260 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 5261 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 5262 // Get reciprocal estimate. 5263 // float4 recip = vrecpeq_f32(yf); 5264 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 5265 DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y); 5266 // Because char has a smaller range than uchar, we can actually get away 5267 // without any newton steps. This requires that we use a weird bias 5268 // of 0xb000, however (again, this has been exhaustively tested). 5269 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 5270 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 5271 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 5272 Y = DAG.getConstant(0xb000, MVT::i32); 5273 Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); 5274 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 5275 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 5276 // Convert back to short. 5277 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 5278 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 5279 return X; 5280 } 5281 5282 static SDValue 5283 LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) { 5284 SDValue N2; 5285 // Convert to float. 5286 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 5287 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 5288 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 5289 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 5290 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 5291 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 5292 5293 // Use reciprocal estimate and one refinement step. 5294 // float4 recip = vrecpeq_f32(yf); 5295 // recip *= vrecpsq_f32(yf, recip); 5296 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 5297 DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); 5298 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 5299 DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), 5300 N1, N2); 5301 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 5302 // Because short has a smaller range than ushort, we can actually get away 5303 // with only a single newton step. This requires that we use a weird bias 5304 // of 89, however (again, this has been exhaustively tested). 5305 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 5306 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 5307 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 5308 N1 = DAG.getConstant(0x89, MVT::i32); 5309 N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); 5310 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 5311 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 5312 // Convert back to integer and return. 5313 // return vmovn_s32(vcvt_s32_f32(result)); 5314 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 5315 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 5316 return N0; 5317 } 5318 5319 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 5320 EVT VT = Op.getValueType(); 5321 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 5322 "unexpected type for custom-lowering ISD::SDIV"); 5323 5324 DebugLoc dl = Op.getDebugLoc(); 5325 SDValue N0 = Op.getOperand(0); 5326 SDValue N1 = Op.getOperand(1); 5327 SDValue N2, N3; 5328 5329 if (VT == MVT::v8i8) { 5330 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 5331 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 5332 5333 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 5334 DAG.getIntPtrConstant(4)); 5335 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 5336 DAG.getIntPtrConstant(4)); 5337 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 5338 DAG.getIntPtrConstant(0)); 5339 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 5340 DAG.getIntPtrConstant(0)); 5341 5342 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 5343 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 5344 5345 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 5346 N0 = LowerCONCAT_VECTORS(N0, DAG); 5347 5348 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 5349 return N0; 5350 } 5351 return LowerSDIV_v4i16(N0, N1, dl, DAG); 5352 } 5353 5354 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { 5355 EVT VT = Op.getValueType(); 5356 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 5357 "unexpected type for custom-lowering ISD::UDIV"); 5358 5359 DebugLoc dl = Op.getDebugLoc(); 5360 SDValue N0 = Op.getOperand(0); 5361 SDValue N1 = Op.getOperand(1); 5362 SDValue N2, N3; 5363 5364 if (VT == MVT::v8i8) { 5365 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 5366 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 5367 5368 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 5369 DAG.getIntPtrConstant(4)); 5370 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 5371 DAG.getIntPtrConstant(4)); 5372 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 5373 DAG.getIntPtrConstant(0)); 5374 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 5375 DAG.getIntPtrConstant(0)); 5376 5377 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 5378 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 5379 5380 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 5381 N0 = LowerCONCAT_VECTORS(N0, DAG); 5382 5383 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 5384 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32), 5385 N0); 5386 return N0; 5387 } 5388 5389 // v4i16 sdiv ... Convert to float. 5390 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 5391 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 5392 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 5393 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 5394 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 5395 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 5396 5397 // Use reciprocal estimate and two refinement steps. 5398 // float4 recip = vrecpeq_f32(yf); 5399 // recip *= vrecpsq_f32(yf, recip); 5400 // recip *= vrecpsq_f32(yf, recip); 5401 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 5402 DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1); 5403 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 5404 DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), 5405 BN1, N2); 5406 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 5407 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 5408 DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), 5409 BN1, N2); 5410 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 5411 // Simply multiplying by the reciprocal estimate can leave us a few ulps 5412 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 5413 // and that it will never cause us to return an answer too large). 5414 // float4 result = as_float4(as_int4(xf*recip) + 2); 5415 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 5416 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 5417 N1 = DAG.getConstant(2, MVT::i32); 5418 N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); 5419 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 5420 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 5421 // Convert back to integer and return. 5422 // return vmovn_u32(vcvt_s32_f32(result)); 5423 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 5424 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 5425 return N0; 5426 } 5427 5428 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 5429 EVT VT = Op.getNode()->getValueType(0); 5430 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 5431 5432 unsigned Opc; 5433 bool ExtraOp = false; 5434 switch (Op.getOpcode()) { 5435 default: llvm_unreachable("Invalid code"); 5436 case ISD::ADDC: Opc = ARMISD::ADDC; break; 5437 case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; 5438 case ISD::SUBC: Opc = ARMISD::SUBC; break; 5439 case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; 5440 } 5441 5442 if (!ExtraOp) 5443 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 5444 Op.getOperand(1)); 5445 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 5446 Op.getOperand(1), Op.getOperand(2)); 5447 } 5448 5449 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 5450 // Monotonic load/store is legal for all targets 5451 if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic) 5452 return Op; 5453 5454 // Aquire/Release load/store is not legal for targets without a 5455 // dmb or equivalent available. 5456 return SDValue(); 5457 } 5458 5459 5460 static void 5461 ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results, 5462 SelectionDAG &DAG, unsigned NewOp) { 5463 DebugLoc dl = Node->getDebugLoc(); 5464 assert (Node->getValueType(0) == MVT::i64 && 5465 "Only know how to expand i64 atomics"); 5466 5467 SmallVector<SDValue, 6> Ops; 5468 Ops.push_back(Node->getOperand(0)); // Chain 5469 Ops.push_back(Node->getOperand(1)); // Ptr 5470 // Low part of Val1 5471 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5472 Node->getOperand(2), DAG.getIntPtrConstant(0))); 5473 // High part of Val1 5474 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5475 Node->getOperand(2), DAG.getIntPtrConstant(1))); 5476 if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) { 5477 // High part of Val1 5478 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5479 Node->getOperand(3), DAG.getIntPtrConstant(0))); 5480 // High part of Val2 5481 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5482 Node->getOperand(3), DAG.getIntPtrConstant(1))); 5483 } 5484 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 5485 SDValue Result = 5486 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64, 5487 cast<MemSDNode>(Node)->getMemOperand()); 5488 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) }; 5489 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 5490 Results.push_back(Result.getValue(2)); 5491 } 5492 5493 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 5494 switch (Op.getOpcode()) { 5495 default: llvm_unreachable("Don't know how to custom lower this!"); 5496 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 5497 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 5498 case ISD::GlobalAddress: 5499 return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : 5500 LowerGlobalAddressELF(Op, DAG); 5501 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 5502 case ISD::SELECT: return LowerSELECT(Op, DAG); 5503 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 5504 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 5505 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 5506 case ISD::VASTART: return LowerVASTART(Op, DAG); 5507 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); 5508 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 5509 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 5510 case ISD::SINT_TO_FP: 5511 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 5512 case ISD::FP_TO_SINT: 5513 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 5514 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 5515 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 5516 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 5517 case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); 5518 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 5519 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 5520 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 5521 Subtarget); 5522 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); 5523 case ISD::SHL: 5524 case ISD::SRL: 5525 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 5526 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 5527 case ISD::SRL_PARTS: 5528 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 5529 case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 5530 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 5531 case ISD::SETCC: return LowerVSETCC(Op, DAG); 5532 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 5533 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 5534 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 5535 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 5536 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 5537 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 5538 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 5539 case ISD::MUL: return LowerMUL(Op, DAG); 5540 case ISD::SDIV: return LowerSDIV(Op, DAG); 5541 case ISD::UDIV: return LowerUDIV(Op, DAG); 5542 case ISD::ADDC: 5543 case ISD::ADDE: 5544 case ISD::SUBC: 5545 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 5546 case ISD::ATOMIC_LOAD: 5547 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 5548 } 5549 } 5550 5551 /// ReplaceNodeResults - Replace the results of node with an illegal result 5552 /// type with new values built out of custom code. 5553 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 5554 SmallVectorImpl<SDValue>&Results, 5555 SelectionDAG &DAG) const { 5556 SDValue Res; 5557 switch (N->getOpcode()) { 5558 default: 5559 llvm_unreachable("Don't know how to custom expand this!"); 5560 case ISD::BITCAST: 5561 Res = ExpandBITCAST(N, DAG); 5562 break; 5563 case ISD::SRL: 5564 case ISD::SRA: 5565 Res = Expand64BitShift(N, DAG, Subtarget); 5566 break; 5567 case ISD::ATOMIC_LOAD_ADD: 5568 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG); 5569 return; 5570 case ISD::ATOMIC_LOAD_AND: 5571 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG); 5572 return; 5573 case ISD::ATOMIC_LOAD_NAND: 5574 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG); 5575 return; 5576 case ISD::ATOMIC_LOAD_OR: 5577 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG); 5578 return; 5579 case ISD::ATOMIC_LOAD_SUB: 5580 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG); 5581 return; 5582 case ISD::ATOMIC_LOAD_XOR: 5583 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG); 5584 return; 5585 case ISD::ATOMIC_SWAP: 5586 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG); 5587 return; 5588 case ISD::ATOMIC_CMP_SWAP: 5589 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG); 5590 return; 5591 case ISD::ATOMIC_LOAD_MIN: 5592 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG); 5593 return; 5594 case ISD::ATOMIC_LOAD_UMIN: 5595 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG); 5596 return; 5597 case ISD::ATOMIC_LOAD_MAX: 5598 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG); 5599 return; 5600 case ISD::ATOMIC_LOAD_UMAX: 5601 ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG); 5602 return; 5603 } 5604 if (Res.getNode()) 5605 Results.push_back(Res); 5606 } 5607 5608 //===----------------------------------------------------------------------===// 5609 // ARM Scheduler Hooks 5610 //===----------------------------------------------------------------------===// 5611 5612 MachineBasicBlock * 5613 ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, 5614 MachineBasicBlock *BB, 5615 unsigned Size) const { 5616 unsigned dest = MI->getOperand(0).getReg(); 5617 unsigned ptr = MI->getOperand(1).getReg(); 5618 unsigned oldval = MI->getOperand(2).getReg(); 5619 unsigned newval = MI->getOperand(3).getReg(); 5620 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 5621 DebugLoc dl = MI->getDebugLoc(); 5622 bool isThumb2 = Subtarget->isThumb2(); 5623 5624 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5625 unsigned scratch = MRI.createVirtualRegister(isThumb2 ? 5626 (const TargetRegisterClass*)&ARM::rGPRRegClass : 5627 (const TargetRegisterClass*)&ARM::GPRRegClass); 5628 5629 if (isThumb2) { 5630 MRI.constrainRegClass(dest, &ARM::rGPRRegClass); 5631 MRI.constrainRegClass(oldval, &ARM::rGPRRegClass); 5632 MRI.constrainRegClass(newval, &ARM::rGPRRegClass); 5633 } 5634 5635 unsigned ldrOpc, strOpc; 5636 switch (Size) { 5637 default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); 5638 case 1: 5639 ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; 5640 strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; 5641 break; 5642 case 2: 5643 ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; 5644 strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; 5645 break; 5646 case 4: 5647 ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; 5648 strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; 5649 break; 5650 } 5651 5652 MachineFunction *MF = BB->getParent(); 5653 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 5654 MachineFunction::iterator It = BB; 5655 ++It; // insert the new blocks after the current block 5656 5657 MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); 5658 MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); 5659 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 5660 MF->insert(It, loop1MBB); 5661 MF->insert(It, loop2MBB); 5662 MF->insert(It, exitMBB); 5663 5664 // Transfer the remainder of BB and its successor edges to exitMBB. 5665 exitMBB->splice(exitMBB->begin(), BB, 5666 llvm::next(MachineBasicBlock::iterator(MI)), 5667 BB->end()); 5668 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 5669 5670 // thisMBB: 5671 // ... 5672 // fallthrough --> loop1MBB 5673 BB->addSuccessor(loop1MBB); 5674 5675 // loop1MBB: 5676 // ldrex dest, [ptr] 5677 // cmp dest, oldval 5678 // bne exitMBB 5679 BB = loop1MBB; 5680 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 5681 if (ldrOpc == ARM::t2LDREX) 5682 MIB.addImm(0); 5683 AddDefaultPred(MIB); 5684 AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 5685 .addReg(dest).addReg(oldval)); 5686 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 5687 .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 5688 BB->addSuccessor(loop2MBB); 5689 BB->addSuccessor(exitMBB); 5690 5691 // loop2MBB: 5692 // strex scratch, newval, [ptr] 5693 // cmp scratch, #0 5694 // bne loop1MBB 5695 BB = loop2MBB; 5696 MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr); 5697 if (strOpc == ARM::t2STREX) 5698 MIB.addImm(0); 5699 AddDefaultPred(MIB); 5700 AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 5701 .addReg(scratch).addImm(0)); 5702 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 5703 .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 5704 BB->addSuccessor(loop1MBB); 5705 BB->addSuccessor(exitMBB); 5706 5707 // exitMBB: 5708 // ... 5709 BB = exitMBB; 5710 5711 MI->eraseFromParent(); // The instruction is gone now. 5712 5713 return BB; 5714 } 5715 5716 MachineBasicBlock * 5717 ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, 5718 unsigned Size, unsigned BinOpcode) const { 5719 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 5720 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 5721 5722 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 5723 MachineFunction *MF = BB->getParent(); 5724 MachineFunction::iterator It = BB; 5725 ++It; 5726 5727 unsigned dest = MI->getOperand(0).getReg(); 5728 unsigned ptr = MI->getOperand(1).getReg(); 5729 unsigned incr = MI->getOperand(2).getReg(); 5730 DebugLoc dl = MI->getDebugLoc(); 5731 bool isThumb2 = Subtarget->isThumb2(); 5732 5733 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5734 if (isThumb2) { 5735 MRI.constrainRegClass(dest, &ARM::rGPRRegClass); 5736 MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); 5737 } 5738 5739 unsigned ldrOpc, strOpc; 5740 switch (Size) { 5741 default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); 5742 case 1: 5743 ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; 5744 strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; 5745 break; 5746 case 2: 5747 ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; 5748 strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; 5749 break; 5750 case 4: 5751 ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; 5752 strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; 5753 break; 5754 } 5755 5756 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 5757 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 5758 MF->insert(It, loopMBB); 5759 MF->insert(It, exitMBB); 5760 5761 // Transfer the remainder of BB and its successor edges to exitMBB. 5762 exitMBB->splice(exitMBB->begin(), BB, 5763 llvm::next(MachineBasicBlock::iterator(MI)), 5764 BB->end()); 5765 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 5766 5767 const TargetRegisterClass *TRC = isThumb2 ? 5768 (const TargetRegisterClass*)&ARM::rGPRRegClass : 5769 (const TargetRegisterClass*)&ARM::GPRRegClass; 5770 unsigned scratch = MRI.createVirtualRegister(TRC); 5771 unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); 5772 5773 // thisMBB: 5774 // ... 5775 // fallthrough --> loopMBB 5776 BB->addSuccessor(loopMBB); 5777 5778 // loopMBB: 5779 // ldrex dest, ptr 5780 // <binop> scratch2, dest, incr 5781 // strex scratch, scratch2, ptr 5782 // cmp scratch, #0 5783 // bne- loopMBB 5784 // fallthrough --> exitMBB 5785 BB = loopMBB; 5786 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 5787 if (ldrOpc == ARM::t2LDREX) 5788 MIB.addImm(0); 5789 AddDefaultPred(MIB); 5790 if (BinOpcode) { 5791 // operand order needs to go the other way for NAND 5792 if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr) 5793 AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2). 5794 addReg(incr).addReg(dest)).addReg(0); 5795 else 5796 AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2). 5797 addReg(dest).addReg(incr)).addReg(0); 5798 } 5799 5800 MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr); 5801 if (strOpc == ARM::t2STREX) 5802 MIB.addImm(0); 5803 AddDefaultPred(MIB); 5804 AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 5805 .addReg(scratch).addImm(0)); 5806 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 5807 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 5808 5809 BB->addSuccessor(loopMBB); 5810 BB->addSuccessor(exitMBB); 5811 5812 // exitMBB: 5813 // ... 5814 BB = exitMBB; 5815 5816 MI->eraseFromParent(); // The instruction is gone now. 5817 5818 return BB; 5819 } 5820 5821 MachineBasicBlock * 5822 ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI, 5823 MachineBasicBlock *BB, 5824 unsigned Size, 5825 bool signExtend, 5826 ARMCC::CondCodes Cond) const { 5827 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 5828 5829 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 5830 MachineFunction *MF = BB->getParent(); 5831 MachineFunction::iterator It = BB; 5832 ++It; 5833 5834 unsigned dest = MI->getOperand(0).getReg(); 5835 unsigned ptr = MI->getOperand(1).getReg(); 5836 unsigned incr = MI->getOperand(2).getReg(); 5837 unsigned oldval = dest; 5838 DebugLoc dl = MI->getDebugLoc(); 5839 bool isThumb2 = Subtarget->isThumb2(); 5840 5841 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5842 if (isThumb2) { 5843 MRI.constrainRegClass(dest, &ARM::rGPRRegClass); 5844 MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); 5845 } 5846 5847 unsigned ldrOpc, strOpc, extendOpc; 5848 switch (Size) { 5849 default: llvm_unreachable("unsupported size for AtomicCmpSwap!"); 5850 case 1: 5851 ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB; 5852 strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB; 5853 extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB; 5854 break; 5855 case 2: 5856 ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH; 5857 strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH; 5858 extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH; 5859 break; 5860 case 4: 5861 ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX; 5862 strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX; 5863 extendOpc = 0; 5864 break; 5865 } 5866 5867 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 5868 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 5869 MF->insert(It, loopMBB); 5870 MF->insert(It, exitMBB); 5871 5872 // Transfer the remainder of BB and its successor edges to exitMBB. 5873 exitMBB->splice(exitMBB->begin(), BB, 5874 llvm::next(MachineBasicBlock::iterator(MI)), 5875 BB->end()); 5876 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 5877 5878 const TargetRegisterClass *TRC = isThumb2 ? 5879 (const TargetRegisterClass*)&ARM::rGPRRegClass : 5880 (const TargetRegisterClass*)&ARM::GPRRegClass; 5881 unsigned scratch = MRI.createVirtualRegister(TRC); 5882 unsigned scratch2 = MRI.createVirtualRegister(TRC); 5883 5884 // thisMBB: 5885 // ... 5886 // fallthrough --> loopMBB 5887 BB->addSuccessor(loopMBB); 5888 5889 // loopMBB: 5890 // ldrex dest, ptr 5891 // (sign extend dest, if required) 5892 // cmp dest, incr 5893 // cmov.cond scratch2, incr, dest 5894 // strex scratch, scratch2, ptr 5895 // cmp scratch, #0 5896 // bne- loopMBB 5897 // fallthrough --> exitMBB 5898 BB = loopMBB; 5899 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 5900 if (ldrOpc == ARM::t2LDREX) 5901 MIB.addImm(0); 5902 AddDefaultPred(MIB); 5903 5904 // Sign extend the value, if necessary. 5905 if (signExtend && extendOpc) { 5906 oldval = MRI.createVirtualRegister(&ARM::GPRRegClass); 5907 AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval) 5908 .addReg(dest) 5909 .addImm(0)); 5910 } 5911 5912 // Build compare and cmov instructions. 5913 AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 5914 .addReg(oldval).addReg(incr)); 5915 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2) 5916 .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR); 5917 5918 MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr); 5919 if (strOpc == ARM::t2STREX) 5920 MIB.addImm(0); 5921 AddDefaultPred(MIB); 5922 AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 5923 .addReg(scratch).addImm(0)); 5924 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 5925 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 5926 5927 BB->addSuccessor(loopMBB); 5928 BB->addSuccessor(exitMBB); 5929 5930 // exitMBB: 5931 // ... 5932 BB = exitMBB; 5933 5934 MI->eraseFromParent(); // The instruction is gone now. 5935 5936 return BB; 5937 } 5938 5939 MachineBasicBlock * 5940 ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB, 5941 unsigned Op1, unsigned Op2, 5942 bool NeedsCarry, bool IsCmpxchg, 5943 bool IsMinMax, ARMCC::CondCodes CC) const { 5944 // This also handles ATOMIC_SWAP, indicated by Op1==0. 5945 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 5946 5947 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 5948 MachineFunction *MF = BB->getParent(); 5949 MachineFunction::iterator It = BB; 5950 ++It; 5951 5952 unsigned destlo = MI->getOperand(0).getReg(); 5953 unsigned desthi = MI->getOperand(1).getReg(); 5954 unsigned ptr = MI->getOperand(2).getReg(); 5955 unsigned vallo = MI->getOperand(3).getReg(); 5956 unsigned valhi = MI->getOperand(4).getReg(); 5957 DebugLoc dl = MI->getDebugLoc(); 5958 bool isThumb2 = Subtarget->isThumb2(); 5959 5960 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5961 if (isThumb2) { 5962 MRI.constrainRegClass(destlo, &ARM::rGPRRegClass); 5963 MRI.constrainRegClass(desthi, &ARM::rGPRRegClass); 5964 MRI.constrainRegClass(ptr, &ARM::rGPRRegClass); 5965 } 5966 5967 unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD; 5968 unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD; 5969 5970 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 5971 MachineBasicBlock *contBB = 0, *cont2BB = 0; 5972 if (IsCmpxchg || IsMinMax) 5973 contBB = MF->CreateMachineBasicBlock(LLVM_BB); 5974 if (IsCmpxchg) 5975 cont2BB = MF->CreateMachineBasicBlock(LLVM_BB); 5976 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 5977 5978 MF->insert(It, loopMBB); 5979 if (IsCmpxchg || IsMinMax) MF->insert(It, contBB); 5980 if (IsCmpxchg) MF->insert(It, cont2BB); 5981 MF->insert(It, exitMBB); 5982 5983 // Transfer the remainder of BB and its successor edges to exitMBB. 5984 exitMBB->splice(exitMBB->begin(), BB, 5985 llvm::next(MachineBasicBlock::iterator(MI)), 5986 BB->end()); 5987 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 5988 5989 const TargetRegisterClass *TRC = isThumb2 ? 5990 (const TargetRegisterClass*)&ARM::tGPRRegClass : 5991 (const TargetRegisterClass*)&ARM::GPRRegClass; 5992 unsigned storesuccess = MRI.createVirtualRegister(TRC); 5993 5994 // thisMBB: 5995 // ... 5996 // fallthrough --> loopMBB 5997 BB->addSuccessor(loopMBB); 5998 5999 // loopMBB: 6000 // ldrexd r2, r3, ptr 6001 // <binopa> r0, r2, incr 6002 // <binopb> r1, r3, incr 6003 // strexd storesuccess, r0, r1, ptr 6004 // cmp storesuccess, #0 6005 // bne- loopMBB 6006 // fallthrough --> exitMBB 6007 // 6008 // Note that the registers are explicitly specified because there is not any 6009 // way to force the register allocator to allocate a register pair. 6010 // 6011 // FIXME: The hardcoded registers are not necessary for Thumb2, but we 6012 // need to properly enforce the restriction that the two output registers 6013 // for ldrexd must be different. 6014 BB = loopMBB; 6015 // Load 6016 unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6017 unsigned GPRPair1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6018 unsigned GPRPair2; 6019 if (IsMinMax) { 6020 //We need an extra double register for doing min/max. 6021 unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6022 unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6023 GPRPair2 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6024 BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef); 6025 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) 6026 .addReg(undef) 6027 .addReg(vallo) 6028 .addImm(ARM::gsub_0); 6029 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair2) 6030 .addReg(r1) 6031 .addReg(valhi) 6032 .addImm(ARM::gsub_1); 6033 } 6034 6035 AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc)) 6036 .addReg(GPRPair0, RegState::Define).addReg(ptr)); 6037 // Copy r2/r3 into dest. (This copy will normally be coalesced.) 6038 BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo) 6039 .addReg(GPRPair0, 0, ARM::gsub_0); 6040 BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi) 6041 .addReg(GPRPair0, 0, ARM::gsub_1); 6042 6043 if (IsCmpxchg) { 6044 // Add early exit 6045 for (unsigned i = 0; i < 2; i++) { 6046 AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : 6047 ARM::CMPrr)) 6048 .addReg(i == 0 ? destlo : desthi) 6049 .addReg(i == 0 ? vallo : valhi)); 6050 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 6051 .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 6052 BB->addSuccessor(exitMBB); 6053 BB->addSuccessor(i == 0 ? contBB : cont2BB); 6054 BB = (i == 0 ? contBB : cont2BB); 6055 } 6056 6057 // Copy to physregs for strexd 6058 unsigned setlo = MI->getOperand(5).getReg(); 6059 unsigned sethi = MI->getOperand(6).getReg(); 6060 unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6061 unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6062 BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef); 6063 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) 6064 .addReg(undef) 6065 .addReg(setlo) 6066 .addImm(ARM::gsub_0); 6067 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) 6068 .addReg(r1) 6069 .addReg(sethi) 6070 .addImm(ARM::gsub_1); 6071 } else if (Op1) { 6072 // Perform binary operation 6073 unsigned tmpRegLo = MRI.createVirtualRegister(TRC); 6074 AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo) 6075 .addReg(destlo).addReg(vallo)) 6076 .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry)); 6077 unsigned tmpRegHi = MRI.createVirtualRegister(TRC); 6078 AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi) 6079 .addReg(desthi).addReg(valhi)) 6080 .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax)); 6081 6082 unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6083 BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair); 6084 unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6085 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) 6086 .addReg(UndefPair) 6087 .addReg(tmpRegLo) 6088 .addImm(ARM::gsub_0); 6089 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) 6090 .addReg(r1) 6091 .addReg(tmpRegHi) 6092 .addImm(ARM::gsub_1); 6093 } else { 6094 // Copy to physregs for strexd 6095 unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6096 unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass); 6097 BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair); 6098 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1) 6099 .addReg(UndefPair) 6100 .addReg(vallo) 6101 .addImm(ARM::gsub_0); 6102 BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1) 6103 .addReg(r1) 6104 .addReg(valhi) 6105 .addImm(ARM::gsub_1); 6106 } 6107 unsigned GPRPairStore = GPRPair1; 6108 if (IsMinMax) { 6109 // Compare and branch to exit block. 6110 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 6111 .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR); 6112 BB->addSuccessor(exitMBB); 6113 BB->addSuccessor(contBB); 6114 BB = contBB; 6115 GPRPairStore = GPRPair2; 6116 } 6117 6118 // Store 6119 AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess) 6120 .addReg(GPRPairStore).addReg(ptr)); 6121 // Cmp+jump 6122 AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 6123 .addReg(storesuccess).addImm(0)); 6124 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 6125 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 6126 6127 BB->addSuccessor(loopMBB); 6128 BB->addSuccessor(exitMBB); 6129 6130 // exitMBB: 6131 // ... 6132 BB = exitMBB; 6133 6134 MI->eraseFromParent(); // The instruction is gone now. 6135 6136 return BB; 6137 } 6138 6139 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 6140 /// registers the function context. 6141 void ARMTargetLowering:: 6142 SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, 6143 MachineBasicBlock *DispatchBB, int FI) const { 6144 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 6145 DebugLoc dl = MI->getDebugLoc(); 6146 MachineFunction *MF = MBB->getParent(); 6147 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6148 MachineConstantPool *MCP = MF->getConstantPool(); 6149 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 6150 const Function *F = MF->getFunction(); 6151 6152 bool isThumb = Subtarget->isThumb(); 6153 bool isThumb2 = Subtarget->isThumb2(); 6154 6155 unsigned PCLabelId = AFI->createPICLabelUId(); 6156 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 6157 ARMConstantPoolValue *CPV = 6158 ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); 6159 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 6160 6161 const TargetRegisterClass *TRC = isThumb ? 6162 (const TargetRegisterClass*)&ARM::tGPRRegClass : 6163 (const TargetRegisterClass*)&ARM::GPRRegClass; 6164 6165 // Grab constant pool and fixed stack memory operands. 6166 MachineMemOperand *CPMMO = 6167 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(), 6168 MachineMemOperand::MOLoad, 4, 4); 6169 6170 MachineMemOperand *FIMMOSt = 6171 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), 6172 MachineMemOperand::MOStore, 4, 4); 6173 6174 // Load the address of the dispatch MBB into the jump buffer. 6175 if (isThumb2) { 6176 // Incoming value: jbuf 6177 // ldr.n r5, LCPI1_1 6178 // orr r5, r5, #1 6179 // add r5, pc 6180 // str r5, [$jbuf, #+4] ; &jbuf[1] 6181 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6182 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 6183 .addConstantPoolIndex(CPI) 6184 .addMemOperand(CPMMO)); 6185 // Set the low bit because of thumb mode. 6186 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 6187 AddDefaultCC( 6188 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 6189 .addReg(NewVReg1, RegState::Kill) 6190 .addImm(0x01))); 6191 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 6192 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 6193 .addReg(NewVReg2, RegState::Kill) 6194 .addImm(PCLabelId); 6195 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 6196 .addReg(NewVReg3, RegState::Kill) 6197 .addFrameIndex(FI) 6198 .addImm(36) // &jbuf[1] :: pc 6199 .addMemOperand(FIMMOSt)); 6200 } else if (isThumb) { 6201 // Incoming value: jbuf 6202 // ldr.n r1, LCPI1_4 6203 // add r1, pc 6204 // mov r2, #1 6205 // orrs r1, r2 6206 // add r2, $jbuf, #+4 ; &jbuf[1] 6207 // str r1, [r2] 6208 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6209 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 6210 .addConstantPoolIndex(CPI) 6211 .addMemOperand(CPMMO)); 6212 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 6213 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 6214 .addReg(NewVReg1, RegState::Kill) 6215 .addImm(PCLabelId); 6216 // Set the low bit because of thumb mode. 6217 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 6218 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 6219 .addReg(ARM::CPSR, RegState::Define) 6220 .addImm(1)); 6221 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 6222 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 6223 .addReg(ARM::CPSR, RegState::Define) 6224 .addReg(NewVReg2, RegState::Kill) 6225 .addReg(NewVReg3, RegState::Kill)); 6226 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 6227 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5) 6228 .addFrameIndex(FI) 6229 .addImm(36)); // &jbuf[1] :: pc 6230 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 6231 .addReg(NewVReg4, RegState::Kill) 6232 .addReg(NewVReg5, RegState::Kill) 6233 .addImm(0) 6234 .addMemOperand(FIMMOSt)); 6235 } else { 6236 // Incoming value: jbuf 6237 // ldr r1, LCPI1_1 6238 // add r1, pc, r1 6239 // str r1, [$jbuf, #+4] ; &jbuf[1] 6240 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6241 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 6242 .addConstantPoolIndex(CPI) 6243 .addImm(0) 6244 .addMemOperand(CPMMO)); 6245 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 6246 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 6247 .addReg(NewVReg1, RegState::Kill) 6248 .addImm(PCLabelId)); 6249 AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 6250 .addReg(NewVReg2, RegState::Kill) 6251 .addFrameIndex(FI) 6252 .addImm(36) // &jbuf[1] :: pc 6253 .addMemOperand(FIMMOSt)); 6254 } 6255 } 6256 6257 MachineBasicBlock *ARMTargetLowering:: 6258 EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { 6259 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 6260 DebugLoc dl = MI->getDebugLoc(); 6261 MachineFunction *MF = MBB->getParent(); 6262 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6263 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 6264 MachineFrameInfo *MFI = MF->getFrameInfo(); 6265 int FI = MFI->getFunctionContextIndex(); 6266 6267 const TargetRegisterClass *TRC = Subtarget->isThumb() ? 6268 (const TargetRegisterClass*)&ARM::tGPRRegClass : 6269 (const TargetRegisterClass*)&ARM::GPRnopcRegClass; 6270 6271 // Get a mapping of the call site numbers to all of the landing pads they're 6272 // associated with. 6273 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad; 6274 unsigned MaxCSNum = 0; 6275 MachineModuleInfo &MMI = MF->getMMI(); 6276 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 6277 ++BB) { 6278 if (!BB->isLandingPad()) continue; 6279 6280 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 6281 // pad. 6282 for (MachineBasicBlock::iterator 6283 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 6284 if (!II->isEHLabel()) continue; 6285 6286 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 6287 if (!MMI.hasCallSiteLandingPad(Sym)) continue; 6288 6289 SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym); 6290 for (SmallVectorImpl<unsigned>::iterator 6291 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 6292 CSI != CSE; ++CSI) { 6293 CallSiteNumToLPad[*CSI].push_back(BB); 6294 MaxCSNum = std::max(MaxCSNum, *CSI); 6295 } 6296 break; 6297 } 6298 } 6299 6300 // Get an ordered list of the machine basic blocks for the jump table. 6301 std::vector<MachineBasicBlock*> LPadList; 6302 SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs; 6303 LPadList.reserve(CallSiteNumToLPad.size()); 6304 for (unsigned I = 1; I <= MaxCSNum; ++I) { 6305 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 6306 for (SmallVectorImpl<MachineBasicBlock*>::iterator 6307 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 6308 LPadList.push_back(*II); 6309 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 6310 } 6311 } 6312 6313 assert(!LPadList.empty() && 6314 "No landing pad destinations for the dispatch jump table!"); 6315 6316 // Create the jump table and associated information. 6317 MachineJumpTableInfo *JTI = 6318 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 6319 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 6320 unsigned UId = AFI->createJumpTableUId(); 6321 6322 // Create the MBBs for the dispatch code. 6323 6324 // Shove the dispatch's address into the return slot in the function context. 6325 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 6326 DispatchBB->setIsLandingPad(); 6327 6328 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6329 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP)); 6330 DispatchBB->addSuccessor(TrapBB); 6331 6332 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 6333 DispatchBB->addSuccessor(DispContBB); 6334 6335 // Insert and MBBs. 6336 MF->insert(MF->end(), DispatchBB); 6337 MF->insert(MF->end(), DispContBB); 6338 MF->insert(MF->end(), TrapBB); 6339 6340 // Insert code into the entry block that creates and registers the function 6341 // context. 6342 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 6343 6344 MachineMemOperand *FIMMOLd = 6345 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), 6346 MachineMemOperand::MOLoad | 6347 MachineMemOperand::MOVolatile, 4, 4); 6348 6349 MachineInstrBuilder MIB; 6350 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 6351 6352 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 6353 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 6354 6355 // Add a register mask with no preserved registers. This results in all 6356 // registers being marked as clobbered. 6357 MIB.addRegMask(RI.getNoPreservedMask()); 6358 6359 unsigned NumLPads = LPadList.size(); 6360 if (Subtarget->isThumb2()) { 6361 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6362 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 6363 .addFrameIndex(FI) 6364 .addImm(4) 6365 .addMemOperand(FIMMOLd)); 6366 6367 if (NumLPads < 256) { 6368 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 6369 .addReg(NewVReg1) 6370 .addImm(LPadList.size())); 6371 } else { 6372 unsigned VReg1 = MRI->createVirtualRegister(TRC); 6373 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 6374 .addImm(NumLPads & 0xFFFF)); 6375 6376 unsigned VReg2 = VReg1; 6377 if ((NumLPads & 0xFFFF0000) != 0) { 6378 VReg2 = MRI->createVirtualRegister(TRC); 6379 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 6380 .addReg(VReg1) 6381 .addImm(NumLPads >> 16)); 6382 } 6383 6384 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 6385 .addReg(NewVReg1) 6386 .addReg(VReg2)); 6387 } 6388 6389 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 6390 .addMBB(TrapBB) 6391 .addImm(ARMCC::HI) 6392 .addReg(ARM::CPSR); 6393 6394 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 6395 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3) 6396 .addJumpTableIndex(MJTI) 6397 .addImm(UId)); 6398 6399 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 6400 AddDefaultCC( 6401 AddDefaultPred( 6402 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 6403 .addReg(NewVReg3, RegState::Kill) 6404 .addReg(NewVReg1) 6405 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); 6406 6407 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 6408 .addReg(NewVReg4, RegState::Kill) 6409 .addReg(NewVReg1) 6410 .addJumpTableIndex(MJTI) 6411 .addImm(UId); 6412 } else if (Subtarget->isThumb()) { 6413 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6414 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 6415 .addFrameIndex(FI) 6416 .addImm(1) 6417 .addMemOperand(FIMMOLd)); 6418 6419 if (NumLPads < 256) { 6420 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 6421 .addReg(NewVReg1) 6422 .addImm(NumLPads)); 6423 } else { 6424 MachineConstantPool *ConstantPool = MF->getConstantPool(); 6425 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 6426 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 6427 6428 // MachineConstantPool wants an explicit alignment. 6429 unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); 6430 if (Align == 0) 6431 Align = getDataLayout()->getTypeAllocSize(C->getType()); 6432 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 6433 6434 unsigned VReg1 = MRI->createVirtualRegister(TRC); 6435 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 6436 .addReg(VReg1, RegState::Define) 6437 .addConstantPoolIndex(Idx)); 6438 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 6439 .addReg(NewVReg1) 6440 .addReg(VReg1)); 6441 } 6442 6443 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 6444 .addMBB(TrapBB) 6445 .addImm(ARMCC::HI) 6446 .addReg(ARM::CPSR); 6447 6448 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 6449 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 6450 .addReg(ARM::CPSR, RegState::Define) 6451 .addReg(NewVReg1) 6452 .addImm(2)); 6453 6454 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 6455 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 6456 .addJumpTableIndex(MJTI) 6457 .addImm(UId)); 6458 6459 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 6460 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 6461 .addReg(ARM::CPSR, RegState::Define) 6462 .addReg(NewVReg2, RegState::Kill) 6463 .addReg(NewVReg3)); 6464 6465 MachineMemOperand *JTMMOLd = 6466 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), 6467 MachineMemOperand::MOLoad, 4, 4); 6468 6469 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 6470 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 6471 .addReg(NewVReg4, RegState::Kill) 6472 .addImm(0) 6473 .addMemOperand(JTMMOLd)); 6474 6475 unsigned NewVReg6 = MRI->createVirtualRegister(TRC); 6476 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 6477 .addReg(ARM::CPSR, RegState::Define) 6478 .addReg(NewVReg5, RegState::Kill) 6479 .addReg(NewVReg3)); 6480 6481 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 6482 .addReg(NewVReg6, RegState::Kill) 6483 .addJumpTableIndex(MJTI) 6484 .addImm(UId); 6485 } else { 6486 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 6487 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 6488 .addFrameIndex(FI) 6489 .addImm(4) 6490 .addMemOperand(FIMMOLd)); 6491 6492 if (NumLPads < 256) { 6493 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 6494 .addReg(NewVReg1) 6495 .addImm(NumLPads)); 6496 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 6497 unsigned VReg1 = MRI->createVirtualRegister(TRC); 6498 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 6499 .addImm(NumLPads & 0xFFFF)); 6500 6501 unsigned VReg2 = VReg1; 6502 if ((NumLPads & 0xFFFF0000) != 0) { 6503 VReg2 = MRI->createVirtualRegister(TRC); 6504 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 6505 .addReg(VReg1) 6506 .addImm(NumLPads >> 16)); 6507 } 6508 6509 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 6510 .addReg(NewVReg1) 6511 .addReg(VReg2)); 6512 } else { 6513 MachineConstantPool *ConstantPool = MF->getConstantPool(); 6514 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 6515 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 6516 6517 // MachineConstantPool wants an explicit alignment. 6518 unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); 6519 if (Align == 0) 6520 Align = getDataLayout()->getTypeAllocSize(C->getType()); 6521 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 6522 6523 unsigned VReg1 = MRI->createVirtualRegister(TRC); 6524 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 6525 .addReg(VReg1, RegState::Define) 6526 .addConstantPoolIndex(Idx) 6527 .addImm(0)); 6528 AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 6529 .addReg(NewVReg1) 6530 .addReg(VReg1, RegState::Kill)); 6531 } 6532 6533 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 6534 .addMBB(TrapBB) 6535 .addImm(ARMCC::HI) 6536 .addReg(ARM::CPSR); 6537 6538 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 6539 AddDefaultCC( 6540 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 6541 .addReg(NewVReg1) 6542 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); 6543 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 6544 AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 6545 .addJumpTableIndex(MJTI) 6546 .addImm(UId)); 6547 6548 MachineMemOperand *JTMMOLd = 6549 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), 6550 MachineMemOperand::MOLoad, 4, 4); 6551 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 6552 AddDefaultPred( 6553 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 6554 .addReg(NewVReg3, RegState::Kill) 6555 .addReg(NewVReg4) 6556 .addImm(0) 6557 .addMemOperand(JTMMOLd)); 6558 6559 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 6560 .addReg(NewVReg5, RegState::Kill) 6561 .addReg(NewVReg4) 6562 .addJumpTableIndex(MJTI) 6563 .addImm(UId); 6564 } 6565 6566 // Add the jump table entries as successors to the MBB. 6567 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 6568 for (std::vector<MachineBasicBlock*>::iterator 6569 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 6570 MachineBasicBlock *CurMBB = *I; 6571 if (SeenMBBs.insert(CurMBB)) 6572 DispContBB->addSuccessor(CurMBB); 6573 } 6574 6575 // N.B. the order the invoke BBs are processed in doesn't matter here. 6576 const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF); 6577 SmallVector<MachineBasicBlock*, 64> MBBLPads; 6578 for (SmallPtrSet<MachineBasicBlock*, 64>::iterator 6579 I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) { 6580 MachineBasicBlock *BB = *I; 6581 6582 // Remove the landing pad successor from the invoke block and replace it 6583 // with the new dispatch block. 6584 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 6585 BB->succ_end()); 6586 while (!Successors.empty()) { 6587 MachineBasicBlock *SMBB = Successors.pop_back_val(); 6588 if (SMBB->isLandingPad()) { 6589 BB->removeSuccessor(SMBB); 6590 MBBLPads.push_back(SMBB); 6591 } 6592 } 6593 6594 BB->addSuccessor(DispatchBB); 6595 6596 // Find the invoke call and mark all of the callee-saved registers as 6597 // 'implicit defined' so that they're spilled. This prevents code from 6598 // moving instructions to before the EH block, where they will never be 6599 // executed. 6600 for (MachineBasicBlock::reverse_iterator 6601 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 6602 if (!II->isCall()) continue; 6603 6604 DenseMap<unsigned, bool> DefRegs; 6605 for (MachineInstr::mop_iterator 6606 OI = II->operands_begin(), OE = II->operands_end(); 6607 OI != OE; ++OI) { 6608 if (!OI->isReg()) continue; 6609 DefRegs[OI->getReg()] = true; 6610 } 6611 6612 MachineInstrBuilder MIB(&*II); 6613 6614 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 6615 unsigned Reg = SavedRegs[i]; 6616 if (Subtarget->isThumb2() && 6617 !ARM::tGPRRegClass.contains(Reg) && 6618 !ARM::hGPRRegClass.contains(Reg)) 6619 continue; 6620 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 6621 continue; 6622 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 6623 continue; 6624 if (!DefRegs[Reg]) 6625 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 6626 } 6627 6628 break; 6629 } 6630 } 6631 6632 // Mark all former landing pads as non-landing pads. The dispatch is the only 6633 // landing pad now. 6634 for (SmallVectorImpl<MachineBasicBlock*>::iterator 6635 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 6636 (*I)->setIsLandingPad(false); 6637 6638 // The instruction is gone now. 6639 MI->eraseFromParent(); 6640 6641 return MBB; 6642 } 6643 6644 static 6645 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 6646 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 6647 E = MBB->succ_end(); I != E; ++I) 6648 if (*I != Succ) 6649 return *I; 6650 llvm_unreachable("Expecting a BB with two successors!"); 6651 } 6652 6653 MachineBasicBlock *ARMTargetLowering:: 6654 EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const { 6655 // This pseudo instruction has 3 operands: dst, src, size 6656 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 6657 // Otherwise, we will generate unrolled scalar copies. 6658 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 6659 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 6660 MachineFunction::iterator It = BB; 6661 ++It; 6662 6663 unsigned dest = MI->getOperand(0).getReg(); 6664 unsigned src = MI->getOperand(1).getReg(); 6665 unsigned SizeVal = MI->getOperand(2).getImm(); 6666 unsigned Align = MI->getOperand(3).getImm(); 6667 DebugLoc dl = MI->getDebugLoc(); 6668 6669 bool isThumb2 = Subtarget->isThumb2(); 6670 MachineFunction *MF = BB->getParent(); 6671 MachineRegisterInfo &MRI = MF->getRegInfo(); 6672 unsigned ldrOpc, strOpc, UnitSize = 0; 6673 6674 const TargetRegisterClass *TRC = isThumb2 ? 6675 (const TargetRegisterClass*)&ARM::tGPRRegClass : 6676 (const TargetRegisterClass*)&ARM::GPRRegClass; 6677 const TargetRegisterClass *TRC_Vec = 0; 6678 6679 if (Align & 1) { 6680 ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; 6681 strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; 6682 UnitSize = 1; 6683 } else if (Align & 2) { 6684 ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST; 6685 strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST; 6686 UnitSize = 2; 6687 } else { 6688 // Check whether we can use NEON instructions. 6689 if (!MF->getFunction()->getFnAttributes(). 6690 hasAttribute(Attributes::NoImplicitFloat) && 6691 Subtarget->hasNEON()) { 6692 if ((Align % 16 == 0) && SizeVal >= 16) { 6693 ldrOpc = ARM::VLD1q32wb_fixed; 6694 strOpc = ARM::VST1q32wb_fixed; 6695 UnitSize = 16; 6696 TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass; 6697 } 6698 else if ((Align % 8 == 0) && SizeVal >= 8) { 6699 ldrOpc = ARM::VLD1d32wb_fixed; 6700 strOpc = ARM::VST1d32wb_fixed; 6701 UnitSize = 8; 6702 TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass; 6703 } 6704 } 6705 // Can't use NEON instructions. 6706 if (UnitSize == 0) { 6707 ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; 6708 strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM; 6709 UnitSize = 4; 6710 } 6711 } 6712 6713 unsigned BytesLeft = SizeVal % UnitSize; 6714 unsigned LoopSize = SizeVal - BytesLeft; 6715 6716 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 6717 // Use LDR and STR to copy. 6718 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 6719 // [destOut] = STR_POST(scratch, destIn, UnitSize) 6720 unsigned srcIn = src; 6721 unsigned destIn = dest; 6722 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 6723 unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); 6724 unsigned srcOut = MRI.createVirtualRegister(TRC); 6725 unsigned destOut = MRI.createVirtualRegister(TRC); 6726 if (UnitSize >= 8) { 6727 AddDefaultPred(BuildMI(*BB, MI, dl, 6728 TII->get(ldrOpc), scratch) 6729 .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0)); 6730 6731 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) 6732 .addReg(destIn).addImm(0).addReg(scratch)); 6733 } else if (isThumb2) { 6734 AddDefaultPred(BuildMI(*BB, MI, dl, 6735 TII->get(ldrOpc), scratch) 6736 .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize)); 6737 6738 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) 6739 .addReg(scratch).addReg(destIn) 6740 .addImm(UnitSize)); 6741 } else { 6742 AddDefaultPred(BuildMI(*BB, MI, dl, 6743 TII->get(ldrOpc), scratch) 6744 .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0) 6745 .addImm(UnitSize)); 6746 6747 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) 6748 .addReg(scratch).addReg(destIn) 6749 .addReg(0).addImm(UnitSize)); 6750 } 6751 srcIn = srcOut; 6752 destIn = destOut; 6753 } 6754 6755 // Handle the leftover bytes with LDRB and STRB. 6756 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 6757 // [destOut] = STRB_POST(scratch, destIn, 1) 6758 ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; 6759 strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; 6760 for (unsigned i = 0; i < BytesLeft; i++) { 6761 unsigned scratch = MRI.createVirtualRegister(TRC); 6762 unsigned srcOut = MRI.createVirtualRegister(TRC); 6763 unsigned destOut = MRI.createVirtualRegister(TRC); 6764 if (isThumb2) { 6765 AddDefaultPred(BuildMI(*BB, MI, dl, 6766 TII->get(ldrOpc),scratch) 6767 .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); 6768 6769 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) 6770 .addReg(scratch).addReg(destIn) 6771 .addReg(0).addImm(1)); 6772 } else { 6773 AddDefaultPred(BuildMI(*BB, MI, dl, 6774 TII->get(ldrOpc),scratch) 6775 .addReg(srcOut, RegState::Define).addReg(srcIn) 6776 .addReg(0).addImm(1)); 6777 6778 AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut) 6779 .addReg(scratch).addReg(destIn) 6780 .addReg(0).addImm(1)); 6781 } 6782 srcIn = srcOut; 6783 destIn = destOut; 6784 } 6785 MI->eraseFromParent(); // The instruction is gone now. 6786 return BB; 6787 } 6788 6789 // Expand the pseudo op to a loop. 6790 // thisMBB: 6791 // ... 6792 // movw varEnd, # --> with thumb2 6793 // movt varEnd, # 6794 // ldrcp varEnd, idx --> without thumb2 6795 // fallthrough --> loopMBB 6796 // loopMBB: 6797 // PHI varPhi, varEnd, varLoop 6798 // PHI srcPhi, src, srcLoop 6799 // PHI destPhi, dst, destLoop 6800 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 6801 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 6802 // subs varLoop, varPhi, #UnitSize 6803 // bne loopMBB 6804 // fallthrough --> exitMBB 6805 // exitMBB: 6806 // epilogue to handle left-over bytes 6807 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 6808 // [destOut] = STRB_POST(scratch, destLoop, 1) 6809 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 6810 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 6811 MF->insert(It, loopMBB); 6812 MF->insert(It, exitMBB); 6813 6814 // Transfer the remainder of BB and its successor edges to exitMBB. 6815 exitMBB->splice(exitMBB->begin(), BB, 6816 llvm::next(MachineBasicBlock::iterator(MI)), 6817 BB->end()); 6818 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 6819 6820 // Load an immediate to varEnd. 6821 unsigned varEnd = MRI.createVirtualRegister(TRC); 6822 if (isThumb2) { 6823 unsigned VReg1 = varEnd; 6824 if ((LoopSize & 0xFFFF0000) != 0) 6825 VReg1 = MRI.createVirtualRegister(TRC); 6826 AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1) 6827 .addImm(LoopSize & 0xFFFF)); 6828 6829 if ((LoopSize & 0xFFFF0000) != 0) 6830 AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd) 6831 .addReg(VReg1) 6832 .addImm(LoopSize >> 16)); 6833 } else { 6834 MachineConstantPool *ConstantPool = MF->getConstantPool(); 6835 Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); 6836 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 6837 6838 // MachineConstantPool wants an explicit alignment. 6839 unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); 6840 if (Align == 0) 6841 Align = getDataLayout()->getTypeAllocSize(C->getType()); 6842 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 6843 6844 AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp)) 6845 .addReg(varEnd, RegState::Define) 6846 .addConstantPoolIndex(Idx) 6847 .addImm(0)); 6848 } 6849 BB->addSuccessor(loopMBB); 6850 6851 // Generate the loop body: 6852 // varPhi = PHI(varLoop, varEnd) 6853 // srcPhi = PHI(srcLoop, src) 6854 // destPhi = PHI(destLoop, dst) 6855 MachineBasicBlock *entryBB = BB; 6856 BB = loopMBB; 6857 unsigned varLoop = MRI.createVirtualRegister(TRC); 6858 unsigned varPhi = MRI.createVirtualRegister(TRC); 6859 unsigned srcLoop = MRI.createVirtualRegister(TRC); 6860 unsigned srcPhi = MRI.createVirtualRegister(TRC); 6861 unsigned destLoop = MRI.createVirtualRegister(TRC); 6862 unsigned destPhi = MRI.createVirtualRegister(TRC); 6863 6864 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 6865 .addReg(varLoop).addMBB(loopMBB) 6866 .addReg(varEnd).addMBB(entryBB); 6867 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 6868 .addReg(srcLoop).addMBB(loopMBB) 6869 .addReg(src).addMBB(entryBB); 6870 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 6871 .addReg(destLoop).addMBB(loopMBB) 6872 .addReg(dest).addMBB(entryBB); 6873 6874 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 6875 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 6876 unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC); 6877 if (UnitSize >= 8) { 6878 AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) 6879 .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0)); 6880 6881 AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) 6882 .addReg(destPhi).addImm(0).addReg(scratch)); 6883 } else if (isThumb2) { 6884 AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) 6885 .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize)); 6886 6887 AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) 6888 .addReg(scratch).addReg(destPhi) 6889 .addImm(UnitSize)); 6890 } else { 6891 AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch) 6892 .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0) 6893 .addImm(UnitSize)); 6894 6895 AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop) 6896 .addReg(scratch).addReg(destPhi) 6897 .addReg(0).addImm(UnitSize)); 6898 } 6899 6900 // Decrement loop variable by UnitSize. 6901 MachineInstrBuilder MIB = BuildMI(BB, dl, 6902 TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 6903 AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize))); 6904 MIB->getOperand(5).setReg(ARM::CPSR); 6905 MIB->getOperand(5).setIsDef(true); 6906 6907 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 6908 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 6909 6910 // loopMBB can loop back to loopMBB or fall through to exitMBB. 6911 BB->addSuccessor(loopMBB); 6912 BB->addSuccessor(exitMBB); 6913 6914 // Add epilogue to handle BytesLeft. 6915 BB = exitMBB; 6916 MachineInstr *StartOfExit = exitMBB->begin(); 6917 ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM; 6918 strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM; 6919 6920 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 6921 // [destOut] = STRB_POST(scratch, destLoop, 1) 6922 unsigned srcIn = srcLoop; 6923 unsigned destIn = destLoop; 6924 for (unsigned i = 0; i < BytesLeft; i++) { 6925 unsigned scratch = MRI.createVirtualRegister(TRC); 6926 unsigned srcOut = MRI.createVirtualRegister(TRC); 6927 unsigned destOut = MRI.createVirtualRegister(TRC); 6928 if (isThumb2) { 6929 AddDefaultPred(BuildMI(*BB, StartOfExit, dl, 6930 TII->get(ldrOpc),scratch) 6931 .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1)); 6932 6933 AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) 6934 .addReg(scratch).addReg(destIn) 6935 .addImm(1)); 6936 } else { 6937 AddDefaultPred(BuildMI(*BB, StartOfExit, dl, 6938 TII->get(ldrOpc),scratch) 6939 .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1)); 6940 6941 AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut) 6942 .addReg(scratch).addReg(destIn) 6943 .addReg(0).addImm(1)); 6944 } 6945 srcIn = srcOut; 6946 destIn = destOut; 6947 } 6948 6949 MI->eraseFromParent(); // The instruction is gone now. 6950 return BB; 6951 } 6952 6953 MachineBasicBlock * 6954 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 6955 MachineBasicBlock *BB) const { 6956 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 6957 DebugLoc dl = MI->getDebugLoc(); 6958 bool isThumb2 = Subtarget->isThumb2(); 6959 switch (MI->getOpcode()) { 6960 default: { 6961 MI->dump(); 6962 llvm_unreachable("Unexpected instr type to insert"); 6963 } 6964 // The Thumb2 pre-indexed stores have the same MI operands, they just 6965 // define them differently in the .td files from the isel patterns, so 6966 // they need pseudos. 6967 case ARM::t2STR_preidx: 6968 MI->setDesc(TII->get(ARM::t2STR_PRE)); 6969 return BB; 6970 case ARM::t2STRB_preidx: 6971 MI->setDesc(TII->get(ARM::t2STRB_PRE)); 6972 return BB; 6973 case ARM::t2STRH_preidx: 6974 MI->setDesc(TII->get(ARM::t2STRH_PRE)); 6975 return BB; 6976 6977 case ARM::STRi_preidx: 6978 case ARM::STRBi_preidx: { 6979 unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ? 6980 ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; 6981 // Decode the offset. 6982 unsigned Offset = MI->getOperand(4).getImm(); 6983 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 6984 Offset = ARM_AM::getAM2Offset(Offset); 6985 if (isSub) 6986 Offset = -Offset; 6987 6988 MachineMemOperand *MMO = *MI->memoperands_begin(); 6989 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 6990 .addOperand(MI->getOperand(0)) // Rn_wb 6991 .addOperand(MI->getOperand(1)) // Rt 6992 .addOperand(MI->getOperand(2)) // Rn 6993 .addImm(Offset) // offset (skip GPR==zero_reg) 6994 .addOperand(MI->getOperand(5)) // pred 6995 .addOperand(MI->getOperand(6)) 6996 .addMemOperand(MMO); 6997 MI->eraseFromParent(); 6998 return BB; 6999 } 7000 case ARM::STRr_preidx: 7001 case ARM::STRBr_preidx: 7002 case ARM::STRH_preidx: { 7003 unsigned NewOpc; 7004 switch (MI->getOpcode()) { 7005 default: llvm_unreachable("unexpected opcode!"); 7006 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 7007 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 7008 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 7009 } 7010 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 7011 for (unsigned i = 0; i < MI->getNumOperands(); ++i) 7012 MIB.addOperand(MI->getOperand(i)); 7013 MI->eraseFromParent(); 7014 return BB; 7015 } 7016 case ARM::ATOMIC_LOAD_ADD_I8: 7017 return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); 7018 case ARM::ATOMIC_LOAD_ADD_I16: 7019 return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); 7020 case ARM::ATOMIC_LOAD_ADD_I32: 7021 return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr); 7022 7023 case ARM::ATOMIC_LOAD_AND_I8: 7024 return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); 7025 case ARM::ATOMIC_LOAD_AND_I16: 7026 return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); 7027 case ARM::ATOMIC_LOAD_AND_I32: 7028 return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); 7029 7030 case ARM::ATOMIC_LOAD_OR_I8: 7031 return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); 7032 case ARM::ATOMIC_LOAD_OR_I16: 7033 return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); 7034 case ARM::ATOMIC_LOAD_OR_I32: 7035 return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); 7036 7037 case ARM::ATOMIC_LOAD_XOR_I8: 7038 return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr); 7039 case ARM::ATOMIC_LOAD_XOR_I16: 7040 return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr); 7041 case ARM::ATOMIC_LOAD_XOR_I32: 7042 return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr); 7043 7044 case ARM::ATOMIC_LOAD_NAND_I8: 7045 return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr); 7046 case ARM::ATOMIC_LOAD_NAND_I16: 7047 return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr); 7048 case ARM::ATOMIC_LOAD_NAND_I32: 7049 return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr); 7050 7051 case ARM::ATOMIC_LOAD_SUB_I8: 7052 return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); 7053 case ARM::ATOMIC_LOAD_SUB_I16: 7054 return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); 7055 case ARM::ATOMIC_LOAD_SUB_I32: 7056 return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr); 7057 7058 case ARM::ATOMIC_LOAD_MIN_I8: 7059 return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT); 7060 case ARM::ATOMIC_LOAD_MIN_I16: 7061 return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT); 7062 case ARM::ATOMIC_LOAD_MIN_I32: 7063 return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT); 7064 7065 case ARM::ATOMIC_LOAD_MAX_I8: 7066 return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT); 7067 case ARM::ATOMIC_LOAD_MAX_I16: 7068 return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT); 7069 case ARM::ATOMIC_LOAD_MAX_I32: 7070 return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT); 7071 7072 case ARM::ATOMIC_LOAD_UMIN_I8: 7073 return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO); 7074 case ARM::ATOMIC_LOAD_UMIN_I16: 7075 return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO); 7076 case ARM::ATOMIC_LOAD_UMIN_I32: 7077 return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO); 7078 7079 case ARM::ATOMIC_LOAD_UMAX_I8: 7080 return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI); 7081 case ARM::ATOMIC_LOAD_UMAX_I16: 7082 return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI); 7083 case ARM::ATOMIC_LOAD_UMAX_I32: 7084 return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI); 7085 7086 case ARM::ATOMIC_SWAP_I8: return EmitAtomicBinary(MI, BB, 1, 0); 7087 case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0); 7088 case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0); 7089 7090 case ARM::ATOMIC_CMP_SWAP_I8: return EmitAtomicCmpSwap(MI, BB, 1); 7091 case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2); 7092 case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4); 7093 7094 7095 case ARM::ATOMADD6432: 7096 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr, 7097 isThumb2 ? ARM::t2ADCrr : ARM::ADCrr, 7098 /*NeedsCarry*/ true); 7099 case ARM::ATOMSUB6432: 7100 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, 7101 isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, 7102 /*NeedsCarry*/ true); 7103 case ARM::ATOMOR6432: 7104 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr, 7105 isThumb2 ? ARM::t2ORRrr : ARM::ORRrr); 7106 case ARM::ATOMXOR6432: 7107 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr, 7108 isThumb2 ? ARM::t2EORrr : ARM::EORrr); 7109 case ARM::ATOMAND6432: 7110 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr, 7111 isThumb2 ? ARM::t2ANDrr : ARM::ANDrr); 7112 case ARM::ATOMSWAP6432: 7113 return EmitAtomicBinary64(MI, BB, 0, 0, false); 7114 case ARM::ATOMCMPXCHG6432: 7115 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, 7116 isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, 7117 /*NeedsCarry*/ false, /*IsCmpxchg*/true); 7118 case ARM::ATOMMIN6432: 7119 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, 7120 isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, 7121 /*NeedsCarry*/ true, /*IsCmpxchg*/false, 7122 /*IsMinMax*/ true, ARMCC::LE); 7123 case ARM::ATOMMAX6432: 7124 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, 7125 isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, 7126 /*NeedsCarry*/ true, /*IsCmpxchg*/false, 7127 /*IsMinMax*/ true, ARMCC::GE); 7128 case ARM::ATOMUMIN6432: 7129 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, 7130 isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, 7131 /*NeedsCarry*/ true, /*IsCmpxchg*/false, 7132 /*IsMinMax*/ true, ARMCC::LS); 7133 case ARM::ATOMUMAX6432: 7134 return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr, 7135 isThumb2 ? ARM::t2SBCrr : ARM::SBCrr, 7136 /*NeedsCarry*/ true, /*IsCmpxchg*/false, 7137 /*IsMinMax*/ true, ARMCC::HS); 7138 7139 case ARM::tMOVCCr_pseudo: { 7140 // To "insert" a SELECT_CC instruction, we actually have to insert the 7141 // diamond control-flow pattern. The incoming instruction knows the 7142 // destination vreg to set, the condition code register to branch on, the 7143 // true/false values to select between, and a branch opcode to use. 7144 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7145 MachineFunction::iterator It = BB; 7146 ++It; 7147 7148 // thisMBB: 7149 // ... 7150 // TrueVal = ... 7151 // cmpTY ccX, r1, r2 7152 // bCC copy1MBB 7153 // fallthrough --> copy0MBB 7154 MachineBasicBlock *thisMBB = BB; 7155 MachineFunction *F = BB->getParent(); 7156 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7157 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7158 F->insert(It, copy0MBB); 7159 F->insert(It, sinkMBB); 7160 7161 // Transfer the remainder of BB and its successor edges to sinkMBB. 7162 sinkMBB->splice(sinkMBB->begin(), BB, 7163 llvm::next(MachineBasicBlock::iterator(MI)), 7164 BB->end()); 7165 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 7166 7167 BB->addSuccessor(copy0MBB); 7168 BB->addSuccessor(sinkMBB); 7169 7170 BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) 7171 .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); 7172 7173 // copy0MBB: 7174 // %FalseValue = ... 7175 // # fallthrough to sinkMBB 7176 BB = copy0MBB; 7177 7178 // Update machine-CFG edges 7179 BB->addSuccessor(sinkMBB); 7180 7181 // sinkMBB: 7182 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7183 // ... 7184 BB = sinkMBB; 7185 BuildMI(*BB, BB->begin(), dl, 7186 TII->get(ARM::PHI), MI->getOperand(0).getReg()) 7187 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7188 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7189 7190 MI->eraseFromParent(); // The pseudo instruction is gone now. 7191 return BB; 7192 } 7193 7194 case ARM::BCCi64: 7195 case ARM::BCCZi64: { 7196 // If there is an unconditional branch to the other successor, remove it. 7197 BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end()); 7198 7199 // Compare both parts that make up the double comparison separately for 7200 // equality. 7201 bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; 7202 7203 unsigned LHS1 = MI->getOperand(1).getReg(); 7204 unsigned LHS2 = MI->getOperand(2).getReg(); 7205 if (RHSisZero) { 7206 AddDefaultPred(BuildMI(BB, dl, 7207 TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 7208 .addReg(LHS1).addImm(0)); 7209 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 7210 .addReg(LHS2).addImm(0) 7211 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 7212 } else { 7213 unsigned RHS1 = MI->getOperand(3).getReg(); 7214 unsigned RHS2 = MI->getOperand(4).getReg(); 7215 AddDefaultPred(BuildMI(BB, dl, 7216 TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 7217 .addReg(LHS1).addReg(RHS1)); 7218 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 7219 .addReg(LHS2).addReg(RHS2) 7220 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 7221 } 7222 7223 MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); 7224 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 7225 if (MI->getOperand(0).getImm() == ARMCC::NE) 7226 std::swap(destMBB, exitMBB); 7227 7228 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 7229 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 7230 if (isThumb2) 7231 AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB)); 7232 else 7233 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 7234 7235 MI->eraseFromParent(); // The pseudo instruction is gone now. 7236 return BB; 7237 } 7238 7239 case ARM::Int_eh_sjlj_setjmp: 7240 case ARM::Int_eh_sjlj_setjmp_nofp: 7241 case ARM::tInt_eh_sjlj_setjmp: 7242 case ARM::t2Int_eh_sjlj_setjmp: 7243 case ARM::t2Int_eh_sjlj_setjmp_nofp: 7244 EmitSjLjDispatchBlock(MI, BB); 7245 return BB; 7246 7247 case ARM::ABS: 7248 case ARM::t2ABS: { 7249 // To insert an ABS instruction, we have to insert the 7250 // diamond control-flow pattern. The incoming instruction knows the 7251 // source vreg to test against 0, the destination vreg to set, 7252 // the condition code register to branch on, the 7253 // true/false values to select between, and a branch opcode to use. 7254 // It transforms 7255 // V1 = ABS V0 7256 // into 7257 // V2 = MOVS V0 7258 // BCC (branch to SinkBB if V0 >= 0) 7259 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 7260 // SinkBB: V1 = PHI(V2, V3) 7261 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7262 MachineFunction::iterator BBI = BB; 7263 ++BBI; 7264 MachineFunction *Fn = BB->getParent(); 7265 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 7266 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 7267 Fn->insert(BBI, RSBBB); 7268 Fn->insert(BBI, SinkBB); 7269 7270 unsigned int ABSSrcReg = MI->getOperand(1).getReg(); 7271 unsigned int ABSDstReg = MI->getOperand(0).getReg(); 7272 bool isThumb2 = Subtarget->isThumb2(); 7273 MachineRegisterInfo &MRI = Fn->getRegInfo(); 7274 // In Thumb mode S must not be specified if source register is the SP or 7275 // PC and if destination register is the SP, so restrict register class 7276 unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ? 7277 (const TargetRegisterClass*)&ARM::rGPRRegClass : 7278 (const TargetRegisterClass*)&ARM::GPRRegClass); 7279 7280 // Transfer the remainder of BB and its successor edges to sinkMBB. 7281 SinkBB->splice(SinkBB->begin(), BB, 7282 llvm::next(MachineBasicBlock::iterator(MI)), 7283 BB->end()); 7284 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 7285 7286 BB->addSuccessor(RSBBB); 7287 BB->addSuccessor(SinkBB); 7288 7289 // fall through to SinkMBB 7290 RSBBB->addSuccessor(SinkBB); 7291 7292 // insert a cmp at the end of BB 7293 AddDefaultPred(BuildMI(BB, dl, 7294 TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 7295 .addReg(ABSSrcReg).addImm(0)); 7296 7297 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 7298 BuildMI(BB, dl, 7299 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 7300 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 7301 7302 // insert rsbri in RSBBB 7303 // Note: BCC and rsbri will be converted into predicated rsbmi 7304 // by if-conversion pass 7305 BuildMI(*RSBBB, RSBBB->begin(), dl, 7306 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 7307 .addReg(ABSSrcReg, RegState::Kill) 7308 .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); 7309 7310 // insert PHI in SinkBB, 7311 // reuse ABSDstReg to not change uses of ABS instruction 7312 BuildMI(*SinkBB, SinkBB->begin(), dl, 7313 TII->get(ARM::PHI), ABSDstReg) 7314 .addReg(NewRsbDstReg).addMBB(RSBBB) 7315 .addReg(ABSSrcReg).addMBB(BB); 7316 7317 // remove ABS instruction 7318 MI->eraseFromParent(); 7319 7320 // return last added BB 7321 return SinkBB; 7322 } 7323 case ARM::COPY_STRUCT_BYVAL_I32: 7324 ++NumLoopByVals; 7325 return EmitStructByval(MI, BB); 7326 } 7327 } 7328 7329 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 7330 SDNode *Node) const { 7331 if (!MI->hasPostISelHook()) { 7332 assert(!convertAddSubFlagsOpcode(MI->getOpcode()) && 7333 "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'"); 7334 return; 7335 } 7336 7337 const MCInstrDesc *MCID = &MI->getDesc(); 7338 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 7339 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 7340 // operand is still set to noreg. If needed, set the optional operand's 7341 // register to CPSR, and remove the redundant implicit def. 7342 // 7343 // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>). 7344 7345 // Rename pseudo opcodes. 7346 unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); 7347 if (NewOpc) { 7348 const ARMBaseInstrInfo *TII = 7349 static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo()); 7350 MCID = &TII->get(NewOpc); 7351 7352 assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 && 7353 "converted opcode should be the same except for cc_out"); 7354 7355 MI->setDesc(*MCID); 7356 7357 // Add the optional cc_out operand 7358 MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 7359 } 7360 unsigned ccOutIdx = MCID->getNumOperands() - 1; 7361 7362 // Any ARM instruction that sets the 's' bit should specify an optional 7363 // "cc_out" operand in the last operand position. 7364 if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 7365 assert(!NewOpc && "Optional cc_out operand required"); 7366 return; 7367 } 7368 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 7369 // since we already have an optional CPSR def. 7370 bool definesCPSR = false; 7371 bool deadCPSR = false; 7372 for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands(); 7373 i != e; ++i) { 7374 const MachineOperand &MO = MI->getOperand(i); 7375 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 7376 definesCPSR = true; 7377 if (MO.isDead()) 7378 deadCPSR = true; 7379 MI->RemoveOperand(i); 7380 break; 7381 } 7382 } 7383 if (!definesCPSR) { 7384 assert(!NewOpc && "Optional cc_out operand required"); 7385 return; 7386 } 7387 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 7388 if (deadCPSR) { 7389 assert(!MI->getOperand(ccOutIdx).getReg() && 7390 "expect uninitialized optional cc_out operand"); 7391 return; 7392 } 7393 7394 // If this instruction was defined with an optional CPSR def and its dag node 7395 // had a live implicit CPSR def, then activate the optional CPSR def. 7396 MachineOperand &MO = MI->getOperand(ccOutIdx); 7397 MO.setReg(ARM::CPSR); 7398 MO.setIsDef(true); 7399 } 7400 7401 //===----------------------------------------------------------------------===// 7402 // ARM Optimization Hooks 7403 //===----------------------------------------------------------------------===// 7404 7405 // Helper function that checks if N is a null or all ones constant. 7406 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 7407 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); 7408 if (!C) 7409 return false; 7410 return AllOnes ? C->isAllOnesValue() : C->isNullValue(); 7411 } 7412 7413 // Return true if N is conditionally 0 or all ones. 7414 // Detects these expressions where cc is an i1 value: 7415 // 7416 // (select cc 0, y) [AllOnes=0] 7417 // (select cc y, 0) [AllOnes=0] 7418 // (zext cc) [AllOnes=0] 7419 // (sext cc) [AllOnes=0/1] 7420 // (select cc -1, y) [AllOnes=1] 7421 // (select cc y, -1) [AllOnes=1] 7422 // 7423 // Invert is set when N is the null/all ones constant when CC is false. 7424 // OtherOp is set to the alternative value of N. 7425 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 7426 SDValue &CC, bool &Invert, 7427 SDValue &OtherOp, 7428 SelectionDAG &DAG) { 7429 switch (N->getOpcode()) { 7430 default: return false; 7431 case ISD::SELECT: { 7432 CC = N->getOperand(0); 7433 SDValue N1 = N->getOperand(1); 7434 SDValue N2 = N->getOperand(2); 7435 if (isZeroOrAllOnes(N1, AllOnes)) { 7436 Invert = false; 7437 OtherOp = N2; 7438 return true; 7439 } 7440 if (isZeroOrAllOnes(N2, AllOnes)) { 7441 Invert = true; 7442 OtherOp = N1; 7443 return true; 7444 } 7445 return false; 7446 } 7447 case ISD::ZERO_EXTEND: 7448 // (zext cc) can never be the all ones value. 7449 if (AllOnes) 7450 return false; 7451 // Fall through. 7452 case ISD::SIGN_EXTEND: { 7453 EVT VT = N->getValueType(0); 7454 CC = N->getOperand(0); 7455 if (CC.getValueType() != MVT::i1) 7456 return false; 7457 Invert = !AllOnes; 7458 if (AllOnes) 7459 // When looking for an AllOnes constant, N is an sext, and the 'other' 7460 // value is 0. 7461 OtherOp = DAG.getConstant(0, VT); 7462 else if (N->getOpcode() == ISD::ZERO_EXTEND) 7463 // When looking for a 0 constant, N can be zext or sext. 7464 OtherOp = DAG.getConstant(1, VT); 7465 else 7466 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT); 7467 return true; 7468 } 7469 } 7470 } 7471 7472 // Combine a constant select operand into its use: 7473 // 7474 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 7475 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 7476 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 7477 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 7478 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 7479 // 7480 // The transform is rejected if the select doesn't have a constant operand that 7481 // is null, or all ones when AllOnes is set. 7482 // 7483 // Also recognize sext/zext from i1: 7484 // 7485 // (add (zext cc), x) -> (select cc (add x, 1), x) 7486 // (add (sext cc), x) -> (select cc (add x, -1), x) 7487 // 7488 // These transformations eventually create predicated instructions. 7489 // 7490 // @param N The node to transform. 7491 // @param Slct The N operand that is a select. 7492 // @param OtherOp The other N operand (x above). 7493 // @param DCI Context. 7494 // @param AllOnes Require the select constant to be all ones instead of null. 7495 // @returns The new node, or SDValue() on failure. 7496 static 7497 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 7498 TargetLowering::DAGCombinerInfo &DCI, 7499 bool AllOnes = false) { 7500 SelectionDAG &DAG = DCI.DAG; 7501 EVT VT = N->getValueType(0); 7502 SDValue NonConstantVal; 7503 SDValue CCOp; 7504 bool SwapSelectOps; 7505 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 7506 NonConstantVal, DAG)) 7507 return SDValue(); 7508 7509 // Slct is now know to be the desired identity constant when CC is true. 7510 SDValue TrueVal = OtherOp; 7511 SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT, 7512 OtherOp, NonConstantVal); 7513 // Unless SwapSelectOps says CC should be false. 7514 if (SwapSelectOps) 7515 std::swap(TrueVal, FalseVal); 7516 7517 return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT, 7518 CCOp, TrueVal, FalseVal); 7519 } 7520 7521 // Attempt combineSelectAndUse on each operand of a commutative operator N. 7522 static 7523 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 7524 TargetLowering::DAGCombinerInfo &DCI) { 7525 SDValue N0 = N->getOperand(0); 7526 SDValue N1 = N->getOperand(1); 7527 if (N0.getNode()->hasOneUse()) { 7528 SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes); 7529 if (Result.getNode()) 7530 return Result; 7531 } 7532 if (N1.getNode()->hasOneUse()) { 7533 SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes); 7534 if (Result.getNode()) 7535 return Result; 7536 } 7537 return SDValue(); 7538 } 7539 7540 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction 7541 // (only after legalization). 7542 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, 7543 TargetLowering::DAGCombinerInfo &DCI, 7544 const ARMSubtarget *Subtarget) { 7545 7546 // Only perform optimization if after legalize, and if NEON is available. We 7547 // also expected both operands to be BUILD_VECTORs. 7548 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 7549 || N0.getOpcode() != ISD::BUILD_VECTOR 7550 || N1.getOpcode() != ISD::BUILD_VECTOR) 7551 return SDValue(); 7552 7553 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 7554 EVT VT = N->getValueType(0); 7555 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 7556 return SDValue(); 7557 7558 // Check that the vector operands are of the right form. 7559 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 7560 // operands, where N is the size of the formed vector. 7561 // Each EXTRACT_VECTOR should have the same input vector and odd or even 7562 // index such that we have a pair wise add pattern. 7563 7564 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 7565 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 7566 return SDValue(); 7567 SDValue Vec = N0->getOperand(0)->getOperand(0); 7568 SDNode *V = Vec.getNode(); 7569 unsigned nextIndex = 0; 7570 7571 // For each operands to the ADD which are BUILD_VECTORs, 7572 // check to see if each of their operands are an EXTRACT_VECTOR with 7573 // the same vector and appropriate index. 7574 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 7575 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 7576 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 7577 7578 SDValue ExtVec0 = N0->getOperand(i); 7579 SDValue ExtVec1 = N1->getOperand(i); 7580 7581 // First operand is the vector, verify its the same. 7582 if (V != ExtVec0->getOperand(0).getNode() || 7583 V != ExtVec1->getOperand(0).getNode()) 7584 return SDValue(); 7585 7586 // Second is the constant, verify its correct. 7587 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 7588 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 7589 7590 // For the constant, we want to see all the even or all the odd. 7591 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 7592 || C1->getZExtValue() != nextIndex+1) 7593 return SDValue(); 7594 7595 // Increment index. 7596 nextIndex+=2; 7597 } else 7598 return SDValue(); 7599 } 7600 7601 // Create VPADDL node. 7602 SelectionDAG &DAG = DCI.DAG; 7603 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 7604 7605 // Build operand list. 7606 SmallVector<SDValue, 8> Ops; 7607 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, 7608 TLI.getPointerTy())); 7609 7610 // Input is the vector. 7611 Ops.push_back(Vec); 7612 7613 // Get widened type and narrowed type. 7614 MVT widenType; 7615 unsigned numElem = VT.getVectorNumElements(); 7616 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 7617 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 7618 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 7619 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 7620 default: 7621 llvm_unreachable("Invalid vector element type for padd optimization."); 7622 } 7623 7624 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), 7625 widenType, &Ops[0], Ops.size()); 7626 return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp); 7627 } 7628 7629 static SDValue findMUL_LOHI(SDValue V) { 7630 if (V->getOpcode() == ISD::UMUL_LOHI || 7631 V->getOpcode() == ISD::SMUL_LOHI) 7632 return V; 7633 return SDValue(); 7634 } 7635 7636 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, 7637 TargetLowering::DAGCombinerInfo &DCI, 7638 const ARMSubtarget *Subtarget) { 7639 7640 if (Subtarget->isThumb1Only()) return SDValue(); 7641 7642 // Only perform the checks after legalize when the pattern is available. 7643 if (DCI.isBeforeLegalize()) return SDValue(); 7644 7645 // Look for multiply add opportunities. 7646 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 7647 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 7648 // a glue link from the first add to the second add. 7649 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 7650 // a S/UMLAL instruction. 7651 // loAdd UMUL_LOHI 7652 // \ / :lo \ :hi 7653 // \ / \ [no multiline comment] 7654 // ADDC | hiAdd 7655 // \ :glue / / 7656 // \ / / 7657 // ADDE 7658 // 7659 assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); 7660 SDValue AddcOp0 = AddcNode->getOperand(0); 7661 SDValue AddcOp1 = AddcNode->getOperand(1); 7662 7663 // Check if the two operands are from the same mul_lohi node. 7664 if (AddcOp0.getNode() == AddcOp1.getNode()) 7665 return SDValue(); 7666 7667 assert(AddcNode->getNumValues() == 2 && 7668 AddcNode->getValueType(0) == MVT::i32 && 7669 AddcNode->getValueType(1) == MVT::Glue && 7670 "Expect ADDC with two result values: i32, glue"); 7671 7672 // Check that the ADDC adds the low result of the S/UMUL_LOHI. 7673 if (AddcOp0->getOpcode() != ISD::UMUL_LOHI && 7674 AddcOp0->getOpcode() != ISD::SMUL_LOHI && 7675 AddcOp1->getOpcode() != ISD::UMUL_LOHI && 7676 AddcOp1->getOpcode() != ISD::SMUL_LOHI) 7677 return SDValue(); 7678 7679 // Look for the glued ADDE. 7680 SDNode* AddeNode = AddcNode->getGluedUser(); 7681 if (AddeNode == NULL) 7682 return SDValue(); 7683 7684 // Make sure it is really an ADDE. 7685 if (AddeNode->getOpcode() != ISD::ADDE) 7686 return SDValue(); 7687 7688 assert(AddeNode->getNumOperands() == 3 && 7689 AddeNode->getOperand(2).getValueType() == MVT::Glue && 7690 "ADDE node has the wrong inputs"); 7691 7692 // Check for the triangle shape. 7693 SDValue AddeOp0 = AddeNode->getOperand(0); 7694 SDValue AddeOp1 = AddeNode->getOperand(1); 7695 7696 // Make sure that the ADDE operands are not coming from the same node. 7697 if (AddeOp0.getNode() == AddeOp1.getNode()) 7698 return SDValue(); 7699 7700 // Find the MUL_LOHI node walking up ADDE's operands. 7701 bool IsLeftOperandMUL = false; 7702 SDValue MULOp = findMUL_LOHI(AddeOp0); 7703 if (MULOp == SDValue()) 7704 MULOp = findMUL_LOHI(AddeOp1); 7705 else 7706 IsLeftOperandMUL = true; 7707 if (MULOp == SDValue()) 7708 return SDValue(); 7709 7710 // Figure out the right opcode. 7711 unsigned Opc = MULOp->getOpcode(); 7712 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 7713 7714 // Figure out the high and low input values to the MLAL node. 7715 SDValue* HiMul = &MULOp; 7716 SDValue* HiAdd = NULL; 7717 SDValue* LoMul = NULL; 7718 SDValue* LowAdd = NULL; 7719 7720 if (IsLeftOperandMUL) 7721 HiAdd = &AddeOp1; 7722 else 7723 HiAdd = &AddeOp0; 7724 7725 7726 if (AddcOp0->getOpcode() == Opc) { 7727 LoMul = &AddcOp0; 7728 LowAdd = &AddcOp1; 7729 } 7730 if (AddcOp1->getOpcode() == Opc) { 7731 LoMul = &AddcOp1; 7732 LowAdd = &AddcOp0; 7733 } 7734 7735 if (LoMul == NULL) 7736 return SDValue(); 7737 7738 if (LoMul->getNode() != HiMul->getNode()) 7739 return SDValue(); 7740 7741 // Create the merged node. 7742 SelectionDAG &DAG = DCI.DAG; 7743 7744 // Build operand list. 7745 SmallVector<SDValue, 8> Ops; 7746 Ops.push_back(LoMul->getOperand(0)); 7747 Ops.push_back(LoMul->getOperand(1)); 7748 Ops.push_back(*LowAdd); 7749 Ops.push_back(*HiAdd); 7750 7751 SDValue MLALNode = DAG.getNode(FinalOpc, AddcNode->getDebugLoc(), 7752 DAG.getVTList(MVT::i32, MVT::i32), 7753 &Ops[0], Ops.size()); 7754 7755 // Replace the ADDs' nodes uses by the MLA node's values. 7756 SDValue HiMLALResult(MLALNode.getNode(), 1); 7757 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 7758 7759 SDValue LoMLALResult(MLALNode.getNode(), 0); 7760 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 7761 7762 // Return original node to notify the driver to stop replacing. 7763 SDValue resNode(AddcNode, 0); 7764 return resNode; 7765 } 7766 7767 /// PerformADDCCombine - Target-specific dag combine transform from 7768 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. 7769 static SDValue PerformADDCCombine(SDNode *N, 7770 TargetLowering::DAGCombinerInfo &DCI, 7771 const ARMSubtarget *Subtarget) { 7772 7773 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 7774 7775 } 7776 7777 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 7778 /// operands N0 and N1. This is a helper for PerformADDCombine that is 7779 /// called with the default operands, and if that fails, with commuted 7780 /// operands. 7781 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 7782 TargetLowering::DAGCombinerInfo &DCI, 7783 const ARMSubtarget *Subtarget){ 7784 7785 // Attempt to create vpaddl for this add. 7786 SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget); 7787 if (Result.getNode()) 7788 return Result; 7789 7790 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 7791 if (N0.getNode()->hasOneUse()) { 7792 SDValue Result = combineSelectAndUse(N, N0, N1, DCI); 7793 if (Result.getNode()) return Result; 7794 } 7795 return SDValue(); 7796 } 7797 7798 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 7799 /// 7800 static SDValue PerformADDCombine(SDNode *N, 7801 TargetLowering::DAGCombinerInfo &DCI, 7802 const ARMSubtarget *Subtarget) { 7803 SDValue N0 = N->getOperand(0); 7804 SDValue N1 = N->getOperand(1); 7805 7806 // First try with the default operand order. 7807 SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget); 7808 if (Result.getNode()) 7809 return Result; 7810 7811 // If that didn't work, try again with the operands commuted. 7812 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 7813 } 7814 7815 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 7816 /// 7817 static SDValue PerformSUBCombine(SDNode *N, 7818 TargetLowering::DAGCombinerInfo &DCI) { 7819 SDValue N0 = N->getOperand(0); 7820 SDValue N1 = N->getOperand(1); 7821 7822 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 7823 if (N1.getNode()->hasOneUse()) { 7824 SDValue Result = combineSelectAndUse(N, N1, N0, DCI); 7825 if (Result.getNode()) return Result; 7826 } 7827 7828 return SDValue(); 7829 } 7830 7831 /// PerformVMULCombine 7832 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 7833 /// special multiplier accumulator forwarding. 7834 /// vmul d3, d0, d2 7835 /// vmla d3, d1, d2 7836 /// is faster than 7837 /// vadd d3, d0, d1 7838 /// vmul d3, d3, d2 7839 static SDValue PerformVMULCombine(SDNode *N, 7840 TargetLowering::DAGCombinerInfo &DCI, 7841 const ARMSubtarget *Subtarget) { 7842 if (!Subtarget->hasVMLxForwarding()) 7843 return SDValue(); 7844 7845 SelectionDAG &DAG = DCI.DAG; 7846 SDValue N0 = N->getOperand(0); 7847 SDValue N1 = N->getOperand(1); 7848 unsigned Opcode = N0.getOpcode(); 7849 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 7850 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 7851 Opcode = N1.getOpcode(); 7852 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 7853 Opcode != ISD::FADD && Opcode != ISD::FSUB) 7854 return SDValue(); 7855 std::swap(N0, N1); 7856 } 7857 7858 EVT VT = N->getValueType(0); 7859 DebugLoc DL = N->getDebugLoc(); 7860 SDValue N00 = N0->getOperand(0); 7861 SDValue N01 = N0->getOperand(1); 7862 return DAG.getNode(Opcode, DL, VT, 7863 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 7864 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 7865 } 7866 7867 static SDValue PerformMULCombine(SDNode *N, 7868 TargetLowering::DAGCombinerInfo &DCI, 7869 const ARMSubtarget *Subtarget) { 7870 SelectionDAG &DAG = DCI.DAG; 7871 7872 if (Subtarget->isThumb1Only()) 7873 return SDValue(); 7874 7875 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 7876 return SDValue(); 7877 7878 EVT VT = N->getValueType(0); 7879 if (VT.is64BitVector() || VT.is128BitVector()) 7880 return PerformVMULCombine(N, DCI, Subtarget); 7881 if (VT != MVT::i32) 7882 return SDValue(); 7883 7884 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 7885 if (!C) 7886 return SDValue(); 7887 7888 int64_t MulAmt = C->getSExtValue(); 7889 unsigned ShiftAmt = CountTrailingZeros_64(MulAmt); 7890 7891 ShiftAmt = ShiftAmt & (32 - 1); 7892 SDValue V = N->getOperand(0); 7893 DebugLoc DL = N->getDebugLoc(); 7894 7895 SDValue Res; 7896 MulAmt >>= ShiftAmt; 7897 7898 if (MulAmt >= 0) { 7899 if (isPowerOf2_32(MulAmt - 1)) { 7900 // (mul x, 2^N + 1) => (add (shl x, N), x) 7901 Res = DAG.getNode(ISD::ADD, DL, VT, 7902 V, 7903 DAG.getNode(ISD::SHL, DL, VT, 7904 V, 7905 DAG.getConstant(Log2_32(MulAmt - 1), 7906 MVT::i32))); 7907 } else if (isPowerOf2_32(MulAmt + 1)) { 7908 // (mul x, 2^N - 1) => (sub (shl x, N), x) 7909 Res = DAG.getNode(ISD::SUB, DL, VT, 7910 DAG.getNode(ISD::SHL, DL, VT, 7911 V, 7912 DAG.getConstant(Log2_32(MulAmt + 1), 7913 MVT::i32)), 7914 V); 7915 } else 7916 return SDValue(); 7917 } else { 7918 uint64_t MulAmtAbs = -MulAmt; 7919 if (isPowerOf2_32(MulAmtAbs + 1)) { 7920 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 7921 Res = DAG.getNode(ISD::SUB, DL, VT, 7922 V, 7923 DAG.getNode(ISD::SHL, DL, VT, 7924 V, 7925 DAG.getConstant(Log2_32(MulAmtAbs + 1), 7926 MVT::i32))); 7927 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 7928 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 7929 Res = DAG.getNode(ISD::ADD, DL, VT, 7930 V, 7931 DAG.getNode(ISD::SHL, DL, VT, 7932 V, 7933 DAG.getConstant(Log2_32(MulAmtAbs-1), 7934 MVT::i32))); 7935 Res = DAG.getNode(ISD::SUB, DL, VT, 7936 DAG.getConstant(0, MVT::i32),Res); 7937 7938 } else 7939 return SDValue(); 7940 } 7941 7942 if (ShiftAmt != 0) 7943 Res = DAG.getNode(ISD::SHL, DL, VT, 7944 Res, DAG.getConstant(ShiftAmt, MVT::i32)); 7945 7946 // Do not add new nodes to DAG combiner worklist. 7947 DCI.CombineTo(N, Res, false); 7948 return SDValue(); 7949 } 7950 7951 static SDValue PerformANDCombine(SDNode *N, 7952 TargetLowering::DAGCombinerInfo &DCI, 7953 const ARMSubtarget *Subtarget) { 7954 7955 // Attempt to use immediate-form VBIC 7956 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 7957 DebugLoc dl = N->getDebugLoc(); 7958 EVT VT = N->getValueType(0); 7959 SelectionDAG &DAG = DCI.DAG; 7960 7961 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 7962 return SDValue(); 7963 7964 APInt SplatBits, SplatUndef; 7965 unsigned SplatBitSize; 7966 bool HasAnyUndefs; 7967 if (BVN && 7968 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7969 if (SplatBitSize <= 64) { 7970 EVT VbicVT; 7971 SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), 7972 SplatUndef.getZExtValue(), SplatBitSize, 7973 DAG, VbicVT, VT.is128BitVector(), 7974 OtherModImm); 7975 if (Val.getNode()) { 7976 SDValue Input = 7977 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 7978 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 7979 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 7980 } 7981 } 7982 } 7983 7984 if (!Subtarget->isThumb1Only()) { 7985 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 7986 SDValue Result = combineSelectAndUseCommutative(N, true, DCI); 7987 if (Result.getNode()) 7988 return Result; 7989 } 7990 7991 return SDValue(); 7992 } 7993 7994 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 7995 static SDValue PerformORCombine(SDNode *N, 7996 TargetLowering::DAGCombinerInfo &DCI, 7997 const ARMSubtarget *Subtarget) { 7998 // Attempt to use immediate-form VORR 7999 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 8000 DebugLoc dl = N->getDebugLoc(); 8001 EVT VT = N->getValueType(0); 8002 SelectionDAG &DAG = DCI.DAG; 8003 8004 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 8005 return SDValue(); 8006 8007 APInt SplatBits, SplatUndef; 8008 unsigned SplatBitSize; 8009 bool HasAnyUndefs; 8010 if (BVN && Subtarget->hasNEON() && 8011 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 8012 if (SplatBitSize <= 64) { 8013 EVT VorrVT; 8014 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 8015 SplatUndef.getZExtValue(), SplatBitSize, 8016 DAG, VorrVT, VT.is128BitVector(), 8017 OtherModImm); 8018 if (Val.getNode()) { 8019 SDValue Input = 8020 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 8021 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 8022 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 8023 } 8024 } 8025 } 8026 8027 if (!Subtarget->isThumb1Only()) { 8028 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 8029 SDValue Result = combineSelectAndUseCommutative(N, false, DCI); 8030 if (Result.getNode()) 8031 return Result; 8032 } 8033 8034 // The code below optimizes (or (and X, Y), Z). 8035 // The AND operand needs to have a single user to make these optimizations 8036 // profitable. 8037 SDValue N0 = N->getOperand(0); 8038 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 8039 return SDValue(); 8040 SDValue N1 = N->getOperand(1); 8041 8042 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 8043 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 8044 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 8045 APInt SplatUndef; 8046 unsigned SplatBitSize; 8047 bool HasAnyUndefs; 8048 8049 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 8050 APInt SplatBits0; 8051 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 8052 HasAnyUndefs) && !HasAnyUndefs) { 8053 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 8054 APInt SplatBits1; 8055 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 8056 HasAnyUndefs) && !HasAnyUndefs && 8057 SplatBits0 == ~SplatBits1) { 8058 // Canonicalize the vector type to make instruction selection simpler. 8059 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 8060 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 8061 N0->getOperand(1), N0->getOperand(0), 8062 N1->getOperand(0)); 8063 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 8064 } 8065 } 8066 } 8067 8068 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 8069 // reasonable. 8070 8071 // BFI is only available on V6T2+ 8072 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 8073 return SDValue(); 8074 8075 DebugLoc DL = N->getDebugLoc(); 8076 // 1) or (and A, mask), val => ARMbfi A, val, mask 8077 // iff (val & mask) == val 8078 // 8079 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 8080 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 8081 // && mask == ~mask2 8082 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 8083 // && ~mask == mask2 8084 // (i.e., copy a bitfield value into another bitfield of the same width) 8085 8086 if (VT != MVT::i32) 8087 return SDValue(); 8088 8089 SDValue N00 = N0.getOperand(0); 8090 8091 // The value and the mask need to be constants so we can verify this is 8092 // actually a bitfield set. If the mask is 0xffff, we can do better 8093 // via a movt instruction, so don't use BFI in that case. 8094 SDValue MaskOp = N0.getOperand(1); 8095 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 8096 if (!MaskC) 8097 return SDValue(); 8098 unsigned Mask = MaskC->getZExtValue(); 8099 if (Mask == 0xffff) 8100 return SDValue(); 8101 SDValue Res; 8102 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 8103 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 8104 if (N1C) { 8105 unsigned Val = N1C->getZExtValue(); 8106 if ((Val & ~Mask) != Val) 8107 return SDValue(); 8108 8109 if (ARM::isBitFieldInvertedMask(Mask)) { 8110 Val >>= CountTrailingZeros_32(~Mask); 8111 8112 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 8113 DAG.getConstant(Val, MVT::i32), 8114 DAG.getConstant(Mask, MVT::i32)); 8115 8116 // Do not add new nodes to DAG combiner worklist. 8117 DCI.CombineTo(N, Res, false); 8118 return SDValue(); 8119 } 8120 } else if (N1.getOpcode() == ISD::AND) { 8121 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 8122 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 8123 if (!N11C) 8124 return SDValue(); 8125 unsigned Mask2 = N11C->getZExtValue(); 8126 8127 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 8128 // as is to match. 8129 if (ARM::isBitFieldInvertedMask(Mask) && 8130 (Mask == ~Mask2)) { 8131 // The pack halfword instruction works better for masks that fit it, 8132 // so use that when it's available. 8133 if (Subtarget->hasT2ExtractPack() && 8134 (Mask == 0xffff || Mask == 0xffff0000)) 8135 return SDValue(); 8136 // 2a 8137 unsigned amt = CountTrailingZeros_32(Mask2); 8138 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 8139 DAG.getConstant(amt, MVT::i32)); 8140 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 8141 DAG.getConstant(Mask, MVT::i32)); 8142 // Do not add new nodes to DAG combiner worklist. 8143 DCI.CombineTo(N, Res, false); 8144 return SDValue(); 8145 } else if (ARM::isBitFieldInvertedMask(~Mask) && 8146 (~Mask == Mask2)) { 8147 // The pack halfword instruction works better for masks that fit it, 8148 // so use that when it's available. 8149 if (Subtarget->hasT2ExtractPack() && 8150 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 8151 return SDValue(); 8152 // 2b 8153 unsigned lsb = CountTrailingZeros_32(Mask); 8154 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 8155 DAG.getConstant(lsb, MVT::i32)); 8156 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 8157 DAG.getConstant(Mask2, MVT::i32)); 8158 // Do not add new nodes to DAG combiner worklist. 8159 DCI.CombineTo(N, Res, false); 8160 return SDValue(); 8161 } 8162 } 8163 8164 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 8165 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 8166 ARM::isBitFieldInvertedMask(~Mask)) { 8167 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 8168 // where lsb(mask) == #shamt and masked bits of B are known zero. 8169 SDValue ShAmt = N00.getOperand(1); 8170 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 8171 unsigned LSB = CountTrailingZeros_32(Mask); 8172 if (ShAmtC != LSB) 8173 return SDValue(); 8174 8175 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 8176 DAG.getConstant(~Mask, MVT::i32)); 8177 8178 // Do not add new nodes to DAG combiner worklist. 8179 DCI.CombineTo(N, Res, false); 8180 } 8181 8182 return SDValue(); 8183 } 8184 8185 static SDValue PerformXORCombine(SDNode *N, 8186 TargetLowering::DAGCombinerInfo &DCI, 8187 const ARMSubtarget *Subtarget) { 8188 EVT VT = N->getValueType(0); 8189 SelectionDAG &DAG = DCI.DAG; 8190 8191 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 8192 return SDValue(); 8193 8194 if (!Subtarget->isThumb1Only()) { 8195 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 8196 SDValue Result = combineSelectAndUseCommutative(N, false, DCI); 8197 if (Result.getNode()) 8198 return Result; 8199 } 8200 8201 return SDValue(); 8202 } 8203 8204 /// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 8205 /// the bits being cleared by the AND are not demanded by the BFI. 8206 static SDValue PerformBFICombine(SDNode *N, 8207 TargetLowering::DAGCombinerInfo &DCI) { 8208 SDValue N1 = N->getOperand(1); 8209 if (N1.getOpcode() == ISD::AND) { 8210 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 8211 if (!N11C) 8212 return SDValue(); 8213 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 8214 unsigned LSB = CountTrailingZeros_32(~InvMask); 8215 unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB; 8216 unsigned Mask = (1 << Width)-1; 8217 unsigned Mask2 = N11C->getZExtValue(); 8218 if ((Mask & (~Mask2)) == 0) 8219 return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0), 8220 N->getOperand(0), N1.getOperand(0), 8221 N->getOperand(2)); 8222 } 8223 return SDValue(); 8224 } 8225 8226 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 8227 /// ARMISD::VMOVRRD. 8228 static SDValue PerformVMOVRRDCombine(SDNode *N, 8229 TargetLowering::DAGCombinerInfo &DCI) { 8230 // vmovrrd(vmovdrr x, y) -> x,y 8231 SDValue InDouble = N->getOperand(0); 8232 if (InDouble.getOpcode() == ARMISD::VMOVDRR) 8233 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 8234 8235 // vmovrrd(load f64) -> (load i32), (load i32) 8236 SDNode *InNode = InDouble.getNode(); 8237 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 8238 InNode->getValueType(0) == MVT::f64 && 8239 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 8240 !cast<LoadSDNode>(InNode)->isVolatile()) { 8241 // TODO: Should this be done for non-FrameIndex operands? 8242 LoadSDNode *LD = cast<LoadSDNode>(InNode); 8243 8244 SelectionDAG &DAG = DCI.DAG; 8245 DebugLoc DL = LD->getDebugLoc(); 8246 SDValue BasePtr = LD->getBasePtr(); 8247 SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, 8248 LD->getPointerInfo(), LD->isVolatile(), 8249 LD->isNonTemporal(), LD->isInvariant(), 8250 LD->getAlignment()); 8251 8252 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 8253 DAG.getConstant(4, MVT::i32)); 8254 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, 8255 LD->getPointerInfo(), LD->isVolatile(), 8256 LD->isNonTemporal(), LD->isInvariant(), 8257 std::min(4U, LD->getAlignment() / 2)); 8258 8259 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 8260 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 8261 DCI.RemoveFromWorklist(LD); 8262 DAG.DeleteNode(LD); 8263 return Result; 8264 } 8265 8266 return SDValue(); 8267 } 8268 8269 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 8270 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 8271 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 8272 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 8273 SDValue Op0 = N->getOperand(0); 8274 SDValue Op1 = N->getOperand(1); 8275 if (Op0.getOpcode() == ISD::BITCAST) 8276 Op0 = Op0.getOperand(0); 8277 if (Op1.getOpcode() == ISD::BITCAST) 8278 Op1 = Op1.getOperand(0); 8279 if (Op0.getOpcode() == ARMISD::VMOVRRD && 8280 Op0.getNode() == Op1.getNode() && 8281 Op0.getResNo() == 0 && Op1.getResNo() == 1) 8282 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), 8283 N->getValueType(0), Op0.getOperand(0)); 8284 return SDValue(); 8285 } 8286 8287 /// PerformSTORECombine - Target-specific dag combine xforms for 8288 /// ISD::STORE. 8289 static SDValue PerformSTORECombine(SDNode *N, 8290 TargetLowering::DAGCombinerInfo &DCI) { 8291 StoreSDNode *St = cast<StoreSDNode>(N); 8292 if (St->isVolatile()) 8293 return SDValue(); 8294 8295 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 8296 // pack all of the elements in one place. Next, store to memory in fewer 8297 // chunks. 8298 SDValue StVal = St->getValue(); 8299 EVT VT = StVal.getValueType(); 8300 if (St->isTruncatingStore() && VT.isVector()) { 8301 SelectionDAG &DAG = DCI.DAG; 8302 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8303 EVT StVT = St->getMemoryVT(); 8304 unsigned NumElems = VT.getVectorNumElements(); 8305 assert(StVT != VT && "Cannot truncate to the same type"); 8306 unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); 8307 unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); 8308 8309 // From, To sizes and ElemCount must be pow of two 8310 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); 8311 8312 // We are going to use the original vector elt for storing. 8313 // Accumulated smaller vector elements must be a multiple of the store size. 8314 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); 8315 8316 unsigned SizeRatio = FromEltSz / ToEltSz; 8317 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 8318 8319 // Create a type on which we perform the shuffle. 8320 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 8321 NumElems*SizeRatio); 8322 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 8323 8324 DebugLoc DL = St->getDebugLoc(); 8325 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 8326 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 8327 for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio; 8328 8329 // Can't shuffle using an illegal type. 8330 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 8331 8332 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, 8333 DAG.getUNDEF(WideVec.getValueType()), 8334 ShuffleVec.data()); 8335 // At this point all of the data is stored at the bottom of the 8336 // register. We now need to save it to mem. 8337 8338 // Find the largest store unit 8339 MVT StoreType = MVT::i8; 8340 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 8341 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 8342 MVT Tp = (MVT::SimpleValueType)tp; 8343 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 8344 StoreType = Tp; 8345 } 8346 // Didn't find a legal store type. 8347 if (!TLI.isTypeLegal(StoreType)) 8348 return SDValue(); 8349 8350 // Bitcast the original vector into a vector of store-size units 8351 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 8352 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 8353 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 8354 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 8355 SmallVector<SDValue, 8> Chains; 8356 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 8357 TLI.getPointerTy()); 8358 SDValue BasePtr = St->getBasePtr(); 8359 8360 // Perform one or more big stores into memory. 8361 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); 8362 for (unsigned I = 0; I < E; I++) { 8363 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 8364 StoreType, ShuffWide, 8365 DAG.getIntPtrConstant(I)); 8366 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, 8367 St->getPointerInfo(), St->isVolatile(), 8368 St->isNonTemporal(), St->getAlignment()); 8369 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, 8370 Increment); 8371 Chains.push_back(Ch); 8372 } 8373 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0], 8374 Chains.size()); 8375 } 8376 8377 if (!ISD::isNormalStore(St)) 8378 return SDValue(); 8379 8380 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 8381 // ARM stores of arguments in the same cache line. 8382 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 8383 StVal.getNode()->hasOneUse()) { 8384 SelectionDAG &DAG = DCI.DAG; 8385 DebugLoc DL = St->getDebugLoc(); 8386 SDValue BasePtr = St->getBasePtr(); 8387 SDValue NewST1 = DAG.getStore(St->getChain(), DL, 8388 StVal.getNode()->getOperand(0), BasePtr, 8389 St->getPointerInfo(), St->isVolatile(), 8390 St->isNonTemporal(), St->getAlignment()); 8391 8392 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 8393 DAG.getConstant(4, MVT::i32)); 8394 return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1), 8395 OffsetPtr, St->getPointerInfo(), St->isVolatile(), 8396 St->isNonTemporal(), 8397 std::min(4U, St->getAlignment() / 2)); 8398 } 8399 8400 if (StVal.getValueType() != MVT::i64 || 8401 StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8402 return SDValue(); 8403 8404 // Bitcast an i64 store extracted from a vector to f64. 8405 // Otherwise, the i64 value will be legalized to a pair of i32 values. 8406 SelectionDAG &DAG = DCI.DAG; 8407 DebugLoc dl = StVal.getDebugLoc(); 8408 SDValue IntVec = StVal.getOperand(0); 8409 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 8410 IntVec.getValueType().getVectorNumElements()); 8411 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 8412 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8413 Vec, StVal.getOperand(1)); 8414 dl = N->getDebugLoc(); 8415 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 8416 // Make the DAGCombiner fold the bitcasts. 8417 DCI.AddToWorklist(Vec.getNode()); 8418 DCI.AddToWorklist(ExtElt.getNode()); 8419 DCI.AddToWorklist(V.getNode()); 8420 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 8421 St->getPointerInfo(), St->isVolatile(), 8422 St->isNonTemporal(), St->getAlignment(), 8423 St->getTBAAInfo()); 8424 } 8425 8426 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 8427 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 8428 /// i64 vector to have f64 elements, since the value can then be loaded 8429 /// directly into a VFP register. 8430 static bool hasNormalLoadOperand(SDNode *N) { 8431 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 8432 for (unsigned i = 0; i < NumElts; ++i) { 8433 SDNode *Elt = N->getOperand(i).getNode(); 8434 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 8435 return true; 8436 } 8437 return false; 8438 } 8439 8440 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 8441 /// ISD::BUILD_VECTOR. 8442 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 8443 TargetLowering::DAGCombinerInfo &DCI){ 8444 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 8445 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 8446 // into a pair of GPRs, which is fine when the value is used as a scalar, 8447 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 8448 SelectionDAG &DAG = DCI.DAG; 8449 if (N->getNumOperands() == 2) { 8450 SDValue RV = PerformVMOVDRRCombine(N, DAG); 8451 if (RV.getNode()) 8452 return RV; 8453 } 8454 8455 // Load i64 elements as f64 values so that type legalization does not split 8456 // them up into i32 values. 8457 EVT VT = N->getValueType(0); 8458 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 8459 return SDValue(); 8460 DebugLoc dl = N->getDebugLoc(); 8461 SmallVector<SDValue, 8> Ops; 8462 unsigned NumElts = VT.getVectorNumElements(); 8463 for (unsigned i = 0; i < NumElts; ++i) { 8464 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 8465 Ops.push_back(V); 8466 // Make the DAGCombiner fold the bitcast. 8467 DCI.AddToWorklist(V.getNode()); 8468 } 8469 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 8470 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts); 8471 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 8472 } 8473 8474 /// PerformInsertEltCombine - Target-specific dag combine xforms for 8475 /// ISD::INSERT_VECTOR_ELT. 8476 static SDValue PerformInsertEltCombine(SDNode *N, 8477 TargetLowering::DAGCombinerInfo &DCI) { 8478 // Bitcast an i64 load inserted into a vector to f64. 8479 // Otherwise, the i64 value will be legalized to a pair of i32 values. 8480 EVT VT = N->getValueType(0); 8481 SDNode *Elt = N->getOperand(1).getNode(); 8482 if (VT.getVectorElementType() != MVT::i64 || 8483 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 8484 return SDValue(); 8485 8486 SelectionDAG &DAG = DCI.DAG; 8487 DebugLoc dl = N->getDebugLoc(); 8488 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 8489 VT.getVectorNumElements()); 8490 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 8491 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 8492 // Make the DAGCombiner fold the bitcasts. 8493 DCI.AddToWorklist(Vec.getNode()); 8494 DCI.AddToWorklist(V.getNode()); 8495 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 8496 Vec, V, N->getOperand(2)); 8497 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 8498 } 8499 8500 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 8501 /// ISD::VECTOR_SHUFFLE. 8502 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 8503 // The LLVM shufflevector instruction does not require the shuffle mask 8504 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 8505 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 8506 // operands do not match the mask length, they are extended by concatenating 8507 // them with undef vectors. That is probably the right thing for other 8508 // targets, but for NEON it is better to concatenate two double-register 8509 // size vector operands into a single quad-register size vector. Do that 8510 // transformation here: 8511 // shuffle(concat(v1, undef), concat(v2, undef)) -> 8512 // shuffle(concat(v1, v2), undef) 8513 SDValue Op0 = N->getOperand(0); 8514 SDValue Op1 = N->getOperand(1); 8515 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 8516 Op1.getOpcode() != ISD::CONCAT_VECTORS || 8517 Op0.getNumOperands() != 2 || 8518 Op1.getNumOperands() != 2) 8519 return SDValue(); 8520 SDValue Concat0Op1 = Op0.getOperand(1); 8521 SDValue Concat1Op1 = Op1.getOperand(1); 8522 if (Concat0Op1.getOpcode() != ISD::UNDEF || 8523 Concat1Op1.getOpcode() != ISD::UNDEF) 8524 return SDValue(); 8525 // Skip the transformation if any of the types are illegal. 8526 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8527 EVT VT = N->getValueType(0); 8528 if (!TLI.isTypeLegal(VT) || 8529 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 8530 !TLI.isTypeLegal(Concat1Op1.getValueType())) 8531 return SDValue(); 8532 8533 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, 8534 Op0.getOperand(0), Op1.getOperand(0)); 8535 // Translate the shuffle mask. 8536 SmallVector<int, 16> NewMask; 8537 unsigned NumElts = VT.getVectorNumElements(); 8538 unsigned HalfElts = NumElts/2; 8539 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8540 for (unsigned n = 0; n < NumElts; ++n) { 8541 int MaskElt = SVN->getMaskElt(n); 8542 int NewElt = -1; 8543 if (MaskElt < (int)HalfElts) 8544 NewElt = MaskElt; 8545 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 8546 NewElt = HalfElts + MaskElt - NumElts; 8547 NewMask.push_back(NewElt); 8548 } 8549 return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat, 8550 DAG.getUNDEF(VT), NewMask.data()); 8551 } 8552 8553 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and 8554 /// NEON load/store intrinsics to merge base address updates. 8555 static SDValue CombineBaseUpdate(SDNode *N, 8556 TargetLowering::DAGCombinerInfo &DCI) { 8557 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 8558 return SDValue(); 8559 8560 SelectionDAG &DAG = DCI.DAG; 8561 bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 8562 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 8563 unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); 8564 SDValue Addr = N->getOperand(AddrOpIdx); 8565 8566 // Search for a use of the address operand that is an increment. 8567 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 8568 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 8569 SDNode *User = *UI; 8570 if (User->getOpcode() != ISD::ADD || 8571 UI.getUse().getResNo() != Addr.getResNo()) 8572 continue; 8573 8574 // Check that the add is independent of the load/store. Otherwise, folding 8575 // it would create a cycle. 8576 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 8577 continue; 8578 8579 // Find the new opcode for the updating load/store. 8580 bool isLoad = true; 8581 bool isLaneOp = false; 8582 unsigned NewOpc = 0; 8583 unsigned NumVecs = 0; 8584 if (isIntrinsic) { 8585 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 8586 switch (IntNo) { 8587 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 8588 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 8589 NumVecs = 1; break; 8590 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 8591 NumVecs = 2; break; 8592 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 8593 NumVecs = 3; break; 8594 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 8595 NumVecs = 4; break; 8596 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 8597 NumVecs = 2; isLaneOp = true; break; 8598 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 8599 NumVecs = 3; isLaneOp = true; break; 8600 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 8601 NumVecs = 4; isLaneOp = true; break; 8602 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 8603 NumVecs = 1; isLoad = false; break; 8604 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 8605 NumVecs = 2; isLoad = false; break; 8606 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 8607 NumVecs = 3; isLoad = false; break; 8608 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 8609 NumVecs = 4; isLoad = false; break; 8610 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 8611 NumVecs = 2; isLoad = false; isLaneOp = true; break; 8612 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 8613 NumVecs = 3; isLoad = false; isLaneOp = true; break; 8614 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 8615 NumVecs = 4; isLoad = false; isLaneOp = true; break; 8616 } 8617 } else { 8618 isLaneOp = true; 8619 switch (N->getOpcode()) { 8620 default: llvm_unreachable("unexpected opcode for Neon base update"); 8621 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 8622 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 8623 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 8624 } 8625 } 8626 8627 // Find the size of memory referenced by the load/store. 8628 EVT VecTy; 8629 if (isLoad) 8630 VecTy = N->getValueType(0); 8631 else 8632 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 8633 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 8634 if (isLaneOp) 8635 NumBytes /= VecTy.getVectorNumElements(); 8636 8637 // If the increment is a constant, it must match the memory ref size. 8638 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 8639 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 8640 uint64_t IncVal = CInc->getZExtValue(); 8641 if (IncVal != NumBytes) 8642 continue; 8643 } else if (NumBytes >= 3 * 16) { 8644 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 8645 // separate instructions that make it harder to use a non-constant update. 8646 continue; 8647 } 8648 8649 // Create the new updating load/store node. 8650 EVT Tys[6]; 8651 unsigned NumResultVecs = (isLoad ? NumVecs : 0); 8652 unsigned n; 8653 for (n = 0; n < NumResultVecs; ++n) 8654 Tys[n] = VecTy; 8655 Tys[n++] = MVT::i32; 8656 Tys[n] = MVT::Other; 8657 SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2); 8658 SmallVector<SDValue, 8> Ops; 8659 Ops.push_back(N->getOperand(0)); // incoming chain 8660 Ops.push_back(N->getOperand(AddrOpIdx)); 8661 Ops.push_back(Inc); 8662 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { 8663 Ops.push_back(N->getOperand(i)); 8664 } 8665 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 8666 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys, 8667 Ops.data(), Ops.size(), 8668 MemInt->getMemoryVT(), 8669 MemInt->getMemOperand()); 8670 8671 // Update the uses. 8672 std::vector<SDValue> NewResults; 8673 for (unsigned i = 0; i < NumResultVecs; ++i) { 8674 NewResults.push_back(SDValue(UpdN.getNode(), i)); 8675 } 8676 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 8677 DCI.CombineTo(N, NewResults); 8678 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 8679 8680 break; 8681 } 8682 return SDValue(); 8683 } 8684 8685 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 8686 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 8687 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 8688 /// return true. 8689 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 8690 SelectionDAG &DAG = DCI.DAG; 8691 EVT VT = N->getValueType(0); 8692 // vldN-dup instructions only support 64-bit vectors for N > 1. 8693 if (!VT.is64BitVector()) 8694 return false; 8695 8696 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 8697 SDNode *VLD = N->getOperand(0).getNode(); 8698 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 8699 return false; 8700 unsigned NumVecs = 0; 8701 unsigned NewOpc = 0; 8702 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 8703 if (IntNo == Intrinsic::arm_neon_vld2lane) { 8704 NumVecs = 2; 8705 NewOpc = ARMISD::VLD2DUP; 8706 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 8707 NumVecs = 3; 8708 NewOpc = ARMISD::VLD3DUP; 8709 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 8710 NumVecs = 4; 8711 NewOpc = ARMISD::VLD4DUP; 8712 } else { 8713 return false; 8714 } 8715 8716 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 8717 // numbers match the load. 8718 unsigned VLDLaneNo = 8719 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 8720 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 8721 UI != UE; ++UI) { 8722 // Ignore uses of the chain result. 8723 if (UI.getUse().getResNo() == NumVecs) 8724 continue; 8725 SDNode *User = *UI; 8726 if (User->getOpcode() != ARMISD::VDUPLANE || 8727 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 8728 return false; 8729 } 8730 8731 // Create the vldN-dup node. 8732 EVT Tys[5]; 8733 unsigned n; 8734 for (n = 0; n < NumVecs; ++n) 8735 Tys[n] = VT; 8736 Tys[n] = MVT::Other; 8737 SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1); 8738 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 8739 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 8740 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys, 8741 Ops, 2, VLDMemInt->getMemoryVT(), 8742 VLDMemInt->getMemOperand()); 8743 8744 // Update the uses. 8745 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 8746 UI != UE; ++UI) { 8747 unsigned ResNo = UI.getUse().getResNo(); 8748 // Ignore uses of the chain result. 8749 if (ResNo == NumVecs) 8750 continue; 8751 SDNode *User = *UI; 8752 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 8753 } 8754 8755 // Now the vldN-lane intrinsic is dead except for its chain result. 8756 // Update uses of the chain. 8757 std::vector<SDValue> VLDDupResults; 8758 for (unsigned n = 0; n < NumVecs; ++n) 8759 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 8760 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 8761 DCI.CombineTo(VLD, VLDDupResults); 8762 8763 return true; 8764 } 8765 8766 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 8767 /// ARMISD::VDUPLANE. 8768 static SDValue PerformVDUPLANECombine(SDNode *N, 8769 TargetLowering::DAGCombinerInfo &DCI) { 8770 SDValue Op = N->getOperand(0); 8771 8772 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 8773 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 8774 if (CombineVLDDUP(N, DCI)) 8775 return SDValue(N, 0); 8776 8777 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 8778 // redundant. Ignore bit_converts for now; element sizes are checked below. 8779 while (Op.getOpcode() == ISD::BITCAST) 8780 Op = Op.getOperand(0); 8781 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 8782 return SDValue(); 8783 8784 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 8785 unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); 8786 // The canonical VMOV for a zero vector uses a 32-bit element size. 8787 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8788 unsigned EltBits; 8789 if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) 8790 EltSize = 8; 8791 EVT VT = N->getValueType(0); 8792 if (EltSize > VT.getVectorElementType().getSizeInBits()) 8793 return SDValue(); 8794 8795 return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 8796 } 8797 8798 // isConstVecPow2 - Return true if each vector element is a power of 2, all 8799 // elements are the same constant, C, and Log2(C) ranges from 1 to 32. 8800 static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) 8801 { 8802 integerPart cN; 8803 integerPart c0 = 0; 8804 for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements(); 8805 I != E; I++) { 8806 ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I)); 8807 if (!C) 8808 return false; 8809 8810 bool isExact; 8811 APFloat APF = C->getValueAPF(); 8812 if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact) 8813 != APFloat::opOK || !isExact) 8814 return false; 8815 8816 c0 = (I == 0) ? cN : c0; 8817 if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32) 8818 return false; 8819 } 8820 C = c0; 8821 return true; 8822 } 8823 8824 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 8825 /// can replace combinations of VMUL and VCVT (floating-point to integer) 8826 /// when the VMUL has a constant operand that is a power of 2. 8827 /// 8828 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 8829 /// vmul.f32 d16, d17, d16 8830 /// vcvt.s32.f32 d16, d16 8831 /// becomes: 8832 /// vcvt.s32.f32 d16, d16, #3 8833 static SDValue PerformVCVTCombine(SDNode *N, 8834 TargetLowering::DAGCombinerInfo &DCI, 8835 const ARMSubtarget *Subtarget) { 8836 SelectionDAG &DAG = DCI.DAG; 8837 SDValue Op = N->getOperand(0); 8838 8839 if (!Subtarget->hasNEON() || !Op.getValueType().isVector() || 8840 Op.getOpcode() != ISD::FMUL) 8841 return SDValue(); 8842 8843 uint64_t C; 8844 SDValue N0 = Op->getOperand(0); 8845 SDValue ConstVec = Op->getOperand(1); 8846 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 8847 8848 if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || 8849 !isConstVecPow2(ConstVec, isSigned, C)) 8850 return SDValue(); 8851 8852 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 8853 Intrinsic::arm_neon_vcvtfp2fxu; 8854 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), 8855 N->getValueType(0), 8856 DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, 8857 DAG.getConstant(Log2_64(C), MVT::i32)); 8858 } 8859 8860 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 8861 /// can replace combinations of VCVT (integer to floating-point) and VDIV 8862 /// when the VDIV has a constant operand that is a power of 2. 8863 /// 8864 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 8865 /// vcvt.f32.s32 d16, d16 8866 /// vdiv.f32 d16, d17, d16 8867 /// becomes: 8868 /// vcvt.f32.s32 d16, d16, #3 8869 static SDValue PerformVDIVCombine(SDNode *N, 8870 TargetLowering::DAGCombinerInfo &DCI, 8871 const ARMSubtarget *Subtarget) { 8872 SelectionDAG &DAG = DCI.DAG; 8873 SDValue Op = N->getOperand(0); 8874 unsigned OpOpcode = Op.getNode()->getOpcode(); 8875 8876 if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() || 8877 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 8878 return SDValue(); 8879 8880 uint64_t C; 8881 SDValue ConstVec = N->getOperand(1); 8882 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 8883 8884 if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || 8885 !isConstVecPow2(ConstVec, isSigned, C)) 8886 return SDValue(); 8887 8888 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 8889 Intrinsic::arm_neon_vcvtfxu2fp; 8890 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(), 8891 Op.getValueType(), 8892 DAG.getConstant(IntrinsicOpcode, MVT::i32), 8893 Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32)); 8894 } 8895 8896 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 8897 /// operand of a vector shift operation, where all the elements of the 8898 /// build_vector must have the same constant integer value. 8899 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 8900 // Ignore bit_converts. 8901 while (Op.getOpcode() == ISD::BITCAST) 8902 Op = Op.getOperand(0); 8903 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 8904 APInt SplatBits, SplatUndef; 8905 unsigned SplatBitSize; 8906 bool HasAnyUndefs; 8907 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 8908 HasAnyUndefs, ElementBits) || 8909 SplatBitSize > ElementBits) 8910 return false; 8911 Cnt = SplatBits.getSExtValue(); 8912 return true; 8913 } 8914 8915 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 8916 /// operand of a vector shift left operation. That value must be in the range: 8917 /// 0 <= Value < ElementBits for a left shift; or 8918 /// 0 <= Value <= ElementBits for a long left shift. 8919 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 8920 assert(VT.isVector() && "vector shift count is not a vector type"); 8921 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 8922 if (! getVShiftImm(Op, ElementBits, Cnt)) 8923 return false; 8924 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); 8925 } 8926 8927 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 8928 /// operand of a vector shift right operation. For a shift opcode, the value 8929 /// is positive, but for an intrinsic the value count must be negative. The 8930 /// absolute value must be in the range: 8931 /// 1 <= |Value| <= ElementBits for a right shift; or 8932 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 8933 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 8934 int64_t &Cnt) { 8935 assert(VT.isVector() && "vector shift count is not a vector type"); 8936 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 8937 if (! getVShiftImm(Op, ElementBits, Cnt)) 8938 return false; 8939 if (isIntrinsic) 8940 Cnt = -Cnt; 8941 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); 8942 } 8943 8944 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 8945 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 8946 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 8947 switch (IntNo) { 8948 default: 8949 // Don't do anything for most intrinsics. 8950 break; 8951 8952 // Vector shifts: check for immediate versions and lower them. 8953 // Note: This is done during DAG combining instead of DAG legalizing because 8954 // the build_vectors for 64-bit vector element shift counts are generally 8955 // not legal, and it is hard to see their values after they get legalized to 8956 // loads from a constant pool. 8957 case Intrinsic::arm_neon_vshifts: 8958 case Intrinsic::arm_neon_vshiftu: 8959 case Intrinsic::arm_neon_vshiftls: 8960 case Intrinsic::arm_neon_vshiftlu: 8961 case Intrinsic::arm_neon_vshiftn: 8962 case Intrinsic::arm_neon_vrshifts: 8963 case Intrinsic::arm_neon_vrshiftu: 8964 case Intrinsic::arm_neon_vrshiftn: 8965 case Intrinsic::arm_neon_vqshifts: 8966 case Intrinsic::arm_neon_vqshiftu: 8967 case Intrinsic::arm_neon_vqshiftsu: 8968 case Intrinsic::arm_neon_vqshiftns: 8969 case Intrinsic::arm_neon_vqshiftnu: 8970 case Intrinsic::arm_neon_vqshiftnsu: 8971 case Intrinsic::arm_neon_vqrshiftns: 8972 case Intrinsic::arm_neon_vqrshiftnu: 8973 case Intrinsic::arm_neon_vqrshiftnsu: { 8974 EVT VT = N->getOperand(1).getValueType(); 8975 int64_t Cnt; 8976 unsigned VShiftOpc = 0; 8977 8978 switch (IntNo) { 8979 case Intrinsic::arm_neon_vshifts: 8980 case Intrinsic::arm_neon_vshiftu: 8981 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 8982 VShiftOpc = ARMISD::VSHL; 8983 break; 8984 } 8985 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 8986 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? 8987 ARMISD::VSHRs : ARMISD::VSHRu); 8988 break; 8989 } 8990 return SDValue(); 8991 8992 case Intrinsic::arm_neon_vshiftls: 8993 case Intrinsic::arm_neon_vshiftlu: 8994 if (isVShiftLImm(N->getOperand(2), VT, true, Cnt)) 8995 break; 8996 llvm_unreachable("invalid shift count for vshll intrinsic"); 8997 8998 case Intrinsic::arm_neon_vrshifts: 8999 case Intrinsic::arm_neon_vrshiftu: 9000 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 9001 break; 9002 return SDValue(); 9003 9004 case Intrinsic::arm_neon_vqshifts: 9005 case Intrinsic::arm_neon_vqshiftu: 9006 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 9007 break; 9008 return SDValue(); 9009 9010 case Intrinsic::arm_neon_vqshiftsu: 9011 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 9012 break; 9013 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 9014 9015 case Intrinsic::arm_neon_vshiftn: 9016 case Intrinsic::arm_neon_vrshiftn: 9017 case Intrinsic::arm_neon_vqshiftns: 9018 case Intrinsic::arm_neon_vqshiftnu: 9019 case Intrinsic::arm_neon_vqshiftnsu: 9020 case Intrinsic::arm_neon_vqrshiftns: 9021 case Intrinsic::arm_neon_vqrshiftnu: 9022 case Intrinsic::arm_neon_vqrshiftnsu: 9023 // Narrowing shifts require an immediate right shift. 9024 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 9025 break; 9026 llvm_unreachable("invalid shift count for narrowing vector shift " 9027 "intrinsic"); 9028 9029 default: 9030 llvm_unreachable("unhandled vector shift"); 9031 } 9032 9033 switch (IntNo) { 9034 case Intrinsic::arm_neon_vshifts: 9035 case Intrinsic::arm_neon_vshiftu: 9036 // Opcode already set above. 9037 break; 9038 case Intrinsic::arm_neon_vshiftls: 9039 case Intrinsic::arm_neon_vshiftlu: 9040 if (Cnt == VT.getVectorElementType().getSizeInBits()) 9041 VShiftOpc = ARMISD::VSHLLi; 9042 else 9043 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ? 9044 ARMISD::VSHLLs : ARMISD::VSHLLu); 9045 break; 9046 case Intrinsic::arm_neon_vshiftn: 9047 VShiftOpc = ARMISD::VSHRN; break; 9048 case Intrinsic::arm_neon_vrshifts: 9049 VShiftOpc = ARMISD::VRSHRs; break; 9050 case Intrinsic::arm_neon_vrshiftu: 9051 VShiftOpc = ARMISD::VRSHRu; break; 9052 case Intrinsic::arm_neon_vrshiftn: 9053 VShiftOpc = ARMISD::VRSHRN; break; 9054 case Intrinsic::arm_neon_vqshifts: 9055 VShiftOpc = ARMISD::VQSHLs; break; 9056 case Intrinsic::arm_neon_vqshiftu: 9057 VShiftOpc = ARMISD::VQSHLu; break; 9058 case Intrinsic::arm_neon_vqshiftsu: 9059 VShiftOpc = ARMISD::VQSHLsu; break; 9060 case Intrinsic::arm_neon_vqshiftns: 9061 VShiftOpc = ARMISD::VQSHRNs; break; 9062 case Intrinsic::arm_neon_vqshiftnu: 9063 VShiftOpc = ARMISD::VQSHRNu; break; 9064 case Intrinsic::arm_neon_vqshiftnsu: 9065 VShiftOpc = ARMISD::VQSHRNsu; break; 9066 case Intrinsic::arm_neon_vqrshiftns: 9067 VShiftOpc = ARMISD::VQRSHRNs; break; 9068 case Intrinsic::arm_neon_vqrshiftnu: 9069 VShiftOpc = ARMISD::VQRSHRNu; break; 9070 case Intrinsic::arm_neon_vqrshiftnsu: 9071 VShiftOpc = ARMISD::VQRSHRNsu; break; 9072 } 9073 9074 return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), 9075 N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); 9076 } 9077 9078 case Intrinsic::arm_neon_vshiftins: { 9079 EVT VT = N->getOperand(1).getValueType(); 9080 int64_t Cnt; 9081 unsigned VShiftOpc = 0; 9082 9083 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 9084 VShiftOpc = ARMISD::VSLI; 9085 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 9086 VShiftOpc = ARMISD::VSRI; 9087 else { 9088 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 9089 } 9090 9091 return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0), 9092 N->getOperand(1), N->getOperand(2), 9093 DAG.getConstant(Cnt, MVT::i32)); 9094 } 9095 9096 case Intrinsic::arm_neon_vqrshifts: 9097 case Intrinsic::arm_neon_vqrshiftu: 9098 // No immediate versions of these to check for. 9099 break; 9100 } 9101 9102 return SDValue(); 9103 } 9104 9105 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 9106 /// lowers them. As with the vector shift intrinsics, this is done during DAG 9107 /// combining instead of DAG legalizing because the build_vectors for 64-bit 9108 /// vector element shift counts are generally not legal, and it is hard to see 9109 /// their values after they get legalized to loads from a constant pool. 9110 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, 9111 const ARMSubtarget *ST) { 9112 EVT VT = N->getValueType(0); 9113 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 9114 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 9115 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 9116 SDValue N1 = N->getOperand(1); 9117 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 9118 SDValue N0 = N->getOperand(0); 9119 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 9120 DAG.MaskedValueIsZero(N0.getOperand(0), 9121 APInt::getHighBitsSet(32, 16))) 9122 return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1); 9123 } 9124 } 9125 9126 // Nothing to be done for scalar shifts. 9127 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9128 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 9129 return SDValue(); 9130 9131 assert(ST->hasNEON() && "unexpected vector shift"); 9132 int64_t Cnt; 9133 9134 switch (N->getOpcode()) { 9135 default: llvm_unreachable("unexpected shift opcode"); 9136 9137 case ISD::SHL: 9138 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 9139 return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0), 9140 DAG.getConstant(Cnt, MVT::i32)); 9141 break; 9142 9143 case ISD::SRA: 9144 case ISD::SRL: 9145 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 9146 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? 9147 ARMISD::VSHRs : ARMISD::VSHRu); 9148 return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0), 9149 DAG.getConstant(Cnt, MVT::i32)); 9150 } 9151 } 9152 return SDValue(); 9153 } 9154 9155 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 9156 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 9157 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 9158 const ARMSubtarget *ST) { 9159 SDValue N0 = N->getOperand(0); 9160 9161 // Check for sign- and zero-extensions of vector extract operations of 8- 9162 // and 16-bit vector elements. NEON supports these directly. They are 9163 // handled during DAG combining because type legalization will promote them 9164 // to 32-bit types and it is messy to recognize the operations after that. 9165 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 9166 SDValue Vec = N0.getOperand(0); 9167 SDValue Lane = N0.getOperand(1); 9168 EVT VT = N->getValueType(0); 9169 EVT EltVT = N0.getValueType(); 9170 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9171 9172 if (VT == MVT::i32 && 9173 (EltVT == MVT::i8 || EltVT == MVT::i16) && 9174 TLI.isTypeLegal(Vec.getValueType()) && 9175 isa<ConstantSDNode>(Lane)) { 9176 9177 unsigned Opc = 0; 9178 switch (N->getOpcode()) { 9179 default: llvm_unreachable("unexpected opcode"); 9180 case ISD::SIGN_EXTEND: 9181 Opc = ARMISD::VGETLANEs; 9182 break; 9183 case ISD::ZERO_EXTEND: 9184 case ISD::ANY_EXTEND: 9185 Opc = ARMISD::VGETLANEu; 9186 break; 9187 } 9188 return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane); 9189 } 9190 } 9191 9192 return SDValue(); 9193 } 9194 9195 /// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC 9196 /// to match f32 max/min patterns to use NEON vmax/vmin instructions. 9197 static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, 9198 const ARMSubtarget *ST) { 9199 // If the target supports NEON, try to use vmax/vmin instructions for f32 9200 // selects like "x < y ? x : y". Unless the NoNaNsFPMath option is set, 9201 // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is 9202 // a NaN; only do the transformation when it matches that behavior. 9203 9204 // For now only do this when using NEON for FP operations; if using VFP, it 9205 // is not obvious that the benefit outweighs the cost of switching to the 9206 // NEON pipeline. 9207 if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() || 9208 N->getValueType(0) != MVT::f32) 9209 return SDValue(); 9210 9211 SDValue CondLHS = N->getOperand(0); 9212 SDValue CondRHS = N->getOperand(1); 9213 SDValue LHS = N->getOperand(2); 9214 SDValue RHS = N->getOperand(3); 9215 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 9216 9217 unsigned Opcode = 0; 9218 bool IsReversed; 9219 if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) { 9220 IsReversed = false; // x CC y ? x : y 9221 } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) { 9222 IsReversed = true ; // x CC y ? y : x 9223 } else { 9224 return SDValue(); 9225 } 9226 9227 bool IsUnordered; 9228 switch (CC) { 9229 default: break; 9230 case ISD::SETOLT: 9231 case ISD::SETOLE: 9232 case ISD::SETLT: 9233 case ISD::SETLE: 9234 case ISD::SETULT: 9235 case ISD::SETULE: 9236 // If LHS is NaN, an ordered comparison will be false and the result will 9237 // be the RHS, but vmin(NaN, RHS) = NaN. Avoid this by checking that LHS 9238 // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. 9239 IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE); 9240 if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) 9241 break; 9242 // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin 9243 // will return -0, so vmin can only be used for unsafe math or if one of 9244 // the operands is known to be nonzero. 9245 if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) && 9246 !DAG.getTarget().Options.UnsafeFPMath && 9247 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9248 break; 9249 Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN; 9250 break; 9251 9252 case ISD::SETOGT: 9253 case ISD::SETOGE: 9254 case ISD::SETGT: 9255 case ISD::SETGE: 9256 case ISD::SETUGT: 9257 case ISD::SETUGE: 9258 // If LHS is NaN, an ordered comparison will be false and the result will 9259 // be the RHS, but vmax(NaN, RHS) = NaN. Avoid this by checking that LHS 9260 // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. 9261 IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE); 9262 if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) 9263 break; 9264 // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax 9265 // will return +0, so vmax can only be used for unsafe math or if one of 9266 // the operands is known to be nonzero. 9267 if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) && 9268 !DAG.getTarget().Options.UnsafeFPMath && 9269 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9270 break; 9271 Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX; 9272 break; 9273 } 9274 9275 if (!Opcode) 9276 return SDValue(); 9277 return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS); 9278 } 9279 9280 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 9281 SDValue 9282 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 9283 SDValue Cmp = N->getOperand(4); 9284 if (Cmp.getOpcode() != ARMISD::CMPZ) 9285 // Only looking at EQ and NE cases. 9286 return SDValue(); 9287 9288 EVT VT = N->getValueType(0); 9289 DebugLoc dl = N->getDebugLoc(); 9290 SDValue LHS = Cmp.getOperand(0); 9291 SDValue RHS = Cmp.getOperand(1); 9292 SDValue FalseVal = N->getOperand(0); 9293 SDValue TrueVal = N->getOperand(1); 9294 SDValue ARMcc = N->getOperand(2); 9295 ARMCC::CondCodes CC = 9296 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 9297 9298 // Simplify 9299 // mov r1, r0 9300 // cmp r1, x 9301 // mov r0, y 9302 // moveq r0, x 9303 // to 9304 // cmp r0, x 9305 // movne r0, y 9306 // 9307 // mov r1, r0 9308 // cmp r1, x 9309 // mov r0, x 9310 // movne r0, y 9311 // to 9312 // cmp r0, x 9313 // movne r0, y 9314 /// FIXME: Turn this into a target neutral optimization? 9315 SDValue Res; 9316 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 9317 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 9318 N->getOperand(3), Cmp); 9319 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 9320 SDValue ARMcc; 9321 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 9322 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 9323 N->getOperand(3), NewCmp); 9324 } 9325 9326 if (Res.getNode()) { 9327 APInt KnownZero, KnownOne; 9328 DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne); 9329 // Capture demanded bits information that would be otherwise lost. 9330 if (KnownZero == 0xfffffffe) 9331 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 9332 DAG.getValueType(MVT::i1)); 9333 else if (KnownZero == 0xffffff00) 9334 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 9335 DAG.getValueType(MVT::i8)); 9336 else if (KnownZero == 0xffff0000) 9337 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 9338 DAG.getValueType(MVT::i16)); 9339 } 9340 9341 return Res; 9342 } 9343 9344 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 9345 DAGCombinerInfo &DCI) const { 9346 switch (N->getOpcode()) { 9347 default: break; 9348 case ISD::ADDC: return PerformADDCCombine(N, DCI, Subtarget); 9349 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 9350 case ISD::SUB: return PerformSUBCombine(N, DCI); 9351 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 9352 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 9353 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 9354 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 9355 case ARMISD::BFI: return PerformBFICombine(N, DCI); 9356 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); 9357 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 9358 case ISD::STORE: return PerformSTORECombine(N, DCI); 9359 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI); 9360 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 9361 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 9362 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 9363 case ISD::FP_TO_SINT: 9364 case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget); 9365 case ISD::FDIV: return PerformVDIVCombine(N, DCI, Subtarget); 9366 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 9367 case ISD::SHL: 9368 case ISD::SRA: 9369 case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); 9370 case ISD::SIGN_EXTEND: 9371 case ISD::ZERO_EXTEND: 9372 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 9373 case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); 9374 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 9375 case ARMISD::VLD2DUP: 9376 case ARMISD::VLD3DUP: 9377 case ARMISD::VLD4DUP: 9378 return CombineBaseUpdate(N, DCI); 9379 case ISD::INTRINSIC_VOID: 9380 case ISD::INTRINSIC_W_CHAIN: 9381 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 9382 case Intrinsic::arm_neon_vld1: 9383 case Intrinsic::arm_neon_vld2: 9384 case Intrinsic::arm_neon_vld3: 9385 case Intrinsic::arm_neon_vld4: 9386 case Intrinsic::arm_neon_vld2lane: 9387 case Intrinsic::arm_neon_vld3lane: 9388 case Intrinsic::arm_neon_vld4lane: 9389 case Intrinsic::arm_neon_vst1: 9390 case Intrinsic::arm_neon_vst2: 9391 case Intrinsic::arm_neon_vst3: 9392 case Intrinsic::arm_neon_vst4: 9393 case Intrinsic::arm_neon_vst2lane: 9394 case Intrinsic::arm_neon_vst3lane: 9395 case Intrinsic::arm_neon_vst4lane: 9396 return CombineBaseUpdate(N, DCI); 9397 default: break; 9398 } 9399 break; 9400 } 9401 return SDValue(); 9402 } 9403 9404 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 9405 EVT VT) const { 9406 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 9407 } 9408 9409 bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { 9410 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 9411 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 9412 9413 switch (VT.getSimpleVT().SimpleTy) { 9414 default: 9415 return false; 9416 case MVT::i8: 9417 case MVT::i16: 9418 case MVT::i32: 9419 // Unaligned access can use (for example) LRDB, LRDH, LDR 9420 return AllowsUnaligned; 9421 case MVT::f64: 9422 case MVT::v2f64: 9423 // For any little-endian targets with neon, we can support unaligned ld/st 9424 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 9425 // A big-endian target may also explictly support unaligned accesses 9426 return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian()); 9427 } 9428 } 9429 9430 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 9431 unsigned AlignCheck) { 9432 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 9433 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 9434 } 9435 9436 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, 9437 unsigned DstAlign, unsigned SrcAlign, 9438 bool IsZeroVal, 9439 bool MemcpyStrSrc, 9440 MachineFunction &MF) const { 9441 const Function *F = MF.getFunction(); 9442 9443 // See if we can use NEON instructions for this... 9444 if (IsZeroVal && 9445 !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat) && 9446 Subtarget->hasNEON()) { 9447 if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) { 9448 return MVT::v4i32; 9449 } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) { 9450 return MVT::v2i32; 9451 } 9452 } 9453 9454 // Lowering to i32/i16 if the size permits. 9455 if (Size >= 4) { 9456 return MVT::i32; 9457 } else if (Size >= 2) { 9458 return MVT::i16; 9459 } 9460 9461 // Let the target-independent logic figure it out. 9462 return MVT::Other; 9463 } 9464 9465 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 9466 if (Val.getOpcode() != ISD::LOAD) 9467 return false; 9468 9469 EVT VT1 = Val.getValueType(); 9470 if (!VT1.isSimple() || !VT1.isInteger() || 9471 !VT2.isSimple() || !VT2.isInteger()) 9472 return false; 9473 9474 switch (VT1.getSimpleVT().SimpleTy) { 9475 default: break; 9476 case MVT::i1: 9477 case MVT::i8: 9478 case MVT::i16: 9479 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 9480 return true; 9481 } 9482 9483 return false; 9484 } 9485 9486 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 9487 if (V < 0) 9488 return false; 9489 9490 unsigned Scale = 1; 9491 switch (VT.getSimpleVT().SimpleTy) { 9492 default: return false; 9493 case MVT::i1: 9494 case MVT::i8: 9495 // Scale == 1; 9496 break; 9497 case MVT::i16: 9498 // Scale == 2; 9499 Scale = 2; 9500 break; 9501 case MVT::i32: 9502 // Scale == 4; 9503 Scale = 4; 9504 break; 9505 } 9506 9507 if ((V & (Scale - 1)) != 0) 9508 return false; 9509 V /= Scale; 9510 return V == (V & ((1LL << 5) - 1)); 9511 } 9512 9513 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 9514 const ARMSubtarget *Subtarget) { 9515 bool isNeg = false; 9516 if (V < 0) { 9517 isNeg = true; 9518 V = - V; 9519 } 9520 9521 switch (VT.getSimpleVT().SimpleTy) { 9522 default: return false; 9523 case MVT::i1: 9524 case MVT::i8: 9525 case MVT::i16: 9526 case MVT::i32: 9527 // + imm12 or - imm8 9528 if (isNeg) 9529 return V == (V & ((1LL << 8) - 1)); 9530 return V == (V & ((1LL << 12) - 1)); 9531 case MVT::f32: 9532 case MVT::f64: 9533 // Same as ARM mode. FIXME: NEON? 9534 if (!Subtarget->hasVFP2()) 9535 return false; 9536 if ((V & 3) != 0) 9537 return false; 9538 V >>= 2; 9539 return V == (V & ((1LL << 8) - 1)); 9540 } 9541 } 9542 9543 /// isLegalAddressImmediate - Return true if the integer value can be used 9544 /// as the offset of the target addressing mode for load / store of the 9545 /// given type. 9546 static bool isLegalAddressImmediate(int64_t V, EVT VT, 9547 const ARMSubtarget *Subtarget) { 9548 if (V == 0) 9549 return true; 9550 9551 if (!VT.isSimple()) 9552 return false; 9553 9554 if (Subtarget->isThumb1Only()) 9555 return isLegalT1AddressImmediate(V, VT); 9556 else if (Subtarget->isThumb2()) 9557 return isLegalT2AddressImmediate(V, VT, Subtarget); 9558 9559 // ARM mode. 9560 if (V < 0) 9561 V = - V; 9562 switch (VT.getSimpleVT().SimpleTy) { 9563 default: return false; 9564 case MVT::i1: 9565 case MVT::i8: 9566 case MVT::i32: 9567 // +- imm12 9568 return V == (V & ((1LL << 12) - 1)); 9569 case MVT::i16: 9570 // +- imm8 9571 return V == (V & ((1LL << 8) - 1)); 9572 case MVT::f32: 9573 case MVT::f64: 9574 if (!Subtarget->hasVFP2()) // FIXME: NEON? 9575 return false; 9576 if ((V & 3) != 0) 9577 return false; 9578 V >>= 2; 9579 return V == (V & ((1LL << 8) - 1)); 9580 } 9581 } 9582 9583 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 9584 EVT VT) const { 9585 int Scale = AM.Scale; 9586 if (Scale < 0) 9587 return false; 9588 9589 switch (VT.getSimpleVT().SimpleTy) { 9590 default: return false; 9591 case MVT::i1: 9592 case MVT::i8: 9593 case MVT::i16: 9594 case MVT::i32: 9595 if (Scale == 1) 9596 return true; 9597 // r + r << imm 9598 Scale = Scale & ~1; 9599 return Scale == 2 || Scale == 4 || Scale == 8; 9600 case MVT::i64: 9601 // r + r 9602 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 9603 return true; 9604 return false; 9605 case MVT::isVoid: 9606 // Note, we allow "void" uses (basically, uses that aren't loads or 9607 // stores), because arm allows folding a scale into many arithmetic 9608 // operations. This should be made more precise and revisited later. 9609 9610 // Allow r << imm, but the imm has to be a multiple of two. 9611 if (Scale & 1) return false; 9612 return isPowerOf2_32(Scale); 9613 } 9614 } 9615 9616 /// isLegalAddressingMode - Return true if the addressing mode represented 9617 /// by AM is legal for this target, for a load/store of the specified type. 9618 bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, 9619 Type *Ty) const { 9620 EVT VT = getValueType(Ty, true); 9621 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 9622 return false; 9623 9624 // Can never fold addr of global into load/store. 9625 if (AM.BaseGV) 9626 return false; 9627 9628 switch (AM.Scale) { 9629 case 0: // no scale reg, must be "r+i" or "r", or "i". 9630 break; 9631 case 1: 9632 if (Subtarget->isThumb1Only()) 9633 return false; 9634 // FALL THROUGH. 9635 default: 9636 // ARM doesn't support any R+R*scale+imm addr modes. 9637 if (AM.BaseOffs) 9638 return false; 9639 9640 if (!VT.isSimple()) 9641 return false; 9642 9643 if (Subtarget->isThumb2()) 9644 return isLegalT2ScaledAddressingMode(AM, VT); 9645 9646 int Scale = AM.Scale; 9647 switch (VT.getSimpleVT().SimpleTy) { 9648 default: return false; 9649 case MVT::i1: 9650 case MVT::i8: 9651 case MVT::i32: 9652 if (Scale < 0) Scale = -Scale; 9653 if (Scale == 1) 9654 return true; 9655 // r + r << imm 9656 return isPowerOf2_32(Scale & ~1); 9657 case MVT::i16: 9658 case MVT::i64: 9659 // r + r 9660 if (((unsigned)AM.HasBaseReg + Scale) <= 2) 9661 return true; 9662 return false; 9663 9664 case MVT::isVoid: 9665 // Note, we allow "void" uses (basically, uses that aren't loads or 9666 // stores), because arm allows folding a scale into many arithmetic 9667 // operations. This should be made more precise and revisited later. 9668 9669 // Allow r << imm, but the imm has to be a multiple of two. 9670 if (Scale & 1) return false; 9671 return isPowerOf2_32(Scale); 9672 } 9673 } 9674 return true; 9675 } 9676 9677 /// isLegalICmpImmediate - Return true if the specified immediate is legal 9678 /// icmp immediate, that is the target has icmp instructions which can compare 9679 /// a register against the immediate without having to materialize the 9680 /// immediate into a register. 9681 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 9682 // Thumb2 and ARM modes can use cmn for negative immediates. 9683 if (!Subtarget->isThumb()) 9684 return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1; 9685 if (Subtarget->isThumb2()) 9686 return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1; 9687 // Thumb1 doesn't have cmn, and only 8-bit immediates. 9688 return Imm >= 0 && Imm <= 255; 9689 } 9690 9691 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 9692 /// *or sub* immediate, that is the target has add or sub instructions which can 9693 /// add a register with the immediate without having to materialize the 9694 /// immediate into a register. 9695 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 9696 // Same encoding for add/sub, just flip the sign. 9697 int64_t AbsImm = llvm::abs64(Imm); 9698 if (!Subtarget->isThumb()) 9699 return ARM_AM::getSOImmVal(AbsImm) != -1; 9700 if (Subtarget->isThumb2()) 9701 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 9702 // Thumb1 only has 8-bit unsigned immediate. 9703 return AbsImm >= 0 && AbsImm <= 255; 9704 } 9705 9706 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 9707 bool isSEXTLoad, SDValue &Base, 9708 SDValue &Offset, bool &isInc, 9709 SelectionDAG &DAG) { 9710 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 9711 return false; 9712 9713 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 9714 // AddressingMode 3 9715 Base = Ptr->getOperand(0); 9716 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 9717 int RHSC = (int)RHS->getZExtValue(); 9718 if (RHSC < 0 && RHSC > -256) { 9719 assert(Ptr->getOpcode() == ISD::ADD); 9720 isInc = false; 9721 Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); 9722 return true; 9723 } 9724 } 9725 isInc = (Ptr->getOpcode() == ISD::ADD); 9726 Offset = Ptr->getOperand(1); 9727 return true; 9728 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 9729 // AddressingMode 2 9730 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 9731 int RHSC = (int)RHS->getZExtValue(); 9732 if (RHSC < 0 && RHSC > -0x1000) { 9733 assert(Ptr->getOpcode() == ISD::ADD); 9734 isInc = false; 9735 Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); 9736 Base = Ptr->getOperand(0); 9737 return true; 9738 } 9739 } 9740 9741 if (Ptr->getOpcode() == ISD::ADD) { 9742 isInc = true; 9743 ARM_AM::ShiftOpc ShOpcVal= 9744 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 9745 if (ShOpcVal != ARM_AM::no_shift) { 9746 Base = Ptr->getOperand(1); 9747 Offset = Ptr->getOperand(0); 9748 } else { 9749 Base = Ptr->getOperand(0); 9750 Offset = Ptr->getOperand(1); 9751 } 9752 return true; 9753 } 9754 9755 isInc = (Ptr->getOpcode() == ISD::ADD); 9756 Base = Ptr->getOperand(0); 9757 Offset = Ptr->getOperand(1); 9758 return true; 9759 } 9760 9761 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 9762 return false; 9763 } 9764 9765 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 9766 bool isSEXTLoad, SDValue &Base, 9767 SDValue &Offset, bool &isInc, 9768 SelectionDAG &DAG) { 9769 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 9770 return false; 9771 9772 Base = Ptr->getOperand(0); 9773 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 9774 int RHSC = (int)RHS->getZExtValue(); 9775 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 9776 assert(Ptr->getOpcode() == ISD::ADD); 9777 isInc = false; 9778 Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); 9779 return true; 9780 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 9781 isInc = Ptr->getOpcode() == ISD::ADD; 9782 Offset = DAG.getConstant(RHSC, RHS->getValueType(0)); 9783 return true; 9784 } 9785 } 9786 9787 return false; 9788 } 9789 9790 /// getPreIndexedAddressParts - returns true by value, base pointer and 9791 /// offset pointer and addressing mode by reference if the node's address 9792 /// can be legally represented as pre-indexed load / store address. 9793 bool 9794 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 9795 SDValue &Offset, 9796 ISD::MemIndexedMode &AM, 9797 SelectionDAG &DAG) const { 9798 if (Subtarget->isThumb1Only()) 9799 return false; 9800 9801 EVT VT; 9802 SDValue Ptr; 9803 bool isSEXTLoad = false; 9804 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 9805 Ptr = LD->getBasePtr(); 9806 VT = LD->getMemoryVT(); 9807 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 9808 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 9809 Ptr = ST->getBasePtr(); 9810 VT = ST->getMemoryVT(); 9811 } else 9812 return false; 9813 9814 bool isInc; 9815 bool isLegal = false; 9816 if (Subtarget->isThumb2()) 9817 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 9818 Offset, isInc, DAG); 9819 else 9820 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 9821 Offset, isInc, DAG); 9822 if (!isLegal) 9823 return false; 9824 9825 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 9826 return true; 9827 } 9828 9829 /// getPostIndexedAddressParts - returns true by value, base pointer and 9830 /// offset pointer and addressing mode by reference if this node can be 9831 /// combined with a load / store to form a post-indexed load / store. 9832 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 9833 SDValue &Base, 9834 SDValue &Offset, 9835 ISD::MemIndexedMode &AM, 9836 SelectionDAG &DAG) const { 9837 if (Subtarget->isThumb1Only()) 9838 return false; 9839 9840 EVT VT; 9841 SDValue Ptr; 9842 bool isSEXTLoad = false; 9843 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 9844 VT = LD->getMemoryVT(); 9845 Ptr = LD->getBasePtr(); 9846 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 9847 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 9848 VT = ST->getMemoryVT(); 9849 Ptr = ST->getBasePtr(); 9850 } else 9851 return false; 9852 9853 bool isInc; 9854 bool isLegal = false; 9855 if (Subtarget->isThumb2()) 9856 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 9857 isInc, DAG); 9858 else 9859 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 9860 isInc, DAG); 9861 if (!isLegal) 9862 return false; 9863 9864 if (Ptr != Base) { 9865 // Swap base ptr and offset to catch more post-index load / store when 9866 // it's legal. In Thumb2 mode, offset must be an immediate. 9867 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 9868 !Subtarget->isThumb2()) 9869 std::swap(Base, Offset); 9870 9871 // Post-indexed load / store update the base pointer. 9872 if (Ptr != Base) 9873 return false; 9874 } 9875 9876 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 9877 return true; 9878 } 9879 9880 void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9881 APInt &KnownZero, 9882 APInt &KnownOne, 9883 const SelectionDAG &DAG, 9884 unsigned Depth) const { 9885 KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); 9886 switch (Op.getOpcode()) { 9887 default: break; 9888 case ARMISD::CMOV: { 9889 // Bits are known zero/one if known on the LHS and RHS. 9890 DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1); 9891 if (KnownZero == 0 && KnownOne == 0) return; 9892 9893 APInt KnownZeroRHS, KnownOneRHS; 9894 DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1); 9895 KnownZero &= KnownZeroRHS; 9896 KnownOne &= KnownOneRHS; 9897 return; 9898 } 9899 } 9900 } 9901 9902 //===----------------------------------------------------------------------===// 9903 // ARM Inline Assembly Support 9904 //===----------------------------------------------------------------------===// 9905 9906 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 9907 // Looking for "rev" which is V6+. 9908 if (!Subtarget->hasV6Ops()) 9909 return false; 9910 9911 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9912 std::string AsmStr = IA->getAsmString(); 9913 SmallVector<StringRef, 4> AsmPieces; 9914 SplitString(AsmStr, AsmPieces, ";\n"); 9915 9916 switch (AsmPieces.size()) { 9917 default: return false; 9918 case 1: 9919 AsmStr = AsmPieces[0]; 9920 AsmPieces.clear(); 9921 SplitString(AsmStr, AsmPieces, " \t,"); 9922 9923 // rev $0, $1 9924 if (AsmPieces.size() == 3 && 9925 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 9926 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 9927 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9928 if (Ty && Ty->getBitWidth() == 32) 9929 return IntrinsicLowering::LowerToByteSwap(CI); 9930 } 9931 break; 9932 } 9933 9934 return false; 9935 } 9936 9937 /// getConstraintType - Given a constraint letter, return the type of 9938 /// constraint it is for this target. 9939 ARMTargetLowering::ConstraintType 9940 ARMTargetLowering::getConstraintType(const std::string &Constraint) const { 9941 if (Constraint.size() == 1) { 9942 switch (Constraint[0]) { 9943 default: break; 9944 case 'l': return C_RegisterClass; 9945 case 'w': return C_RegisterClass; 9946 case 'h': return C_RegisterClass; 9947 case 'x': return C_RegisterClass; 9948 case 't': return C_RegisterClass; 9949 case 'j': return C_Other; // Constant for movw. 9950 // An address with a single base register. Due to the way we 9951 // currently handle addresses it is the same as an 'r' memory constraint. 9952 case 'Q': return C_Memory; 9953 } 9954 } else if (Constraint.size() == 2) { 9955 switch (Constraint[0]) { 9956 default: break; 9957 // All 'U+' constraints are addresses. 9958 case 'U': return C_Memory; 9959 } 9960 } 9961 return TargetLowering::getConstraintType(Constraint); 9962 } 9963 9964 /// Examine constraint type and operand type and determine a weight value. 9965 /// This object must already have been set up with the operand type 9966 /// and the current alternative constraint selected. 9967 TargetLowering::ConstraintWeight 9968 ARMTargetLowering::getSingleConstraintMatchWeight( 9969 AsmOperandInfo &info, const char *constraint) const { 9970 ConstraintWeight weight = CW_Invalid; 9971 Value *CallOperandVal = info.CallOperandVal; 9972 // If we don't have a value, we can't do a match, 9973 // but allow it at the lowest weight. 9974 if (CallOperandVal == NULL) 9975 return CW_Default; 9976 Type *type = CallOperandVal->getType(); 9977 // Look at the constraint type. 9978 switch (*constraint) { 9979 default: 9980 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 9981 break; 9982 case 'l': 9983 if (type->isIntegerTy()) { 9984 if (Subtarget->isThumb()) 9985 weight = CW_SpecificReg; 9986 else 9987 weight = CW_Register; 9988 } 9989 break; 9990 case 'w': 9991 if (type->isFloatingPointTy()) 9992 weight = CW_Register; 9993 break; 9994 } 9995 return weight; 9996 } 9997 9998 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair; 9999 RCPair 10000 ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10001 EVT VT) const { 10002 if (Constraint.size() == 1) { 10003 // GCC ARM Constraint Letters 10004 switch (Constraint[0]) { 10005 case 'l': // Low regs or general regs. 10006 if (Subtarget->isThumb()) 10007 return RCPair(0U, &ARM::tGPRRegClass); 10008 return RCPair(0U, &ARM::GPRRegClass); 10009 case 'h': // High regs or no regs. 10010 if (Subtarget->isThumb()) 10011 return RCPair(0U, &ARM::hGPRRegClass); 10012 break; 10013 case 'r': 10014 return RCPair(0U, &ARM::GPRRegClass); 10015 case 'w': 10016 if (VT == MVT::f32) 10017 return RCPair(0U, &ARM::SPRRegClass); 10018 if (VT.getSizeInBits() == 64) 10019 return RCPair(0U, &ARM::DPRRegClass); 10020 if (VT.getSizeInBits() == 128) 10021 return RCPair(0U, &ARM::QPRRegClass); 10022 break; 10023 case 'x': 10024 if (VT == MVT::f32) 10025 return RCPair(0U, &ARM::SPR_8RegClass); 10026 if (VT.getSizeInBits() == 64) 10027 return RCPair(0U, &ARM::DPR_8RegClass); 10028 if (VT.getSizeInBits() == 128) 10029 return RCPair(0U, &ARM::QPR_8RegClass); 10030 break; 10031 case 't': 10032 if (VT == MVT::f32) 10033 return RCPair(0U, &ARM::SPRRegClass); 10034 break; 10035 } 10036 } 10037 if (StringRef("{cc}").equals_lower(Constraint)) 10038 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 10039 10040 return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10041 } 10042 10043 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10044 /// vector. If it is invalid, don't add anything to Ops. 10045 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10046 std::string &Constraint, 10047 std::vector<SDValue>&Ops, 10048 SelectionDAG &DAG) const { 10049 SDValue Result(0, 0); 10050 10051 // Currently only support length 1 constraints. 10052 if (Constraint.length() != 1) return; 10053 10054 char ConstraintLetter = Constraint[0]; 10055 switch (ConstraintLetter) { 10056 default: break; 10057 case 'j': 10058 case 'I': case 'J': case 'K': case 'L': 10059 case 'M': case 'N': case 'O': 10060 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 10061 if (!C) 10062 return; 10063 10064 int64_t CVal64 = C->getSExtValue(); 10065 int CVal = (int) CVal64; 10066 // None of these constraints allow values larger than 32 bits. Check 10067 // that the value fits in an int. 10068 if (CVal != CVal64) 10069 return; 10070 10071 switch (ConstraintLetter) { 10072 case 'j': 10073 // Constant suitable for movw, must be between 0 and 10074 // 65535. 10075 if (Subtarget->hasV6T2Ops()) 10076 if (CVal >= 0 && CVal <= 65535) 10077 break; 10078 return; 10079 case 'I': 10080 if (Subtarget->isThumb1Only()) { 10081 // This must be a constant between 0 and 255, for ADD 10082 // immediates. 10083 if (CVal >= 0 && CVal <= 255) 10084 break; 10085 } else if (Subtarget->isThumb2()) { 10086 // A constant that can be used as an immediate value in a 10087 // data-processing instruction. 10088 if (ARM_AM::getT2SOImmVal(CVal) != -1) 10089 break; 10090 } else { 10091 // A constant that can be used as an immediate value in a 10092 // data-processing instruction. 10093 if (ARM_AM::getSOImmVal(CVal) != -1) 10094 break; 10095 } 10096 return; 10097 10098 case 'J': 10099 if (Subtarget->isThumb()) { // FIXME thumb2 10100 // This must be a constant between -255 and -1, for negated ADD 10101 // immediates. This can be used in GCC with an "n" modifier that 10102 // prints the negated value, for use with SUB instructions. It is 10103 // not useful otherwise but is implemented for compatibility. 10104 if (CVal >= -255 && CVal <= -1) 10105 break; 10106 } else { 10107 // This must be a constant between -4095 and 4095. It is not clear 10108 // what this constraint is intended for. Implemented for 10109 // compatibility with GCC. 10110 if (CVal >= -4095 && CVal <= 4095) 10111 break; 10112 } 10113 return; 10114 10115 case 'K': 10116 if (Subtarget->isThumb1Only()) { 10117 // A 32-bit value where only one byte has a nonzero value. Exclude 10118 // zero to match GCC. This constraint is used by GCC internally for 10119 // constants that can be loaded with a move/shift combination. 10120 // It is not useful otherwise but is implemented for compatibility. 10121 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 10122 break; 10123 } else if (Subtarget->isThumb2()) { 10124 // A constant whose bitwise inverse can be used as an immediate 10125 // value in a data-processing instruction. This can be used in GCC 10126 // with a "B" modifier that prints the inverted value, for use with 10127 // BIC and MVN instructions. It is not useful otherwise but is 10128 // implemented for compatibility. 10129 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 10130 break; 10131 } else { 10132 // A constant whose bitwise inverse can be used as an immediate 10133 // value in a data-processing instruction. This can be used in GCC 10134 // with a "B" modifier that prints the inverted value, for use with 10135 // BIC and MVN instructions. It is not useful otherwise but is 10136 // implemented for compatibility. 10137 if (ARM_AM::getSOImmVal(~CVal) != -1) 10138 break; 10139 } 10140 return; 10141 10142 case 'L': 10143 if (Subtarget->isThumb1Only()) { 10144 // This must be a constant between -7 and 7, 10145 // for 3-operand ADD/SUB immediate instructions. 10146 if (CVal >= -7 && CVal < 7) 10147 break; 10148 } else if (Subtarget->isThumb2()) { 10149 // A constant whose negation can be used as an immediate value in a 10150 // data-processing instruction. This can be used in GCC with an "n" 10151 // modifier that prints the negated value, for use with SUB 10152 // instructions. It is not useful otherwise but is implemented for 10153 // compatibility. 10154 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 10155 break; 10156 } else { 10157 // A constant whose negation can be used as an immediate value in a 10158 // data-processing instruction. This can be used in GCC with an "n" 10159 // modifier that prints the negated value, for use with SUB 10160 // instructions. It is not useful otherwise but is implemented for 10161 // compatibility. 10162 if (ARM_AM::getSOImmVal(-CVal) != -1) 10163 break; 10164 } 10165 return; 10166 10167 case 'M': 10168 if (Subtarget->isThumb()) { // FIXME thumb2 10169 // This must be a multiple of 4 between 0 and 1020, for 10170 // ADD sp + immediate. 10171 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 10172 break; 10173 } else { 10174 // A power of two or a constant between 0 and 32. This is used in 10175 // GCC for the shift amount on shifted register operands, but it is 10176 // useful in general for any shift amounts. 10177 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 10178 break; 10179 } 10180 return; 10181 10182 case 'N': 10183 if (Subtarget->isThumb()) { // FIXME thumb2 10184 // This must be a constant between 0 and 31, for shift amounts. 10185 if (CVal >= 0 && CVal <= 31) 10186 break; 10187 } 10188 return; 10189 10190 case 'O': 10191 if (Subtarget->isThumb()) { // FIXME thumb2 10192 // This must be a multiple of 4 between -508 and 508, for 10193 // ADD/SUB sp = sp + immediate. 10194 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 10195 break; 10196 } 10197 return; 10198 } 10199 Result = DAG.getTargetConstant(CVal, Op.getValueType()); 10200 break; 10201 } 10202 10203 if (Result.getNode()) { 10204 Ops.push_back(Result); 10205 return; 10206 } 10207 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10208 } 10209 10210 bool 10211 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 10212 // The ARM target isn't yet aware of offsets. 10213 return false; 10214 } 10215 10216 bool ARM::isBitFieldInvertedMask(unsigned v) { 10217 if (v == 0xffffffff) 10218 return 0; 10219 // there can be 1's on either or both "outsides", all the "inside" 10220 // bits must be 0's 10221 unsigned int lsb = 0, msb = 31; 10222 while (v & (1 << msb)) --msb; 10223 while (v & (1 << lsb)) ++lsb; 10224 for (unsigned int i = lsb; i <= msb; ++i) { 10225 if (v & (1 << i)) 10226 return 0; 10227 } 10228 return 1; 10229 } 10230 10231 /// isFPImmLegal - Returns true if the target can instruction select the 10232 /// specified FP immediate natively. If false, the legalizer will 10233 /// materialize the FP immediate as a load from a constant pool. 10234 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 10235 if (!Subtarget->hasVFP3()) 10236 return false; 10237 if (VT == MVT::f32) 10238 return ARM_AM::getFP32Imm(Imm) != -1; 10239 if (VT == MVT::f64) 10240 return ARM_AM::getFP64Imm(Imm) != -1; 10241 return false; 10242 } 10243 10244 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 10245 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 10246 /// specified in the intrinsic calls. 10247 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 10248 const CallInst &I, 10249 unsigned Intrinsic) const { 10250 switch (Intrinsic) { 10251 case Intrinsic::arm_neon_vld1: 10252 case Intrinsic::arm_neon_vld2: 10253 case Intrinsic::arm_neon_vld3: 10254 case Intrinsic::arm_neon_vld4: 10255 case Intrinsic::arm_neon_vld2lane: 10256 case Intrinsic::arm_neon_vld3lane: 10257 case Intrinsic::arm_neon_vld4lane: { 10258 Info.opc = ISD::INTRINSIC_W_CHAIN; 10259 // Conservatively set memVT to the entire set of vectors loaded. 10260 uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; 10261 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 10262 Info.ptrVal = I.getArgOperand(0); 10263 Info.offset = 0; 10264 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 10265 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 10266 Info.vol = false; // volatile loads with NEON intrinsics not supported 10267 Info.readMem = true; 10268 Info.writeMem = false; 10269 return true; 10270 } 10271 case Intrinsic::arm_neon_vst1: 10272 case Intrinsic::arm_neon_vst2: 10273 case Intrinsic::arm_neon_vst3: 10274 case Intrinsic::arm_neon_vst4: 10275 case Intrinsic::arm_neon_vst2lane: 10276 case Intrinsic::arm_neon_vst3lane: 10277 case Intrinsic::arm_neon_vst4lane: { 10278 Info.opc = ISD::INTRINSIC_VOID; 10279 // Conservatively set memVT to the entire set of vectors stored. 10280 unsigned NumElts = 0; 10281 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 10282 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 10283 if (!ArgTy->isVectorTy()) 10284 break; 10285 NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; 10286 } 10287 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 10288 Info.ptrVal = I.getArgOperand(0); 10289 Info.offset = 0; 10290 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 10291 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 10292 Info.vol = false; // volatile stores with NEON intrinsics not supported 10293 Info.readMem = false; 10294 Info.writeMem = true; 10295 return true; 10296 } 10297 case Intrinsic::arm_strexd: { 10298 Info.opc = ISD::INTRINSIC_W_CHAIN; 10299 Info.memVT = MVT::i64; 10300 Info.ptrVal = I.getArgOperand(2); 10301 Info.offset = 0; 10302 Info.align = 8; 10303 Info.vol = true; 10304 Info.readMem = false; 10305 Info.writeMem = true; 10306 return true; 10307 } 10308 case Intrinsic::arm_ldrexd: { 10309 Info.opc = ISD::INTRINSIC_W_CHAIN; 10310 Info.memVT = MVT::i64; 10311 Info.ptrVal = I.getArgOperand(0); 10312 Info.offset = 0; 10313 Info.align = 8; 10314 Info.vol = true; 10315 Info.readMem = true; 10316 Info.writeMem = false; 10317 return true; 10318 } 10319 default: 10320 break; 10321 } 10322 10323 return false; 10324 } 10325