1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the interfaces that ARM uses to lower LLVM code into a 11 // selection DAG. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "ARMISelLowering.h" 16 #include "ARMBaseInstrInfo.h" 17 #include "ARMBaseRegisterInfo.h" 18 #include "ARMCallingConv.h" 19 #include "ARMConstantPoolValue.h" 20 #include "ARMMachineFunctionInfo.h" 21 #include "ARMPerfectShuffle.h" 22 #include "ARMRegisterInfo.h" 23 #include "ARMSelectionDAGInfo.h" 24 #include "ARMSubtarget.h" 25 #include "MCTargetDesc/ARMAddressingModes.h" 26 #include "MCTargetDesc/ARMBaseInfo.h" 27 #include "Utils/ARMBaseInfo.h" 28 #include "llvm/ADT/APFloat.h" 29 #include "llvm/ADT/APInt.h" 30 #include "llvm/ADT/ArrayRef.h" 31 #include "llvm/ADT/BitVector.h" 32 #include "llvm/ADT/DenseMap.h" 33 #include "llvm/ADT/STLExtras.h" 34 #include "llvm/ADT/SmallPtrSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/ADT/StringExtras.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/ADT/StringSwitch.h" 40 #include "llvm/ADT/Triple.h" 41 #include "llvm/ADT/Twine.h" 42 #include "llvm/Analysis/VectorUtils.h" 43 #include "llvm/CodeGen/CallingConvLower.h" 44 #include "llvm/CodeGen/ISDOpcodes.h" 45 #include "llvm/CodeGen/IntrinsicLowering.h" 46 #include "llvm/CodeGen/MachineBasicBlock.h" 47 #include "llvm/CodeGen/MachineConstantPool.h" 48 #include "llvm/CodeGen/MachineFrameInfo.h" 49 #include "llvm/CodeGen/MachineFunction.h" 50 #include "llvm/CodeGen/MachineInstr.h" 51 #include "llvm/CodeGen/MachineInstrBuilder.h" 52 #include "llvm/CodeGen/MachineJumpTableInfo.h" 53 #include "llvm/CodeGen/MachineMemOperand.h" 54 #include "llvm/CodeGen/MachineOperand.h" 55 #include "llvm/CodeGen/MachineRegisterInfo.h" 56 #include "llvm/CodeGen/RuntimeLibcalls.h" 57 #include "llvm/CodeGen/SelectionDAG.h" 58 #include "llvm/CodeGen/SelectionDAGNodes.h" 59 #include "llvm/CodeGen/TargetInstrInfo.h" 60 #include "llvm/CodeGen/TargetLowering.h" 61 #include "llvm/CodeGen/TargetOpcodes.h" 62 #include "llvm/CodeGen/TargetRegisterInfo.h" 63 #include "llvm/CodeGen/TargetSubtargetInfo.h" 64 #include "llvm/CodeGen/ValueTypes.h" 65 #include "llvm/IR/Attributes.h" 66 #include "llvm/IR/CallingConv.h" 67 #include "llvm/IR/Constant.h" 68 #include "llvm/IR/Constants.h" 69 #include "llvm/IR/DataLayout.h" 70 #include "llvm/IR/DebugLoc.h" 71 #include "llvm/IR/DerivedTypes.h" 72 #include "llvm/IR/Function.h" 73 #include "llvm/IR/GlobalAlias.h" 74 #include "llvm/IR/GlobalValue.h" 75 #include "llvm/IR/GlobalVariable.h" 76 #include "llvm/IR/IRBuilder.h" 77 #include "llvm/IR/InlineAsm.h" 78 #include "llvm/IR/Instruction.h" 79 #include "llvm/IR/Instructions.h" 80 #include "llvm/IR/IntrinsicInst.h" 81 #include "llvm/IR/Intrinsics.h" 82 #include "llvm/IR/Module.h" 83 #include "llvm/IR/Type.h" 84 #include "llvm/IR/User.h" 85 #include "llvm/IR/Value.h" 86 #include "llvm/MC/MCInstrDesc.h" 87 #include "llvm/MC/MCInstrItineraries.h" 88 #include "llvm/MC/MCRegisterInfo.h" 89 #include "llvm/MC/MCSchedule.h" 90 #include "llvm/Support/AtomicOrdering.h" 91 #include "llvm/Support/BranchProbability.h" 92 #include "llvm/Support/Casting.h" 93 #include "llvm/Support/CodeGen.h" 94 #include "llvm/Support/CommandLine.h" 95 #include "llvm/Support/Compiler.h" 96 #include "llvm/Support/Debug.h" 97 #include "llvm/Support/ErrorHandling.h" 98 #include "llvm/Support/KnownBits.h" 99 #include "llvm/Support/MachineValueType.h" 100 #include "llvm/Support/MathExtras.h" 101 #include "llvm/Support/raw_ostream.h" 102 #include "llvm/Target/TargetMachine.h" 103 #include "llvm/Target/TargetOptions.h" 104 #include <algorithm> 105 #include <cassert> 106 #include <cstdint> 107 #include <cstdlib> 108 #include <iterator> 109 #include <limits> 110 #include <string> 111 #include <tuple> 112 #include <utility> 113 #include <vector> 114 115 using namespace llvm; 116 117 #define DEBUG_TYPE "arm-isel" 118 119 STATISTIC(NumTailCalls, "Number of tail calls"); 120 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 121 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 122 STATISTIC(NumConstpoolPromoted, 123 "Number of constants with their storage promoted into constant pools"); 124 125 static cl::opt<bool> 126 ARMInterworking("arm-interworking", cl::Hidden, 127 cl::desc("Enable / disable ARM interworking (for debugging only)"), 128 cl::init(true)); 129 130 static cl::opt<bool> EnableConstpoolPromotion( 131 "arm-promote-constant", cl::Hidden, 132 cl::desc("Enable / disable promotion of unnamed_addr constants into " 133 "constant pools"), 134 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 135 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 136 "arm-promote-constant-max-size", cl::Hidden, 137 cl::desc("Maximum size of constant to promote into a constant pool"), 138 cl::init(64)); 139 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 140 "arm-promote-constant-max-total", cl::Hidden, 141 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 142 cl::init(128)); 143 144 // The APCS parameter registers. 145 static const MCPhysReg GPRArgRegs[] = { 146 ARM::R0, ARM::R1, ARM::R2, ARM::R3 147 }; 148 149 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 150 MVT PromotedBitwiseVT) { 151 if (VT != PromotedLdStVT) { 152 setOperationAction(ISD::LOAD, VT, Promote); 153 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 154 155 setOperationAction(ISD::STORE, VT, Promote); 156 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 157 } 158 159 MVT ElemTy = VT.getVectorElementType(); 160 if (ElemTy != MVT::f64) 161 setOperationAction(ISD::SETCC, VT, Custom); 162 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 163 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 164 if (ElemTy == MVT::i32) { 165 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 166 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 167 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 168 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 169 } else { 170 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 171 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 172 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 173 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 174 } 175 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 176 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 177 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 178 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 179 setOperationAction(ISD::SELECT, VT, Expand); 180 setOperationAction(ISD::SELECT_CC, VT, Expand); 181 setOperationAction(ISD::VSELECT, VT, Expand); 182 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 183 if (VT.isInteger()) { 184 setOperationAction(ISD::SHL, VT, Custom); 185 setOperationAction(ISD::SRA, VT, Custom); 186 setOperationAction(ISD::SRL, VT, Custom); 187 } 188 189 // Promote all bit-wise operations. 190 if (VT.isInteger() && VT != PromotedBitwiseVT) { 191 setOperationAction(ISD::AND, VT, Promote); 192 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 193 setOperationAction(ISD::OR, VT, Promote); 194 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 195 setOperationAction(ISD::XOR, VT, Promote); 196 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 197 } 198 199 // Neon does not support vector divide/remainder operations. 200 setOperationAction(ISD::SDIV, VT, Expand); 201 setOperationAction(ISD::UDIV, VT, Expand); 202 setOperationAction(ISD::FDIV, VT, Expand); 203 setOperationAction(ISD::SREM, VT, Expand); 204 setOperationAction(ISD::UREM, VT, Expand); 205 setOperationAction(ISD::FREM, VT, Expand); 206 207 if (!VT.isFloatingPoint() && 208 VT != MVT::v2i64 && VT != MVT::v1i64) 209 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 210 setOperationAction(Opcode, VT, Legal); 211 } 212 213 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 214 addRegisterClass(VT, &ARM::DPRRegClass); 215 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 216 } 217 218 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 219 addRegisterClass(VT, &ARM::DPairRegClass); 220 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 221 } 222 223 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 224 const ARMSubtarget &STI) 225 : TargetLowering(TM), Subtarget(&STI) { 226 RegInfo = Subtarget->getRegisterInfo(); 227 Itins = Subtarget->getInstrItineraryData(); 228 229 setBooleanContents(ZeroOrOneBooleanContent); 230 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 231 232 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 233 !Subtarget->isTargetWatchOS()) { 234 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 235 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 236 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 237 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 238 : CallingConv::ARM_AAPCS); 239 } 240 241 if (Subtarget->isTargetMachO()) { 242 // Uses VFP for Thumb libfuncs if available. 243 if (Subtarget->isThumb() && Subtarget->hasVFP2() && 244 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 245 static const struct { 246 const RTLIB::Libcall Op; 247 const char * const Name; 248 const ISD::CondCode Cond; 249 } LibraryCalls[] = { 250 // Single-precision floating-point arithmetic. 251 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 252 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 253 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 254 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 255 256 // Double-precision floating-point arithmetic. 257 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 258 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 259 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 260 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 261 262 // Single-precision comparisons. 263 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 264 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 265 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 266 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 267 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 268 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 269 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 270 { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, 271 272 // Double-precision comparisons. 273 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 274 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 275 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 276 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 277 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 278 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 279 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 280 { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, 281 282 // Floating-point to integer conversions. 283 // i64 conversions are done via library routines even when generating VFP 284 // instructions, so use the same ones. 285 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 286 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 287 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 288 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 289 290 // Conversions between floating types. 291 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 292 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 293 294 // Integer to floating-point conversions. 295 // i64 conversions are done via library routines even when generating VFP 296 // instructions, so use the same ones. 297 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 298 // e.g., __floatunsidf vs. __floatunssidfvfp. 299 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 300 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 301 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 302 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 303 }; 304 305 for (const auto &LC : LibraryCalls) { 306 setLibcallName(LC.Op, LC.Name); 307 if (LC.Cond != ISD::SETCC_INVALID) 308 setCmpLibcallCC(LC.Op, LC.Cond); 309 } 310 } 311 312 // Set the correct calling convention for ARMv7k WatchOS. It's just 313 // AAPCS_VFP for functions as simple as libcalls. 314 if (Subtarget->isTargetWatchABI()) { 315 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) 316 setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); 317 } 318 } 319 320 // These libcalls are not available in 32-bit. 321 setLibcallName(RTLIB::SHL_I128, nullptr); 322 setLibcallName(RTLIB::SRL_I128, nullptr); 323 setLibcallName(RTLIB::SRA_I128, nullptr); 324 325 // RTLIB 326 if (Subtarget->isAAPCS_ABI() && 327 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 328 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 329 static const struct { 330 const RTLIB::Libcall Op; 331 const char * const Name; 332 const CallingConv::ID CC; 333 const ISD::CondCode Cond; 334 } LibraryCalls[] = { 335 // Double-precision floating-point arithmetic helper functions 336 // RTABI chapter 4.1.2, Table 2 337 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 338 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 339 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 340 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 341 342 // Double-precision floating-point comparison helper functions 343 // RTABI chapter 4.1.2, Table 3 344 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 345 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 346 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 347 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 348 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 349 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 350 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 351 { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 352 353 // Single-precision floating-point arithmetic helper functions 354 // RTABI chapter 4.1.2, Table 4 355 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 356 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 357 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 358 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 359 360 // Single-precision floating-point comparison helper functions 361 // RTABI chapter 4.1.2, Table 5 362 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 363 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 364 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 365 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 366 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 367 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 368 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 369 { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 370 371 // Floating-point to integer conversions. 372 // RTABI chapter 4.1.2, Table 6 373 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 374 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 375 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 376 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 377 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 378 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 379 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 380 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 381 382 // Conversions between floating types. 383 // RTABI chapter 4.1.2, Table 7 384 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 385 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 386 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 387 388 // Integer to floating-point conversions. 389 // RTABI chapter 4.1.2, Table 8 390 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 391 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 392 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 393 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 394 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 395 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 396 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 397 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 398 399 // Long long helper functions 400 // RTABI chapter 4.2, Table 9 401 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 402 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 403 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 404 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 405 406 // Integer division functions 407 // RTABI chapter 4.3.1 408 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 409 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 410 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 411 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 412 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 413 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 414 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 415 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 416 }; 417 418 for (const auto &LC : LibraryCalls) { 419 setLibcallName(LC.Op, LC.Name); 420 setLibcallCallingConv(LC.Op, LC.CC); 421 if (LC.Cond != ISD::SETCC_INVALID) 422 setCmpLibcallCC(LC.Op, LC.Cond); 423 } 424 425 // EABI dependent RTLIB 426 if (TM.Options.EABIVersion == EABI::EABI4 || 427 TM.Options.EABIVersion == EABI::EABI5) { 428 static const struct { 429 const RTLIB::Libcall Op; 430 const char *const Name; 431 const CallingConv::ID CC; 432 const ISD::CondCode Cond; 433 } MemOpsLibraryCalls[] = { 434 // Memory operations 435 // RTABI chapter 4.3.4 436 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 437 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 438 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 439 }; 440 441 for (const auto &LC : MemOpsLibraryCalls) { 442 setLibcallName(LC.Op, LC.Name); 443 setLibcallCallingConv(LC.Op, LC.CC); 444 if (LC.Cond != ISD::SETCC_INVALID) 445 setCmpLibcallCC(LC.Op, LC.Cond); 446 } 447 } 448 } 449 450 if (Subtarget->isTargetWindows()) { 451 static const struct { 452 const RTLIB::Libcall Op; 453 const char * const Name; 454 const CallingConv::ID CC; 455 } LibraryCalls[] = { 456 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 457 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 458 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 459 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 460 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 461 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 462 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 463 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 464 }; 465 466 for (const auto &LC : LibraryCalls) { 467 setLibcallName(LC.Op, LC.Name); 468 setLibcallCallingConv(LC.Op, LC.CC); 469 } 470 } 471 472 // Use divmod compiler-rt calls for iOS 5.0 and later. 473 if (Subtarget->isTargetMachO() && 474 !(Subtarget->isTargetIOS() && 475 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 476 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 477 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 478 } 479 480 // The half <-> float conversion functions are always soft-float on 481 // non-watchos platforms, but are needed for some targets which use a 482 // hard-float calling convention by default. 483 if (!Subtarget->isTargetWatchABI()) { 484 if (Subtarget->isAAPCS_ABI()) { 485 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 486 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 487 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 488 } else { 489 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 490 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 491 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 492 } 493 } 494 495 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 496 // a __gnu_ prefix (which is the default). 497 if (Subtarget->isTargetAEABI()) { 498 static const struct { 499 const RTLIB::Libcall Op; 500 const char * const Name; 501 const CallingConv::ID CC; 502 } LibraryCalls[] = { 503 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 504 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 505 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 506 }; 507 508 for (const auto &LC : LibraryCalls) { 509 setLibcallName(LC.Op, LC.Name); 510 setLibcallCallingConv(LC.Op, LC.CC); 511 } 512 } 513 514 if (Subtarget->isThumb1Only()) 515 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 516 else 517 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 518 519 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 520 !Subtarget->isThumb1Only()) { 521 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 522 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 523 } 524 525 if (Subtarget->hasFullFP16()) { 526 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 527 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 528 setOperationAction(ISD::BITCAST, MVT::i32, Custom); 529 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 530 } 531 532 for (MVT VT : MVT::vector_valuetypes()) { 533 for (MVT InnerVT : MVT::vector_valuetypes()) { 534 setTruncStoreAction(VT, InnerVT, Expand); 535 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 536 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 537 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 538 } 539 540 setOperationAction(ISD::MULHS, VT, Expand); 541 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 542 setOperationAction(ISD::MULHU, VT, Expand); 543 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 544 545 setOperationAction(ISD::BSWAP, VT, Expand); 546 } 547 548 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 549 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 550 551 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 552 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 553 554 if (Subtarget->hasNEON()) { 555 addDRTypeForNEON(MVT::v2f32); 556 addDRTypeForNEON(MVT::v8i8); 557 addDRTypeForNEON(MVT::v4i16); 558 addDRTypeForNEON(MVT::v2i32); 559 addDRTypeForNEON(MVT::v1i64); 560 561 addQRTypeForNEON(MVT::v4f32); 562 addQRTypeForNEON(MVT::v2f64); 563 addQRTypeForNEON(MVT::v16i8); 564 addQRTypeForNEON(MVT::v8i16); 565 addQRTypeForNEON(MVT::v4i32); 566 addQRTypeForNEON(MVT::v2i64); 567 568 if (Subtarget->hasFullFP16()) { 569 addQRTypeForNEON(MVT::v8f16); 570 addDRTypeForNEON(MVT::v4f16); 571 } 572 573 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 574 // neither Neon nor VFP support any arithmetic operations on it. 575 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 576 // supported for v4f32. 577 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 578 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 579 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 580 // FIXME: Code duplication: FDIV and FREM are expanded always, see 581 // ARMTargetLowering::addTypeForNEON method for details. 582 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 583 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 584 // FIXME: Create unittest. 585 // In another words, find a way when "copysign" appears in DAG with vector 586 // operands. 587 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 588 // FIXME: Code duplication: SETCC has custom operation action, see 589 // ARMTargetLowering::addTypeForNEON method for details. 590 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 591 // FIXME: Create unittest for FNEG and for FABS. 592 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 593 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 594 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 595 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 596 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 597 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 598 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 599 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 600 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 601 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 602 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 603 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 604 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 605 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 606 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 607 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 608 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 609 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 610 611 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 612 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 613 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 614 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 615 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 616 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 617 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 618 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 619 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 620 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 621 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 622 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 623 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 624 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 625 626 // Mark v2f32 intrinsics. 627 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 628 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 629 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 630 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 631 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 632 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 633 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 634 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 635 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 636 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 637 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 638 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 639 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 640 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 641 642 // Neon does not support some operations on v1i64 and v2i64 types. 643 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 644 // Custom handling for some quad-vector types to detect VMULL. 645 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 646 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 647 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 648 // Custom handling for some vector types to avoid expensive expansions 649 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 650 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 651 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 652 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 653 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 654 // a destination type that is wider than the source, and nor does 655 // it have a FP_TO_[SU]INT instruction with a narrower destination than 656 // source. 657 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 658 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 659 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 660 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 661 662 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 663 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 664 665 // NEON does not have single instruction CTPOP for vectors with element 666 // types wider than 8-bits. However, custom lowering can leverage the 667 // v8i8/v16i8 vcnt instruction. 668 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 669 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 670 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 671 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 672 setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); 673 setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); 674 675 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 676 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 677 678 // NEON does not have single instruction CTTZ for vectors. 679 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 680 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 681 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 682 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 683 684 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 685 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 686 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 687 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 688 689 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 690 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 691 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 692 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 693 694 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 695 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 696 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 697 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 698 699 // NEON only has FMA instructions as of VFP4. 700 if (!Subtarget->hasVFP4()) { 701 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 702 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 703 } 704 705 setTargetDAGCombine(ISD::INTRINSIC_VOID); 706 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 707 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 708 setTargetDAGCombine(ISD::SHL); 709 setTargetDAGCombine(ISD::SRL); 710 setTargetDAGCombine(ISD::SRA); 711 setTargetDAGCombine(ISD::SIGN_EXTEND); 712 setTargetDAGCombine(ISD::ZERO_EXTEND); 713 setTargetDAGCombine(ISD::ANY_EXTEND); 714 setTargetDAGCombine(ISD::BUILD_VECTOR); 715 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 716 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 717 setTargetDAGCombine(ISD::STORE); 718 setTargetDAGCombine(ISD::FP_TO_SINT); 719 setTargetDAGCombine(ISD::FP_TO_UINT); 720 setTargetDAGCombine(ISD::FDIV); 721 setTargetDAGCombine(ISD::LOAD); 722 723 // It is legal to extload from v4i8 to v4i16 or v4i32. 724 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 725 MVT::v2i32}) { 726 for (MVT VT : MVT::integer_vector_valuetypes()) { 727 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 728 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 729 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 730 } 731 } 732 } 733 734 if (Subtarget->isFPOnlySP()) { 735 // When targeting a floating-point unit with only single-precision 736 // operations, f64 is legal for the few double-precision instructions which 737 // are present However, no double-precision operations other than moves, 738 // loads and stores are provided by the hardware. 739 setOperationAction(ISD::FADD, MVT::f64, Expand); 740 setOperationAction(ISD::FSUB, MVT::f64, Expand); 741 setOperationAction(ISD::FMUL, MVT::f64, Expand); 742 setOperationAction(ISD::FMA, MVT::f64, Expand); 743 setOperationAction(ISD::FDIV, MVT::f64, Expand); 744 setOperationAction(ISD::FREM, MVT::f64, Expand); 745 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 746 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 747 setOperationAction(ISD::FNEG, MVT::f64, Expand); 748 setOperationAction(ISD::FABS, MVT::f64, Expand); 749 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 750 setOperationAction(ISD::FSIN, MVT::f64, Expand); 751 setOperationAction(ISD::FCOS, MVT::f64, Expand); 752 setOperationAction(ISD::FPOW, MVT::f64, Expand); 753 setOperationAction(ISD::FLOG, MVT::f64, Expand); 754 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 755 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 756 setOperationAction(ISD::FEXP, MVT::f64, Expand); 757 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 758 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 759 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 760 setOperationAction(ISD::FRINT, MVT::f64, Expand); 761 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 762 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 763 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 764 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 765 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 766 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 767 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 768 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 769 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 770 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 771 } 772 773 computeRegisterProperties(Subtarget->getRegisterInfo()); 774 775 // ARM does not have floating-point extending loads. 776 for (MVT VT : MVT::fp_valuetypes()) { 777 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 778 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 779 } 780 781 // ... or truncating stores 782 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 783 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 784 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 785 786 // ARM does not have i1 sign extending load. 787 for (MVT VT : MVT::integer_valuetypes()) 788 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 789 790 // ARM supports all 4 flavors of integer indexed load / store. 791 if (!Subtarget->isThumb1Only()) { 792 for (unsigned im = (unsigned)ISD::PRE_INC; 793 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 794 setIndexedLoadAction(im, MVT::i1, Legal); 795 setIndexedLoadAction(im, MVT::i8, Legal); 796 setIndexedLoadAction(im, MVT::i16, Legal); 797 setIndexedLoadAction(im, MVT::i32, Legal); 798 setIndexedStoreAction(im, MVT::i1, Legal); 799 setIndexedStoreAction(im, MVT::i8, Legal); 800 setIndexedStoreAction(im, MVT::i16, Legal); 801 setIndexedStoreAction(im, MVT::i32, Legal); 802 } 803 } else { 804 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 805 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 806 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 807 } 808 809 setOperationAction(ISD::SADDO, MVT::i32, Custom); 810 setOperationAction(ISD::UADDO, MVT::i32, Custom); 811 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 812 setOperationAction(ISD::USUBO, MVT::i32, Custom); 813 814 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 815 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 816 817 // i64 operation support. 818 setOperationAction(ISD::MUL, MVT::i64, Expand); 819 setOperationAction(ISD::MULHU, MVT::i32, Expand); 820 if (Subtarget->isThumb1Only()) { 821 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 822 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 823 } 824 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 825 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 826 setOperationAction(ISD::MULHS, MVT::i32, Expand); 827 828 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 829 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 830 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 831 setOperationAction(ISD::SRL, MVT::i64, Custom); 832 setOperationAction(ISD::SRA, MVT::i64, Custom); 833 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 834 835 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 836 if (Subtarget->isThumb1Only()) { 837 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 838 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 839 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 840 } 841 842 setOperationAction(ISD::ADDC, MVT::i32, Custom); 843 setOperationAction(ISD::ADDE, MVT::i32, Custom); 844 setOperationAction(ISD::SUBC, MVT::i32, Custom); 845 setOperationAction(ISD::SUBE, MVT::i32, Custom); 846 847 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 848 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 849 850 // ARM does not have ROTL. 851 setOperationAction(ISD::ROTL, MVT::i32, Expand); 852 for (MVT VT : MVT::vector_valuetypes()) { 853 setOperationAction(ISD::ROTL, VT, Expand); 854 setOperationAction(ISD::ROTR, VT, Expand); 855 } 856 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 857 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 858 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) 859 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 860 861 // @llvm.readcyclecounter requires the Performance Monitors extension. 862 // Default to the 0 expansion on unsupported platforms. 863 // FIXME: Technically there are older ARM CPUs that have 864 // implementation-specific ways of obtaining this information. 865 if (Subtarget->hasPerfMon()) 866 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 867 868 // Only ARMv6 has BSWAP. 869 if (!Subtarget->hasV6Ops()) 870 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 871 872 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 873 : Subtarget->hasDivideInARMMode(); 874 if (!hasDivide) { 875 // These are expanded into libcalls if the cpu doesn't have HW divider. 876 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 877 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 878 } 879 880 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 881 setOperationAction(ISD::SDIV, MVT::i32, Custom); 882 setOperationAction(ISD::UDIV, MVT::i32, Custom); 883 884 setOperationAction(ISD::SDIV, MVT::i64, Custom); 885 setOperationAction(ISD::UDIV, MVT::i64, Custom); 886 } 887 888 setOperationAction(ISD::SREM, MVT::i32, Expand); 889 setOperationAction(ISD::UREM, MVT::i32, Expand); 890 891 // Register based DivRem for AEABI (RTABI 4.2) 892 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 893 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 894 Subtarget->isTargetWindows()) { 895 setOperationAction(ISD::SREM, MVT::i64, Custom); 896 setOperationAction(ISD::UREM, MVT::i64, Custom); 897 HasStandaloneRem = false; 898 899 if (Subtarget->isTargetWindows()) { 900 const struct { 901 const RTLIB::Libcall Op; 902 const char * const Name; 903 const CallingConv::ID CC; 904 } LibraryCalls[] = { 905 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 906 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 907 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 908 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 909 910 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 911 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 912 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 913 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 914 }; 915 916 for (const auto &LC : LibraryCalls) { 917 setLibcallName(LC.Op, LC.Name); 918 setLibcallCallingConv(LC.Op, LC.CC); 919 } 920 } else { 921 const struct { 922 const RTLIB::Libcall Op; 923 const char * const Name; 924 const CallingConv::ID CC; 925 } LibraryCalls[] = { 926 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 927 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 928 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 929 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 930 931 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 932 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 933 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 934 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 935 }; 936 937 for (const auto &LC : LibraryCalls) { 938 setLibcallName(LC.Op, LC.Name); 939 setLibcallCallingConv(LC.Op, LC.CC); 940 } 941 } 942 943 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 944 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 945 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 946 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 947 } else { 948 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 949 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 950 } 951 952 if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) 953 for (auto &VT : {MVT::f32, MVT::f64}) 954 setOperationAction(ISD::FPOWI, VT, Custom); 955 956 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 957 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 958 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 959 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 960 961 setOperationAction(ISD::TRAP, MVT::Other, Legal); 962 963 // Use the default implementation. 964 setOperationAction(ISD::VASTART, MVT::Other, Custom); 965 setOperationAction(ISD::VAARG, MVT::Other, Expand); 966 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 967 setOperationAction(ISD::VAEND, MVT::Other, Expand); 968 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 969 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 970 971 if (Subtarget->isTargetWindows()) 972 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 973 else 974 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 975 976 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 977 // the default expansion. 978 InsertFencesForAtomic = false; 979 if (Subtarget->hasAnyDataBarrier() && 980 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 981 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 982 // to ldrex/strex loops already. 983 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 984 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 985 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 986 987 // On v8, we have particularly efficient implementations of atomic fences 988 // if they can be combined with nearby atomic loads and stores. 989 if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { 990 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 991 InsertFencesForAtomic = true; 992 } 993 } else { 994 // If there's anything we can use as a barrier, go through custom lowering 995 // for ATOMIC_FENCE. 996 // If target has DMB in thumb, Fences can be inserted. 997 if (Subtarget->hasDataBarrier()) 998 InsertFencesForAtomic = true; 999 1000 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1001 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1002 1003 // Set them all for expansion, which will force libcalls. 1004 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1005 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1006 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1007 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1008 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1009 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1010 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1011 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1012 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1013 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1014 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1015 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1016 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1017 // Unordered/Monotonic case. 1018 if (!InsertFencesForAtomic) { 1019 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1020 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1021 } 1022 } 1023 1024 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1025 1026 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1027 if (!Subtarget->hasV6Ops()) { 1028 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1029 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1030 } 1031 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1032 1033 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 1034 !Subtarget->isThumb1Only()) { 1035 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1036 // iff target supports vfp2. 1037 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1038 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1039 } 1040 1041 // We want to custom lower some of our intrinsics. 1042 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1043 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1044 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1045 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1046 if (Subtarget->useSjLjEH()) 1047 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1048 1049 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1050 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1051 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1052 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1053 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1054 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1055 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1056 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1057 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1058 if (Subtarget->hasFullFP16()) { 1059 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1060 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1061 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1062 } 1063 1064 // Thumb-1 cannot currently select ARMISD::SUBE. 1065 if (!Subtarget->isThumb1Only()) 1066 setOperationAction(ISD::SETCCE, MVT::i32, Custom); 1067 1068 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1069 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1070 if (Subtarget->hasFullFP16()) 1071 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1072 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1073 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1074 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1075 1076 // We don't support sin/cos/fmod/copysign/pow 1077 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1078 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1079 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1080 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1081 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1082 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1083 setOperationAction(ISD::FREM, MVT::f64, Expand); 1084 setOperationAction(ISD::FREM, MVT::f32, Expand); 1085 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 1086 !Subtarget->isThumb1Only()) { 1087 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1088 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1089 } 1090 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1091 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1092 1093 if (!Subtarget->hasVFP4()) { 1094 setOperationAction(ISD::FMA, MVT::f64, Expand); 1095 setOperationAction(ISD::FMA, MVT::f32, Expand); 1096 } 1097 1098 // Various VFP goodness 1099 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1100 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1101 if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { 1102 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1103 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1104 } 1105 1106 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1107 if (!Subtarget->hasFP16()) { 1108 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1109 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1110 } 1111 } 1112 1113 // Use __sincos_stret if available. 1114 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1115 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1116 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1117 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1118 } 1119 1120 // FP-ARMv8 implements a lot of rounding-like FP operations. 1121 if (Subtarget->hasFPARMv8()) { 1122 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1123 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1124 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1125 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1126 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1127 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1128 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1129 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1130 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1131 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1132 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1133 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1134 1135 if (!Subtarget->isFPOnlySP()) { 1136 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1137 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1138 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1139 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1140 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1141 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1142 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1143 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1144 } 1145 } 1146 1147 if (Subtarget->hasNEON()) { 1148 // vmin and vmax aren't available in a scalar form, so we use 1149 // a NEON instruction with an undef lane instead. 1150 setOperationAction(ISD::FMINNAN, MVT::f32, Legal); 1151 setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); 1152 setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); 1153 setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); 1154 setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); 1155 setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); 1156 } 1157 1158 // We have target-specific dag combine patterns for the following nodes: 1159 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1160 setTargetDAGCombine(ISD::ADD); 1161 setTargetDAGCombine(ISD::SUB); 1162 setTargetDAGCombine(ISD::MUL); 1163 setTargetDAGCombine(ISD::AND); 1164 setTargetDAGCombine(ISD::OR); 1165 setTargetDAGCombine(ISD::XOR); 1166 1167 if (Subtarget->hasV6Ops()) 1168 setTargetDAGCombine(ISD::SRL); 1169 1170 setStackPointerRegisterToSaveRestore(ARM::SP); 1171 1172 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1173 !Subtarget->hasVFP2()) 1174 setSchedulingPreference(Sched::RegPressure); 1175 else 1176 setSchedulingPreference(Sched::Hybrid); 1177 1178 //// temporary - rewrite interface to use type 1179 MaxStoresPerMemset = 8; 1180 MaxStoresPerMemsetOptSize = 4; 1181 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1182 MaxStoresPerMemcpyOptSize = 2; 1183 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1184 MaxStoresPerMemmoveOptSize = 2; 1185 1186 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1187 // are at least 4 bytes aligned. 1188 setMinStackArgumentAlignment(4); 1189 1190 // Prefer likely predicted branches to selects on out-of-order cores. 1191 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1192 1193 setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); 1194 } 1195 1196 bool ARMTargetLowering::useSoftFloat() const { 1197 return Subtarget->useSoftFloat(); 1198 } 1199 1200 // FIXME: It might make sense to define the representative register class as the 1201 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1202 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1203 // SPR's representative would be DPR_VFP2. This should work well if register 1204 // pressure tracking were modified such that a register use would increment the 1205 // pressure of the register class's representative and all of it's super 1206 // classes' representatives transitively. We have not implemented this because 1207 // of the difficulty prior to coalescing of modeling operand register classes 1208 // due to the common occurrence of cross class copies and subregister insertions 1209 // and extractions. 1210 std::pair<const TargetRegisterClass *, uint8_t> 1211 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1212 MVT VT) const { 1213 const TargetRegisterClass *RRC = nullptr; 1214 uint8_t Cost = 1; 1215 switch (VT.SimpleTy) { 1216 default: 1217 return TargetLowering::findRepresentativeClass(TRI, VT); 1218 // Use DPR as representative register class for all floating point 1219 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1220 // the cost is 1 for both f32 and f64. 1221 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1222 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1223 RRC = &ARM::DPRRegClass; 1224 // When NEON is used for SP, only half of the register file is available 1225 // because operations that define both SP and DP results will be constrained 1226 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1227 // coalescing by double-counting the SP regs. See the FIXME above. 1228 if (Subtarget->useNEONForSinglePrecisionFP()) 1229 Cost = 2; 1230 break; 1231 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1232 case MVT::v4f32: case MVT::v2f64: 1233 RRC = &ARM::DPRRegClass; 1234 Cost = 2; 1235 break; 1236 case MVT::v4i64: 1237 RRC = &ARM::DPRRegClass; 1238 Cost = 4; 1239 break; 1240 case MVT::v8i64: 1241 RRC = &ARM::DPRRegClass; 1242 Cost = 8; 1243 break; 1244 } 1245 return std::make_pair(RRC, Cost); 1246 } 1247 1248 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1249 switch ((ARMISD::NodeType)Opcode) { 1250 case ARMISD::FIRST_NUMBER: break; 1251 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1252 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1253 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1254 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1255 case ARMISD::CALL: return "ARMISD::CALL"; 1256 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1257 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1258 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1259 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1260 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1261 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1262 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1263 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1264 case ARMISD::CMP: return "ARMISD::CMP"; 1265 case ARMISD::CMN: return "ARMISD::CMN"; 1266 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1267 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1268 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1269 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1270 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1271 1272 case ARMISD::CMOV: return "ARMISD::CMOV"; 1273 1274 case ARMISD::SSAT: return "ARMISD::SSAT"; 1275 case ARMISD::USAT: return "ARMISD::USAT"; 1276 1277 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1278 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1279 case ARMISD::RRX: return "ARMISD::RRX"; 1280 1281 case ARMISD::ADDC: return "ARMISD::ADDC"; 1282 case ARMISD::ADDE: return "ARMISD::ADDE"; 1283 case ARMISD::SUBC: return "ARMISD::SUBC"; 1284 case ARMISD::SUBE: return "ARMISD::SUBE"; 1285 1286 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1287 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1288 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1289 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1290 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1291 1292 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1293 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1294 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1295 1296 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1297 1298 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1299 1300 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1301 1302 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1303 1304 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1305 1306 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1307 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1308 1309 case ARMISD::VCEQ: return "ARMISD::VCEQ"; 1310 case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; 1311 case ARMISD::VCGE: return "ARMISD::VCGE"; 1312 case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; 1313 case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; 1314 case ARMISD::VCGEU: return "ARMISD::VCGEU"; 1315 case ARMISD::VCGT: return "ARMISD::VCGT"; 1316 case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; 1317 case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; 1318 case ARMISD::VCGTU: return "ARMISD::VCGTU"; 1319 case ARMISD::VTST: return "ARMISD::VTST"; 1320 1321 case ARMISD::VSHL: return "ARMISD::VSHL"; 1322 case ARMISD::VSHRs: return "ARMISD::VSHRs"; 1323 case ARMISD::VSHRu: return "ARMISD::VSHRu"; 1324 case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; 1325 case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; 1326 case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; 1327 case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; 1328 case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; 1329 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; 1330 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; 1331 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; 1332 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; 1333 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; 1334 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; 1335 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; 1336 case ARMISD::VSLI: return "ARMISD::VSLI"; 1337 case ARMISD::VSRI: return "ARMISD::VSRI"; 1338 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1339 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1340 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1341 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1342 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1343 case ARMISD::VDUP: return "ARMISD::VDUP"; 1344 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1345 case ARMISD::VEXT: return "ARMISD::VEXT"; 1346 case ARMISD::VREV64: return "ARMISD::VREV64"; 1347 case ARMISD::VREV32: return "ARMISD::VREV32"; 1348 case ARMISD::VREV16: return "ARMISD::VREV16"; 1349 case ARMISD::VZIP: return "ARMISD::VZIP"; 1350 case ARMISD::VUZP: return "ARMISD::VUZP"; 1351 case ARMISD::VTRN: return "ARMISD::VTRN"; 1352 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1353 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1354 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1355 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1356 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1357 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1358 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1359 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1360 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1361 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1362 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1363 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1364 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1365 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1366 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1367 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1368 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1369 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1370 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1371 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1372 case ARMISD::BFI: return "ARMISD::BFI"; 1373 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1374 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1375 case ARMISD::VBSL: return "ARMISD::VBSL"; 1376 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1377 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1378 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1379 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1380 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1381 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1382 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1383 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1384 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1385 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1386 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1387 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1388 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1389 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1390 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1391 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1392 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1393 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1394 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1395 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1396 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1397 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1398 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1399 } 1400 return nullptr; 1401 } 1402 1403 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1404 EVT VT) const { 1405 if (!VT.isVector()) 1406 return getPointerTy(DL); 1407 return VT.changeVectorElementTypeToInteger(); 1408 } 1409 1410 /// getRegClassFor - Return the register class that should be used for the 1411 /// specified value type. 1412 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { 1413 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1414 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1415 // load / store 4 to 8 consecutive D registers. 1416 if (Subtarget->hasNEON()) { 1417 if (VT == MVT::v4i64) 1418 return &ARM::QQPRRegClass; 1419 if (VT == MVT::v8i64) 1420 return &ARM::QQQQPRRegClass; 1421 } 1422 return TargetLowering::getRegClassFor(VT); 1423 } 1424 1425 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1426 // source/dest is aligned and the copy size is large enough. We therefore want 1427 // to align such objects passed to memory intrinsics. 1428 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1429 unsigned &PrefAlign) const { 1430 if (!isa<MemIntrinsic>(CI)) 1431 return false; 1432 MinSize = 8; 1433 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1434 // cycle faster than 4-byte aligned LDM. 1435 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1436 return true; 1437 } 1438 1439 // Create a fast isel object. 1440 FastISel * 1441 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1442 const TargetLibraryInfo *libInfo) const { 1443 return ARM::createFastISel(funcInfo, libInfo); 1444 } 1445 1446 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1447 unsigned NumVals = N->getNumValues(); 1448 if (!NumVals) 1449 return Sched::RegPressure; 1450 1451 for (unsigned i = 0; i != NumVals; ++i) { 1452 EVT VT = N->getValueType(i); 1453 if (VT == MVT::Glue || VT == MVT::Other) 1454 continue; 1455 if (VT.isFloatingPoint() || VT.isVector()) 1456 return Sched::ILP; 1457 } 1458 1459 if (!N->isMachineOpcode()) 1460 return Sched::RegPressure; 1461 1462 // Load are scheduled for latency even if there instruction itinerary 1463 // is not available. 1464 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1465 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1466 1467 if (MCID.getNumDefs() == 0) 1468 return Sched::RegPressure; 1469 if (!Itins->isEmpty() && 1470 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1471 return Sched::ILP; 1472 1473 return Sched::RegPressure; 1474 } 1475 1476 //===----------------------------------------------------------------------===// 1477 // Lowering Code 1478 //===----------------------------------------------------------------------===// 1479 1480 static bool isSRL16(const SDValue &Op) { 1481 if (Op.getOpcode() != ISD::SRL) 1482 return false; 1483 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1484 return Const->getZExtValue() == 16; 1485 return false; 1486 } 1487 1488 static bool isSRA16(const SDValue &Op) { 1489 if (Op.getOpcode() != ISD::SRA) 1490 return false; 1491 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1492 return Const->getZExtValue() == 16; 1493 return false; 1494 } 1495 1496 static bool isSHL16(const SDValue &Op) { 1497 if (Op.getOpcode() != ISD::SHL) 1498 return false; 1499 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1500 return Const->getZExtValue() == 16; 1501 return false; 1502 } 1503 1504 // Check for a signed 16-bit value. We special case SRA because it makes it 1505 // more simple when also looking for SRAs that aren't sign extending a 1506 // smaller value. Without the check, we'd need to take extra care with 1507 // checking order for some operations. 1508 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1509 if (isSRA16(Op)) 1510 return isSHL16(Op.getOperand(0)); 1511 return DAG.ComputeNumSignBits(Op) == 17; 1512 } 1513 1514 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1515 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1516 switch (CC) { 1517 default: llvm_unreachable("Unknown condition code!"); 1518 case ISD::SETNE: return ARMCC::NE; 1519 case ISD::SETEQ: return ARMCC::EQ; 1520 case ISD::SETGT: return ARMCC::GT; 1521 case ISD::SETGE: return ARMCC::GE; 1522 case ISD::SETLT: return ARMCC::LT; 1523 case ISD::SETLE: return ARMCC::LE; 1524 case ISD::SETUGT: return ARMCC::HI; 1525 case ISD::SETUGE: return ARMCC::HS; 1526 case ISD::SETULT: return ARMCC::LO; 1527 case ISD::SETULE: return ARMCC::LS; 1528 } 1529 } 1530 1531 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1532 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1533 ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { 1534 CondCode2 = ARMCC::AL; 1535 InvalidOnQNaN = true; 1536 switch (CC) { 1537 default: llvm_unreachable("Unknown FP condition!"); 1538 case ISD::SETEQ: 1539 case ISD::SETOEQ: 1540 CondCode = ARMCC::EQ; 1541 InvalidOnQNaN = false; 1542 break; 1543 case ISD::SETGT: 1544 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1545 case ISD::SETGE: 1546 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1547 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1548 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1549 case ISD::SETONE: 1550 CondCode = ARMCC::MI; 1551 CondCode2 = ARMCC::GT; 1552 InvalidOnQNaN = false; 1553 break; 1554 case ISD::SETO: CondCode = ARMCC::VC; break; 1555 case ISD::SETUO: CondCode = ARMCC::VS; break; 1556 case ISD::SETUEQ: 1557 CondCode = ARMCC::EQ; 1558 CondCode2 = ARMCC::VS; 1559 InvalidOnQNaN = false; 1560 break; 1561 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1562 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1563 case ISD::SETLT: 1564 case ISD::SETULT: CondCode = ARMCC::LT; break; 1565 case ISD::SETLE: 1566 case ISD::SETULE: CondCode = ARMCC::LE; break; 1567 case ISD::SETNE: 1568 case ISD::SETUNE: 1569 CondCode = ARMCC::NE; 1570 InvalidOnQNaN = false; 1571 break; 1572 } 1573 } 1574 1575 //===----------------------------------------------------------------------===// 1576 // Calling Convention Implementation 1577 //===----------------------------------------------------------------------===// 1578 1579 #include "ARMGenCallingConv.inc" 1580 1581 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1582 /// account presence of floating point hardware and calling convention 1583 /// limitations, such as support for variadic functions. 1584 CallingConv::ID 1585 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1586 bool isVarArg) const { 1587 switch (CC) { 1588 default: 1589 report_fatal_error("Unsupported calling convention"); 1590 case CallingConv::ARM_AAPCS: 1591 case CallingConv::ARM_APCS: 1592 case CallingConv::GHC: 1593 return CC; 1594 case CallingConv::PreserveMost: 1595 return CallingConv::PreserveMost; 1596 case CallingConv::ARM_AAPCS_VFP: 1597 case CallingConv::Swift: 1598 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1599 case CallingConv::C: 1600 if (!Subtarget->isAAPCS_ABI()) 1601 return CallingConv::ARM_APCS; 1602 else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && 1603 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1604 !isVarArg) 1605 return CallingConv::ARM_AAPCS_VFP; 1606 else 1607 return CallingConv::ARM_AAPCS; 1608 case CallingConv::Fast: 1609 case CallingConv::CXX_FAST_TLS: 1610 if (!Subtarget->isAAPCS_ABI()) { 1611 if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1612 return CallingConv::Fast; 1613 return CallingConv::ARM_APCS; 1614 } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1615 return CallingConv::ARM_AAPCS_VFP; 1616 else 1617 return CallingConv::ARM_AAPCS; 1618 } 1619 } 1620 1621 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1622 bool isVarArg) const { 1623 return CCAssignFnForNode(CC, false, isVarArg); 1624 } 1625 1626 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1627 bool isVarArg) const { 1628 return CCAssignFnForNode(CC, true, isVarArg); 1629 } 1630 1631 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1632 /// CallingConvention. 1633 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1634 bool Return, 1635 bool isVarArg) const { 1636 switch (getEffectiveCallingConv(CC, isVarArg)) { 1637 default: 1638 report_fatal_error("Unsupported calling convention"); 1639 case CallingConv::ARM_APCS: 1640 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1641 case CallingConv::ARM_AAPCS: 1642 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1643 case CallingConv::ARM_AAPCS_VFP: 1644 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1645 case CallingConv::Fast: 1646 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1647 case CallingConv::GHC: 1648 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1649 case CallingConv::PreserveMost: 1650 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1651 } 1652 } 1653 1654 /// LowerCallResult - Lower the result values of a call into the 1655 /// appropriate copies out of appropriate physical registers. 1656 SDValue ARMTargetLowering::LowerCallResult( 1657 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1658 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1659 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1660 SDValue ThisVal) const { 1661 // Assign locations to each value returned by this call. 1662 SmallVector<CCValAssign, 16> RVLocs; 1663 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1664 *DAG.getContext()); 1665 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1666 1667 // Copy all of the result registers out of their specified physreg. 1668 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1669 CCValAssign VA = RVLocs[i]; 1670 1671 // Pass 'this' value directly from the argument to return value, to avoid 1672 // reg unit interference 1673 if (i == 0 && isThisReturn) { 1674 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1675 "unexpected return calling convention register assignment"); 1676 InVals.push_back(ThisVal); 1677 continue; 1678 } 1679 1680 SDValue Val; 1681 if (VA.needsCustom()) { 1682 // Handle f64 or half of a v2f64. 1683 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1684 InFlag); 1685 Chain = Lo.getValue(1); 1686 InFlag = Lo.getValue(2); 1687 VA = RVLocs[++i]; // skip ahead to next loc 1688 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1689 InFlag); 1690 Chain = Hi.getValue(1); 1691 InFlag = Hi.getValue(2); 1692 if (!Subtarget->isLittle()) 1693 std::swap (Lo, Hi); 1694 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1695 1696 if (VA.getLocVT() == MVT::v2f64) { 1697 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1698 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1699 DAG.getConstant(0, dl, MVT::i32)); 1700 1701 VA = RVLocs[++i]; // skip ahead to next loc 1702 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1703 Chain = Lo.getValue(1); 1704 InFlag = Lo.getValue(2); 1705 VA = RVLocs[++i]; // skip ahead to next loc 1706 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1707 Chain = Hi.getValue(1); 1708 InFlag = Hi.getValue(2); 1709 if (!Subtarget->isLittle()) 1710 std::swap (Lo, Hi); 1711 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1712 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1713 DAG.getConstant(1, dl, MVT::i32)); 1714 } 1715 } else { 1716 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1717 InFlag); 1718 Chain = Val.getValue(1); 1719 InFlag = Val.getValue(2); 1720 } 1721 1722 switch (VA.getLocInfo()) { 1723 default: llvm_unreachable("Unknown loc info!"); 1724 case CCValAssign::Full: break; 1725 case CCValAssign::BCvt: 1726 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1727 break; 1728 } 1729 1730 InVals.push_back(Val); 1731 } 1732 1733 return Chain; 1734 } 1735 1736 /// LowerMemOpCallTo - Store the argument to the stack. 1737 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1738 SDValue Arg, const SDLoc &dl, 1739 SelectionDAG &DAG, 1740 const CCValAssign &VA, 1741 ISD::ArgFlagsTy Flags) const { 1742 unsigned LocMemOffset = VA.getLocMemOffset(); 1743 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1744 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1745 StackPtr, PtrOff); 1746 return DAG.getStore( 1747 Chain, dl, Arg, PtrOff, 1748 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 1749 } 1750 1751 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 1752 SDValue Chain, SDValue &Arg, 1753 RegsToPassVector &RegsToPass, 1754 CCValAssign &VA, CCValAssign &NextVA, 1755 SDValue &StackPtr, 1756 SmallVectorImpl<SDValue> &MemOpChains, 1757 ISD::ArgFlagsTy Flags) const { 1758 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1759 DAG.getVTList(MVT::i32, MVT::i32), Arg); 1760 unsigned id = Subtarget->isLittle() ? 0 : 1; 1761 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 1762 1763 if (NextVA.isRegLoc()) 1764 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 1765 else { 1766 assert(NextVA.isMemLoc()); 1767 if (!StackPtr.getNode()) 1768 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 1769 getPointerTy(DAG.getDataLayout())); 1770 1771 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 1772 dl, DAG, NextVA, 1773 Flags)); 1774 } 1775 } 1776 1777 /// LowerCall - Lowering a call into a callseq_start <- 1778 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 1779 /// nodes. 1780 SDValue 1781 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1782 SmallVectorImpl<SDValue> &InVals) const { 1783 SelectionDAG &DAG = CLI.DAG; 1784 SDLoc &dl = CLI.DL; 1785 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1786 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1787 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1788 SDValue Chain = CLI.Chain; 1789 SDValue Callee = CLI.Callee; 1790 bool &isTailCall = CLI.IsTailCall; 1791 CallingConv::ID CallConv = CLI.CallConv; 1792 bool doesNotRet = CLI.DoesNotReturn; 1793 bool isVarArg = CLI.IsVarArg; 1794 1795 MachineFunction &MF = DAG.getMachineFunction(); 1796 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 1797 bool isThisReturn = false; 1798 bool isSibCall = false; 1799 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); 1800 1801 // Disable tail calls if they're not supported. 1802 if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") 1803 isTailCall = false; 1804 1805 if (isTailCall) { 1806 // Check if it's really possible to do a tail call. 1807 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1808 isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), 1809 Outs, OutVals, Ins, DAG); 1810 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 1811 report_fatal_error("failed to perform tail call elimination on a call " 1812 "site marked musttail"); 1813 // We don't support GuaranteedTailCallOpt for ARM, only automatically 1814 // detected sibcalls. 1815 if (isTailCall) { 1816 ++NumTailCalls; 1817 isSibCall = true; 1818 } 1819 } 1820 1821 // Analyze operands of the call, assigning locations to each operand. 1822 SmallVector<CCValAssign, 16> ArgLocs; 1823 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1824 *DAG.getContext()); 1825 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 1826 1827 // Get a count of how many bytes are to be pushed on the stack. 1828 unsigned NumBytes = CCInfo.getNextStackOffset(); 1829 1830 // For tail calls, memory operands are available in our caller's stack. 1831 if (isSibCall) 1832 NumBytes = 0; 1833 1834 // Adjust the stack pointer for the new arguments... 1835 // These operations are automatically eliminated by the prolog/epilog pass 1836 if (!isSibCall) 1837 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 1838 1839 SDValue StackPtr = 1840 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 1841 1842 RegsToPassVector RegsToPass; 1843 SmallVector<SDValue, 8> MemOpChains; 1844 1845 // Walk the register/memloc assignments, inserting copies/loads. In the case 1846 // of tail call optimization, arguments are handled later. 1847 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1848 i != e; 1849 ++i, ++realArgIdx) { 1850 CCValAssign &VA = ArgLocs[i]; 1851 SDValue Arg = OutVals[realArgIdx]; 1852 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1853 bool isByVal = Flags.isByVal(); 1854 1855 // Promote the value if needed. 1856 switch (VA.getLocInfo()) { 1857 default: llvm_unreachable("Unknown loc info!"); 1858 case CCValAssign::Full: break; 1859 case CCValAssign::SExt: 1860 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1861 break; 1862 case CCValAssign::ZExt: 1863 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1864 break; 1865 case CCValAssign::AExt: 1866 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1867 break; 1868 case CCValAssign::BCvt: 1869 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1870 break; 1871 } 1872 1873 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 1874 if (VA.needsCustom()) { 1875 if (VA.getLocVT() == MVT::v2f64) { 1876 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1877 DAG.getConstant(0, dl, MVT::i32)); 1878 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1879 DAG.getConstant(1, dl, MVT::i32)); 1880 1881 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 1882 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1883 1884 VA = ArgLocs[++i]; // skip ahead to next loc 1885 if (VA.isRegLoc()) { 1886 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 1887 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1888 } else { 1889 assert(VA.isMemLoc()); 1890 1891 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 1892 dl, DAG, VA, Flags)); 1893 } 1894 } else { 1895 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 1896 StackPtr, MemOpChains, Flags); 1897 } 1898 } else if (VA.isRegLoc()) { 1899 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 1900 Outs[0].VT == MVT::i32) { 1901 assert(VA.getLocVT() == MVT::i32 && 1902 "unexpected calling convention register assignment"); 1903 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 1904 "unexpected use of 'returned'"); 1905 isThisReturn = true; 1906 } 1907 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1908 } else if (isByVal) { 1909 assert(VA.isMemLoc()); 1910 unsigned offset = 0; 1911 1912 // True if this byval aggregate will be split between registers 1913 // and memory. 1914 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 1915 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 1916 1917 if (CurByValIdx < ByValArgsCount) { 1918 1919 unsigned RegBegin, RegEnd; 1920 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 1921 1922 EVT PtrVT = 1923 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 1924 unsigned int i, j; 1925 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 1926 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 1927 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 1928 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 1929 MachinePointerInfo(), 1930 DAG.InferPtrAlignment(AddArg)); 1931 MemOpChains.push_back(Load.getValue(1)); 1932 RegsToPass.push_back(std::make_pair(j, Load)); 1933 } 1934 1935 // If parameter size outsides register area, "offset" value 1936 // helps us to calculate stack slot for remained part properly. 1937 offset = RegEnd - RegBegin; 1938 1939 CCInfo.nextInRegsParam(); 1940 } 1941 1942 if (Flags.getByValSize() > 4*offset) { 1943 auto PtrVT = getPointerTy(DAG.getDataLayout()); 1944 unsigned LocMemOffset = VA.getLocMemOffset(); 1945 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1946 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 1947 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 1948 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 1949 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 1950 MVT::i32); 1951 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 1952 MVT::i32); 1953 1954 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 1955 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 1956 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 1957 Ops)); 1958 } 1959 } else if (!isSibCall) { 1960 assert(VA.isMemLoc()); 1961 1962 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1963 dl, DAG, VA, Flags)); 1964 } 1965 } 1966 1967 if (!MemOpChains.empty()) 1968 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 1969 1970 // Build a sequence of copy-to-reg nodes chained together with token chain 1971 // and flag operands which copy the outgoing args into the appropriate regs. 1972 SDValue InFlag; 1973 // Tail call byval lowering might overwrite argument registers so in case of 1974 // tail call optimization the copies to registers are lowered later. 1975 if (!isTailCall) 1976 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1977 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1978 RegsToPass[i].second, InFlag); 1979 InFlag = Chain.getValue(1); 1980 } 1981 1982 // For tail calls lower the arguments to the 'real' stack slot. 1983 if (isTailCall) { 1984 // Force all the incoming stack arguments to be loaded from the stack 1985 // before any new outgoing arguments are stored to the stack, because the 1986 // outgoing stack slots may alias the incoming argument stack slots, and 1987 // the alias isn't otherwise explicit. This is slightly more conservative 1988 // than necessary, because it means that each store effectively depends 1989 // on every argument instead of just those arguments it would clobber. 1990 1991 // Do not flag preceding copytoreg stuff together with the following stuff. 1992 InFlag = SDValue(); 1993 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1994 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1995 RegsToPass[i].second, InFlag); 1996 InFlag = Chain.getValue(1); 1997 } 1998 InFlag = SDValue(); 1999 } 2000 2001 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2002 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2003 // node so that legalize doesn't hack it. 2004 bool isDirect = false; 2005 2006 const TargetMachine &TM = getTargetMachine(); 2007 const Module *Mod = MF.getFunction().getParent(); 2008 const GlobalValue *GV = nullptr; 2009 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2010 GV = G->getGlobal(); 2011 bool isStub = 2012 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2013 2014 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2015 bool isLocalARMFunc = false; 2016 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2017 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2018 2019 if (Subtarget->genLongCalls()) { 2020 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2021 "long-calls codegen is not position independent!"); 2022 // Handle a global address or an external symbol. If it's not one of 2023 // those, the target's already in a register, so we don't need to do 2024 // anything extra. 2025 if (isa<GlobalAddressSDNode>(Callee)) { 2026 // Create a constant pool entry for the callee address 2027 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2028 ARMConstantPoolValue *CPV = 2029 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2030 2031 // Get the address of the callee into a register 2032 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2033 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2034 Callee = DAG.getLoad( 2035 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2036 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2037 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2038 const char *Sym = S->getSymbol(); 2039 2040 // Create a constant pool entry for the callee address 2041 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2042 ARMConstantPoolValue *CPV = 2043 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2044 ARMPCLabelIndex, 0); 2045 // Get the address of the callee into a register 2046 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2047 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2048 Callee = DAG.getLoad( 2049 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2050 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2051 } 2052 } else if (isa<GlobalAddressSDNode>(Callee)) { 2053 // If we're optimizing for minimum size and the function is called three or 2054 // more times in this block, we can improve codesize by calling indirectly 2055 // as BLXr has a 16-bit encoding. 2056 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2057 auto *BB = CLI.CS.getParent(); 2058 bool PreferIndirect = 2059 Subtarget->isThumb() && MF.getFunction().optForMinSize() && 2060 count_if(GV->users(), [&BB](const User *U) { 2061 return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB; 2062 }) > 2; 2063 2064 if (!PreferIndirect) { 2065 isDirect = true; 2066 bool isDef = GV->isStrongDefinitionForLinker(); 2067 2068 // ARM call to a local ARM function is predicable. 2069 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2070 // tBX takes a register source operand. 2071 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2072 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2073 Callee = DAG.getNode( 2074 ARMISD::WrapperPIC, dl, PtrVt, 2075 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2076 Callee = DAG.getLoad( 2077 PtrVt, dl, DAG.getEntryNode(), Callee, 2078 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2079 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2080 MachineMemOperand::MOInvariant); 2081 } else if (Subtarget->isTargetCOFF()) { 2082 assert(Subtarget->isTargetWindows() && 2083 "Windows is the only supported COFF target"); 2084 unsigned TargetFlags = GV->hasDLLImportStorageClass() 2085 ? ARMII::MO_DLLIMPORT 2086 : ARMII::MO_NO_FLAG; 2087 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, 2088 TargetFlags); 2089 if (GV->hasDLLImportStorageClass()) 2090 Callee = 2091 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2092 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2093 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2094 } else { 2095 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2096 } 2097 } 2098 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2099 isDirect = true; 2100 // tBX takes a register source operand. 2101 const char *Sym = S->getSymbol(); 2102 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2103 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2104 ARMConstantPoolValue *CPV = 2105 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2106 ARMPCLabelIndex, 4); 2107 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2108 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2109 Callee = DAG.getLoad( 2110 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2111 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2112 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2113 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2114 } else { 2115 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2116 } 2117 } 2118 2119 // FIXME: handle tail calls differently. 2120 unsigned CallOpc; 2121 if (Subtarget->isThumb()) { 2122 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2123 CallOpc = ARMISD::CALL_NOLINK; 2124 else 2125 CallOpc = ARMISD::CALL; 2126 } else { 2127 if (!isDirect && !Subtarget->hasV5TOps()) 2128 CallOpc = ARMISD::CALL_NOLINK; 2129 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2130 // Emit regular call when code size is the priority 2131 !MF.getFunction().optForMinSize()) 2132 // "mov lr, pc; b _foo" to avoid confusing the RSP 2133 CallOpc = ARMISD::CALL_NOLINK; 2134 else 2135 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2136 } 2137 2138 std::vector<SDValue> Ops; 2139 Ops.push_back(Chain); 2140 Ops.push_back(Callee); 2141 2142 // Add argument registers to the end of the list so that they are known live 2143 // into the call. 2144 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2145 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2146 RegsToPass[i].second.getValueType())); 2147 2148 // Add a register mask operand representing the call-preserved registers. 2149 if (!isTailCall) { 2150 const uint32_t *Mask; 2151 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2152 if (isThisReturn) { 2153 // For 'this' returns, use the R0-preserving mask if applicable 2154 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2155 if (!Mask) { 2156 // Set isThisReturn to false if the calling convention is not one that 2157 // allows 'returned' to be modeled in this way, so LowerCallResult does 2158 // not try to pass 'this' straight through 2159 isThisReturn = false; 2160 Mask = ARI->getCallPreservedMask(MF, CallConv); 2161 } 2162 } else 2163 Mask = ARI->getCallPreservedMask(MF, CallConv); 2164 2165 assert(Mask && "Missing call preserved mask for calling convention"); 2166 Ops.push_back(DAG.getRegisterMask(Mask)); 2167 } 2168 2169 if (InFlag.getNode()) 2170 Ops.push_back(InFlag); 2171 2172 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2173 if (isTailCall) { 2174 MF.getFrameInfo().setHasTailCall(); 2175 return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2176 } 2177 2178 // Returns a chain and a flag for retval copy to use. 2179 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2180 InFlag = Chain.getValue(1); 2181 2182 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2183 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2184 if (!Ins.empty()) 2185 InFlag = Chain.getValue(1); 2186 2187 // Handle result values, copying them out of physregs into vregs that we 2188 // return. 2189 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2190 InVals, isThisReturn, 2191 isThisReturn ? OutVals[0] : SDValue()); 2192 } 2193 2194 /// HandleByVal - Every parameter *after* a byval parameter is passed 2195 /// on the stack. Remember the next parameter register to allocate, 2196 /// and then confiscate the rest of the parameter registers to insure 2197 /// this. 2198 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2199 unsigned Align) const { 2200 // Byval (as with any stack) slots are always at least 4 byte aligned. 2201 Align = std::max(Align, 4U); 2202 2203 unsigned Reg = State->AllocateReg(GPRArgRegs); 2204 if (!Reg) 2205 return; 2206 2207 unsigned AlignInRegs = Align / 4; 2208 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2209 for (unsigned i = 0; i < Waste; ++i) 2210 Reg = State->AllocateReg(GPRArgRegs); 2211 2212 if (!Reg) 2213 return; 2214 2215 unsigned Excess = 4 * (ARM::R4 - Reg); 2216 2217 // Special case when NSAA != SP and parameter size greater than size of 2218 // all remained GPR regs. In that case we can't split parameter, we must 2219 // send it to stack. We also must set NCRN to R4, so waste all 2220 // remained registers. 2221 const unsigned NSAAOffset = State->getNextStackOffset(); 2222 if (NSAAOffset != 0 && Size > Excess) { 2223 while (State->AllocateReg(GPRArgRegs)) 2224 ; 2225 return; 2226 } 2227 2228 // First register for byval parameter is the first register that wasn't 2229 // allocated before this method call, so it would be "reg". 2230 // If parameter is small enough to be saved in range [reg, r4), then 2231 // the end (first after last) register would be reg + param-size-in-regs, 2232 // else parameter would be splitted between registers and stack, 2233 // end register would be r4 in this case. 2234 unsigned ByValRegBegin = Reg; 2235 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2236 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2237 // Note, first register is allocated in the beginning of function already, 2238 // allocate remained amount of registers we need. 2239 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2240 State->AllocateReg(GPRArgRegs); 2241 // A byval parameter that is split between registers and memory needs its 2242 // size truncated here. 2243 // In the case where the entire structure fits in registers, we set the 2244 // size in memory to zero. 2245 Size = std::max<int>(Size - Excess, 0); 2246 } 2247 2248 /// MatchingStackOffset - Return true if the given stack call argument is 2249 /// already available in the same position (relatively) of the caller's 2250 /// incoming argument stack. 2251 static 2252 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2253 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2254 const TargetInstrInfo *TII) { 2255 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2256 int FI = std::numeric_limits<int>::max(); 2257 if (Arg.getOpcode() == ISD::CopyFromReg) { 2258 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2259 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2260 return false; 2261 MachineInstr *Def = MRI->getVRegDef(VR); 2262 if (!Def) 2263 return false; 2264 if (!Flags.isByVal()) { 2265 if (!TII->isLoadFromStackSlot(*Def, FI)) 2266 return false; 2267 } else { 2268 return false; 2269 } 2270 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2271 if (Flags.isByVal()) 2272 // ByVal argument is passed in as a pointer but it's now being 2273 // dereferenced. e.g. 2274 // define @foo(%struct.X* %A) { 2275 // tail call @bar(%struct.X* byval %A) 2276 // } 2277 return false; 2278 SDValue Ptr = Ld->getBasePtr(); 2279 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2280 if (!FINode) 2281 return false; 2282 FI = FINode->getIndex(); 2283 } else 2284 return false; 2285 2286 assert(FI != std::numeric_limits<int>::max()); 2287 if (!MFI.isFixedObjectIndex(FI)) 2288 return false; 2289 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2290 } 2291 2292 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2293 /// for tail call optimization. Targets which want to do tail call 2294 /// optimization should implement this function. 2295 bool 2296 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2297 CallingConv::ID CalleeCC, 2298 bool isVarArg, 2299 bool isCalleeStructRet, 2300 bool isCallerStructRet, 2301 const SmallVectorImpl<ISD::OutputArg> &Outs, 2302 const SmallVectorImpl<SDValue> &OutVals, 2303 const SmallVectorImpl<ISD::InputArg> &Ins, 2304 SelectionDAG& DAG) const { 2305 MachineFunction &MF = DAG.getMachineFunction(); 2306 const Function &CallerF = MF.getFunction(); 2307 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2308 2309 assert(Subtarget->supportsTailCall()); 2310 2311 // Tail calls to function pointers cannot be optimized for Thumb1 if the args 2312 // to the call take up r0-r3. The reason is that there are no legal registers 2313 // left to hold the pointer to the function to be called. 2314 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2315 !isa<GlobalAddressSDNode>(Callee.getNode())) 2316 return false; 2317 2318 // Look for obvious safe cases to perform tail call optimization that do not 2319 // require ABI changes. This is what gcc calls sibcall. 2320 2321 // Exception-handling functions need a special set of instructions to indicate 2322 // a return to the hardware. Tail-calling another function would probably 2323 // break this. 2324 if (CallerF.hasFnAttribute("interrupt")) 2325 return false; 2326 2327 // Also avoid sibcall optimization if either caller or callee uses struct 2328 // return semantics. 2329 if (isCalleeStructRet || isCallerStructRet) 2330 return false; 2331 2332 // Externally-defined functions with weak linkage should not be 2333 // tail-called on ARM when the OS does not support dynamic 2334 // pre-emption of symbols, as the AAELF spec requires normal calls 2335 // to undefined weak functions to be replaced with a NOP or jump to the 2336 // next instruction. The behaviour of branch instructions in this 2337 // situation (as used for tail calls) is implementation-defined, so we 2338 // cannot rely on the linker replacing the tail call with a return. 2339 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2340 const GlobalValue *GV = G->getGlobal(); 2341 const Triple &TT = getTargetMachine().getTargetTriple(); 2342 if (GV->hasExternalWeakLinkage() && 2343 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2344 return false; 2345 } 2346 2347 // Check that the call results are passed in the same way. 2348 LLVMContext &C = *DAG.getContext(); 2349 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2350 CCAssignFnForReturn(CalleeCC, isVarArg), 2351 CCAssignFnForReturn(CallerCC, isVarArg))) 2352 return false; 2353 // The callee has to preserve all registers the caller needs to preserve. 2354 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2355 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2356 if (CalleeCC != CallerCC) { 2357 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2358 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2359 return false; 2360 } 2361 2362 // If Caller's vararg or byval argument has been split between registers and 2363 // stack, do not perform tail call, since part of the argument is in caller's 2364 // local frame. 2365 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2366 if (AFI_Caller->getArgRegsSaveSize()) 2367 return false; 2368 2369 // If the callee takes no arguments then go on to check the results of the 2370 // call. 2371 if (!Outs.empty()) { 2372 // Check if stack adjustment is needed. For now, do not do this if any 2373 // argument is passed on the stack. 2374 SmallVector<CCValAssign, 16> ArgLocs; 2375 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2376 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2377 if (CCInfo.getNextStackOffset()) { 2378 // Check if the arguments are already laid out in the right way as 2379 // the caller's fixed stack objects. 2380 MachineFrameInfo &MFI = MF.getFrameInfo(); 2381 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2382 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2383 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2384 i != e; 2385 ++i, ++realArgIdx) { 2386 CCValAssign &VA = ArgLocs[i]; 2387 EVT RegVT = VA.getLocVT(); 2388 SDValue Arg = OutVals[realArgIdx]; 2389 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2390 if (VA.getLocInfo() == CCValAssign::Indirect) 2391 return false; 2392 if (VA.needsCustom()) { 2393 // f64 and vector types are split into multiple registers or 2394 // register/stack-slot combinations. The types will not match 2395 // the registers; give up on memory f64 refs until we figure 2396 // out what to do about this. 2397 if (!VA.isRegLoc()) 2398 return false; 2399 if (!ArgLocs[++i].isRegLoc()) 2400 return false; 2401 if (RegVT == MVT::v2f64) { 2402 if (!ArgLocs[++i].isRegLoc()) 2403 return false; 2404 if (!ArgLocs[++i].isRegLoc()) 2405 return false; 2406 } 2407 } else if (!VA.isRegLoc()) { 2408 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2409 MFI, MRI, TII)) 2410 return false; 2411 } 2412 } 2413 } 2414 2415 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2416 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2417 return false; 2418 } 2419 2420 return true; 2421 } 2422 2423 bool 2424 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2425 MachineFunction &MF, bool isVarArg, 2426 const SmallVectorImpl<ISD::OutputArg> &Outs, 2427 LLVMContext &Context) const { 2428 SmallVector<CCValAssign, 16> RVLocs; 2429 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2430 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2431 } 2432 2433 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2434 const SDLoc &DL, SelectionDAG &DAG) { 2435 const MachineFunction &MF = DAG.getMachineFunction(); 2436 const Function &F = MF.getFunction(); 2437 2438 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2439 2440 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2441 // version of the "preferred return address". These offsets affect the return 2442 // instruction if this is a return from PL1 without hypervisor extensions. 2443 // IRQ/FIQ: +4 "subs pc, lr, #4" 2444 // SWI: 0 "subs pc, lr, #0" 2445 // ABORT: +4 "subs pc, lr, #4" 2446 // UNDEF: +4/+2 "subs pc, lr, #0" 2447 // UNDEF varies depending on where the exception came from ARM or Thumb 2448 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2449 2450 int64_t LROffset; 2451 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2452 IntKind == "ABORT") 2453 LROffset = 4; 2454 else if (IntKind == "SWI" || IntKind == "UNDEF") 2455 LROffset = 0; 2456 else 2457 report_fatal_error("Unsupported interrupt attribute. If present, value " 2458 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2459 2460 RetOps.insert(RetOps.begin() + 1, 2461 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2462 2463 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2464 } 2465 2466 SDValue 2467 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2468 bool isVarArg, 2469 const SmallVectorImpl<ISD::OutputArg> &Outs, 2470 const SmallVectorImpl<SDValue> &OutVals, 2471 const SDLoc &dl, SelectionDAG &DAG) const { 2472 // CCValAssign - represent the assignment of the return value to a location. 2473 SmallVector<CCValAssign, 16> RVLocs; 2474 2475 // CCState - Info about the registers and stack slots. 2476 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2477 *DAG.getContext()); 2478 2479 // Analyze outgoing return values. 2480 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2481 2482 SDValue Flag; 2483 SmallVector<SDValue, 4> RetOps; 2484 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2485 bool isLittleEndian = Subtarget->isLittle(); 2486 2487 MachineFunction &MF = DAG.getMachineFunction(); 2488 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2489 AFI->setReturnRegsCount(RVLocs.size()); 2490 2491 // Copy the result values into the output registers. 2492 for (unsigned i = 0, realRVLocIdx = 0; 2493 i != RVLocs.size(); 2494 ++i, ++realRVLocIdx) { 2495 CCValAssign &VA = RVLocs[i]; 2496 assert(VA.isRegLoc() && "Can only return in registers!"); 2497 2498 SDValue Arg = OutVals[realRVLocIdx]; 2499 bool ReturnF16 = false; 2500 2501 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2502 // Half-precision return values can be returned like this: 2503 // 2504 // t11 f16 = fadd ... 2505 // t12: i16 = bitcast t11 2506 // t13: i32 = zero_extend t12 2507 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2508 // 2509 // to avoid code generation for bitcasts, we simply set Arg to the node 2510 // that produces the f16 value, t11 in this case. 2511 // 2512 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2513 SDValue ZE = Arg.getOperand(0); 2514 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2515 SDValue BC = ZE.getOperand(0); 2516 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2517 Arg = BC.getOperand(0); 2518 ReturnF16 = true; 2519 } 2520 } 2521 } 2522 } 2523 2524 switch (VA.getLocInfo()) { 2525 default: llvm_unreachable("Unknown loc info!"); 2526 case CCValAssign::Full: break; 2527 case CCValAssign::BCvt: 2528 if (!ReturnF16) 2529 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2530 break; 2531 } 2532 2533 if (VA.needsCustom()) { 2534 if (VA.getLocVT() == MVT::v2f64) { 2535 // Extract the first half and return it in two registers. 2536 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2537 DAG.getConstant(0, dl, MVT::i32)); 2538 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2539 DAG.getVTList(MVT::i32, MVT::i32), Half); 2540 2541 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2542 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2543 Flag); 2544 Flag = Chain.getValue(1); 2545 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2546 VA = RVLocs[++i]; // skip ahead to next loc 2547 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2548 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2549 Flag); 2550 Flag = Chain.getValue(1); 2551 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2552 VA = RVLocs[++i]; // skip ahead to next loc 2553 2554 // Extract the 2nd half and fall through to handle it as an f64 value. 2555 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2556 DAG.getConstant(1, dl, MVT::i32)); 2557 } 2558 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2559 // available. 2560 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2561 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2562 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2563 fmrrd.getValue(isLittleEndian ? 0 : 1), 2564 Flag); 2565 Flag = Chain.getValue(1); 2566 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2567 VA = RVLocs[++i]; // skip ahead to next loc 2568 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2569 fmrrd.getValue(isLittleEndian ? 1 : 0), 2570 Flag); 2571 } else 2572 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2573 2574 // Guarantee that all emitted copies are 2575 // stuck together, avoiding something bad. 2576 Flag = Chain.getValue(1); 2577 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2578 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2579 } 2580 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2581 const MCPhysReg *I = 2582 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2583 if (I) { 2584 for (; *I; ++I) { 2585 if (ARM::GPRRegClass.contains(*I)) 2586 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2587 else if (ARM::DPRRegClass.contains(*I)) 2588 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2589 else 2590 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2591 } 2592 } 2593 2594 // Update chain and glue. 2595 RetOps[0] = Chain; 2596 if (Flag.getNode()) 2597 RetOps.push_back(Flag); 2598 2599 // CPUs which aren't M-class use a special sequence to return from 2600 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2601 // though we use "subs pc, lr, #N"). 2602 // 2603 // M-class CPUs actually use a normal return sequence with a special 2604 // (hardware-provided) value in LR, so the normal code path works. 2605 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2606 !Subtarget->isMClass()) { 2607 if (Subtarget->isThumb1Only()) 2608 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2609 return LowerInterruptReturn(RetOps, dl, DAG); 2610 } 2611 2612 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2613 } 2614 2615 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2616 if (N->getNumValues() != 1) 2617 return false; 2618 if (!N->hasNUsesOfValue(1, 0)) 2619 return false; 2620 2621 SDValue TCChain = Chain; 2622 SDNode *Copy = *N->use_begin(); 2623 if (Copy->getOpcode() == ISD::CopyToReg) { 2624 // If the copy has a glue operand, we conservatively assume it isn't safe to 2625 // perform a tail call. 2626 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2627 return false; 2628 TCChain = Copy->getOperand(0); 2629 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2630 SDNode *VMov = Copy; 2631 // f64 returned in a pair of GPRs. 2632 SmallPtrSet<SDNode*, 2> Copies; 2633 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2634 UI != UE; ++UI) { 2635 if (UI->getOpcode() != ISD::CopyToReg) 2636 return false; 2637 Copies.insert(*UI); 2638 } 2639 if (Copies.size() > 2) 2640 return false; 2641 2642 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2643 UI != UE; ++UI) { 2644 SDValue UseChain = UI->getOperand(0); 2645 if (Copies.count(UseChain.getNode())) 2646 // Second CopyToReg 2647 Copy = *UI; 2648 else { 2649 // We are at the top of this chain. 2650 // If the copy has a glue operand, we conservatively assume it 2651 // isn't safe to perform a tail call. 2652 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2653 return false; 2654 // First CopyToReg 2655 TCChain = UseChain; 2656 } 2657 } 2658 } else if (Copy->getOpcode() == ISD::BITCAST) { 2659 // f32 returned in a single GPR. 2660 if (!Copy->hasOneUse()) 2661 return false; 2662 Copy = *Copy->use_begin(); 2663 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2664 return false; 2665 // If the copy has a glue operand, we conservatively assume it isn't safe to 2666 // perform a tail call. 2667 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2668 return false; 2669 TCChain = Copy->getOperand(0); 2670 } else { 2671 return false; 2672 } 2673 2674 bool HasRet = false; 2675 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2676 UI != UE; ++UI) { 2677 if (UI->getOpcode() != ARMISD::RET_FLAG && 2678 UI->getOpcode() != ARMISD::INTRET_FLAG) 2679 return false; 2680 HasRet = true; 2681 } 2682 2683 if (!HasRet) 2684 return false; 2685 2686 Chain = TCChain; 2687 return true; 2688 } 2689 2690 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2691 if (!Subtarget->supportsTailCall()) 2692 return false; 2693 2694 auto Attr = 2695 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2696 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2697 return false; 2698 2699 return true; 2700 } 2701 2702 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2703 // and pass the lower and high parts through. 2704 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2705 SDLoc DL(Op); 2706 SDValue WriteValue = Op->getOperand(2); 2707 2708 // This function is only supposed to be called for i64 type argument. 2709 assert(WriteValue.getValueType() == MVT::i64 2710 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2711 2712 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2713 DAG.getConstant(0, DL, MVT::i32)); 2714 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2715 DAG.getConstant(1, DL, MVT::i32)); 2716 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2717 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 2718 } 2719 2720 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2721 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2722 // one of the above mentioned nodes. It has to be wrapped because otherwise 2723 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2724 // be used to form addressing mode. These wrapped nodes will be selected 2725 // into MOVi. 2726 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 2727 SelectionDAG &DAG) const { 2728 EVT PtrVT = Op.getValueType(); 2729 // FIXME there is no actual debug info here 2730 SDLoc dl(Op); 2731 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2732 SDValue Res; 2733 2734 // When generating execute-only code Constant Pools must be promoted to the 2735 // global data section. It's a bit ugly that we can't share them across basic 2736 // blocks, but this way we guarantee that execute-only behaves correct with 2737 // position-independent addressing modes. 2738 if (Subtarget->genExecuteOnly()) { 2739 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 2740 auto T = const_cast<Type*>(CP->getType()); 2741 auto C = const_cast<Constant*>(CP->getConstVal()); 2742 auto M = const_cast<Module*>(DAG.getMachineFunction(). 2743 getFunction().getParent()); 2744 auto GV = new GlobalVariable( 2745 *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, 2746 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 2747 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 2748 Twine(AFI->createPICLabelUId()) 2749 ); 2750 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 2751 dl, PtrVT); 2752 return LowerGlobalAddress(GA, DAG); 2753 } 2754 2755 if (CP->isMachineConstantPoolEntry()) 2756 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 2757 CP->getAlignment()); 2758 else 2759 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 2760 CP->getAlignment()); 2761 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 2762 } 2763 2764 unsigned ARMTargetLowering::getJumpTableEncoding() const { 2765 return MachineJumpTableInfo::EK_Inline; 2766 } 2767 2768 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 2769 SelectionDAG &DAG) const { 2770 MachineFunction &MF = DAG.getMachineFunction(); 2771 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2772 unsigned ARMPCLabelIndex = 0; 2773 SDLoc DL(Op); 2774 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2775 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 2776 SDValue CPAddr; 2777 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 2778 if (!IsPositionIndependent) { 2779 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 2780 } else { 2781 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2782 ARMPCLabelIndex = AFI->createPICLabelUId(); 2783 ARMConstantPoolValue *CPV = 2784 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 2785 ARMCP::CPBlockAddress, PCAdj); 2786 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2787 } 2788 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 2789 SDValue Result = DAG.getLoad( 2790 PtrVT, DL, DAG.getEntryNode(), CPAddr, 2791 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2792 if (!IsPositionIndependent) 2793 return Result; 2794 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 2795 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 2796 } 2797 2798 /// \brief Convert a TLS address reference into the correct sequence of loads 2799 /// and calls to compute the variable's address for Darwin, and return an 2800 /// SDValue containing the final node. 2801 2802 /// Darwin only has one TLS scheme which must be capable of dealing with the 2803 /// fully general situation, in the worst case. This means: 2804 /// + "extern __thread" declaration. 2805 /// + Defined in a possibly unknown dynamic library. 2806 /// 2807 /// The general system is that each __thread variable has a [3 x i32] descriptor 2808 /// which contains information used by the runtime to calculate the address. The 2809 /// only part of this the compiler needs to know about is the first word, which 2810 /// contains a function pointer that must be called with the address of the 2811 /// entire descriptor in "r0". 2812 /// 2813 /// Since this descriptor may be in a different unit, in general access must 2814 /// proceed along the usual ARM rules. A common sequence to produce is: 2815 /// 2816 /// movw rT1, :lower16:_var$non_lazy_ptr 2817 /// movt rT1, :upper16:_var$non_lazy_ptr 2818 /// ldr r0, [rT1] 2819 /// ldr rT2, [r0] 2820 /// blx rT2 2821 /// [...address now in r0...] 2822 SDValue 2823 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 2824 SelectionDAG &DAG) const { 2825 assert(Subtarget->isTargetDarwin() && 2826 "This function expects a Darwin target"); 2827 SDLoc DL(Op); 2828 2829 // First step is to get the address of the actua global symbol. This is where 2830 // the TLS descriptor lives. 2831 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 2832 2833 // The first entry in the descriptor is a function pointer that we must call 2834 // to obtain the address of the variable. 2835 SDValue Chain = DAG.getEntryNode(); 2836 SDValue FuncTLVGet = DAG.getLoad( 2837 MVT::i32, DL, Chain, DescAddr, 2838 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2839 /* Alignment = */ 4, 2840 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 2841 MachineMemOperand::MOInvariant); 2842 Chain = FuncTLVGet.getValue(1); 2843 2844 MachineFunction &F = DAG.getMachineFunction(); 2845 MachineFrameInfo &MFI = F.getFrameInfo(); 2846 MFI.setAdjustsStack(true); 2847 2848 // TLS calls preserve all registers except those that absolutely must be 2849 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 2850 // silly). 2851 auto TRI = 2852 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 2853 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 2854 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 2855 2856 // Finally, we can make the call. This is just a degenerate version of a 2857 // normal AArch64 call node: r0 takes the address of the descriptor, and 2858 // returns the address of the variable in this thread. 2859 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 2860 Chain = 2861 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 2862 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 2863 DAG.getRegisterMask(Mask), Chain.getValue(1)); 2864 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 2865 } 2866 2867 SDValue 2868 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 2869 SelectionDAG &DAG) const { 2870 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 2871 2872 SDValue Chain = DAG.getEntryNode(); 2873 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2874 SDLoc DL(Op); 2875 2876 // Load the current TEB (thread environment block) 2877 SDValue Ops[] = {Chain, 2878 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 2879 DAG.getConstant(15, DL, MVT::i32), 2880 DAG.getConstant(0, DL, MVT::i32), 2881 DAG.getConstant(13, DL, MVT::i32), 2882 DAG.getConstant(0, DL, MVT::i32), 2883 DAG.getConstant(2, DL, MVT::i32)}; 2884 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 2885 DAG.getVTList(MVT::i32, MVT::Other), Ops); 2886 2887 SDValue TEB = CurrentTEB.getValue(0); 2888 Chain = CurrentTEB.getValue(1); 2889 2890 // Load the ThreadLocalStoragePointer from the TEB 2891 // A pointer to the TLS array is located at offset 0x2c from the TEB. 2892 SDValue TLSArray = 2893 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 2894 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 2895 2896 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 2897 // offset into the TLSArray. 2898 2899 // Load the TLS index from the C runtime 2900 SDValue TLSIndex = 2901 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 2902 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 2903 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 2904 2905 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 2906 DAG.getConstant(2, DL, MVT::i32)); 2907 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 2908 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 2909 MachinePointerInfo()); 2910 2911 // Get the offset of the start of the .tls section (section base) 2912 const auto *GA = cast<GlobalAddressSDNode>(Op); 2913 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 2914 SDValue Offset = DAG.getLoad( 2915 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 2916 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 2917 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2918 2919 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 2920 } 2921 2922 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 2923 SDValue 2924 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 2925 SelectionDAG &DAG) const { 2926 SDLoc dl(GA); 2927 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2928 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2929 MachineFunction &MF = DAG.getMachineFunction(); 2930 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2931 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2932 ARMConstantPoolValue *CPV = 2933 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2934 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 2935 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2936 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 2937 Argument = DAG.getLoad( 2938 PtrVT, dl, DAG.getEntryNode(), Argument, 2939 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2940 SDValue Chain = Argument.getValue(1); 2941 2942 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2943 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 2944 2945 // call __tls_get_addr. 2946 ArgListTy Args; 2947 ArgListEntry Entry; 2948 Entry.Node = Argument; 2949 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 2950 Args.push_back(Entry); 2951 2952 // FIXME: is there useful debug info available here? 2953 TargetLowering::CallLoweringInfo CLI(DAG); 2954 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 2955 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 2956 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 2957 2958 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2959 return CallResult.first; 2960 } 2961 2962 // Lower ISD::GlobalTLSAddress using the "initial exec" or 2963 // "local exec" model. 2964 SDValue 2965 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 2966 SelectionDAG &DAG, 2967 TLSModel::Model model) const { 2968 const GlobalValue *GV = GA->getGlobal(); 2969 SDLoc dl(GA); 2970 SDValue Offset; 2971 SDValue Chain = DAG.getEntryNode(); 2972 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2973 // Get the Thread Pointer 2974 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2975 2976 if (model == TLSModel::InitialExec) { 2977 MachineFunction &MF = DAG.getMachineFunction(); 2978 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2979 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2980 // Initial exec model. 2981 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2982 ARMConstantPoolValue *CPV = 2983 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2984 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 2985 true); 2986 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2987 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 2988 Offset = DAG.getLoad( 2989 PtrVT, dl, Chain, Offset, 2990 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2991 Chain = Offset.getValue(1); 2992 2993 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2994 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 2995 2996 Offset = DAG.getLoad( 2997 PtrVT, dl, Chain, Offset, 2998 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2999 } else { 3000 // local exec model 3001 assert(model == TLSModel::LocalExec); 3002 ARMConstantPoolValue *CPV = 3003 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3004 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3005 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3006 Offset = DAG.getLoad( 3007 PtrVT, dl, Chain, Offset, 3008 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3009 } 3010 3011 // The address of the thread local variable is the add of the thread 3012 // pointer with the offset of the variable. 3013 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3014 } 3015 3016 SDValue 3017 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3018 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3019 if (DAG.getTarget().useEmulatedTLS()) 3020 return LowerToTLSEmulatedModel(GA, DAG); 3021 3022 if (Subtarget->isTargetDarwin()) 3023 return LowerGlobalTLSAddressDarwin(Op, DAG); 3024 3025 if (Subtarget->isTargetWindows()) 3026 return LowerGlobalTLSAddressWindows(Op, DAG); 3027 3028 // TODO: implement the "local dynamic" model 3029 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3030 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3031 3032 switch (model) { 3033 case TLSModel::GeneralDynamic: 3034 case TLSModel::LocalDynamic: 3035 return LowerToTLSGeneralDynamicModel(GA, DAG); 3036 case TLSModel::InitialExec: 3037 case TLSModel::LocalExec: 3038 return LowerToTLSExecModels(GA, DAG, model); 3039 } 3040 llvm_unreachable("bogus TLS model"); 3041 } 3042 3043 /// Return true if all users of V are within function F, looking through 3044 /// ConstantExprs. 3045 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3046 SmallVector<const User*,4> Worklist; 3047 for (auto *U : V->users()) 3048 Worklist.push_back(U); 3049 while (!Worklist.empty()) { 3050 auto *U = Worklist.pop_back_val(); 3051 if (isa<ConstantExpr>(U)) { 3052 for (auto *UU : U->users()) 3053 Worklist.push_back(UU); 3054 continue; 3055 } 3056 3057 auto *I = dyn_cast<Instruction>(U); 3058 if (!I || I->getParent()->getParent() != F) 3059 return false; 3060 } 3061 return true; 3062 } 3063 3064 /// Return true if all users of V are within some (any) function, looking through 3065 /// ConstantExprs. In other words, are there any global constant users? 3066 static bool allUsersAreInFunctions(const Value *V) { 3067 SmallVector<const User*,4> Worklist; 3068 for (auto *U : V->users()) 3069 Worklist.push_back(U); 3070 while (!Worklist.empty()) { 3071 auto *U = Worklist.pop_back_val(); 3072 if (isa<ConstantExpr>(U)) { 3073 for (auto *UU : U->users()) 3074 Worklist.push_back(UU); 3075 continue; 3076 } 3077 3078 if (!isa<Instruction>(U)) 3079 return false; 3080 } 3081 return true; 3082 } 3083 3084 // Return true if T is an integer, float or an array/vector of either. 3085 static bool isSimpleType(Type *T) { 3086 if (T->isIntegerTy() || T->isFloatingPointTy()) 3087 return true; 3088 Type *SubT = nullptr; 3089 if (T->isArrayTy()) 3090 SubT = T->getArrayElementType(); 3091 else if (T->isVectorTy()) 3092 SubT = T->getVectorElementType(); 3093 else 3094 return false; 3095 return SubT->isIntegerTy() || SubT->isFloatingPointTy(); 3096 } 3097 3098 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, 3099 EVT PtrVT, const SDLoc &dl) { 3100 // If we're creating a pool entry for a constant global with unnamed address, 3101 // and the global is small enough, we can emit it inline into the constant pool 3102 // to save ourselves an indirection. 3103 // 3104 // This is a win if the constant is only used in one function (so it doesn't 3105 // need to be duplicated) or duplicating the constant wouldn't increase code 3106 // size (implying the constant is no larger than 4 bytes). 3107 const Function &F = DAG.getMachineFunction().getFunction(); 3108 3109 // We rely on this decision to inline being idemopotent and unrelated to the 3110 // use-site. We know that if we inline a variable at one use site, we'll 3111 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3112 // doesn't know about this optimization, so bail out if it's enabled else 3113 // we could decide to inline here (and thus never emit the GV) but require 3114 // the GV from fast-isel generated code. 3115 if (!EnableConstpoolPromotion || 3116 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3117 return SDValue(); 3118 3119 auto *GVar = dyn_cast<GlobalVariable>(GV); 3120 if (!GVar || !GVar->hasInitializer() || 3121 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3122 !GVar->hasLocalLinkage()) 3123 return SDValue(); 3124 3125 // Ensure that we don't try and inline any type that contains pointers. If 3126 // we inline a value that contains relocations, we move the relocations from 3127 // .data to .text which is not ideal. 3128 auto *Init = GVar->getInitializer(); 3129 if (!isSimpleType(Init->getType())) 3130 return SDValue(); 3131 3132 // The constant islands pass can only really deal with alignment requests 3133 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3134 // any type wanting greater alignment requirements than 4 bytes. We also 3135 // can only promote constants that are multiples of 4 bytes in size or 3136 // are paddable to a multiple of 4. Currently we only try and pad constants 3137 // that are strings for simplicity. 3138 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3139 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3140 unsigned Align = GVar->getAlignment(); 3141 unsigned RequiredPadding = 4 - (Size % 4); 3142 bool PaddingPossible = 3143 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3144 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3145 Size == 0) 3146 return SDValue(); 3147 3148 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3149 MachineFunction &MF = DAG.getMachineFunction(); 3150 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3151 3152 // We can't bloat the constant pool too much, else the ConstantIslands pass 3153 // may fail to converge. If we haven't promoted this global yet (it may have 3154 // multiple uses), and promoting it would increase the constant pool size (Sz 3155 // > 4), ensure we have space to do so up to MaxTotal. 3156 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3157 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3158 ConstpoolPromotionMaxTotal) 3159 return SDValue(); 3160 3161 // This is only valid if all users are in a single function OR it has users 3162 // in multiple functions but it no larger than a pointer. We also check if 3163 // GVar has constant (non-ConstantExpr) users. If so, it essentially has its 3164 // address taken. 3165 if (!allUsersAreInFunction(GVar, &F) && 3166 !(Size <= 4 && allUsersAreInFunctions(GVar))) 3167 return SDValue(); 3168 3169 // We're going to inline this global. Pad it out if needed. 3170 if (RequiredPadding != 4) { 3171 StringRef S = CDAInit->getAsString(); 3172 3173 SmallVector<uint8_t,16> V(S.size()); 3174 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3175 while (RequiredPadding--) 3176 V.push_back(0); 3177 Init = ConstantDataArray::get(*DAG.getContext(), V); 3178 } 3179 3180 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3181 SDValue CPAddr = 3182 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3183 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3184 AFI->markGlobalAsPromotedToConstantPool(GVar); 3185 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3186 PaddedSize - 4); 3187 } 3188 ++NumConstpoolPromoted; 3189 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3190 } 3191 3192 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3193 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3194 GV = GA->getBaseObject(); 3195 return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) || 3196 isa<Function>(GV); 3197 } 3198 3199 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3200 SelectionDAG &DAG) const { 3201 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3202 default: llvm_unreachable("unknown object format"); 3203 case Triple::COFF: 3204 return LowerGlobalAddressWindows(Op, DAG); 3205 case Triple::ELF: 3206 return LowerGlobalAddressELF(Op, DAG); 3207 case Triple::MachO: 3208 return LowerGlobalAddressDarwin(Op, DAG); 3209 } 3210 } 3211 3212 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3213 SelectionDAG &DAG) const { 3214 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3215 SDLoc dl(Op); 3216 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3217 const TargetMachine &TM = getTargetMachine(); 3218 bool IsRO = isReadOnly(GV); 3219 3220 // promoteToConstantPool only if not generating XO text section 3221 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3222 if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl)) 3223 return V; 3224 3225 if (isPositionIndependent()) { 3226 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3227 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3228 UseGOT_PREL ? ARMII::MO_GOT : 0); 3229 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3230 if (UseGOT_PREL) 3231 Result = 3232 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3233 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3234 return Result; 3235 } else if (Subtarget->isROPI() && IsRO) { 3236 // PC-relative. 3237 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3238 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3239 return Result; 3240 } else if (Subtarget->isRWPI() && !IsRO) { 3241 // SB-relative. 3242 SDValue RelAddr; 3243 if (Subtarget->useMovt(DAG.getMachineFunction())) { 3244 ++NumMovwMovt; 3245 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3246 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3247 } else { // use literal pool for address constant 3248 ARMConstantPoolValue *CPV = 3249 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3250 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3251 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3252 RelAddr = DAG.getLoad( 3253 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3254 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3255 } 3256 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3257 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3258 return Result; 3259 } 3260 3261 // If we have T2 ops, we can materialize the address directly via movt/movw 3262 // pair. This is always cheaper. 3263 if (Subtarget->useMovt(DAG.getMachineFunction())) { 3264 ++NumMovwMovt; 3265 // FIXME: Once remat is capable of dealing with instructions with register 3266 // operands, expand this into two nodes. 3267 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3268 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3269 } else { 3270 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3271 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3272 return DAG.getLoad( 3273 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3274 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3275 } 3276 } 3277 3278 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3279 SelectionDAG &DAG) const { 3280 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3281 "ROPI/RWPI not currently supported for Darwin"); 3282 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3283 SDLoc dl(Op); 3284 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3285 3286 if (Subtarget->useMovt(DAG.getMachineFunction())) 3287 ++NumMovwMovt; 3288 3289 // FIXME: Once remat is capable of dealing with instructions with register 3290 // operands, expand this into multiple nodes 3291 unsigned Wrapper = 3292 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3293 3294 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3295 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3296 3297 if (Subtarget->isGVIndirectSymbol(GV)) 3298 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3299 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3300 return Result; 3301 } 3302 3303 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3304 SelectionDAG &DAG) const { 3305 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3306 assert(Subtarget->useMovt(DAG.getMachineFunction()) && 3307 "Windows on ARM expects to use movw/movt"); 3308 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3309 "ROPI/RWPI not currently supported for Windows"); 3310 3311 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3312 const ARMII::TOF TargetFlags = 3313 (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); 3314 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3315 SDValue Result; 3316 SDLoc DL(Op); 3317 3318 ++NumMovwMovt; 3319 3320 // FIXME: Once remat is capable of dealing with instructions with register 3321 // operands, expand this into two nodes. 3322 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3323 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, 3324 TargetFlags)); 3325 if (GV->hasDLLImportStorageClass()) 3326 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3327 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3328 return Result; 3329 } 3330 3331 SDValue 3332 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3333 SDLoc dl(Op); 3334 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3335 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3336 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3337 Op.getOperand(1), Val); 3338 } 3339 3340 SDValue 3341 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3342 SDLoc dl(Op); 3343 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3344 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3345 } 3346 3347 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3348 SelectionDAG &DAG) const { 3349 SDLoc dl(Op); 3350 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3351 Op.getOperand(0)); 3352 } 3353 3354 SDValue 3355 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3356 const ARMSubtarget *Subtarget) const { 3357 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3358 SDLoc dl(Op); 3359 switch (IntNo) { 3360 default: return SDValue(); // Don't custom lower most intrinsics. 3361 case Intrinsic::thread_pointer: { 3362 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3363 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3364 } 3365 case Intrinsic::eh_sjlj_lsda: { 3366 MachineFunction &MF = DAG.getMachineFunction(); 3367 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3368 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3369 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3370 SDValue CPAddr; 3371 bool IsPositionIndependent = isPositionIndependent(); 3372 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3373 ARMConstantPoolValue *CPV = 3374 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3375 ARMCP::CPLSDA, PCAdj); 3376 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3377 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3378 SDValue Result = DAG.getLoad( 3379 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3380 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3381 3382 if (IsPositionIndependent) { 3383 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3384 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3385 } 3386 return Result; 3387 } 3388 case Intrinsic::arm_neon_vabs: 3389 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3390 Op.getOperand(1)); 3391 case Intrinsic::arm_neon_vmulls: 3392 case Intrinsic::arm_neon_vmullu: { 3393 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3394 ? ARMISD::VMULLs : ARMISD::VMULLu; 3395 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3396 Op.getOperand(1), Op.getOperand(2)); 3397 } 3398 case Intrinsic::arm_neon_vminnm: 3399 case Intrinsic::arm_neon_vmaxnm: { 3400 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3401 ? ISD::FMINNUM : ISD::FMAXNUM; 3402 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3403 Op.getOperand(1), Op.getOperand(2)); 3404 } 3405 case Intrinsic::arm_neon_vminu: 3406 case Intrinsic::arm_neon_vmaxu: { 3407 if (Op.getValueType().isFloatingPoint()) 3408 return SDValue(); 3409 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3410 ? ISD::UMIN : ISD::UMAX; 3411 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3412 Op.getOperand(1), Op.getOperand(2)); 3413 } 3414 case Intrinsic::arm_neon_vmins: 3415 case Intrinsic::arm_neon_vmaxs: { 3416 // v{min,max}s is overloaded between signed integers and floats. 3417 if (!Op.getValueType().isFloatingPoint()) { 3418 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3419 ? ISD::SMIN : ISD::SMAX; 3420 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3421 Op.getOperand(1), Op.getOperand(2)); 3422 } 3423 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3424 ? ISD::FMINNAN : ISD::FMAXNAN; 3425 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3426 Op.getOperand(1), Op.getOperand(2)); 3427 } 3428 case Intrinsic::arm_neon_vtbl1: 3429 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3430 Op.getOperand(1), Op.getOperand(2)); 3431 case Intrinsic::arm_neon_vtbl2: 3432 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3433 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3434 } 3435 } 3436 3437 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3438 const ARMSubtarget *Subtarget) { 3439 SDLoc dl(Op); 3440 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3441 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3442 if (SSID == SyncScope::SingleThread) 3443 return Op; 3444 3445 if (!Subtarget->hasDataBarrier()) { 3446 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3447 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3448 // here. 3449 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3450 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3451 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3452 DAG.getConstant(0, dl, MVT::i32)); 3453 } 3454 3455 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3456 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3457 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3458 if (Subtarget->isMClass()) { 3459 // Only a full system barrier exists in the M-class architectures. 3460 Domain = ARM_MB::SY; 3461 } else if (Subtarget->preferISHSTBarriers() && 3462 Ord == AtomicOrdering::Release) { 3463 // Swift happens to implement ISHST barriers in a way that's compatible with 3464 // Release semantics but weaker than ISH so we'd be fools not to use 3465 // it. Beware: other processors probably don't! 3466 Domain = ARM_MB::ISHST; 3467 } 3468 3469 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3470 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3471 DAG.getConstant(Domain, dl, MVT::i32)); 3472 } 3473 3474 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3475 const ARMSubtarget *Subtarget) { 3476 // ARM pre v5TE and Thumb1 does not have preload instructions. 3477 if (!(Subtarget->isThumb2() || 3478 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3479 // Just preserve the chain. 3480 return Op.getOperand(0); 3481 3482 SDLoc dl(Op); 3483 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3484 if (!isRead && 3485 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3486 // ARMv7 with MP extension has PLDW. 3487 return Op.getOperand(0); 3488 3489 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3490 if (Subtarget->isThumb()) { 3491 // Invert the bits. 3492 isRead = ~isRead & 1; 3493 isData = ~isData & 1; 3494 } 3495 3496 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3497 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3498 DAG.getConstant(isData, dl, MVT::i32)); 3499 } 3500 3501 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3502 MachineFunction &MF = DAG.getMachineFunction(); 3503 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3504 3505 // vastart just stores the address of the VarArgsFrameIndex slot into the 3506 // memory location argument. 3507 SDLoc dl(Op); 3508 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3509 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3510 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3511 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3512 MachinePointerInfo(SV)); 3513 } 3514 3515 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3516 CCValAssign &NextVA, 3517 SDValue &Root, 3518 SelectionDAG &DAG, 3519 const SDLoc &dl) const { 3520 MachineFunction &MF = DAG.getMachineFunction(); 3521 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3522 3523 const TargetRegisterClass *RC; 3524 if (AFI->isThumb1OnlyFunction()) 3525 RC = &ARM::tGPRRegClass; 3526 else 3527 RC = &ARM::GPRRegClass; 3528 3529 // Transform the arguments stored in physical registers into virtual ones. 3530 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3531 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3532 3533 SDValue ArgValue2; 3534 if (NextVA.isMemLoc()) { 3535 MachineFrameInfo &MFI = MF.getFrameInfo(); 3536 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3537 3538 // Create load node to retrieve arguments from the stack. 3539 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3540 ArgValue2 = DAG.getLoad( 3541 MVT::i32, dl, Root, FIN, 3542 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3543 } else { 3544 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3545 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3546 } 3547 if (!Subtarget->isLittle()) 3548 std::swap (ArgValue, ArgValue2); 3549 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3550 } 3551 3552 // The remaining GPRs hold either the beginning of variable-argument 3553 // data, or the beginning of an aggregate passed by value (usually 3554 // byval). Either way, we allocate stack slots adjacent to the data 3555 // provided by our caller, and store the unallocated registers there. 3556 // If this is a variadic function, the va_list pointer will begin with 3557 // these values; otherwise, this reassembles a (byval) structure that 3558 // was split between registers and memory. 3559 // Return: The frame index registers were stored into. 3560 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3561 const SDLoc &dl, SDValue &Chain, 3562 const Value *OrigArg, 3563 unsigned InRegsParamRecordIdx, 3564 int ArgOffset, unsigned ArgSize) const { 3565 // Currently, two use-cases possible: 3566 // Case #1. Non-var-args function, and we meet first byval parameter. 3567 // Setup first unallocated register as first byval register; 3568 // eat all remained registers 3569 // (these two actions are performed by HandleByVal method). 3570 // Then, here, we initialize stack frame with 3571 // "store-reg" instructions. 3572 // Case #2. Var-args function, that doesn't contain byval parameters. 3573 // The same: eat all remained unallocated registers, 3574 // initialize stack frame. 3575 3576 MachineFunction &MF = DAG.getMachineFunction(); 3577 MachineFrameInfo &MFI = MF.getFrameInfo(); 3578 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3579 unsigned RBegin, REnd; 3580 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3581 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3582 } else { 3583 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3584 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3585 REnd = ARM::R4; 3586 } 3587 3588 if (REnd != RBegin) 3589 ArgOffset = -4 * (ARM::R4 - RBegin); 3590 3591 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3592 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3593 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3594 3595 SmallVector<SDValue, 4> MemOps; 3596 const TargetRegisterClass *RC = 3597 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3598 3599 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3600 unsigned VReg = MF.addLiveIn(Reg, RC); 3601 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3602 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3603 MachinePointerInfo(OrigArg, 4 * i)); 3604 MemOps.push_back(Store); 3605 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3606 } 3607 3608 if (!MemOps.empty()) 3609 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3610 return FrameIndex; 3611 } 3612 3613 // Setup stack frame, the va_list pointer will start from. 3614 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3615 const SDLoc &dl, SDValue &Chain, 3616 unsigned ArgOffset, 3617 unsigned TotalArgRegsSaveSize, 3618 bool ForceMutable) const { 3619 MachineFunction &MF = DAG.getMachineFunction(); 3620 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3621 3622 // Try to store any remaining integer argument regs 3623 // to their spots on the stack so that they may be loaded by dereferencing 3624 // the result of va_next. 3625 // If there is no regs to be stored, just point address after last 3626 // argument passed via stack. 3627 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3628 CCInfo.getInRegsParamsCount(), 3629 CCInfo.getNextStackOffset(), 4); 3630 AFI->setVarArgsFrameIndex(FrameIndex); 3631 } 3632 3633 SDValue ARMTargetLowering::LowerFormalArguments( 3634 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3635 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3636 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3637 MachineFunction &MF = DAG.getMachineFunction(); 3638 MachineFrameInfo &MFI = MF.getFrameInfo(); 3639 3640 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3641 3642 // Assign locations to all of the incoming arguments. 3643 SmallVector<CCValAssign, 16> ArgLocs; 3644 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3645 *DAG.getContext()); 3646 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 3647 3648 SmallVector<SDValue, 16> ArgValues; 3649 SDValue ArgValue; 3650 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 3651 unsigned CurArgIdx = 0; 3652 3653 // Initially ArgRegsSaveSize is zero. 3654 // Then we increase this value each time we meet byval parameter. 3655 // We also increase this value in case of varargs function. 3656 AFI->setArgRegsSaveSize(0); 3657 3658 // Calculate the amount of stack space that we need to allocate to store 3659 // byval and variadic arguments that are passed in registers. 3660 // We need to know this before we allocate the first byval or variadic 3661 // argument, as they will be allocated a stack slot below the CFA (Canonical 3662 // Frame Address, the stack pointer at entry to the function). 3663 unsigned ArgRegBegin = ARM::R4; 3664 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3665 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 3666 break; 3667 3668 CCValAssign &VA = ArgLocs[i]; 3669 unsigned Index = VA.getValNo(); 3670 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 3671 if (!Flags.isByVal()) 3672 continue; 3673 3674 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 3675 unsigned RBegin, REnd; 3676 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 3677 ArgRegBegin = std::min(ArgRegBegin, RBegin); 3678 3679 CCInfo.nextInRegsParam(); 3680 } 3681 CCInfo.rewindByValRegsInfo(); 3682 3683 int lastInsIndex = -1; 3684 if (isVarArg && MFI.hasVAStart()) { 3685 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3686 if (RegIdx != array_lengthof(GPRArgRegs)) 3687 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 3688 } 3689 3690 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 3691 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 3692 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3693 3694 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3695 CCValAssign &VA = ArgLocs[i]; 3696 if (Ins[VA.getValNo()].isOrigArg()) { 3697 std::advance(CurOrigArg, 3698 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 3699 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 3700 } 3701 // Arguments stored in registers. 3702 if (VA.isRegLoc()) { 3703 EVT RegVT = VA.getLocVT(); 3704 3705 if (VA.needsCustom()) { 3706 // f64 and vector types are split up into multiple registers or 3707 // combinations of registers and stack slots. 3708 if (VA.getLocVT() == MVT::v2f64) { 3709 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 3710 Chain, DAG, dl); 3711 VA = ArgLocs[++i]; // skip ahead to next loc 3712 SDValue ArgValue2; 3713 if (VA.isMemLoc()) { 3714 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 3715 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3716 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 3717 MachinePointerInfo::getFixedStack( 3718 DAG.getMachineFunction(), FI)); 3719 } else { 3720 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 3721 Chain, DAG, dl); 3722 } 3723 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 3724 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3725 ArgValue, ArgValue1, 3726 DAG.getIntPtrConstant(0, dl)); 3727 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3728 ArgValue, ArgValue2, 3729 DAG.getIntPtrConstant(1, dl)); 3730 } else 3731 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 3732 } else { 3733 const TargetRegisterClass *RC; 3734 3735 3736 if (RegVT == MVT::f16) 3737 RC = &ARM::HPRRegClass; 3738 else if (RegVT == MVT::f32) 3739 RC = &ARM::SPRRegClass; 3740 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 3741 RC = &ARM::DPRRegClass; 3742 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 3743 RC = &ARM::QPRRegClass; 3744 else if (RegVT == MVT::i32) 3745 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 3746 : &ARM::GPRRegClass; 3747 else 3748 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3749 3750 // Transform the arguments in physical registers into virtual ones. 3751 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3752 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 3753 } 3754 3755 // If this is an 8 or 16-bit value, it is really passed promoted 3756 // to 32 bits. Insert an assert[sz]ext to capture this, then 3757 // truncate to the right size. 3758 switch (VA.getLocInfo()) { 3759 default: llvm_unreachable("Unknown loc info!"); 3760 case CCValAssign::Full: break; 3761 case CCValAssign::BCvt: 3762 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 3763 break; 3764 case CCValAssign::SExt: 3765 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 3766 DAG.getValueType(VA.getValVT())); 3767 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3768 break; 3769 case CCValAssign::ZExt: 3770 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 3771 DAG.getValueType(VA.getValVT())); 3772 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3773 break; 3774 } 3775 3776 InVals.push_back(ArgValue); 3777 } else { // VA.isRegLoc() 3778 // sanity check 3779 assert(VA.isMemLoc()); 3780 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 3781 3782 int index = VA.getValNo(); 3783 3784 // Some Ins[] entries become multiple ArgLoc[] entries. 3785 // Process them only once. 3786 if (index != lastInsIndex) 3787 { 3788 ISD::ArgFlagsTy Flags = Ins[index].Flags; 3789 // FIXME: For now, all byval parameter objects are marked mutable. 3790 // This can be changed with more analysis. 3791 // In case of tail call optimization mark all arguments mutable. 3792 // Since they could be overwritten by lowering of arguments in case of 3793 // a tail call. 3794 if (Flags.isByVal()) { 3795 assert(Ins[index].isOrigArg() && 3796 "Byval arguments cannot be implicit"); 3797 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 3798 3799 int FrameIndex = StoreByValRegs( 3800 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 3801 VA.getLocMemOffset(), Flags.getByValSize()); 3802 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 3803 CCInfo.nextInRegsParam(); 3804 } else { 3805 unsigned FIOffset = VA.getLocMemOffset(); 3806 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 3807 FIOffset, true); 3808 3809 // Create load nodes to retrieve arguments from the stack. 3810 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3811 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 3812 MachinePointerInfo::getFixedStack( 3813 DAG.getMachineFunction(), FI))); 3814 } 3815 lastInsIndex = index; 3816 } 3817 } 3818 } 3819 3820 // varargs 3821 if (isVarArg && MFI.hasVAStart()) 3822 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 3823 CCInfo.getNextStackOffset(), 3824 TotalArgRegsSaveSize); 3825 3826 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 3827 3828 return Chain; 3829 } 3830 3831 /// isFloatingPointZero - Return true if this is +0.0. 3832 static bool isFloatingPointZero(SDValue Op) { 3833 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 3834 return CFP->getValueAPF().isPosZero(); 3835 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 3836 // Maybe this has already been legalized into the constant pool? 3837 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 3838 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 3839 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 3840 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 3841 return CFP->getValueAPF().isPosZero(); 3842 } 3843 } else if (Op->getOpcode() == ISD::BITCAST && 3844 Op->getValueType(0) == MVT::f64) { 3845 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 3846 // created by LowerConstantFP(). 3847 SDValue BitcastOp = Op->getOperand(0); 3848 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 3849 isNullConstant(BitcastOp->getOperand(0))) 3850 return true; 3851 } 3852 return false; 3853 } 3854 3855 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 3856 /// the given operands. 3857 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3858 SDValue &ARMcc, SelectionDAG &DAG, 3859 const SDLoc &dl) const { 3860 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3861 unsigned C = RHSC->getZExtValue(); 3862 if (!isLegalICmpImmediate(C)) { 3863 // Constant does not fit, try adjusting it by one? 3864 switch (CC) { 3865 default: break; 3866 case ISD::SETLT: 3867 case ISD::SETGE: 3868 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 3869 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3870 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3871 } 3872 break; 3873 case ISD::SETULT: 3874 case ISD::SETUGE: 3875 if (C != 0 && isLegalICmpImmediate(C-1)) { 3876 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3877 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3878 } 3879 break; 3880 case ISD::SETLE: 3881 case ISD::SETGT: 3882 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 3883 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3884 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3885 } 3886 break; 3887 case ISD::SETULE: 3888 case ISD::SETUGT: 3889 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 3890 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3891 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3892 } 3893 break; 3894 } 3895 } 3896 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 3897 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 3898 // In ARM and Thumb-2, the compare instructions can shift their second 3899 // operand. 3900 CC = ISD::getSetCCSwappedOperands(CC); 3901 std::swap(LHS, RHS); 3902 } 3903 3904 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3905 ARMISD::NodeType CompareType; 3906 switch (CondCode) { 3907 default: 3908 CompareType = ARMISD::CMP; 3909 break; 3910 case ARMCC::EQ: 3911 case ARMCC::NE: 3912 // Uses only Z Flag 3913 CompareType = ARMISD::CMPZ; 3914 break; 3915 } 3916 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3917 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 3918 } 3919 3920 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 3921 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 3922 SelectionDAG &DAG, const SDLoc &dl, 3923 bool InvalidOnQNaN) const { 3924 assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); 3925 SDValue Cmp; 3926 SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); 3927 if (!isFloatingPointZero(RHS)) 3928 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); 3929 else 3930 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); 3931 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 3932 } 3933 3934 /// duplicateCmp - Glue values can have only one use, so this function 3935 /// duplicates a comparison node. 3936 SDValue 3937 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 3938 unsigned Opc = Cmp.getOpcode(); 3939 SDLoc DL(Cmp); 3940 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 3941 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3942 3943 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 3944 Cmp = Cmp.getOperand(0); 3945 Opc = Cmp.getOpcode(); 3946 if (Opc == ARMISD::CMPFP) 3947 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), 3948 Cmp.getOperand(1), Cmp.getOperand(2)); 3949 else { 3950 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 3951 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), 3952 Cmp.getOperand(1)); 3953 } 3954 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 3955 } 3956 3957 // This function returns three things: the arithmetic computation itself 3958 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 3959 // comparison and the condition code define the case in which the arithmetic 3960 // computation *does not* overflow. 3961 std::pair<SDValue, SDValue> 3962 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 3963 SDValue &ARMcc) const { 3964 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 3965 3966 SDValue Value, OverflowCmp; 3967 SDValue LHS = Op.getOperand(0); 3968 SDValue RHS = Op.getOperand(1); 3969 SDLoc dl(Op); 3970 3971 // FIXME: We are currently always generating CMPs because we don't support 3972 // generating CMN through the backend. This is not as good as the natural 3973 // CMP case because it causes a register dependency and cannot be folded 3974 // later. 3975 3976 switch (Op.getOpcode()) { 3977 default: 3978 llvm_unreachable("Unknown overflow instruction!"); 3979 case ISD::SADDO: 3980 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3981 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3982 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3983 break; 3984 case ISD::UADDO: 3985 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3986 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 3987 // We do not use it in the USUBO case as Value may not be used. 3988 Value = DAG.getNode(ARMISD::ADDC, dl, 3989 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 3990 .getValue(0); 3991 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3992 break; 3993 case ISD::SSUBO: 3994 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3995 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3996 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3997 break; 3998 case ISD::USUBO: 3999 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4000 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4001 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4002 break; 4003 case ISD::UMULO: 4004 // We generate a UMUL_LOHI and then check if the high word is 0. 4005 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4006 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4007 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4008 LHS, RHS); 4009 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4010 DAG.getConstant(0, dl, MVT::i32)); 4011 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4012 break; 4013 case ISD::SMULO: 4014 // We generate a SMUL_LOHI and then check if all the bits of the high word 4015 // are the same as the sign bit of the low word. 4016 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4017 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4018 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4019 LHS, RHS); 4020 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4021 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4022 Value.getValue(0), 4023 DAG.getConstant(31, dl, MVT::i32))); 4024 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4025 break; 4026 } // switch (...) 4027 4028 return std::make_pair(Value, OverflowCmp); 4029 } 4030 4031 SDValue 4032 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4033 // Let legalize expand this if it isn't a legal type yet. 4034 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4035 return SDValue(); 4036 4037 SDValue Value, OverflowCmp; 4038 SDValue ARMcc; 4039 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4040 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4041 SDLoc dl(Op); 4042 // We use 0 and 1 as false and true values. 4043 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4044 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4045 EVT VT = Op.getValueType(); 4046 4047 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4048 ARMcc, CCR, OverflowCmp); 4049 4050 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4051 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4052 } 4053 4054 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4055 SelectionDAG &DAG) { 4056 SDLoc DL(BoolCarry); 4057 EVT CarryVT = BoolCarry.getValueType(); 4058 4059 // This converts the boolean value carry into the carry flag by doing 4060 // ARMISD::SUBC Carry, 1 4061 return DAG.getNode(ARMISD::SUBC, DL, DAG.getVTList(CarryVT, MVT::i32), 4062 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4063 } 4064 4065 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4066 SelectionDAG &DAG) { 4067 SDLoc DL(Flags); 4068 4069 // Now convert the carry flag into a boolean carry. We do this 4070 // using ARMISD:ADDE 0, 0, Carry 4071 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4072 DAG.getConstant(0, DL, MVT::i32), 4073 DAG.getConstant(0, DL, MVT::i32), Flags); 4074 } 4075 4076 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4077 SelectionDAG &DAG) const { 4078 // Let legalize expand this if it isn't a legal type yet. 4079 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4080 return SDValue(); 4081 4082 SDValue LHS = Op.getOperand(0); 4083 SDValue RHS = Op.getOperand(1); 4084 SDLoc dl(Op); 4085 4086 EVT VT = Op.getValueType(); 4087 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4088 SDValue Value; 4089 SDValue Overflow; 4090 switch (Op.getOpcode()) { 4091 default: 4092 llvm_unreachable("Unknown overflow instruction!"); 4093 case ISD::UADDO: 4094 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4095 // Convert the carry flag into a boolean value. 4096 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4097 break; 4098 case ISD::USUBO: { 4099 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4100 // Convert the carry flag into a boolean value. 4101 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4102 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4103 // value. So compute 1 - C. 4104 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4105 DAG.getConstant(1, dl, MVT::i32), Overflow); 4106 break; 4107 } 4108 } 4109 4110 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4111 } 4112 4113 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4114 SDValue Cond = Op.getOperand(0); 4115 SDValue SelectTrue = Op.getOperand(1); 4116 SDValue SelectFalse = Op.getOperand(2); 4117 SDLoc dl(Op); 4118 unsigned Opc = Cond.getOpcode(); 4119 4120 if (Cond.getResNo() == 1 && 4121 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4122 Opc == ISD::USUBO)) { 4123 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4124 return SDValue(); 4125 4126 SDValue Value, OverflowCmp; 4127 SDValue ARMcc; 4128 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4129 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4130 EVT VT = Op.getValueType(); 4131 4132 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4133 OverflowCmp, DAG); 4134 } 4135 4136 // Convert: 4137 // 4138 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4139 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4140 // 4141 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4142 const ConstantSDNode *CMOVTrue = 4143 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4144 const ConstantSDNode *CMOVFalse = 4145 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4146 4147 if (CMOVTrue && CMOVFalse) { 4148 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4149 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4150 4151 SDValue True; 4152 SDValue False; 4153 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4154 True = SelectTrue; 4155 False = SelectFalse; 4156 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4157 True = SelectFalse; 4158 False = SelectTrue; 4159 } 4160 4161 if (True.getNode() && False.getNode()) { 4162 EVT VT = Op.getValueType(); 4163 SDValue ARMcc = Cond.getOperand(2); 4164 SDValue CCR = Cond.getOperand(3); 4165 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4166 assert(True.getValueType() == VT); 4167 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4168 } 4169 } 4170 } 4171 4172 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4173 // undefined bits before doing a full-word comparison with zero. 4174 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4175 DAG.getConstant(1, dl, Cond.getValueType())); 4176 4177 return DAG.getSelectCC(dl, Cond, 4178 DAG.getConstant(0, dl, Cond.getValueType()), 4179 SelectTrue, SelectFalse, ISD::SETNE); 4180 } 4181 4182 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4183 bool &swpCmpOps, bool &swpVselOps) { 4184 // Start by selecting the GE condition code for opcodes that return true for 4185 // 'equality' 4186 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4187 CC == ISD::SETULE) 4188 CondCode = ARMCC::GE; 4189 4190 // and GT for opcodes that return false for 'equality'. 4191 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4192 CC == ISD::SETULT) 4193 CondCode = ARMCC::GT; 4194 4195 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4196 // to swap the compare operands. 4197 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4198 CC == ISD::SETULT) 4199 swpCmpOps = true; 4200 4201 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4202 // If we have an unordered opcode, we need to swap the operands to the VSEL 4203 // instruction (effectively negating the condition). 4204 // 4205 // This also has the effect of swapping which one of 'less' or 'greater' 4206 // returns true, so we also swap the compare operands. It also switches 4207 // whether we return true for 'equality', so we compensate by picking the 4208 // opposite condition code to our original choice. 4209 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4210 CC == ISD::SETUGT) { 4211 swpCmpOps = !swpCmpOps; 4212 swpVselOps = !swpVselOps; 4213 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4214 } 4215 4216 // 'ordered' is 'anything but unordered', so use the VS condition code and 4217 // swap the VSEL operands. 4218 if (CC == ISD::SETO) { 4219 CondCode = ARMCC::VS; 4220 swpVselOps = true; 4221 } 4222 4223 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4224 // code and swap the VSEL operands. 4225 if (CC == ISD::SETUNE) { 4226 CondCode = ARMCC::EQ; 4227 swpVselOps = true; 4228 } 4229 } 4230 4231 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4232 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4233 SDValue Cmp, SelectionDAG &DAG) const { 4234 if (Subtarget->isFPOnlySP() && VT == MVT::f64) { 4235 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4236 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4237 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4238 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4239 4240 SDValue TrueLow = TrueVal.getValue(0); 4241 SDValue TrueHigh = TrueVal.getValue(1); 4242 SDValue FalseLow = FalseVal.getValue(0); 4243 SDValue FalseHigh = FalseVal.getValue(1); 4244 4245 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4246 ARMcc, CCR, Cmp); 4247 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4248 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4249 4250 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4251 } else { 4252 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4253 Cmp); 4254 } 4255 } 4256 4257 static bool isGTorGE(ISD::CondCode CC) { 4258 return CC == ISD::SETGT || CC == ISD::SETGE; 4259 } 4260 4261 static bool isLTorLE(ISD::CondCode CC) { 4262 return CC == ISD::SETLT || CC == ISD::SETLE; 4263 } 4264 4265 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4266 // All of these conditions (and their <= and >= counterparts) will do: 4267 // x < k ? k : x 4268 // x > k ? x : k 4269 // k < x ? x : k 4270 // k > x ? k : x 4271 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4272 const SDValue TrueVal, const SDValue FalseVal, 4273 const ISD::CondCode CC, const SDValue K) { 4274 return (isGTorGE(CC) && 4275 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4276 (isLTorLE(CC) && 4277 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4278 } 4279 4280 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4281 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4282 const SDValue TrueVal, const SDValue FalseVal, 4283 const ISD::CondCode CC, const SDValue K) { 4284 return (isGTorGE(CC) && 4285 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4286 (isLTorLE(CC) && 4287 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4288 } 4289 4290 // Check if two chained conditionals could be converted into SSAT or USAT. 4291 // 4292 // SSAT can replace a set of two conditional selectors that bound a number to an 4293 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4294 // 4295 // x < -k ? -k : (x > k ? k : x) 4296 // x < -k ? -k : (x < k ? x : k) 4297 // x > -k ? (x > k ? k : x) : -k 4298 // x < k ? (x < -k ? -k : x) : k 4299 // etc. 4300 // 4301 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4302 // a power of 2. 4303 // 4304 // It returns true if the conversion can be done, false otherwise. 4305 // Additionally, the variable is returned in parameter V, the constant in K and 4306 // usat is set to true if the conditional represents an unsigned saturation 4307 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4308 uint64_t &K, bool &usat) { 4309 SDValue LHS1 = Op.getOperand(0); 4310 SDValue RHS1 = Op.getOperand(1); 4311 SDValue TrueVal1 = Op.getOperand(2); 4312 SDValue FalseVal1 = Op.getOperand(3); 4313 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4314 4315 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4316 if (Op2.getOpcode() != ISD::SELECT_CC) 4317 return false; 4318 4319 SDValue LHS2 = Op2.getOperand(0); 4320 SDValue RHS2 = Op2.getOperand(1); 4321 SDValue TrueVal2 = Op2.getOperand(2); 4322 SDValue FalseVal2 = Op2.getOperand(3); 4323 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4324 4325 // Find out which are the constants and which are the variables 4326 // in each conditional 4327 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4328 ? &RHS1 4329 : nullptr; 4330 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4331 ? &RHS2 4332 : nullptr; 4333 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4334 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4335 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4336 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4337 4338 // We must detect cases where the original operations worked with 16- or 4339 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4340 // must work with sign-extended values but the select operations return 4341 // the original non-extended value. 4342 SDValue V2TmpReg = V2Tmp; 4343 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4344 V2TmpReg = V2Tmp->getOperand(0); 4345 4346 // Check that the registers and the constants have the correct values 4347 // in both conditionals 4348 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4349 V2TmpReg != V2) 4350 return false; 4351 4352 // Figure out which conditional is saturating the lower/upper bound. 4353 const SDValue *LowerCheckOp = 4354 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4355 ? &Op 4356 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4357 ? &Op2 4358 : nullptr; 4359 const SDValue *UpperCheckOp = 4360 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4361 ? &Op 4362 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4363 ? &Op2 4364 : nullptr; 4365 4366 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4367 return false; 4368 4369 // Check that the constant in the lower-bound check is 4370 // the opposite of the constant in the upper-bound check 4371 // in 1's complement. 4372 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4373 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4374 int64_t PosVal = std::max(Val1, Val2); 4375 int64_t NegVal = std::min(Val1, Val2); 4376 4377 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4378 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4379 isPowerOf2_64(PosVal + 1)) { 4380 4381 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4382 if (Val1 == ~Val2) 4383 usat = false; 4384 else if (NegVal == 0) 4385 usat = true; 4386 else 4387 return false; 4388 4389 V = V2; 4390 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4391 4392 return true; 4393 } 4394 4395 return false; 4396 } 4397 4398 // Check if a condition of the type x < k ? k : x can be converted into a 4399 // bit operation instead of conditional moves. 4400 // Currently this is allowed given: 4401 // - The conditions and values match up 4402 // - k is 0 or -1 (all ones) 4403 // This function will not check the last condition, thats up to the caller 4404 // It returns true if the transformation can be made, and in such case 4405 // returns x in V, and k in SatK. 4406 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4407 SDValue &SatK) 4408 { 4409 SDValue LHS = Op.getOperand(0); 4410 SDValue RHS = Op.getOperand(1); 4411 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4412 SDValue TrueVal = Op.getOperand(2); 4413 SDValue FalseVal = Op.getOperand(3); 4414 4415 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4416 ? &RHS 4417 : nullptr; 4418 4419 // No constant operation in comparison, early out 4420 if (!K) 4421 return false; 4422 4423 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4424 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4425 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4426 4427 // If the constant on left and right side, or variable on left and right, 4428 // does not match, early out 4429 if (*K != KTmp || V != VTmp) 4430 return false; 4431 4432 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4433 SatK = *K; 4434 return true; 4435 } 4436 4437 return false; 4438 } 4439 4440 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4441 EVT VT = Op.getValueType(); 4442 SDLoc dl(Op); 4443 4444 // Try to convert two saturating conditional selects into a single SSAT 4445 SDValue SatValue; 4446 uint64_t SatConstant; 4447 bool SatUSat; 4448 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4449 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4450 if (SatUSat) 4451 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4452 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4453 else 4454 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4455 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4456 } 4457 4458 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4459 // into more efficient bit operations, which is possible when k is 0 or -1 4460 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4461 // single instructions. On Thumb the shift and the bit operation will be two 4462 // instructions. 4463 // Only allow this transformation on full-width (32-bit) operations 4464 SDValue LowerSatConstant; 4465 if (VT == MVT::i32 && 4466 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4467 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4468 DAG.getConstant(31, dl, VT)); 4469 if (isNullConstant(LowerSatConstant)) { 4470 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4471 DAG.getAllOnesConstant(dl, VT)); 4472 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4473 } else if (isAllOnesConstant(LowerSatConstant)) 4474 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4475 } 4476 4477 SDValue LHS = Op.getOperand(0); 4478 SDValue RHS = Op.getOperand(1); 4479 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4480 SDValue TrueVal = Op.getOperand(2); 4481 SDValue FalseVal = Op.getOperand(3); 4482 4483 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4484 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4485 dl); 4486 4487 // If softenSetCCOperands only returned one value, we should compare it to 4488 // zero. 4489 if (!RHS.getNode()) { 4490 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4491 CC = ISD::SETNE; 4492 } 4493 } 4494 4495 if (LHS.getValueType() == MVT::i32) { 4496 // Try to generate VSEL on ARMv8. 4497 // The VSEL instruction can't use all the usual ARM condition 4498 // codes: it only has two bits to select the condition code, so it's 4499 // constrained to use only GE, GT, VS and EQ. 4500 // 4501 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 4502 // swap the operands of the previous compare instruction (effectively 4503 // inverting the compare condition, swapping 'less' and 'greater') and 4504 // sometimes need to swap the operands to the VSEL (which inverts the 4505 // condition in the sense of firing whenever the previous condition didn't) 4506 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || 4507 TrueVal.getValueType() == MVT::f64)) { 4508 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4509 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 4510 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 4511 CC = ISD::getSetCCInverse(CC, true); 4512 std::swap(TrueVal, FalseVal); 4513 } 4514 } 4515 4516 SDValue ARMcc; 4517 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4518 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4519 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4520 } 4521 4522 ARMCC::CondCodes CondCode, CondCode2; 4523 bool InvalidOnQNaN; 4524 FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); 4525 4526 // Normalize the fp compare. If RHS is zero we keep it there so we match 4527 // CMPFPw0 instead of CMPFP. 4528 if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) && 4529 (TrueVal.getValueType() == MVT::f16 || 4530 TrueVal.getValueType() == MVT::f32 || 4531 TrueVal.getValueType() == MVT::f64)) { 4532 bool swpCmpOps = false; 4533 bool swpVselOps = false; 4534 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 4535 4536 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 4537 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 4538 if (swpCmpOps) 4539 std::swap(LHS, RHS); 4540 if (swpVselOps) 4541 std::swap(TrueVal, FalseVal); 4542 } 4543 } 4544 4545 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4546 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4547 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4548 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4549 if (CondCode2 != ARMCC::AL) { 4550 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 4551 // FIXME: Needs another CMP because flag can have but one use. 4552 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4553 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 4554 } 4555 return Result; 4556 } 4557 4558 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 4559 /// to morph to an integer compare sequence. 4560 static bool canChangeToInt(SDValue Op, bool &SeenZero, 4561 const ARMSubtarget *Subtarget) { 4562 SDNode *N = Op.getNode(); 4563 if (!N->hasOneUse()) 4564 // Otherwise it requires moving the value from fp to integer registers. 4565 return false; 4566 if (!N->getNumValues()) 4567 return false; 4568 EVT VT = Op.getValueType(); 4569 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 4570 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 4571 // vmrs are very slow, e.g. cortex-a8. 4572 return false; 4573 4574 if (isFloatingPointZero(Op)) { 4575 SeenZero = true; 4576 return true; 4577 } 4578 return ISD::isNormalLoad(N); 4579 } 4580 4581 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 4582 if (isFloatingPointZero(Op)) 4583 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 4584 4585 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 4586 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 4587 Ld->getPointerInfo(), Ld->getAlignment(), 4588 Ld->getMemOperand()->getFlags()); 4589 4590 llvm_unreachable("Unknown VFP cmp argument!"); 4591 } 4592 4593 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 4594 SDValue &RetVal1, SDValue &RetVal2) { 4595 SDLoc dl(Op); 4596 4597 if (isFloatingPointZero(Op)) { 4598 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 4599 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 4600 return; 4601 } 4602 4603 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 4604 SDValue Ptr = Ld->getBasePtr(); 4605 RetVal1 = 4606 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 4607 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 4608 4609 EVT PtrType = Ptr.getValueType(); 4610 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 4611 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 4612 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 4613 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 4614 Ld->getPointerInfo().getWithOffset(4), NewAlign, 4615 Ld->getMemOperand()->getFlags()); 4616 return; 4617 } 4618 4619 llvm_unreachable("Unknown VFP cmp argument!"); 4620 } 4621 4622 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 4623 /// f32 and even f64 comparisons to integer ones. 4624 SDValue 4625 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 4626 SDValue Chain = Op.getOperand(0); 4627 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4628 SDValue LHS = Op.getOperand(2); 4629 SDValue RHS = Op.getOperand(3); 4630 SDValue Dest = Op.getOperand(4); 4631 SDLoc dl(Op); 4632 4633 bool LHSSeenZero = false; 4634 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 4635 bool RHSSeenZero = false; 4636 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 4637 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 4638 // If unsafe fp math optimization is enabled and there are no other uses of 4639 // the CMP operands, and the condition code is EQ or NE, we can optimize it 4640 // to an integer comparison. 4641 if (CC == ISD::SETOEQ) 4642 CC = ISD::SETEQ; 4643 else if (CC == ISD::SETUNE) 4644 CC = ISD::SETNE; 4645 4646 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4647 SDValue ARMcc; 4648 if (LHS.getValueType() == MVT::f32) { 4649 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4650 bitcastf32Toi32(LHS, DAG), Mask); 4651 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4652 bitcastf32Toi32(RHS, DAG), Mask); 4653 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4654 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4655 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4656 Chain, Dest, ARMcc, CCR, Cmp); 4657 } 4658 4659 SDValue LHS1, LHS2; 4660 SDValue RHS1, RHS2; 4661 expandf64Toi32(LHS, DAG, LHS1, LHS2); 4662 expandf64Toi32(RHS, DAG, RHS1, RHS2); 4663 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 4664 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 4665 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4666 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4667 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4668 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 4669 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 4670 } 4671 4672 return SDValue(); 4673 } 4674 4675 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 4676 SDValue Chain = Op.getOperand(0); 4677 SDValue Cond = Op.getOperand(1); 4678 SDValue Dest = Op.getOperand(2); 4679 SDLoc dl(Op); 4680 4681 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 4682 // instruction. 4683 unsigned Opc = Cond.getOpcode(); 4684 if (Cond.getResNo() == 1 && 4685 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4686 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 4687 // Only lower legal XALUO ops. 4688 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4689 return SDValue(); 4690 4691 // The actual operation with overflow check. 4692 SDValue Value, OverflowCmp; 4693 SDValue ARMcc; 4694 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4695 4696 // Reverse the condition code. 4697 ARMCC::CondCodes CondCode = 4698 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 4699 CondCode = ARMCC::getOppositeCondition(CondCode); 4700 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 4701 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4702 4703 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 4704 OverflowCmp); 4705 } 4706 4707 return SDValue(); 4708 } 4709 4710 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 4711 SDValue Chain = Op.getOperand(0); 4712 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4713 SDValue LHS = Op.getOperand(2); 4714 SDValue RHS = Op.getOperand(3); 4715 SDValue Dest = Op.getOperand(4); 4716 SDLoc dl(Op); 4717 4718 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4719 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4720 dl); 4721 4722 // If softenSetCCOperands only returned one value, we should compare it to 4723 // zero. 4724 if (!RHS.getNode()) { 4725 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4726 CC = ISD::SETNE; 4727 } 4728 } 4729 4730 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 4731 // instruction. 4732 unsigned Opc = LHS.getOpcode(); 4733 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 4734 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4735 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO) && 4736 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 4737 // Only lower legal XALUO ops. 4738 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 4739 return SDValue(); 4740 4741 // The actual operation with overflow check. 4742 SDValue Value, OverflowCmp; 4743 SDValue ARMcc; 4744 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 4745 4746 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 4747 // Reverse the condition code. 4748 ARMCC::CondCodes CondCode = 4749 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 4750 CondCode = ARMCC::getOppositeCondition(CondCode); 4751 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 4752 } 4753 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4754 4755 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 4756 OverflowCmp); 4757 } 4758 4759 if (LHS.getValueType() == MVT::i32) { 4760 SDValue ARMcc; 4761 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4762 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4763 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4764 Chain, Dest, ARMcc, CCR, Cmp); 4765 } 4766 4767 if (getTargetMachine().Options.UnsafeFPMath && 4768 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 4769 CC == ISD::SETNE || CC == ISD::SETUNE)) { 4770 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 4771 return Result; 4772 } 4773 4774 ARMCC::CondCodes CondCode, CondCode2; 4775 bool InvalidOnQNaN; 4776 FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); 4777 4778 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4779 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4780 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4781 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4782 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 4783 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4784 if (CondCode2 != ARMCC::AL) { 4785 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 4786 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 4787 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4788 } 4789 return Res; 4790 } 4791 4792 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 4793 SDValue Chain = Op.getOperand(0); 4794 SDValue Table = Op.getOperand(1); 4795 SDValue Index = Op.getOperand(2); 4796 SDLoc dl(Op); 4797 4798 EVT PTy = getPointerTy(DAG.getDataLayout()); 4799 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 4800 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 4801 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 4802 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 4803 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 4804 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 4805 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 4806 // which does another jump to the destination. This also makes it easier 4807 // to translate it to TBB / TBH later (Thumb2 only). 4808 // FIXME: This might not work if the function is extremely large. 4809 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 4810 Addr, Op.getOperand(2), JTI); 4811 } 4812 if (isPositionIndependent() || Subtarget->isROPI()) { 4813 Addr = 4814 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 4815 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4816 Chain = Addr.getValue(1); 4817 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 4818 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4819 } else { 4820 Addr = 4821 DAG.getLoad(PTy, dl, Chain, Addr, 4822 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4823 Chain = Addr.getValue(1); 4824 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4825 } 4826 } 4827 4828 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 4829 EVT VT = Op.getValueType(); 4830 SDLoc dl(Op); 4831 4832 if (Op.getValueType().getVectorElementType() == MVT::i32) { 4833 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 4834 return Op; 4835 return DAG.UnrollVectorOp(Op.getNode()); 4836 } 4837 4838 assert(Op.getOperand(0).getValueType() == MVT::v4f32 && 4839 "Invalid type for custom lowering!"); 4840 if (VT != MVT::v4i16) 4841 return DAG.UnrollVectorOp(Op.getNode()); 4842 4843 Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); 4844 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 4845 } 4846 4847 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 4848 EVT VT = Op.getValueType(); 4849 if (VT.isVector()) 4850 return LowerVectorFP_TO_INT(Op, DAG); 4851 if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { 4852 RTLIB::Libcall LC; 4853 if (Op.getOpcode() == ISD::FP_TO_SINT) 4854 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), 4855 Op.getValueType()); 4856 else 4857 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), 4858 Op.getValueType()); 4859 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4860 /*isSigned*/ false, SDLoc(Op)).first; 4861 } 4862 4863 return Op; 4864 } 4865 4866 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4867 EVT VT = Op.getValueType(); 4868 SDLoc dl(Op); 4869 4870 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 4871 if (VT.getVectorElementType() == MVT::f32) 4872 return Op; 4873 return DAG.UnrollVectorOp(Op.getNode()); 4874 } 4875 4876 assert(Op.getOperand(0).getValueType() == MVT::v4i16 && 4877 "Invalid type for custom lowering!"); 4878 if (VT != MVT::v4f32) 4879 return DAG.UnrollVectorOp(Op.getNode()); 4880 4881 unsigned CastOpc; 4882 unsigned Opc; 4883 switch (Op.getOpcode()) { 4884 default: llvm_unreachable("Invalid opcode!"); 4885 case ISD::SINT_TO_FP: 4886 CastOpc = ISD::SIGN_EXTEND; 4887 Opc = ISD::SINT_TO_FP; 4888 break; 4889 case ISD::UINT_TO_FP: 4890 CastOpc = ISD::ZERO_EXTEND; 4891 Opc = ISD::UINT_TO_FP; 4892 break; 4893 } 4894 4895 Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); 4896 return DAG.getNode(Opc, dl, VT, Op); 4897 } 4898 4899 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 4900 EVT VT = Op.getValueType(); 4901 if (VT.isVector()) 4902 return LowerVectorINT_TO_FP(Op, DAG); 4903 if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { 4904 RTLIB::Libcall LC; 4905 if (Op.getOpcode() == ISD::SINT_TO_FP) 4906 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 4907 Op.getValueType()); 4908 else 4909 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 4910 Op.getValueType()); 4911 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4912 /*isSigned*/ false, SDLoc(Op)).first; 4913 } 4914 4915 return Op; 4916 } 4917 4918 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 4919 // Implement fcopysign with a fabs and a conditional fneg. 4920 SDValue Tmp0 = Op.getOperand(0); 4921 SDValue Tmp1 = Op.getOperand(1); 4922 SDLoc dl(Op); 4923 EVT VT = Op.getValueType(); 4924 EVT SrcVT = Tmp1.getValueType(); 4925 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 4926 Tmp0.getOpcode() == ARMISD::VMOVDRR; 4927 bool UseNEON = !InGPR && Subtarget->hasNEON(); 4928 4929 if (UseNEON) { 4930 // Use VBSL to copy the sign bit. 4931 unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); 4932 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 4933 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 4934 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 4935 if (VT == MVT::f64) 4936 Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4937 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 4938 DAG.getConstant(32, dl, MVT::i32)); 4939 else /*if (VT == MVT::f32)*/ 4940 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 4941 if (SrcVT == MVT::f32) { 4942 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 4943 if (VT == MVT::f64) 4944 Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4945 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 4946 DAG.getConstant(32, dl, MVT::i32)); 4947 } else if (VT == MVT::f32) 4948 Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, 4949 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 4950 DAG.getConstant(32, dl, MVT::i32)); 4951 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 4952 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 4953 4954 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), 4955 dl, MVT::i32); 4956 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 4957 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 4958 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 4959 4960 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 4961 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 4962 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 4963 if (VT == MVT::f32) { 4964 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 4965 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 4966 DAG.getConstant(0, dl, MVT::i32)); 4967 } else { 4968 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 4969 } 4970 4971 return Res; 4972 } 4973 4974 // Bitcast operand 1 to i32. 4975 if (SrcVT == MVT::f64) 4976 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4977 Tmp1).getValue(1); 4978 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 4979 4980 // Or in the signbit with integer operations. 4981 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 4982 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4983 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 4984 if (VT == MVT::f32) { 4985 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 4986 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 4987 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 4988 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 4989 } 4990 4991 // f64: Or the high part with signbit and then combine two parts. 4992 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 4993 Tmp0); 4994 SDValue Lo = Tmp0.getValue(0); 4995 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 4996 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 4997 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 4998 } 4999 5000 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5001 MachineFunction &MF = DAG.getMachineFunction(); 5002 MachineFrameInfo &MFI = MF.getFrameInfo(); 5003 MFI.setReturnAddressIsTaken(true); 5004 5005 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5006 return SDValue(); 5007 5008 EVT VT = Op.getValueType(); 5009 SDLoc dl(Op); 5010 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5011 if (Depth) { 5012 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5013 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5014 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5015 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5016 MachinePointerInfo()); 5017 } 5018 5019 // Return LR, which contains the return address. Mark it an implicit live-in. 5020 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5021 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5022 } 5023 5024 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5025 const ARMBaseRegisterInfo &ARI = 5026 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5027 MachineFunction &MF = DAG.getMachineFunction(); 5028 MachineFrameInfo &MFI = MF.getFrameInfo(); 5029 MFI.setFrameAddressIsTaken(true); 5030 5031 EVT VT = Op.getValueType(); 5032 SDLoc dl(Op); // FIXME probably not meaningful 5033 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5034 unsigned FrameReg = ARI.getFrameRegister(MF); 5035 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5036 while (Depth--) 5037 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5038 MachinePointerInfo()); 5039 return FrameAddr; 5040 } 5041 5042 // FIXME? Maybe this could be a TableGen attribute on some registers and 5043 // this table could be generated automatically from RegInfo. 5044 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, 5045 SelectionDAG &DAG) const { 5046 unsigned Reg = StringSwitch<unsigned>(RegName) 5047 .Case("sp", ARM::SP) 5048 .Default(0); 5049 if (Reg) 5050 return Reg; 5051 report_fatal_error(Twine("Invalid register name \"" 5052 + StringRef(RegName) + "\".")); 5053 } 5054 5055 // Result is 64 bit value so split into two 32 bit values and return as a 5056 // pair of values. 5057 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5058 SelectionDAG &DAG) { 5059 SDLoc DL(N); 5060 5061 // This function is only supposed to be called for i64 type destination. 5062 assert(N->getValueType(0) == MVT::i64 5063 && "ExpandREAD_REGISTER called for non-i64 type result."); 5064 5065 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5066 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5067 N->getOperand(0), 5068 N->getOperand(1)); 5069 5070 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5071 Read.getValue(1))); 5072 Results.push_back(Read.getOperand(0)); 5073 } 5074 5075 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5076 /// When \p DstVT, the destination type of \p BC, is on the vector 5077 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5078 /// it might be possible to combine them, such that everything stays on the 5079 /// vector register bank. 5080 /// \p return The node that would replace \p BT, if the combine 5081 /// is possible. 5082 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5083 SelectionDAG &DAG) { 5084 SDValue Op = BC->getOperand(0); 5085 EVT DstVT = BC->getValueType(0); 5086 5087 // The only vector instruction that can produce a scalar (remember, 5088 // since the bitcast was about to be turned into VMOVDRR, the source 5089 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5090 // Moreover, we can do this combine only if there is one use. 5091 // Finally, if the destination type is not a vector, there is not 5092 // much point on forcing everything on the vector bank. 5093 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5094 !Op.hasOneUse()) 5095 return SDValue(); 5096 5097 // If the index is not constant, we will introduce an additional 5098 // multiply that will stick. 5099 // Give up in that case. 5100 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5101 if (!Index) 5102 return SDValue(); 5103 unsigned DstNumElt = DstVT.getVectorNumElements(); 5104 5105 // Compute the new index. 5106 const APInt &APIntIndex = Index->getAPIntValue(); 5107 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5108 NewIndex *= APIntIndex; 5109 // Check if the new constant index fits into i32. 5110 if (NewIndex.getBitWidth() > 32) 5111 return SDValue(); 5112 5113 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5114 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5115 SDLoc dl(Op); 5116 SDValue ExtractSrc = Op.getOperand(0); 5117 EVT VecVT = EVT::getVectorVT( 5118 *DAG.getContext(), DstVT.getScalarType(), 5119 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5120 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5121 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5122 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5123 } 5124 5125 /// ExpandBITCAST - If the target supports VFP, this function is called to 5126 /// expand a bit convert where either the source or destination type is i64 to 5127 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5128 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5129 /// vectors), since the legalizer won't know what to do with that. 5130 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5131 const ARMSubtarget *Subtarget) { 5132 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5133 SDLoc dl(N); 5134 SDValue Op = N->getOperand(0); 5135 5136 // This function is only supposed to be called for i64 types, either as the 5137 // source or destination of the bit convert. 5138 EVT SrcVT = Op.getValueType(); 5139 EVT DstVT = N->getValueType(0); 5140 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5141 5142 if (SrcVT == MVT::f32 && DstVT == MVT::i32) { 5143 // FullFP16: half values are passed in S-registers, and we don't 5144 // need any of the bitcast and moves: 5145 // 5146 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 5147 // t5: i32 = bitcast t2 5148 // t18: f16 = ARMISD::VMOVhr t5 5149 if (Op.getOpcode() != ISD::CopyFromReg || 5150 Op.getValueType() != MVT::f32) 5151 return SDValue(); 5152 5153 auto Move = N->use_begin(); 5154 if (Move->getOpcode() != ARMISD::VMOVhr) 5155 return SDValue(); 5156 5157 SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; 5158 SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); 5159 DAG.ReplaceAllUsesWith(*Move, &Copy); 5160 return Copy; 5161 } 5162 5163 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5164 if (!HasFullFP16) 5165 return SDValue(); 5166 // SoftFP: read half-precision arguments: 5167 // 5168 // t2: i32,ch = ... 5169 // t7: i16 = truncate t2 <~~~~ Op 5170 // t8: f16 = bitcast t7 <~~~~ N 5171 // 5172 if (Op.getOperand(0).getValueType() == MVT::i32) 5173 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5174 MVT::f16, Op.getOperand(0)); 5175 5176 return SDValue(); 5177 } 5178 5179 // Half-precision return values 5180 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5181 if (!HasFullFP16) 5182 return SDValue(); 5183 // 5184 // t11: f16 = fadd t8, t10 5185 // t12: i16 = bitcast t11 <~~~ SDNode N 5186 // t13: i32 = zero_extend t12 5187 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5188 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5189 // 5190 // transform this into: 5191 // 5192 // t20: i32 = ARMISD::VMOVrh t11 5193 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5194 // 5195 auto ZeroExtend = N->use_begin(); 5196 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5197 ZeroExtend->getValueType(0) != MVT::i32) 5198 return SDValue(); 5199 5200 auto Copy = ZeroExtend->use_begin(); 5201 if (Copy->getOpcode() == ISD::CopyToReg && 5202 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5203 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5204 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5205 return Cvt; 5206 } 5207 return SDValue(); 5208 } 5209 5210 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5211 return SDValue(); 5212 5213 // Turn i64->f64 into VMOVDRR. 5214 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5215 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5216 // if we can combine the bitcast with its source. 5217 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5218 return Val; 5219 5220 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5221 DAG.getConstant(0, dl, MVT::i32)); 5222 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5223 DAG.getConstant(1, dl, MVT::i32)); 5224 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5225 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5226 } 5227 5228 // Turn f64->i64 into VMOVRRD. 5229 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5230 SDValue Cvt; 5231 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5232 SrcVT.getVectorNumElements() > 1) 5233 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5234 DAG.getVTList(MVT::i32, MVT::i32), 5235 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5236 else 5237 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5238 DAG.getVTList(MVT::i32, MVT::i32), Op); 5239 // Merge the pieces into a single i64 value. 5240 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5241 } 5242 5243 return SDValue(); 5244 } 5245 5246 /// getZeroVector - Returns a vector of specified type with all zero elements. 5247 /// Zero vectors are used to represent vector negation and in those cases 5248 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5249 /// not support i64 elements, so sometimes the zero vectors will need to be 5250 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5251 /// zero vector. 5252 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5253 assert(VT.isVector() && "Expected a vector type"); 5254 // The canonical modified immediate encoding of a zero vector is....0! 5255 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5256 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5257 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5258 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5259 } 5260 5261 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5262 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5263 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5264 SelectionDAG &DAG) const { 5265 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5266 EVT VT = Op.getValueType(); 5267 unsigned VTBits = VT.getSizeInBits(); 5268 SDLoc dl(Op); 5269 SDValue ShOpLo = Op.getOperand(0); 5270 SDValue ShOpHi = Op.getOperand(1); 5271 SDValue ShAmt = Op.getOperand(2); 5272 SDValue ARMcc; 5273 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5274 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5275 5276 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5277 5278 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5279 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5280 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5281 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5282 DAG.getConstant(VTBits, dl, MVT::i32)); 5283 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5284 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5285 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5286 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5287 ISD::SETGE, ARMcc, DAG, dl); 5288 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5289 ARMcc, CCR, CmpLo); 5290 5291 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5292 SDValue HiBigShift = Opc == ISD::SRA 5293 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5294 DAG.getConstant(VTBits - 1, dl, VT)) 5295 : DAG.getConstant(0, dl, VT); 5296 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5297 ISD::SETGE, ARMcc, DAG, dl); 5298 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5299 ARMcc, CCR, CmpHi); 5300 5301 SDValue Ops[2] = { Lo, Hi }; 5302 return DAG.getMergeValues(Ops, dl); 5303 } 5304 5305 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5306 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5307 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5308 SelectionDAG &DAG) const { 5309 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5310 EVT VT = Op.getValueType(); 5311 unsigned VTBits = VT.getSizeInBits(); 5312 SDLoc dl(Op); 5313 SDValue ShOpLo = Op.getOperand(0); 5314 SDValue ShOpHi = Op.getOperand(1); 5315 SDValue ShAmt = Op.getOperand(2); 5316 SDValue ARMcc; 5317 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5318 5319 assert(Op.getOpcode() == ISD::SHL_PARTS); 5320 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5321 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5322 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5323 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5324 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5325 5326 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5327 DAG.getConstant(VTBits, dl, MVT::i32)); 5328 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5329 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5330 ISD::SETGE, ARMcc, DAG, dl); 5331 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5332 ARMcc, CCR, CmpHi); 5333 5334 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5335 ISD::SETGE, ARMcc, DAG, dl); 5336 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5337 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5338 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5339 5340 SDValue Ops[2] = { Lo, Hi }; 5341 return DAG.getMergeValues(Ops, dl); 5342 } 5343 5344 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5345 SelectionDAG &DAG) const { 5346 // The rounding mode is in bits 23:22 of the FPSCR. 5347 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5348 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5349 // so that the shift + and get folded into a bitfield extract. 5350 SDLoc dl(Op); 5351 SDValue Ops[] = { DAG.getEntryNode(), 5352 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; 5353 5354 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); 5355 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5356 DAG.getConstant(1U << 22, dl, MVT::i32)); 5357 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5358 DAG.getConstant(22, dl, MVT::i32)); 5359 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5360 DAG.getConstant(3, dl, MVT::i32)); 5361 } 5362 5363 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5364 const ARMSubtarget *ST) { 5365 SDLoc dl(N); 5366 EVT VT = N->getValueType(0); 5367 if (VT.isVector()) { 5368 assert(ST->hasNEON()); 5369 5370 // Compute the least significant set bit: LSB = X & -X 5371 SDValue X = N->getOperand(0); 5372 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5373 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5374 5375 EVT ElemTy = VT.getVectorElementType(); 5376 5377 if (ElemTy == MVT::i8) { 5378 // Compute with: cttz(x) = ctpop(lsb - 1) 5379 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5380 DAG.getTargetConstant(1, dl, ElemTy)); 5381 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5382 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5383 } 5384 5385 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5386 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5387 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5388 unsigned NumBits = ElemTy.getSizeInBits(); 5389 SDValue WidthMinus1 = 5390 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5391 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5392 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5393 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5394 } 5395 5396 // Compute with: cttz(x) = ctpop(lsb - 1) 5397 5398 // Since we can only compute the number of bits in a byte with vcnt.8, we 5399 // have to gather the result with pairwise addition (vpaddl) for i16, i32, 5400 // and i64. 5401 5402 // Compute LSB - 1. 5403 SDValue Bits; 5404 if (ElemTy == MVT::i64) { 5405 // Load constant 0xffff'ffff'ffff'ffff to register. 5406 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5407 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 5408 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 5409 } else { 5410 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5411 DAG.getTargetConstant(1, dl, ElemTy)); 5412 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5413 } 5414 5415 // Count #bits with vcnt.8. 5416 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 5417 SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); 5418 SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); 5419 5420 // Gather the #bits with vpaddl (pairwise add.) 5421 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 5422 SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, 5423 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 5424 Cnt8); 5425 if (ElemTy == MVT::i16) 5426 return Cnt16; 5427 5428 EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; 5429 SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, 5430 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 5431 Cnt16); 5432 if (ElemTy == MVT::i32) 5433 return Cnt32; 5434 5435 assert(ElemTy == MVT::i64); 5436 SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5437 DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), 5438 Cnt32); 5439 return Cnt64; 5440 } 5441 5442 if (!ST->hasV6T2Ops()) 5443 return SDValue(); 5444 5445 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 5446 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 5447 } 5448 5449 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count 5450 /// for each 16-bit element from operand, repeated. The basic idea is to 5451 /// leverage vcnt to get the 8-bit counts, gather and add the results. 5452 /// 5453 /// Trace for v4i16: 5454 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 5455 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) 5456 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) 5457 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 5458 /// [b0 b1 b2 b3 b4 b5 b6 b7] 5459 /// +[b1 b0 b3 b2 b5 b4 b7 b6] 5460 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, 5461 /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) 5462 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { 5463 EVT VT = N->getValueType(0); 5464 SDLoc DL(N); 5465 5466 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 5467 SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); 5468 SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); 5469 SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); 5470 SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); 5471 return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); 5472 } 5473 5474 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the 5475 /// bit-count for each 16-bit element from the operand. We need slightly 5476 /// different sequencing for v4i16 and v8i16 to stay within NEON's available 5477 /// 64/128-bit registers. 5478 /// 5479 /// Trace for v4i16: 5480 /// input = [v0 v1 v2 v3 ] (vi 16-bit element) 5481 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) 5482 /// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] 5483 /// v4i16:Extracted = [k0 k1 k2 k3 ] 5484 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { 5485 EVT VT = N->getValueType(0); 5486 SDLoc DL(N); 5487 5488 SDValue BitCounts = getCTPOP16BitCounts(N, DAG); 5489 if (VT.is64BitVector()) { 5490 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); 5491 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, 5492 DAG.getIntPtrConstant(0, DL)); 5493 } else { 5494 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, 5495 BitCounts, DAG.getIntPtrConstant(0, DL)); 5496 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); 5497 } 5498 } 5499 5500 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the 5501 /// bit-count for each 32-bit element from the operand. The idea here is 5502 /// to split the vector into 16-bit elements, leverage the 16-bit count 5503 /// routine, and then combine the results. 5504 /// 5505 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): 5506 /// input = [v0 v1 ] (vi: 32-bit elements) 5507 /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) 5508 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) 5509 /// vrev: N0 = [k1 k0 k3 k2 ] 5510 /// [k0 k1 k2 k3 ] 5511 /// N1 =+[k1 k0 k3 k2 ] 5512 /// [k0 k2 k1 k3 ] 5513 /// N2 =+[k1 k3 k0 k2 ] 5514 /// [k0 k2 k1 k3 ] 5515 /// Extended =+[k1 k3 k0 k2 ] 5516 /// [k0 k2 ] 5517 /// Extracted=+[k1 k3 ] 5518 /// 5519 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { 5520 EVT VT = N->getValueType(0); 5521 SDLoc DL(N); 5522 5523 EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; 5524 5525 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); 5526 SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); 5527 SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); 5528 SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); 5529 SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); 5530 5531 if (VT.is64BitVector()) { 5532 SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); 5533 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, 5534 DAG.getIntPtrConstant(0, DL)); 5535 } else { 5536 SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, 5537 DAG.getIntPtrConstant(0, DL)); 5538 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); 5539 } 5540 } 5541 5542 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 5543 const ARMSubtarget *ST) { 5544 EVT VT = N->getValueType(0); 5545 5546 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 5547 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || 5548 VT == MVT::v4i16 || VT == MVT::v8i16) && 5549 "Unexpected type for custom ctpop lowering"); 5550 5551 if (VT.getVectorElementType() == MVT::i32) 5552 return lowerCTPOP32BitElements(N, DAG); 5553 else 5554 return lowerCTPOP16BitElements(N, DAG); 5555 } 5556 5557 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 5558 const ARMSubtarget *ST) { 5559 EVT VT = N->getValueType(0); 5560 SDLoc dl(N); 5561 5562 if (!VT.isVector()) 5563 return SDValue(); 5564 5565 // Lower vector shifts on NEON to use VSHL. 5566 assert(ST->hasNEON() && "unexpected vector shift"); 5567 5568 // Left shifts translate directly to the vshiftu intrinsic. 5569 if (N->getOpcode() == ISD::SHL) 5570 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5571 DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, 5572 MVT::i32), 5573 N->getOperand(0), N->getOperand(1)); 5574 5575 assert((N->getOpcode() == ISD::SRA || 5576 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); 5577 5578 // NEON uses the same intrinsics for both left and right shifts. For 5579 // right shifts, the shift amounts are negative, so negate the vector of 5580 // shift amounts. 5581 EVT ShiftVT = N->getOperand(1).getValueType(); 5582 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, 5583 getZeroVector(ShiftVT, DAG, dl), 5584 N->getOperand(1)); 5585 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? 5586 Intrinsic::arm_neon_vshifts : 5587 Intrinsic::arm_neon_vshiftu); 5588 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5589 DAG.getConstant(vshiftInt, dl, MVT::i32), 5590 N->getOperand(0), NegatedCount); 5591 } 5592 5593 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 5594 const ARMSubtarget *ST) { 5595 EVT VT = N->getValueType(0); 5596 SDLoc dl(N); 5597 5598 // We can get here for a node like i32 = ISD::SHL i32, i64 5599 if (VT != MVT::i64) 5600 return SDValue(); 5601 5602 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 5603 "Unknown shift to lower!"); 5604 5605 // We only lower SRA, SRL of 1 here, all others use generic lowering. 5606 if (!isOneConstant(N->getOperand(1))) 5607 return SDValue(); 5608 5609 // If we are in thumb mode, we don't have RRX. 5610 if (ST->isThumb1Only()) return SDValue(); 5611 5612 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 5613 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5614 DAG.getConstant(0, dl, MVT::i32)); 5615 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5616 DAG.getConstant(1, dl, MVT::i32)); 5617 5618 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 5619 // captures the result into a carry flag. 5620 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 5621 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 5622 5623 // The low part is an ARMISD::RRX operand, which shifts the carry in. 5624 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 5625 5626 // Merge the pieces into a single i64 value. 5627 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5628 } 5629 5630 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5631 SDValue TmpOp0, TmpOp1; 5632 bool Invert = false; 5633 bool Swap = false; 5634 unsigned Opc = 0; 5635 5636 SDValue Op0 = Op.getOperand(0); 5637 SDValue Op1 = Op.getOperand(1); 5638 SDValue CC = Op.getOperand(2); 5639 EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 5640 EVT VT = Op.getValueType(); 5641 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5642 SDLoc dl(Op); 5643 5644 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 5645 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 5646 // Special-case integer 64-bit equality comparisons. They aren't legal, 5647 // but they can be lowered with a few vector instructions. 5648 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 5649 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 5650 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 5651 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 5652 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 5653 DAG.getCondCode(ISD::SETEQ)); 5654 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 5655 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 5656 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 5657 if (SetCCOpcode == ISD::SETNE) 5658 Merged = DAG.getNOT(dl, Merged, CmpVT); 5659 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 5660 return Merged; 5661 } 5662 5663 if (CmpVT.getVectorElementType() == MVT::i64) 5664 // 64-bit comparisons are not legal in general. 5665 return SDValue(); 5666 5667 if (Op1.getValueType().isFloatingPoint()) { 5668 switch (SetCCOpcode) { 5669 default: llvm_unreachable("Illegal FP comparison"); 5670 case ISD::SETUNE: 5671 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; 5672 case ISD::SETOEQ: 5673 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5674 case ISD::SETOLT: 5675 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 5676 case ISD::SETOGT: 5677 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5678 case ISD::SETOLE: 5679 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 5680 case ISD::SETOGE: 5681 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5682 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 5683 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; 5684 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 5685 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; 5686 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 5687 case ISD::SETONE: 5688 // Expand this to (OLT | OGT). 5689 TmpOp0 = Op0; 5690 TmpOp1 = Op1; 5691 Opc = ISD::OR; 5692 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5693 Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); 5694 break; 5695 case ISD::SETUO: 5696 Invert = true; 5697 LLVM_FALLTHROUGH; 5698 case ISD::SETO: 5699 // Expand this to (OLT | OGE). 5700 TmpOp0 = Op0; 5701 TmpOp1 = Op1; 5702 Opc = ISD::OR; 5703 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5704 Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); 5705 break; 5706 } 5707 } else { 5708 // Integer comparisons. 5709 switch (SetCCOpcode) { 5710 default: llvm_unreachable("Illegal integer comparison"); 5711 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; 5712 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5713 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 5714 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5715 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 5716 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5717 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 5718 case ISD::SETUGT: Opc = ARMISD::VCGTU; break; 5719 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 5720 case ISD::SETUGE: Opc = ARMISD::VCGEU; break; 5721 } 5722 5723 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 5724 if (Opc == ARMISD::VCEQ) { 5725 SDValue AndOp; 5726 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5727 AndOp = Op0; 5728 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 5729 AndOp = Op1; 5730 5731 // Ignore bitconvert. 5732 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 5733 AndOp = AndOp.getOperand(0); 5734 5735 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 5736 Opc = ARMISD::VTST; 5737 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 5738 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 5739 Invert = !Invert; 5740 } 5741 } 5742 } 5743 5744 if (Swap) 5745 std::swap(Op0, Op1); 5746 5747 // If one of the operands is a constant vector zero, attempt to fold the 5748 // comparison to a specialized compare-against-zero form. 5749 SDValue SingleOp; 5750 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5751 SingleOp = Op0; 5752 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 5753 if (Opc == ARMISD::VCGE) 5754 Opc = ARMISD::VCLEZ; 5755 else if (Opc == ARMISD::VCGT) 5756 Opc = ARMISD::VCLTZ; 5757 SingleOp = Op1; 5758 } 5759 5760 SDValue Result; 5761 if (SingleOp.getNode()) { 5762 switch (Opc) { 5763 case ARMISD::VCEQ: 5764 Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; 5765 case ARMISD::VCGE: 5766 Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; 5767 case ARMISD::VCLEZ: 5768 Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; 5769 case ARMISD::VCGT: 5770 Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; 5771 case ARMISD::VCLTZ: 5772 Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; 5773 default: 5774 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5775 } 5776 } else { 5777 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5778 } 5779 5780 Result = DAG.getSExtOrTrunc(Result, dl, VT); 5781 5782 if (Invert) 5783 Result = DAG.getNOT(dl, Result, VT); 5784 5785 return Result; 5786 } 5787 5788 static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) { 5789 SDValue LHS = Op.getOperand(0); 5790 SDValue RHS = Op.getOperand(1); 5791 SDValue Carry = Op.getOperand(2); 5792 SDValue Cond = Op.getOperand(3); 5793 SDLoc DL(Op); 5794 5795 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); 5796 5797 assert(Carry.getOpcode() != ISD::CARRY_FALSE); 5798 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 5799 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 5800 5801 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 5802 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 5803 SDValue ARMcc = DAG.getConstant( 5804 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 5805 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5806 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 5807 Cmp.getValue(1), SDValue()); 5808 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 5809 CCR, Chain.getValue(1)); 5810 } 5811 5812 /// isNEONModifiedImm - Check if the specified splat value corresponds to a 5813 /// valid vector constant for a NEON instruction with a "modified immediate" 5814 /// operand (e.g., VMOV). If so, return the encoded value. 5815 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 5816 unsigned SplatBitSize, SelectionDAG &DAG, 5817 const SDLoc &dl, EVT &VT, bool is128Bits, 5818 NEONModImmType type) { 5819 unsigned OpCmode, Imm; 5820 5821 // SplatBitSize is set to the smallest size that splats the vector, so a 5822 // zero vector will always have SplatBitSize == 8. However, NEON modified 5823 // immediate instructions others than VMOV do not support the 8-bit encoding 5824 // of a zero vector, and the default encoding of zero is supposed to be the 5825 // 32-bit version. 5826 if (SplatBits == 0) 5827 SplatBitSize = 32; 5828 5829 switch (SplatBitSize) { 5830 case 8: 5831 if (type != VMOVModImm) 5832 return SDValue(); 5833 // Any 1-byte value is OK. Op=0, Cmode=1110. 5834 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 5835 OpCmode = 0xe; 5836 Imm = SplatBits; 5837 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 5838 break; 5839 5840 case 16: 5841 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 5842 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 5843 if ((SplatBits & ~0xff) == 0) { 5844 // Value = 0x00nn: Op=x, Cmode=100x. 5845 OpCmode = 0x8; 5846 Imm = SplatBits; 5847 break; 5848 } 5849 if ((SplatBits & ~0xff00) == 0) { 5850 // Value = 0xnn00: Op=x, Cmode=101x. 5851 OpCmode = 0xa; 5852 Imm = SplatBits >> 8; 5853 break; 5854 } 5855 return SDValue(); 5856 5857 case 32: 5858 // NEON's 32-bit VMOV supports splat values where: 5859 // * only one byte is nonzero, or 5860 // * the least significant byte is 0xff and the second byte is nonzero, or 5861 // * the least significant 2 bytes are 0xff and the third is nonzero. 5862 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 5863 if ((SplatBits & ~0xff) == 0) { 5864 // Value = 0x000000nn: Op=x, Cmode=000x. 5865 OpCmode = 0; 5866 Imm = SplatBits; 5867 break; 5868 } 5869 if ((SplatBits & ~0xff00) == 0) { 5870 // Value = 0x0000nn00: Op=x, Cmode=001x. 5871 OpCmode = 0x2; 5872 Imm = SplatBits >> 8; 5873 break; 5874 } 5875 if ((SplatBits & ~0xff0000) == 0) { 5876 // Value = 0x00nn0000: Op=x, Cmode=010x. 5877 OpCmode = 0x4; 5878 Imm = SplatBits >> 16; 5879 break; 5880 } 5881 if ((SplatBits & ~0xff000000) == 0) { 5882 // Value = 0xnn000000: Op=x, Cmode=011x. 5883 OpCmode = 0x6; 5884 Imm = SplatBits >> 24; 5885 break; 5886 } 5887 5888 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 5889 if (type == OtherModImm) return SDValue(); 5890 5891 if ((SplatBits & ~0xffff) == 0 && 5892 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 5893 // Value = 0x0000nnff: Op=x, Cmode=1100. 5894 OpCmode = 0xc; 5895 Imm = SplatBits >> 8; 5896 break; 5897 } 5898 5899 if ((SplatBits & ~0xffffff) == 0 && 5900 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 5901 // Value = 0x00nnffff: Op=x, Cmode=1101. 5902 OpCmode = 0xd; 5903 Imm = SplatBits >> 16; 5904 break; 5905 } 5906 5907 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 5908 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 5909 // VMOV.I32. A (very) minor optimization would be to replicate the value 5910 // and fall through here to test for a valid 64-bit splat. But, then the 5911 // caller would also need to check and handle the change in size. 5912 return SDValue(); 5913 5914 case 64: { 5915 if (type != VMOVModImm) 5916 return SDValue(); 5917 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 5918 uint64_t BitMask = 0xff; 5919 uint64_t Val = 0; 5920 unsigned ImmMask = 1; 5921 Imm = 0; 5922 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 5923 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 5924 Val |= BitMask; 5925 Imm |= ImmMask; 5926 } else if ((SplatBits & BitMask) != 0) { 5927 return SDValue(); 5928 } 5929 BitMask <<= 8; 5930 ImmMask <<= 1; 5931 } 5932 5933 if (DAG.getDataLayout().isBigEndian()) 5934 // swap higher and lower 32 bit word 5935 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 5936 5937 // Op=1, Cmode=1110. 5938 OpCmode = 0x1e; 5939 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 5940 break; 5941 } 5942 5943 default: 5944 llvm_unreachable("unexpected size for isNEONModifiedImm"); 5945 } 5946 5947 unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); 5948 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 5949 } 5950 5951 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 5952 const ARMSubtarget *ST) const { 5953 EVT VT = Op.getValueType(); 5954 bool IsDouble = (VT == MVT::f64); 5955 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 5956 const APFloat &FPVal = CFP->getValueAPF(); 5957 5958 // Prevent floating-point constants from using literal loads 5959 // when execute-only is enabled. 5960 if (ST->genExecuteOnly()) { 5961 // If we can represent the constant as an immediate, don't lower it 5962 if (isFPImmLegal(FPVal, VT)) 5963 return Op; 5964 // Otherwise, construct as integer, and move to float register 5965 APInt INTVal = FPVal.bitcastToAPInt(); 5966 SDLoc DL(CFP); 5967 switch (VT.getSimpleVT().SimpleTy) { 5968 default: 5969 llvm_unreachable("Unknown floating point type!"); 5970 break; 5971 case MVT::f64: { 5972 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 5973 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 5974 if (!ST->isLittle()) 5975 std::swap(Lo, Hi); 5976 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 5977 } 5978 case MVT::f32: 5979 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 5980 DAG.getConstant(INTVal, DL, MVT::i32)); 5981 } 5982 } 5983 5984 if (!ST->hasVFP3()) 5985 return SDValue(); 5986 5987 // Use the default (constant pool) lowering for double constants when we have 5988 // an SP-only FPU 5989 if (IsDouble && Subtarget->isFPOnlySP()) 5990 return SDValue(); 5991 5992 // Try splatting with a VMOV.f32... 5993 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 5994 5995 if (ImmVal != -1) { 5996 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 5997 // We have code in place to select a valid ConstantFP already, no need to 5998 // do any mangling. 5999 return Op; 6000 } 6001 6002 // It's a float and we are trying to use NEON operations where 6003 // possible. Lower it to a splat followed by an extract. 6004 SDLoc DL(Op); 6005 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6006 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6007 NewVal); 6008 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6009 DAG.getConstant(0, DL, MVT::i32)); 6010 } 6011 6012 // The rest of our options are NEON only, make sure that's allowed before 6013 // proceeding.. 6014 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6015 return SDValue(); 6016 6017 EVT VMovVT; 6018 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6019 6020 // It wouldn't really be worth bothering for doubles except for one very 6021 // important value, which does happen to match: 0.0. So make sure we don't do 6022 // anything stupid. 6023 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6024 return SDValue(); 6025 6026 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6027 SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6028 VMovVT, false, VMOVModImm); 6029 if (NewVal != SDValue()) { 6030 SDLoc DL(Op); 6031 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6032 NewVal); 6033 if (IsDouble) 6034 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6035 6036 // It's a float: cast and extract a vector element. 6037 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6038 VecConstant); 6039 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6040 DAG.getConstant(0, DL, MVT::i32)); 6041 } 6042 6043 // Finally, try a VMVN.i32 6044 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6045 false, VMVNModImm); 6046 if (NewVal != SDValue()) { 6047 SDLoc DL(Op); 6048 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6049 6050 if (IsDouble) 6051 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6052 6053 // It's a float: cast and extract a vector element. 6054 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6055 VecConstant); 6056 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6057 DAG.getConstant(0, DL, MVT::i32)); 6058 } 6059 6060 return SDValue(); 6061 } 6062 6063 // check if an VEXT instruction can handle the shuffle mask when the 6064 // vector sources of the shuffle are the same. 6065 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6066 unsigned NumElts = VT.getVectorNumElements(); 6067 6068 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6069 if (M[0] < 0) 6070 return false; 6071 6072 Imm = M[0]; 6073 6074 // If this is a VEXT shuffle, the immediate value is the index of the first 6075 // element. The other shuffle indices must be the successive elements after 6076 // the first one. 6077 unsigned ExpectedElt = Imm; 6078 for (unsigned i = 1; i < NumElts; ++i) { 6079 // Increment the expected index. If it wraps around, just follow it 6080 // back to index zero and keep going. 6081 ++ExpectedElt; 6082 if (ExpectedElt == NumElts) 6083 ExpectedElt = 0; 6084 6085 if (M[i] < 0) continue; // ignore UNDEF indices 6086 if (ExpectedElt != static_cast<unsigned>(M[i])) 6087 return false; 6088 } 6089 6090 return true; 6091 } 6092 6093 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6094 bool &ReverseVEXT, unsigned &Imm) { 6095 unsigned NumElts = VT.getVectorNumElements(); 6096 ReverseVEXT = false; 6097 6098 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6099 if (M[0] < 0) 6100 return false; 6101 6102 Imm = M[0]; 6103 6104 // If this is a VEXT shuffle, the immediate value is the index of the first 6105 // element. The other shuffle indices must be the successive elements after 6106 // the first one. 6107 unsigned ExpectedElt = Imm; 6108 for (unsigned i = 1; i < NumElts; ++i) { 6109 // Increment the expected index. If it wraps around, it may still be 6110 // a VEXT but the source vectors must be swapped. 6111 ExpectedElt += 1; 6112 if (ExpectedElt == NumElts * 2) { 6113 ExpectedElt = 0; 6114 ReverseVEXT = true; 6115 } 6116 6117 if (M[i] < 0) continue; // ignore UNDEF indices 6118 if (ExpectedElt != static_cast<unsigned>(M[i])) 6119 return false; 6120 } 6121 6122 // Adjust the index value if the source operands will be swapped. 6123 if (ReverseVEXT) 6124 Imm -= NumElts; 6125 6126 return true; 6127 } 6128 6129 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6130 /// instruction with the specified blocksize. (The order of the elements 6131 /// within each block of the vector is reversed.) 6132 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6133 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6134 "Only possible block sizes for VREV are: 16, 32, 64"); 6135 6136 unsigned EltSz = VT.getScalarSizeInBits(); 6137 if (EltSz == 64) 6138 return false; 6139 6140 unsigned NumElts = VT.getVectorNumElements(); 6141 unsigned BlockElts = M[0] + 1; 6142 // If the first shuffle index is UNDEF, be optimistic. 6143 if (M[0] < 0) 6144 BlockElts = BlockSize / EltSz; 6145 6146 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6147 return false; 6148 6149 for (unsigned i = 0; i < NumElts; ++i) { 6150 if (M[i] < 0) continue; // ignore UNDEF indices 6151 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6152 return false; 6153 } 6154 6155 return true; 6156 } 6157 6158 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6159 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6160 // range, then 0 is placed into the resulting vector. So pretty much any mask 6161 // of 8 elements can work here. 6162 return VT == MVT::v8i8 && M.size() == 8; 6163 } 6164 6165 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6166 unsigned Index) { 6167 if (Mask.size() == Elements * 2) 6168 return Index / Elements; 6169 return Mask[Index] == 0 ? 0 : 1; 6170 } 6171 6172 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6173 // checking that pairs of elements in the shuffle mask represent the same index 6174 // in each vector, incrementing the expected index by 2 at each step. 6175 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6176 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6177 // v2={e,f,g,h} 6178 // WhichResult gives the offset for each element in the mask based on which 6179 // of the two results it belongs to. 6180 // 6181 // The transpose can be represented either as: 6182 // result1 = shufflevector v1, v2, result1_shuffle_mask 6183 // result2 = shufflevector v1, v2, result2_shuffle_mask 6184 // where v1/v2 and the shuffle masks have the same number of elements 6185 // (here WhichResult (see below) indicates which result is being checked) 6186 // 6187 // or as: 6188 // results = shufflevector v1, v2, shuffle_mask 6189 // where both results are returned in one vector and the shuffle mask has twice 6190 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6191 // want to check the low half and high half of the shuffle mask as if it were 6192 // the other case 6193 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6194 unsigned EltSz = VT.getScalarSizeInBits(); 6195 if (EltSz == 64) 6196 return false; 6197 6198 unsigned NumElts = VT.getVectorNumElements(); 6199 if (M.size() != NumElts && M.size() != NumElts*2) 6200 return false; 6201 6202 // If the mask is twice as long as the input vector then we need to check the 6203 // upper and lower parts of the mask with a matching value for WhichResult 6204 // FIXME: A mask with only even values will be rejected in case the first 6205 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6206 // M[0] is used to determine WhichResult 6207 for (unsigned i = 0; i < M.size(); i += NumElts) { 6208 WhichResult = SelectPairHalf(NumElts, M, i); 6209 for (unsigned j = 0; j < NumElts; j += 2) { 6210 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6211 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6212 return false; 6213 } 6214 } 6215 6216 if (M.size() == NumElts*2) 6217 WhichResult = 0; 6218 6219 return true; 6220 } 6221 6222 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6223 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6224 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6225 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6226 unsigned EltSz = VT.getScalarSizeInBits(); 6227 if (EltSz == 64) 6228 return false; 6229 6230 unsigned NumElts = VT.getVectorNumElements(); 6231 if (M.size() != NumElts && M.size() != NumElts*2) 6232 return false; 6233 6234 for (unsigned i = 0; i < M.size(); i += NumElts) { 6235 WhichResult = SelectPairHalf(NumElts, M, i); 6236 for (unsigned j = 0; j < NumElts; j += 2) { 6237 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6238 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6239 return false; 6240 } 6241 } 6242 6243 if (M.size() == NumElts*2) 6244 WhichResult = 0; 6245 6246 return true; 6247 } 6248 6249 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6250 // that the mask elements are either all even and in steps of size 2 or all odd 6251 // and in steps of size 2. 6252 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6253 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6254 // v2={e,f,g,h} 6255 // Requires similar checks to that of isVTRNMask with 6256 // respect the how results are returned. 6257 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6258 unsigned EltSz = VT.getScalarSizeInBits(); 6259 if (EltSz == 64) 6260 return false; 6261 6262 unsigned NumElts = VT.getVectorNumElements(); 6263 if (M.size() != NumElts && M.size() != NumElts*2) 6264 return false; 6265 6266 for (unsigned i = 0; i < M.size(); i += NumElts) { 6267 WhichResult = SelectPairHalf(NumElts, M, i); 6268 for (unsigned j = 0; j < NumElts; ++j) { 6269 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6270 return false; 6271 } 6272 } 6273 6274 if (M.size() == NumElts*2) 6275 WhichResult = 0; 6276 6277 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6278 if (VT.is64BitVector() && EltSz == 32) 6279 return false; 6280 6281 return true; 6282 } 6283 6284 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6285 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6286 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6287 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6288 unsigned EltSz = VT.getScalarSizeInBits(); 6289 if (EltSz == 64) 6290 return false; 6291 6292 unsigned NumElts = VT.getVectorNumElements(); 6293 if (M.size() != NumElts && M.size() != NumElts*2) 6294 return false; 6295 6296 unsigned Half = NumElts / 2; 6297 for (unsigned i = 0; i < M.size(); i += NumElts) { 6298 WhichResult = SelectPairHalf(NumElts, M, i); 6299 for (unsigned j = 0; j < NumElts; j += Half) { 6300 unsigned Idx = WhichResult; 6301 for (unsigned k = 0; k < Half; ++k) { 6302 int MIdx = M[i + j + k]; 6303 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6304 return false; 6305 Idx += 2; 6306 } 6307 } 6308 } 6309 6310 if (M.size() == NumElts*2) 6311 WhichResult = 0; 6312 6313 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6314 if (VT.is64BitVector() && EltSz == 32) 6315 return false; 6316 6317 return true; 6318 } 6319 6320 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6321 // that pairs of elements of the shufflemask represent the same index in each 6322 // vector incrementing sequentially through the vectors. 6323 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6324 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6325 // v2={e,f,g,h} 6326 // Requires similar checks to that of isVTRNMask with respect the how results 6327 // are returned. 6328 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6329 unsigned EltSz = VT.getScalarSizeInBits(); 6330 if (EltSz == 64) 6331 return false; 6332 6333 unsigned NumElts = VT.getVectorNumElements(); 6334 if (M.size() != NumElts && M.size() != NumElts*2) 6335 return false; 6336 6337 for (unsigned i = 0; i < M.size(); i += NumElts) { 6338 WhichResult = SelectPairHalf(NumElts, M, i); 6339 unsigned Idx = WhichResult * NumElts / 2; 6340 for (unsigned j = 0; j < NumElts; j += 2) { 6341 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6342 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6343 return false; 6344 Idx += 1; 6345 } 6346 } 6347 6348 if (M.size() == NumElts*2) 6349 WhichResult = 0; 6350 6351 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6352 if (VT.is64BitVector() && EltSz == 32) 6353 return false; 6354 6355 return true; 6356 } 6357 6358 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6359 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6360 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6361 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6362 unsigned EltSz = VT.getScalarSizeInBits(); 6363 if (EltSz == 64) 6364 return false; 6365 6366 unsigned NumElts = VT.getVectorNumElements(); 6367 if (M.size() != NumElts && M.size() != NumElts*2) 6368 return false; 6369 6370 for (unsigned i = 0; i < M.size(); i += NumElts) { 6371 WhichResult = SelectPairHalf(NumElts, M, i); 6372 unsigned Idx = WhichResult * NumElts / 2; 6373 for (unsigned j = 0; j < NumElts; j += 2) { 6374 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6375 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 6376 return false; 6377 Idx += 1; 6378 } 6379 } 6380 6381 if (M.size() == NumElts*2) 6382 WhichResult = 0; 6383 6384 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6385 if (VT.is64BitVector() && EltSz == 32) 6386 return false; 6387 6388 return true; 6389 } 6390 6391 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 6392 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 6393 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 6394 unsigned &WhichResult, 6395 bool &isV_UNDEF) { 6396 isV_UNDEF = false; 6397 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 6398 return ARMISD::VTRN; 6399 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 6400 return ARMISD::VUZP; 6401 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 6402 return ARMISD::VZIP; 6403 6404 isV_UNDEF = true; 6405 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6406 return ARMISD::VTRN; 6407 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6408 return ARMISD::VUZP; 6409 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6410 return ARMISD::VZIP; 6411 6412 return 0; 6413 } 6414 6415 /// \return true if this is a reverse operation on an vector. 6416 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 6417 unsigned NumElts = VT.getVectorNumElements(); 6418 // Make sure the mask has the right size. 6419 if (NumElts != M.size()) 6420 return false; 6421 6422 // Look for <15, ..., 3, -1, 1, 0>. 6423 for (unsigned i = 0; i != NumElts; ++i) 6424 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 6425 return false; 6426 6427 return true; 6428 } 6429 6430 // If N is an integer constant that can be moved into a register in one 6431 // instruction, return an SDValue of such a constant (will become a MOV 6432 // instruction). Otherwise return null. 6433 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 6434 const ARMSubtarget *ST, const SDLoc &dl) { 6435 uint64_t Val; 6436 if (!isa<ConstantSDNode>(N)) 6437 return SDValue(); 6438 Val = cast<ConstantSDNode>(N)->getZExtValue(); 6439 6440 if (ST->isThumb1Only()) { 6441 if (Val <= 255 || ~Val <= 255) 6442 return DAG.getConstant(Val, dl, MVT::i32); 6443 } else { 6444 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 6445 return DAG.getConstant(Val, dl, MVT::i32); 6446 } 6447 return SDValue(); 6448 } 6449 6450 // If this is a case we can't handle, return null and let the default 6451 // expansion code take care of it. 6452 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 6453 const ARMSubtarget *ST) const { 6454 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 6455 SDLoc dl(Op); 6456 EVT VT = Op.getValueType(); 6457 6458 APInt SplatBits, SplatUndef; 6459 unsigned SplatBitSize; 6460 bool HasAnyUndefs; 6461 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 6462 if (SplatUndef.isAllOnesValue()) 6463 return DAG.getUNDEF(VT); 6464 6465 if (SplatBitSize <= 64) { 6466 // Check if an immediate VMOV works. 6467 EVT VmovVT; 6468 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 6469 SplatUndef.getZExtValue(), SplatBitSize, 6470 DAG, dl, VmovVT, VT.is128BitVector(), 6471 VMOVModImm); 6472 if (Val.getNode()) { 6473 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 6474 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6475 } 6476 6477 // Try an immediate VMVN. 6478 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 6479 Val = isNEONModifiedImm(NegatedImm, 6480 SplatUndef.getZExtValue(), SplatBitSize, 6481 DAG, dl, VmovVT, VT.is128BitVector(), 6482 VMVNModImm); 6483 if (Val.getNode()) { 6484 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 6485 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6486 } 6487 6488 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 6489 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 6490 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 6491 if (ImmVal != -1) { 6492 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 6493 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 6494 } 6495 } 6496 } 6497 } 6498 6499 // Scan through the operands to see if only one value is used. 6500 // 6501 // As an optimisation, even if more than one value is used it may be more 6502 // profitable to splat with one value then change some lanes. 6503 // 6504 // Heuristically we decide to do this if the vector has a "dominant" value, 6505 // defined as splatted to more than half of the lanes. 6506 unsigned NumElts = VT.getVectorNumElements(); 6507 bool isOnlyLowElement = true; 6508 bool usesOnlyOneValue = true; 6509 bool hasDominantValue = false; 6510 bool isConstant = true; 6511 6512 // Map of the number of times a particular SDValue appears in the 6513 // element list. 6514 DenseMap<SDValue, unsigned> ValueCounts; 6515 SDValue Value; 6516 for (unsigned i = 0; i < NumElts; ++i) { 6517 SDValue V = Op.getOperand(i); 6518 if (V.isUndef()) 6519 continue; 6520 if (i > 0) 6521 isOnlyLowElement = false; 6522 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 6523 isConstant = false; 6524 6525 ValueCounts.insert(std::make_pair(V, 0)); 6526 unsigned &Count = ValueCounts[V]; 6527 6528 // Is this value dominant? (takes up more than half of the lanes) 6529 if (++Count > (NumElts / 2)) { 6530 hasDominantValue = true; 6531 Value = V; 6532 } 6533 } 6534 if (ValueCounts.size() != 1) 6535 usesOnlyOneValue = false; 6536 if (!Value.getNode() && !ValueCounts.empty()) 6537 Value = ValueCounts.begin()->first; 6538 6539 if (ValueCounts.empty()) 6540 return DAG.getUNDEF(VT); 6541 6542 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 6543 // Keep going if we are hitting this case. 6544 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 6545 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 6546 6547 unsigned EltSize = VT.getScalarSizeInBits(); 6548 6549 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 6550 // i32 and try again. 6551 if (hasDominantValue && EltSize <= 32) { 6552 if (!isConstant) { 6553 SDValue N; 6554 6555 // If we are VDUPing a value that comes directly from a vector, that will 6556 // cause an unnecessary move to and from a GPR, where instead we could 6557 // just use VDUPLANE. We can only do this if the lane being extracted 6558 // is at a constant index, as the VDUP from lane instructions only have 6559 // constant-index forms. 6560 ConstantSDNode *constIndex; 6561 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6562 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 6563 // We need to create a new undef vector to use for the VDUPLANE if the 6564 // size of the vector from which we get the value is different than the 6565 // size of the vector that we need to create. We will insert the element 6566 // such that the register coalescer will remove unnecessary copies. 6567 if (VT != Value->getOperand(0).getValueType()) { 6568 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 6569 VT.getVectorNumElements(); 6570 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6571 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 6572 Value, DAG.getConstant(index, dl, MVT::i32)), 6573 DAG.getConstant(index, dl, MVT::i32)); 6574 } else 6575 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6576 Value->getOperand(0), Value->getOperand(1)); 6577 } else 6578 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 6579 6580 if (!usesOnlyOneValue) { 6581 // The dominant value was splatted as 'N', but we now have to insert 6582 // all differing elements. 6583 for (unsigned I = 0; I < NumElts; ++I) { 6584 if (Op.getOperand(I) == Value) 6585 continue; 6586 SmallVector<SDValue, 3> Ops; 6587 Ops.push_back(N); 6588 Ops.push_back(Op.getOperand(I)); 6589 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 6590 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 6591 } 6592 } 6593 return N; 6594 } 6595 if (VT.getVectorElementType().isFloatingPoint()) { 6596 SmallVector<SDValue, 8> Ops; 6597 for (unsigned i = 0; i < NumElts; ++i) 6598 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, 6599 Op.getOperand(i))); 6600 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 6601 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 6602 Val = LowerBUILD_VECTOR(Val, DAG, ST); 6603 if (Val.getNode()) 6604 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6605 } 6606 if (usesOnlyOneValue) { 6607 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 6608 if (isConstant && Val.getNode()) 6609 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 6610 } 6611 } 6612 6613 // If all elements are constants and the case above didn't get hit, fall back 6614 // to the default expansion, which will generate a load from the constant 6615 // pool. 6616 if (isConstant) 6617 return SDValue(); 6618 6619 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 6620 if (NumElts >= 4) { 6621 SDValue shuffle = ReconstructShuffle(Op, DAG); 6622 if (shuffle != SDValue()) 6623 return shuffle; 6624 } 6625 6626 if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 6627 // If we haven't found an efficient lowering, try splitting a 128-bit vector 6628 // into two 64-bit vectors; we might discover a better way to lower it. 6629 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 6630 EVT ExtVT = VT.getVectorElementType(); 6631 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 6632 SDValue Lower = 6633 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 6634 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 6635 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 6636 SDValue Upper = DAG.getBuildVector( 6637 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 6638 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 6639 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 6640 if (Lower && Upper) 6641 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 6642 } 6643 6644 // Vectors with 32- or 64-bit elements can be built by directly assigning 6645 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 6646 // will be legalized. 6647 if (EltSize >= 32) { 6648 // Do the expansion with floating-point types, since that is what the VFP 6649 // registers are defined to use, and since i64 is not legal. 6650 EVT EltVT = EVT::getFloatingPointVT(EltSize); 6651 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 6652 SmallVector<SDValue, 8> Ops; 6653 for (unsigned i = 0; i < NumElts; ++i) 6654 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 6655 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 6656 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6657 } 6658 6659 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 6660 // know the default expansion would otherwise fall back on something even 6661 // worse. For a vector with one or two non-undef values, that's 6662 // scalar_to_vector for the elements followed by a shuffle (provided the 6663 // shuffle is valid for the target) and materialization element by element 6664 // on the stack followed by a load for everything else. 6665 if (!isConstant && !usesOnlyOneValue) { 6666 SDValue Vec = DAG.getUNDEF(VT); 6667 for (unsigned i = 0 ; i < NumElts; ++i) { 6668 SDValue V = Op.getOperand(i); 6669 if (V.isUndef()) 6670 continue; 6671 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 6672 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 6673 } 6674 return Vec; 6675 } 6676 6677 return SDValue(); 6678 } 6679 6680 // Gather data to see if the operation can be modelled as a 6681 // shuffle in combination with VEXTs. 6682 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 6683 SelectionDAG &DAG) const { 6684 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 6685 SDLoc dl(Op); 6686 EVT VT = Op.getValueType(); 6687 unsigned NumElts = VT.getVectorNumElements(); 6688 6689 struct ShuffleSourceInfo { 6690 SDValue Vec; 6691 unsigned MinElt = std::numeric_limits<unsigned>::max(); 6692 unsigned MaxElt = 0; 6693 6694 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 6695 // be compatible with the shuffle we intend to construct. As a result 6696 // ShuffleVec will be some sliding window into the original Vec. 6697 SDValue ShuffleVec; 6698 6699 // Code should guarantee that element i in Vec starts at element "WindowBase 6700 // + i * WindowScale in ShuffleVec". 6701 int WindowBase = 0; 6702 int WindowScale = 1; 6703 6704 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 6705 6706 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 6707 }; 6708 6709 // First gather all vectors used as an immediate source for this BUILD_VECTOR 6710 // node. 6711 SmallVector<ShuffleSourceInfo, 2> Sources; 6712 for (unsigned i = 0; i < NumElts; ++i) { 6713 SDValue V = Op.getOperand(i); 6714 if (V.isUndef()) 6715 continue; 6716 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 6717 // A shuffle can only come from building a vector from various 6718 // elements of other vectors. 6719 return SDValue(); 6720 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 6721 // Furthermore, shuffles require a constant mask, whereas extractelts 6722 // accept variable indices. 6723 return SDValue(); 6724 } 6725 6726 // Add this element source to the list if it's not already there. 6727 SDValue SourceVec = V.getOperand(0); 6728 auto Source = llvm::find(Sources, SourceVec); 6729 if (Source == Sources.end()) 6730 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 6731 6732 // Update the minimum and maximum lane number seen. 6733 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 6734 Source->MinElt = std::min(Source->MinElt, EltNo); 6735 Source->MaxElt = std::max(Source->MaxElt, EltNo); 6736 } 6737 6738 // Currently only do something sane when at most two source vectors 6739 // are involved. 6740 if (Sources.size() > 2) 6741 return SDValue(); 6742 6743 // Find out the smallest element size among result and two sources, and use 6744 // it as element size to build the shuffle_vector. 6745 EVT SmallestEltTy = VT.getVectorElementType(); 6746 for (auto &Source : Sources) { 6747 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 6748 if (SrcEltTy.bitsLT(SmallestEltTy)) 6749 SmallestEltTy = SrcEltTy; 6750 } 6751 unsigned ResMultiplier = 6752 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 6753 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6754 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 6755 6756 // If the source vector is too wide or too narrow, we may nevertheless be able 6757 // to construct a compatible shuffle either by concatenating it with UNDEF or 6758 // extracting a suitable range of elements. 6759 for (auto &Src : Sources) { 6760 EVT SrcVT = Src.ShuffleVec.getValueType(); 6761 6762 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 6763 continue; 6764 6765 // This stage of the search produces a source with the same element type as 6766 // the original, but with a total width matching the BUILD_VECTOR output. 6767 EVT EltVT = SrcVT.getVectorElementType(); 6768 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 6769 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 6770 6771 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 6772 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 6773 return SDValue(); 6774 // We can pad out the smaller vector for free, so if it's part of a 6775 // shuffle... 6776 Src.ShuffleVec = 6777 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 6778 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 6779 continue; 6780 } 6781 6782 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 6783 return SDValue(); 6784 6785 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 6786 // Span too large for a VEXT to cope 6787 return SDValue(); 6788 } 6789 6790 if (Src.MinElt >= NumSrcElts) { 6791 // The extraction can just take the second half 6792 Src.ShuffleVec = 6793 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6794 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6795 Src.WindowBase = -NumSrcElts; 6796 } else if (Src.MaxElt < NumSrcElts) { 6797 // The extraction can just take the first half 6798 Src.ShuffleVec = 6799 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6800 DAG.getConstant(0, dl, MVT::i32)); 6801 } else { 6802 // An actual VEXT is needed 6803 SDValue VEXTSrc1 = 6804 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6805 DAG.getConstant(0, dl, MVT::i32)); 6806 SDValue VEXTSrc2 = 6807 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6808 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6809 6810 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 6811 VEXTSrc2, 6812 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 6813 Src.WindowBase = -Src.MinElt; 6814 } 6815 } 6816 6817 // Another possible incompatibility occurs from the vector element types. We 6818 // can fix this by bitcasting the source vectors to the same type we intend 6819 // for the shuffle. 6820 for (auto &Src : Sources) { 6821 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 6822 if (SrcEltTy == SmallestEltTy) 6823 continue; 6824 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 6825 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 6826 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6827 Src.WindowBase *= Src.WindowScale; 6828 } 6829 6830 // Final sanity check before we try to actually produce a shuffle. 6831 DEBUG( 6832 for (auto Src : Sources) 6833 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 6834 ); 6835 6836 // The stars all align, our next step is to produce the mask for the shuffle. 6837 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 6838 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 6839 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 6840 SDValue Entry = Op.getOperand(i); 6841 if (Entry.isUndef()) 6842 continue; 6843 6844 auto Src = llvm::find(Sources, Entry.getOperand(0)); 6845 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 6846 6847 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 6848 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 6849 // segment. 6850 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 6851 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 6852 VT.getScalarSizeInBits()); 6853 int LanesDefined = BitsDefined / BitsPerShuffleLane; 6854 6855 // This source is expected to fill ResMultiplier lanes of the final shuffle, 6856 // starting at the appropriate offset. 6857 int *LaneMask = &Mask[i * ResMultiplier]; 6858 6859 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 6860 ExtractBase += NumElts * (Src - Sources.begin()); 6861 for (int j = 0; j < LanesDefined; ++j) 6862 LaneMask[j] = ExtractBase + j; 6863 } 6864 6865 // Final check before we try to produce nonsense... 6866 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 6867 return SDValue(); 6868 6869 // We can't handle more than two sources. This should have already 6870 // been checked before this point. 6871 assert(Sources.size() <= 2 && "Too many sources!"); 6872 6873 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 6874 for (unsigned i = 0; i < Sources.size(); ++i) 6875 ShuffleOps[i] = Sources[i].ShuffleVec; 6876 6877 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 6878 ShuffleOps[1], Mask); 6879 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 6880 } 6881 6882 /// isShuffleMaskLegal - Targets can use this to indicate that they only 6883 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6884 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6885 /// are assumed to be legal. 6886 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 6887 if (VT.getVectorNumElements() == 4 && 6888 (VT.is128BitVector() || VT.is64BitVector())) { 6889 unsigned PFIndexes[4]; 6890 for (unsigned i = 0; i != 4; ++i) { 6891 if (M[i] < 0) 6892 PFIndexes[i] = 8; 6893 else 6894 PFIndexes[i] = M[i]; 6895 } 6896 6897 // Compute the index in the perfect shuffle table. 6898 unsigned PFTableIndex = 6899 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6900 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6901 unsigned Cost = (PFEntry >> 30); 6902 6903 if (Cost <= 4) 6904 return true; 6905 } 6906 6907 bool ReverseVEXT, isV_UNDEF; 6908 unsigned Imm, WhichResult; 6909 6910 unsigned EltSize = VT.getScalarSizeInBits(); 6911 return (EltSize >= 32 || 6912 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6913 isVREVMask(M, VT, 64) || 6914 isVREVMask(M, VT, 32) || 6915 isVREVMask(M, VT, 16) || 6916 isVEXTMask(M, VT, ReverseVEXT, Imm) || 6917 isVTBLMask(M, VT) || 6918 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || 6919 ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); 6920 } 6921 6922 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 6923 /// the specified operations to build the shuffle. 6924 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 6925 SDValue RHS, SelectionDAG &DAG, 6926 const SDLoc &dl) { 6927 unsigned OpNum = (PFEntry >> 26) & 0x0F; 6928 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 6929 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 6930 6931 enum { 6932 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 6933 OP_VREV, 6934 OP_VDUP0, 6935 OP_VDUP1, 6936 OP_VDUP2, 6937 OP_VDUP3, 6938 OP_VEXT1, 6939 OP_VEXT2, 6940 OP_VEXT3, 6941 OP_VUZPL, // VUZP, left result 6942 OP_VUZPR, // VUZP, right result 6943 OP_VZIPL, // VZIP, left result 6944 OP_VZIPR, // VZIP, right result 6945 OP_VTRNL, // VTRN, left result 6946 OP_VTRNR // VTRN, right result 6947 }; 6948 6949 if (OpNum == OP_COPY) { 6950 if (LHSID == (1*9+2)*9+3) return LHS; 6951 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 6952 return RHS; 6953 } 6954 6955 SDValue OpLHS, OpRHS; 6956 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 6957 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 6958 EVT VT = OpLHS.getValueType(); 6959 6960 switch (OpNum) { 6961 default: llvm_unreachable("Unknown shuffle opcode!"); 6962 case OP_VREV: 6963 // VREV divides the vector in half and swaps within the half. 6964 if (VT.getVectorElementType() == MVT::i32 || 6965 VT.getVectorElementType() == MVT::f32) 6966 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 6967 // vrev <4 x i16> -> VREV32 6968 if (VT.getVectorElementType() == MVT::i16) 6969 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 6970 // vrev <4 x i8> -> VREV16 6971 assert(VT.getVectorElementType() == MVT::i8); 6972 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 6973 case OP_VDUP0: 6974 case OP_VDUP1: 6975 case OP_VDUP2: 6976 case OP_VDUP3: 6977 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6978 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 6979 case OP_VEXT1: 6980 case OP_VEXT2: 6981 case OP_VEXT3: 6982 return DAG.getNode(ARMISD::VEXT, dl, VT, 6983 OpLHS, OpRHS, 6984 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 6985 case OP_VUZPL: 6986 case OP_VUZPR: 6987 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 6988 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 6989 case OP_VZIPL: 6990 case OP_VZIPR: 6991 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 6992 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 6993 case OP_VTRNL: 6994 case OP_VTRNR: 6995 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 6996 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 6997 } 6998 } 6999 7000 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 7001 ArrayRef<int> ShuffleMask, 7002 SelectionDAG &DAG) { 7003 // Check to see if we can use the VTBL instruction. 7004 SDValue V1 = Op.getOperand(0); 7005 SDValue V2 = Op.getOperand(1); 7006 SDLoc DL(Op); 7007 7008 SmallVector<SDValue, 8> VTBLMask; 7009 for (ArrayRef<int>::iterator 7010 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 7011 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 7012 7013 if (V2.getNode()->isUndef()) 7014 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 7015 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7016 7017 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 7018 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 7019 } 7020 7021 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 7022 SelectionDAG &DAG) { 7023 SDLoc DL(Op); 7024 SDValue OpLHS = Op.getOperand(0); 7025 EVT VT = OpLHS.getValueType(); 7026 7027 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 7028 "Expect an v8i16/v16i8 type"); 7029 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 7030 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 7031 // extract the first 8 bytes into the top double word and the last 8 bytes 7032 // into the bottom double word. The v8i16 case is similar. 7033 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 7034 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 7035 DAG.getConstant(ExtractNum, DL, MVT::i32)); 7036 } 7037 7038 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 7039 SDValue V1 = Op.getOperand(0); 7040 SDValue V2 = Op.getOperand(1); 7041 SDLoc dl(Op); 7042 EVT VT = Op.getValueType(); 7043 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 7044 7045 // Convert shuffles that are directly supported on NEON to target-specific 7046 // DAG nodes, instead of keeping them as shuffles and matching them again 7047 // during code selection. This is more efficient and avoids the possibility 7048 // of inconsistencies between legalization and selection. 7049 // FIXME: floating-point vectors should be canonicalized to integer vectors 7050 // of the same time so that they get CSEd properly. 7051 ArrayRef<int> ShuffleMask = SVN->getMask(); 7052 7053 unsigned EltSize = VT.getScalarSizeInBits(); 7054 if (EltSize <= 32) { 7055 if (SVN->isSplat()) { 7056 int Lane = SVN->getSplatIndex(); 7057 // If this is undef splat, generate it via "just" vdup, if possible. 7058 if (Lane == -1) Lane = 0; 7059 7060 // Test if V1 is a SCALAR_TO_VECTOR. 7061 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7062 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7063 } 7064 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 7065 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 7066 // reaches it). 7067 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 7068 !isa<ConstantSDNode>(V1.getOperand(0))) { 7069 bool IsScalarToVector = true; 7070 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 7071 if (!V1.getOperand(i).isUndef()) { 7072 IsScalarToVector = false; 7073 break; 7074 } 7075 if (IsScalarToVector) 7076 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7077 } 7078 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 7079 DAG.getConstant(Lane, dl, MVT::i32)); 7080 } 7081 7082 bool ReverseVEXT; 7083 unsigned Imm; 7084 if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 7085 if (ReverseVEXT) 7086 std::swap(V1, V2); 7087 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 7088 DAG.getConstant(Imm, dl, MVT::i32)); 7089 } 7090 7091 if (isVREVMask(ShuffleMask, VT, 64)) 7092 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 7093 if (isVREVMask(ShuffleMask, VT, 32)) 7094 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 7095 if (isVREVMask(ShuffleMask, VT, 16)) 7096 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 7097 7098 if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 7099 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 7100 DAG.getConstant(Imm, dl, MVT::i32)); 7101 } 7102 7103 // Check for Neon shuffles that modify both input vectors in place. 7104 // If both results are used, i.e., if there are two shuffles with the same 7105 // source operands and with masks corresponding to both results of one of 7106 // these operations, DAG memoization will ensure that a single node is 7107 // used for both shuffles. 7108 unsigned WhichResult; 7109 bool isV_UNDEF; 7110 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7111 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 7112 if (isV_UNDEF) 7113 V2 = V1; 7114 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 7115 .getValue(WhichResult); 7116 } 7117 7118 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 7119 // shuffles that produce a result larger than their operands with: 7120 // shuffle(concat(v1, undef), concat(v2, undef)) 7121 // -> 7122 // shuffle(concat(v1, v2), undef) 7123 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 7124 // 7125 // This is useful in the general case, but there are special cases where 7126 // native shuffles produce larger results: the two-result ops. 7127 // 7128 // Look through the concat when lowering them: 7129 // shuffle(concat(v1, v2), undef) 7130 // -> 7131 // concat(VZIP(v1, v2):0, :1) 7132 // 7133 if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 7134 SDValue SubV1 = V1->getOperand(0); 7135 SDValue SubV2 = V1->getOperand(1); 7136 EVT SubVT = SubV1.getValueType(); 7137 7138 // We expect these to have been canonicalized to -1. 7139 assert(llvm::all_of(ShuffleMask, [&](int i) { 7140 return i < (int)VT.getVectorNumElements(); 7141 }) && "Unexpected shuffle index into UNDEF operand!"); 7142 7143 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7144 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 7145 if (isV_UNDEF) 7146 SubV2 = SubV1; 7147 assert((WhichResult == 0) && 7148 "In-place shuffle of concat can only have one result!"); 7149 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 7150 SubV1, SubV2); 7151 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 7152 Res.getValue(1)); 7153 } 7154 } 7155 } 7156 7157 // If the shuffle is not directly supported and it has 4 elements, use 7158 // the PerfectShuffle-generated table to synthesize it from other shuffles. 7159 unsigned NumElts = VT.getVectorNumElements(); 7160 if (NumElts == 4) { 7161 unsigned PFIndexes[4]; 7162 for (unsigned i = 0; i != 4; ++i) { 7163 if (ShuffleMask[i] < 0) 7164 PFIndexes[i] = 8; 7165 else 7166 PFIndexes[i] = ShuffleMask[i]; 7167 } 7168 7169 // Compute the index in the perfect shuffle table. 7170 unsigned PFTableIndex = 7171 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7172 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7173 unsigned Cost = (PFEntry >> 30); 7174 7175 if (Cost <= 4) 7176 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 7177 } 7178 7179 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 7180 if (EltSize >= 32) { 7181 // Do the expansion with floating-point types, since that is what the VFP 7182 // registers are defined to use, and since i64 is not legal. 7183 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7184 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7185 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 7186 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 7187 SmallVector<SDValue, 8> Ops; 7188 for (unsigned i = 0; i < NumElts; ++i) { 7189 if (ShuffleMask[i] < 0) 7190 Ops.push_back(DAG.getUNDEF(EltVT)); 7191 else 7192 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 7193 ShuffleMask[i] < (int)NumElts ? V1 : V2, 7194 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 7195 dl, MVT::i32))); 7196 } 7197 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7198 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7199 } 7200 7201 if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 7202 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 7203 7204 if (VT == MVT::v8i8) 7205 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 7206 return NewOp; 7207 7208 return SDValue(); 7209 } 7210 7211 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 7212 // INSERT_VECTOR_ELT is legal only for immediate indexes. 7213 SDValue Lane = Op.getOperand(2); 7214 if (!isa<ConstantSDNode>(Lane)) 7215 return SDValue(); 7216 7217 return Op; 7218 } 7219 7220 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 7221 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 7222 SDValue Lane = Op.getOperand(1); 7223 if (!isa<ConstantSDNode>(Lane)) 7224 return SDValue(); 7225 7226 SDValue Vec = Op.getOperand(0); 7227 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 7228 SDLoc dl(Op); 7229 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 7230 } 7231 7232 return Op; 7233 } 7234 7235 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 7236 // The only time a CONCAT_VECTORS operation can have legal types is when 7237 // two 64-bit vectors are concatenated to a 128-bit vector. 7238 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 7239 "unexpected CONCAT_VECTORS"); 7240 SDLoc dl(Op); 7241 SDValue Val = DAG.getUNDEF(MVT::v2f64); 7242 SDValue Op0 = Op.getOperand(0); 7243 SDValue Op1 = Op.getOperand(1); 7244 if (!Op0.isUndef()) 7245 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 7246 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 7247 DAG.getIntPtrConstant(0, dl)); 7248 if (!Op1.isUndef()) 7249 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 7250 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 7251 DAG.getIntPtrConstant(1, dl)); 7252 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 7253 } 7254 7255 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 7256 /// element has been zero/sign-extended, depending on the isSigned parameter, 7257 /// from an integer type half its size. 7258 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 7259 bool isSigned) { 7260 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 7261 EVT VT = N->getValueType(0); 7262 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 7263 SDNode *BVN = N->getOperand(0).getNode(); 7264 if (BVN->getValueType(0) != MVT::v4i32 || 7265 BVN->getOpcode() != ISD::BUILD_VECTOR) 7266 return false; 7267 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 7268 unsigned HiElt = 1 - LoElt; 7269 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 7270 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 7271 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 7272 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 7273 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 7274 return false; 7275 if (isSigned) { 7276 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 7277 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 7278 return true; 7279 } else { 7280 if (Hi0->isNullValue() && Hi1->isNullValue()) 7281 return true; 7282 } 7283 return false; 7284 } 7285 7286 if (N->getOpcode() != ISD::BUILD_VECTOR) 7287 return false; 7288 7289 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 7290 SDNode *Elt = N->getOperand(i).getNode(); 7291 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 7292 unsigned EltSize = VT.getScalarSizeInBits(); 7293 unsigned HalfSize = EltSize / 2; 7294 if (isSigned) { 7295 if (!isIntN(HalfSize, C->getSExtValue())) 7296 return false; 7297 } else { 7298 if (!isUIntN(HalfSize, C->getZExtValue())) 7299 return false; 7300 } 7301 continue; 7302 } 7303 return false; 7304 } 7305 7306 return true; 7307 } 7308 7309 /// isSignExtended - Check if a node is a vector value that is sign-extended 7310 /// or a constant BUILD_VECTOR with sign-extended elements. 7311 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 7312 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 7313 return true; 7314 if (isExtendedBUILD_VECTOR(N, DAG, true)) 7315 return true; 7316 return false; 7317 } 7318 7319 /// isZeroExtended - Check if a node is a vector value that is zero-extended 7320 /// or a constant BUILD_VECTOR with zero-extended elements. 7321 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 7322 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 7323 return true; 7324 if (isExtendedBUILD_VECTOR(N, DAG, false)) 7325 return true; 7326 return false; 7327 } 7328 7329 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 7330 if (OrigVT.getSizeInBits() >= 64) 7331 return OrigVT; 7332 7333 assert(OrigVT.isSimple() && "Expecting a simple value type"); 7334 7335 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 7336 switch (OrigSimpleTy) { 7337 default: llvm_unreachable("Unexpected Vector Type"); 7338 case MVT::v2i8: 7339 case MVT::v2i16: 7340 return MVT::v2i32; 7341 case MVT::v4i8: 7342 return MVT::v4i16; 7343 } 7344 } 7345 7346 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 7347 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 7348 /// We insert the required extension here to get the vector to fill a D register. 7349 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 7350 const EVT &OrigTy, 7351 const EVT &ExtTy, 7352 unsigned ExtOpcode) { 7353 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 7354 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 7355 // 64-bits we need to insert a new extension so that it will be 64-bits. 7356 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 7357 if (OrigTy.getSizeInBits() >= 64) 7358 return N; 7359 7360 // Must extend size to at least 64 bits to be used as an operand for VMULL. 7361 EVT NewVT = getExtensionTo64Bits(OrigTy); 7362 7363 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 7364 } 7365 7366 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 7367 /// does not do any sign/zero extension. If the original vector is less 7368 /// than 64 bits, an appropriate extension will be added after the load to 7369 /// reach a total size of 64 bits. We have to add the extension separately 7370 /// because ARM does not have a sign/zero extending load for vectors. 7371 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 7372 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 7373 7374 // The load already has the right type. 7375 if (ExtendedTy == LD->getMemoryVT()) 7376 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 7377 LD->getBasePtr(), LD->getPointerInfo(), 7378 LD->getAlignment(), LD->getMemOperand()->getFlags()); 7379 7380 // We need to create a zextload/sextload. We cannot just create a load 7381 // followed by a zext/zext node because LowerMUL is also run during normal 7382 // operation legalization where we can't create illegal types. 7383 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 7384 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 7385 LD->getMemoryVT(), LD->getAlignment(), 7386 LD->getMemOperand()->getFlags()); 7387 } 7388 7389 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 7390 /// extending load, or BUILD_VECTOR with extended elements, return the 7391 /// unextended value. The unextended vector should be 64 bits so that it can 7392 /// be used as an operand to a VMULL instruction. If the original vector size 7393 /// before extension is less than 64 bits we add a an extension to resize 7394 /// the vector to 64 bits. 7395 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 7396 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 7397 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 7398 N->getOperand(0)->getValueType(0), 7399 N->getValueType(0), 7400 N->getOpcode()); 7401 7402 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 7403 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 7404 "Expected extending load"); 7405 7406 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 7407 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 7408 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 7409 SDValue extLoad = 7410 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 7411 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 7412 7413 return newLoad; 7414 } 7415 7416 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 7417 // have been legalized as a BITCAST from v4i32. 7418 if (N->getOpcode() == ISD::BITCAST) { 7419 SDNode *BVN = N->getOperand(0).getNode(); 7420 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 7421 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 7422 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 7423 return DAG.getBuildVector( 7424 MVT::v2i32, SDLoc(N), 7425 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 7426 } 7427 // Construct a new BUILD_VECTOR with elements truncated to half the size. 7428 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 7429 EVT VT = N->getValueType(0); 7430 unsigned EltSize = VT.getScalarSizeInBits() / 2; 7431 unsigned NumElts = VT.getVectorNumElements(); 7432 MVT TruncVT = MVT::getIntegerVT(EltSize); 7433 SmallVector<SDValue, 8> Ops; 7434 SDLoc dl(N); 7435 for (unsigned i = 0; i != NumElts; ++i) { 7436 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 7437 const APInt &CInt = C->getAPIntValue(); 7438 // Element types smaller than 32 bits are not legal, so use i32 elements. 7439 // The values are implicitly truncated so sext vs. zext doesn't matter. 7440 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 7441 } 7442 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 7443 } 7444 7445 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 7446 unsigned Opcode = N->getOpcode(); 7447 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 7448 SDNode *N0 = N->getOperand(0).getNode(); 7449 SDNode *N1 = N->getOperand(1).getNode(); 7450 return N0->hasOneUse() && N1->hasOneUse() && 7451 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 7452 } 7453 return false; 7454 } 7455 7456 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 7457 unsigned Opcode = N->getOpcode(); 7458 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 7459 SDNode *N0 = N->getOperand(0).getNode(); 7460 SDNode *N1 = N->getOperand(1).getNode(); 7461 return N0->hasOneUse() && N1->hasOneUse() && 7462 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 7463 } 7464 return false; 7465 } 7466 7467 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 7468 // Multiplications are only custom-lowered for 128-bit vectors so that 7469 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 7470 EVT VT = Op.getValueType(); 7471 assert(VT.is128BitVector() && VT.isInteger() && 7472 "unexpected type for custom-lowering ISD::MUL"); 7473 SDNode *N0 = Op.getOperand(0).getNode(); 7474 SDNode *N1 = Op.getOperand(1).getNode(); 7475 unsigned NewOpc = 0; 7476 bool isMLA = false; 7477 bool isN0SExt = isSignExtended(N0, DAG); 7478 bool isN1SExt = isSignExtended(N1, DAG); 7479 if (isN0SExt && isN1SExt) 7480 NewOpc = ARMISD::VMULLs; 7481 else { 7482 bool isN0ZExt = isZeroExtended(N0, DAG); 7483 bool isN1ZExt = isZeroExtended(N1, DAG); 7484 if (isN0ZExt && isN1ZExt) 7485 NewOpc = ARMISD::VMULLu; 7486 else if (isN1SExt || isN1ZExt) { 7487 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 7488 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 7489 if (isN1SExt && isAddSubSExt(N0, DAG)) { 7490 NewOpc = ARMISD::VMULLs; 7491 isMLA = true; 7492 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 7493 NewOpc = ARMISD::VMULLu; 7494 isMLA = true; 7495 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 7496 std::swap(N0, N1); 7497 NewOpc = ARMISD::VMULLu; 7498 isMLA = true; 7499 } 7500 } 7501 7502 if (!NewOpc) { 7503 if (VT == MVT::v2i64) 7504 // Fall through to expand this. It is not legal. 7505 return SDValue(); 7506 else 7507 // Other vector multiplications are legal. 7508 return Op; 7509 } 7510 } 7511 7512 // Legalize to a VMULL instruction. 7513 SDLoc DL(Op); 7514 SDValue Op0; 7515 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 7516 if (!isMLA) { 7517 Op0 = SkipExtensionForVMULL(N0, DAG); 7518 assert(Op0.getValueType().is64BitVector() && 7519 Op1.getValueType().is64BitVector() && 7520 "unexpected types for extended operands to VMULL"); 7521 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 7522 } 7523 7524 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 7525 // isel lowering to take advantage of no-stall back to back vmul + vmla. 7526 // vmull q0, d4, d6 7527 // vmlal q0, d5, d6 7528 // is faster than 7529 // vaddl q0, d4, d5 7530 // vmovl q1, d6 7531 // vmul q0, q0, q1 7532 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 7533 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 7534 EVT Op1VT = Op1.getValueType(); 7535 return DAG.getNode(N0->getOpcode(), DL, VT, 7536 DAG.getNode(NewOpc, DL, VT, 7537 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 7538 DAG.getNode(NewOpc, DL, VT, 7539 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 7540 } 7541 7542 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 7543 SelectionDAG &DAG) { 7544 // TODO: Should this propagate fast-math-flags? 7545 7546 // Convert to float 7547 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 7548 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 7549 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 7550 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 7551 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 7552 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 7553 // Get reciprocal estimate. 7554 // float4 recip = vrecpeq_f32(yf); 7555 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7556 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7557 Y); 7558 // Because char has a smaller range than uchar, we can actually get away 7559 // without any newton steps. This requires that we use a weird bias 7560 // of 0xb000, however (again, this has been exhaustively tested). 7561 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 7562 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 7563 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 7564 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 7565 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 7566 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 7567 // Convert back to short. 7568 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 7569 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 7570 return X; 7571 } 7572 7573 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 7574 SelectionDAG &DAG) { 7575 // TODO: Should this propagate fast-math-flags? 7576 7577 SDValue N2; 7578 // Convert to float. 7579 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 7580 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 7581 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 7582 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 7583 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7584 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7585 7586 // Use reciprocal estimate and one refinement step. 7587 // float4 recip = vrecpeq_f32(yf); 7588 // recip *= vrecpsq_f32(yf, recip); 7589 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7590 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7591 N1); 7592 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7593 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7594 N1, N2); 7595 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7596 // Because short has a smaller range than ushort, we can actually get away 7597 // with only a single newton step. This requires that we use a weird bias 7598 // of 89, however (again, this has been exhaustively tested). 7599 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 7600 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7601 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7602 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 7603 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7604 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7605 // Convert back to integer and return. 7606 // return vmovn_s32(vcvt_s32_f32(result)); 7607 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7608 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7609 return N0; 7610 } 7611 7612 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 7613 EVT VT = Op.getValueType(); 7614 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7615 "unexpected type for custom-lowering ISD::SDIV"); 7616 7617 SDLoc dl(Op); 7618 SDValue N0 = Op.getOperand(0); 7619 SDValue N1 = Op.getOperand(1); 7620 SDValue N2, N3; 7621 7622 if (VT == MVT::v8i8) { 7623 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 7624 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 7625 7626 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7627 DAG.getIntPtrConstant(4, dl)); 7628 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7629 DAG.getIntPtrConstant(4, dl)); 7630 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7631 DAG.getIntPtrConstant(0, dl)); 7632 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7633 DAG.getIntPtrConstant(0, dl)); 7634 7635 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 7636 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 7637 7638 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7639 N0 = LowerCONCAT_VECTORS(N0, DAG); 7640 7641 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 7642 return N0; 7643 } 7644 return LowerSDIV_v4i16(N0, N1, dl, DAG); 7645 } 7646 7647 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { 7648 // TODO: Should this propagate fast-math-flags? 7649 EVT VT = Op.getValueType(); 7650 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7651 "unexpected type for custom-lowering ISD::UDIV"); 7652 7653 SDLoc dl(Op); 7654 SDValue N0 = Op.getOperand(0); 7655 SDValue N1 = Op.getOperand(1); 7656 SDValue N2, N3; 7657 7658 if (VT == MVT::v8i8) { 7659 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 7660 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 7661 7662 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7663 DAG.getIntPtrConstant(4, dl)); 7664 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7665 DAG.getIntPtrConstant(4, dl)); 7666 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7667 DAG.getIntPtrConstant(0, dl)); 7668 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7669 DAG.getIntPtrConstant(0, dl)); 7670 7671 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 7672 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 7673 7674 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7675 N0 = LowerCONCAT_VECTORS(N0, DAG); 7676 7677 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 7678 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 7679 MVT::i32), 7680 N0); 7681 return N0; 7682 } 7683 7684 // v4i16 sdiv ... Convert to float. 7685 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 7686 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 7687 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 7688 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 7689 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7690 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7691 7692 // Use reciprocal estimate and two refinement steps. 7693 // float4 recip = vrecpeq_f32(yf); 7694 // recip *= vrecpsq_f32(yf, recip); 7695 // recip *= vrecpsq_f32(yf, recip); 7696 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7697 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7698 BN1); 7699 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7700 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7701 BN1, N2); 7702 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7703 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7704 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7705 BN1, N2); 7706 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7707 // Simply multiplying by the reciprocal estimate can leave us a few ulps 7708 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 7709 // and that it will never cause us to return an answer too large). 7710 // float4 result = as_float4(as_int4(xf*recip) + 2); 7711 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7712 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7713 N1 = DAG.getConstant(2, dl, MVT::v4i32); 7714 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7715 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7716 // Convert back to integer and return. 7717 // return vmovn_u32(vcvt_s32_f32(result)); 7718 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7719 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7720 return N0; 7721 } 7722 7723 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 7724 EVT VT = Op.getNode()->getValueType(0); 7725 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 7726 7727 unsigned Opc; 7728 bool ExtraOp = false; 7729 switch (Op.getOpcode()) { 7730 default: llvm_unreachable("Invalid code"); 7731 case ISD::ADDC: Opc = ARMISD::ADDC; break; 7732 case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break; 7733 case ISD::SUBC: Opc = ARMISD::SUBC; break; 7734 case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break; 7735 } 7736 7737 if (!ExtraOp) 7738 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 7739 Op.getOperand(1)); 7740 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 7741 Op.getOperand(1), Op.getOperand(2)); 7742 } 7743 7744 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 7745 SDNode *N = Op.getNode(); 7746 EVT VT = N->getValueType(0); 7747 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 7748 7749 SDValue Carry = Op.getOperand(2); 7750 7751 SDLoc DL(Op); 7752 7753 SDValue Result; 7754 if (Op.getOpcode() == ISD::ADDCARRY) { 7755 // This converts the boolean value carry into the carry flag. 7756 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 7757 7758 // Do the addition proper using the carry flag we wanted. 7759 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 7760 Op.getOperand(1), Carry.getValue(1)); 7761 7762 // Now convert the carry flag into a boolean value. 7763 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 7764 } else { 7765 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 7766 // have to invert the carry first. 7767 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 7768 DAG.getConstant(1, DL, MVT::i32), Carry); 7769 // This converts the boolean value carry into the carry flag. 7770 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 7771 7772 // Do the subtraction proper using the carry flag we wanted. 7773 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 7774 Op.getOperand(1), Carry.getValue(1)); 7775 7776 // Now convert the carry flag into a boolean value. 7777 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 7778 // But the carry returned by ARMISD::SUBE is not a borrow as expected 7779 // by ISD::SUBCARRY, so compute 1 - C. 7780 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 7781 DAG.getConstant(1, DL, MVT::i32), Carry); 7782 } 7783 7784 // Return both values. 7785 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 7786 } 7787 7788 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 7789 assert(Subtarget->isTargetDarwin()); 7790 7791 // For iOS, we want to call an alternative entry point: __sincos_stret, 7792 // return values are passed via sret. 7793 SDLoc dl(Op); 7794 SDValue Arg = Op.getOperand(0); 7795 EVT ArgVT = Arg.getValueType(); 7796 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7797 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7798 7799 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7800 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 7801 7802 // Pair of floats / doubles used to pass the result. 7803 Type *RetTy = StructType::get(ArgTy, ArgTy); 7804 auto &DL = DAG.getDataLayout(); 7805 7806 ArgListTy Args; 7807 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 7808 SDValue SRet; 7809 if (ShouldUseSRet) { 7810 // Create stack object for sret. 7811 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 7812 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 7813 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 7814 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 7815 7816 ArgListEntry Entry; 7817 Entry.Node = SRet; 7818 Entry.Ty = RetTy->getPointerTo(); 7819 Entry.IsSExt = false; 7820 Entry.IsZExt = false; 7821 Entry.IsSRet = true; 7822 Args.push_back(Entry); 7823 RetTy = Type::getVoidTy(*DAG.getContext()); 7824 } 7825 7826 ArgListEntry Entry; 7827 Entry.Node = Arg; 7828 Entry.Ty = ArgTy; 7829 Entry.IsSExt = false; 7830 Entry.IsZExt = false; 7831 Args.push_back(Entry); 7832 7833 RTLIB::Libcall LC = 7834 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 7835 const char *LibcallName = getLibcallName(LC); 7836 CallingConv::ID CC = getLibcallCallingConv(LC); 7837 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 7838 7839 TargetLowering::CallLoweringInfo CLI(DAG); 7840 CLI.setDebugLoc(dl) 7841 .setChain(DAG.getEntryNode()) 7842 .setCallee(CC, RetTy, Callee, std::move(Args)) 7843 .setDiscardResult(ShouldUseSRet); 7844 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 7845 7846 if (!ShouldUseSRet) 7847 return CallResult.first; 7848 7849 SDValue LoadSin = 7850 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 7851 7852 // Address of cos field. 7853 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 7854 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 7855 SDValue LoadCos = 7856 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 7857 7858 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 7859 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 7860 LoadSin.getValue(0), LoadCos.getValue(0)); 7861 } 7862 7863 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 7864 bool Signed, 7865 SDValue &Chain) const { 7866 EVT VT = Op.getValueType(); 7867 assert((VT == MVT::i32 || VT == MVT::i64) && 7868 "unexpected type for custom lowering DIV"); 7869 SDLoc dl(Op); 7870 7871 const auto &DL = DAG.getDataLayout(); 7872 const auto &TLI = DAG.getTargetLoweringInfo(); 7873 7874 const char *Name = nullptr; 7875 if (Signed) 7876 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 7877 else 7878 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 7879 7880 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 7881 7882 ARMTargetLowering::ArgListTy Args; 7883 7884 for (auto AI : {1, 0}) { 7885 ArgListEntry Arg; 7886 Arg.Node = Op.getOperand(AI); 7887 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 7888 Args.push_back(Arg); 7889 } 7890 7891 CallLoweringInfo CLI(DAG); 7892 CLI.setDebugLoc(dl) 7893 .setChain(Chain) 7894 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 7895 ES, std::move(Args)); 7896 7897 return LowerCallTo(CLI).first; 7898 } 7899 7900 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 7901 bool Signed) const { 7902 assert(Op.getValueType() == MVT::i32 && 7903 "unexpected type for custom lowering DIV"); 7904 SDLoc dl(Op); 7905 7906 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 7907 DAG.getEntryNode(), Op.getOperand(1)); 7908 7909 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7910 } 7911 7912 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 7913 SDLoc DL(N); 7914 SDValue Op = N->getOperand(1); 7915 if (N->getValueType(0) == MVT::i32) 7916 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 7917 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7918 DAG.getConstant(0, DL, MVT::i32)); 7919 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7920 DAG.getConstant(1, DL, MVT::i32)); 7921 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 7922 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 7923 } 7924 7925 void ARMTargetLowering::ExpandDIV_Windows( 7926 SDValue Op, SelectionDAG &DAG, bool Signed, 7927 SmallVectorImpl<SDValue> &Results) const { 7928 const auto &DL = DAG.getDataLayout(); 7929 const auto &TLI = DAG.getTargetLoweringInfo(); 7930 7931 assert(Op.getValueType() == MVT::i64 && 7932 "unexpected type for custom lowering DIV"); 7933 SDLoc dl(Op); 7934 7935 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 7936 7937 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7938 7939 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 7940 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 7941 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 7942 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 7943 7944 Results.push_back(Lower); 7945 Results.push_back(Upper); 7946 } 7947 7948 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 7949 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 7950 // Acquire/Release load/store is not legal for targets without a dmb or 7951 // equivalent available. 7952 return SDValue(); 7953 7954 // Monotonic load/store is legal for all targets. 7955 return Op; 7956 } 7957 7958 static void ReplaceREADCYCLECOUNTER(SDNode *N, 7959 SmallVectorImpl<SDValue> &Results, 7960 SelectionDAG &DAG, 7961 const ARMSubtarget *Subtarget) { 7962 SDLoc DL(N); 7963 // Under Power Management extensions, the cycle-count is: 7964 // mrc p15, #0, <Rt>, c9, c13, #0 7965 SDValue Ops[] = { N->getOperand(0), // Chain 7966 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 7967 DAG.getConstant(15, DL, MVT::i32), 7968 DAG.getConstant(0, DL, MVT::i32), 7969 DAG.getConstant(9, DL, MVT::i32), 7970 DAG.getConstant(13, DL, MVT::i32), 7971 DAG.getConstant(0, DL, MVT::i32) 7972 }; 7973 7974 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 7975 DAG.getVTList(MVT::i32, MVT::Other), Ops); 7976 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 7977 DAG.getConstant(0, DL, MVT::i32))); 7978 Results.push_back(Cycles32.getValue(1)); 7979 } 7980 7981 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 7982 SDLoc dl(V.getNode()); 7983 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 7984 SDValue VHi = DAG.getAnyExtOrTrunc( 7985 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 7986 dl, MVT::i32); 7987 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 7988 if (isBigEndian) 7989 std::swap (VLo, VHi); 7990 SDValue RegClass = 7991 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 7992 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 7993 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 7994 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 7995 return SDValue( 7996 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 7997 } 7998 7999 static void ReplaceCMP_SWAP_64Results(SDNode *N, 8000 SmallVectorImpl<SDValue> & Results, 8001 SelectionDAG &DAG) { 8002 assert(N->getValueType(0) == MVT::i64 && 8003 "AtomicCmpSwap on types less than 64 should be legal"); 8004 SDValue Ops[] = {N->getOperand(1), 8005 createGPRPairNode(DAG, N->getOperand(2)), 8006 createGPRPairNode(DAG, N->getOperand(3)), 8007 N->getOperand(0)}; 8008 SDNode *CmpSwap = DAG.getMachineNode( 8009 ARM::CMP_SWAP_64, SDLoc(N), 8010 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 8011 8012 MachineFunction &MF = DAG.getMachineFunction(); 8013 MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); 8014 MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); 8015 cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); 8016 8017 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 8018 8019 Results.push_back( 8020 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 8021 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 8022 Results.push_back( 8023 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 8024 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 8025 Results.push_back(SDValue(CmpSwap, 2)); 8026 } 8027 8028 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, 8029 SelectionDAG &DAG) { 8030 const auto &TLI = DAG.getTargetLoweringInfo(); 8031 8032 assert(Subtarget.getTargetTriple().isOSMSVCRT() && 8033 "Custom lowering is MSVCRT specific!"); 8034 8035 SDLoc dl(Op); 8036 SDValue Val = Op.getOperand(0); 8037 MVT Ty = Val->getSimpleValueType(0); 8038 SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); 8039 SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", 8040 TLI.getPointerTy(DAG.getDataLayout())); 8041 8042 TargetLowering::ArgListTy Args; 8043 TargetLowering::ArgListEntry Entry; 8044 8045 Entry.Node = Val; 8046 Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); 8047 Entry.IsZExt = true; 8048 Args.push_back(Entry); 8049 8050 Entry.Node = Exponent; 8051 Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); 8052 Entry.IsZExt = true; 8053 Args.push_back(Entry); 8054 8055 Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); 8056 8057 // In the in-chain to the call is the entry node If we are emitting a 8058 // tailcall, the chain will be mutated if the node has a non-entry input 8059 // chain. 8060 SDValue InChain = DAG.getEntryNode(); 8061 SDValue TCChain = InChain; 8062 8063 const Function &F = DAG.getMachineFunction().getFunction(); 8064 bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && 8065 F.getReturnType() == LCRTy; 8066 if (IsTC) 8067 InChain = TCChain; 8068 8069 TargetLowering::CallLoweringInfo CLI(DAG); 8070 CLI.setDebugLoc(dl) 8071 .setChain(InChain) 8072 .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) 8073 .setTailCall(IsTC); 8074 std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI); 8075 8076 // Return the chain (the DAG root) if it is a tail call 8077 return !CI.second.getNode() ? DAG.getRoot() : CI.first; 8078 } 8079 8080 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8081 DEBUG(dbgs() << "Lowering node: "; Op.dump()); 8082 switch (Op.getOpcode()) { 8083 default: llvm_unreachable("Don't know how to custom lower this!"); 8084 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 8085 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8086 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8087 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8088 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8089 case ISD::SELECT: return LowerSELECT(Op, DAG); 8090 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 8091 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8092 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 8093 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 8094 case ISD::VASTART: return LowerVASTART(Op, DAG); 8095 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 8096 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 8097 case ISD::SINT_TO_FP: 8098 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 8099 case ISD::FP_TO_SINT: 8100 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 8101 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8102 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8103 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8104 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 8105 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 8106 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 8107 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 8108 Subtarget); 8109 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 8110 case ISD::SHL: 8111 case ISD::SRL: 8112 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 8113 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 8114 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 8115 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 8116 case ISD::SRL_PARTS: 8117 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 8118 case ISD::CTTZ: 8119 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 8120 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 8121 case ISD::SETCC: return LowerVSETCC(Op, DAG); 8122 case ISD::SETCCE: return LowerSETCCE(Op, DAG); 8123 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 8124 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 8125 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8126 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8127 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8128 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8129 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8130 case ISD::MUL: return LowerMUL(Op, DAG); 8131 case ISD::SDIV: 8132 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 8133 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 8134 return LowerSDIV(Op, DAG); 8135 case ISD::UDIV: 8136 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 8137 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 8138 return LowerUDIV(Op, DAG); 8139 case ISD::ADDC: 8140 case ISD::ADDE: 8141 case ISD::SUBC: 8142 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 8143 case ISD::ADDCARRY: 8144 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 8145 case ISD::SADDO: 8146 case ISD::SSUBO: 8147 return LowerSignedALUO(Op, DAG); 8148 case ISD::UADDO: 8149 case ISD::USUBO: 8150 return LowerUnsignedALUO(Op, DAG); 8151 case ISD::ATOMIC_LOAD: 8152 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 8153 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 8154 case ISD::SDIVREM: 8155 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 8156 case ISD::DYNAMIC_STACKALLOC: 8157 if (Subtarget->isTargetWindows()) 8158 return LowerDYNAMIC_STACKALLOC(Op, DAG); 8159 llvm_unreachable("Don't know how to custom lower this!"); 8160 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 8161 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 8162 case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); 8163 case ARMISD::WIN__DBZCHK: return SDValue(); 8164 } 8165 } 8166 8167 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 8168 SelectionDAG &DAG) { 8169 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 8170 unsigned Opc = 0; 8171 if (IntNo == Intrinsic::arm_smlald) 8172 Opc = ARMISD::SMLALD; 8173 else if (IntNo == Intrinsic::arm_smlaldx) 8174 Opc = ARMISD::SMLALDX; 8175 else if (IntNo == Intrinsic::arm_smlsld) 8176 Opc = ARMISD::SMLSLD; 8177 else if (IntNo == Intrinsic::arm_smlsldx) 8178 Opc = ARMISD::SMLSLDX; 8179 else 8180 return; 8181 8182 SDLoc dl(N); 8183 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8184 N->getOperand(3), 8185 DAG.getConstant(0, dl, MVT::i32)); 8186 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8187 N->getOperand(3), 8188 DAG.getConstant(1, dl, MVT::i32)); 8189 8190 SDValue LongMul = DAG.getNode(Opc, dl, 8191 DAG.getVTList(MVT::i32, MVT::i32), 8192 N->getOperand(1), N->getOperand(2), 8193 Lo, Hi); 8194 Results.push_back(LongMul.getValue(0)); 8195 Results.push_back(LongMul.getValue(1)); 8196 } 8197 8198 /// ReplaceNodeResults - Replace the results of node with an illegal result 8199 /// type with new values built out of custom code. 8200 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 8201 SmallVectorImpl<SDValue> &Results, 8202 SelectionDAG &DAG) const { 8203 SDValue Res; 8204 switch (N->getOpcode()) { 8205 default: 8206 llvm_unreachable("Don't know how to custom expand this!"); 8207 case ISD::READ_REGISTER: 8208 ExpandREAD_REGISTER(N, Results, DAG); 8209 break; 8210 case ISD::BITCAST: 8211 Res = ExpandBITCAST(N, DAG, Subtarget); 8212 break; 8213 case ISD::SRL: 8214 case ISD::SRA: 8215 Res = Expand64BitShift(N, DAG, Subtarget); 8216 break; 8217 case ISD::SREM: 8218 case ISD::UREM: 8219 Res = LowerREM(N, DAG); 8220 break; 8221 case ISD::SDIVREM: 8222 case ISD::UDIVREM: 8223 Res = LowerDivRem(SDValue(N, 0), DAG); 8224 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 8225 Results.push_back(Res.getValue(0)); 8226 Results.push_back(Res.getValue(1)); 8227 return; 8228 case ISD::READCYCLECOUNTER: 8229 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 8230 return; 8231 case ISD::UDIV: 8232 case ISD::SDIV: 8233 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 8234 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 8235 Results); 8236 case ISD::ATOMIC_CMP_SWAP: 8237 ReplaceCMP_SWAP_64Results(N, Results, DAG); 8238 return; 8239 case ISD::INTRINSIC_WO_CHAIN: 8240 return ReplaceLongIntrinsic(N, Results, DAG); 8241 } 8242 if (Res.getNode()) 8243 Results.push_back(Res); 8244 } 8245 8246 //===----------------------------------------------------------------------===// 8247 // ARM Scheduler Hooks 8248 //===----------------------------------------------------------------------===// 8249 8250 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 8251 /// registers the function context. 8252 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 8253 MachineBasicBlock *MBB, 8254 MachineBasicBlock *DispatchBB, 8255 int FI) const { 8256 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 8257 "ROPI/RWPI not currently supported with SjLj"); 8258 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8259 DebugLoc dl = MI.getDebugLoc(); 8260 MachineFunction *MF = MBB->getParent(); 8261 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8262 MachineConstantPool *MCP = MF->getConstantPool(); 8263 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 8264 const Function &F = MF->getFunction(); 8265 8266 bool isThumb = Subtarget->isThumb(); 8267 bool isThumb2 = Subtarget->isThumb2(); 8268 8269 unsigned PCLabelId = AFI->createPICLabelUId(); 8270 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 8271 ARMConstantPoolValue *CPV = 8272 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 8273 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 8274 8275 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 8276 : &ARM::GPRRegClass; 8277 8278 // Grab constant pool and fixed stack memory operands. 8279 MachineMemOperand *CPMMO = 8280 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 8281 MachineMemOperand::MOLoad, 4, 4); 8282 8283 MachineMemOperand *FIMMOSt = 8284 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 8285 MachineMemOperand::MOStore, 4, 4); 8286 8287 // Load the address of the dispatch MBB into the jump buffer. 8288 if (isThumb2) { 8289 // Incoming value: jbuf 8290 // ldr.n r5, LCPI1_1 8291 // orr r5, r5, #1 8292 // add r5, pc 8293 // str r5, [$jbuf, #+4] ; &jbuf[1] 8294 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8295 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 8296 .addConstantPoolIndex(CPI) 8297 .addMemOperand(CPMMO) 8298 .add(predOps(ARMCC::AL)); 8299 // Set the low bit because of thumb mode. 8300 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8301 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 8302 .addReg(NewVReg1, RegState::Kill) 8303 .addImm(0x01) 8304 .add(predOps(ARMCC::AL)) 8305 .add(condCodeOp()); 8306 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8307 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 8308 .addReg(NewVReg2, RegState::Kill) 8309 .addImm(PCLabelId); 8310 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 8311 .addReg(NewVReg3, RegState::Kill) 8312 .addFrameIndex(FI) 8313 .addImm(36) // &jbuf[1] :: pc 8314 .addMemOperand(FIMMOSt) 8315 .add(predOps(ARMCC::AL)); 8316 } else if (isThumb) { 8317 // Incoming value: jbuf 8318 // ldr.n r1, LCPI1_4 8319 // add r1, pc 8320 // mov r2, #1 8321 // orrs r1, r2 8322 // add r2, $jbuf, #+4 ; &jbuf[1] 8323 // str r1, [r2] 8324 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8325 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 8326 .addConstantPoolIndex(CPI) 8327 .addMemOperand(CPMMO) 8328 .add(predOps(ARMCC::AL)); 8329 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8330 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 8331 .addReg(NewVReg1, RegState::Kill) 8332 .addImm(PCLabelId); 8333 // Set the low bit because of thumb mode. 8334 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8335 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 8336 .addReg(ARM::CPSR, RegState::Define) 8337 .addImm(1) 8338 .add(predOps(ARMCC::AL)); 8339 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8340 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 8341 .addReg(ARM::CPSR, RegState::Define) 8342 .addReg(NewVReg2, RegState::Kill) 8343 .addReg(NewVReg3, RegState::Kill) 8344 .add(predOps(ARMCC::AL)); 8345 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8346 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 8347 .addFrameIndex(FI) 8348 .addImm(36); // &jbuf[1] :: pc 8349 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 8350 .addReg(NewVReg4, RegState::Kill) 8351 .addReg(NewVReg5, RegState::Kill) 8352 .addImm(0) 8353 .addMemOperand(FIMMOSt) 8354 .add(predOps(ARMCC::AL)); 8355 } else { 8356 // Incoming value: jbuf 8357 // ldr r1, LCPI1_1 8358 // add r1, pc, r1 8359 // str r1, [$jbuf, #+4] ; &jbuf[1] 8360 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8361 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 8362 .addConstantPoolIndex(CPI) 8363 .addImm(0) 8364 .addMemOperand(CPMMO) 8365 .add(predOps(ARMCC::AL)); 8366 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8367 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 8368 .addReg(NewVReg1, RegState::Kill) 8369 .addImm(PCLabelId) 8370 .add(predOps(ARMCC::AL)); 8371 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 8372 .addReg(NewVReg2, RegState::Kill) 8373 .addFrameIndex(FI) 8374 .addImm(36) // &jbuf[1] :: pc 8375 .addMemOperand(FIMMOSt) 8376 .add(predOps(ARMCC::AL)); 8377 } 8378 } 8379 8380 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 8381 MachineBasicBlock *MBB) const { 8382 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8383 DebugLoc dl = MI.getDebugLoc(); 8384 MachineFunction *MF = MBB->getParent(); 8385 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8386 MachineFrameInfo &MFI = MF->getFrameInfo(); 8387 int FI = MFI.getFunctionContextIndex(); 8388 8389 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 8390 : &ARM::GPRnopcRegClass; 8391 8392 // Get a mapping of the call site numbers to all of the landing pads they're 8393 // associated with. 8394 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 8395 unsigned MaxCSNum = 0; 8396 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 8397 ++BB) { 8398 if (!BB->isEHPad()) continue; 8399 8400 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 8401 // pad. 8402 for (MachineBasicBlock::iterator 8403 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 8404 if (!II->isEHLabel()) continue; 8405 8406 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 8407 if (!MF->hasCallSiteLandingPad(Sym)) continue; 8408 8409 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 8410 for (SmallVectorImpl<unsigned>::iterator 8411 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 8412 CSI != CSE; ++CSI) { 8413 CallSiteNumToLPad[*CSI].push_back(&*BB); 8414 MaxCSNum = std::max(MaxCSNum, *CSI); 8415 } 8416 break; 8417 } 8418 } 8419 8420 // Get an ordered list of the machine basic blocks for the jump table. 8421 std::vector<MachineBasicBlock*> LPadList; 8422 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 8423 LPadList.reserve(CallSiteNumToLPad.size()); 8424 for (unsigned I = 1; I <= MaxCSNum; ++I) { 8425 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 8426 for (SmallVectorImpl<MachineBasicBlock*>::iterator 8427 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 8428 LPadList.push_back(*II); 8429 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 8430 } 8431 } 8432 8433 assert(!LPadList.empty() && 8434 "No landing pad destinations for the dispatch jump table!"); 8435 8436 // Create the jump table and associated information. 8437 MachineJumpTableInfo *JTI = 8438 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 8439 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 8440 8441 // Create the MBBs for the dispatch code. 8442 8443 // Shove the dispatch's address into the return slot in the function context. 8444 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 8445 DispatchBB->setIsEHPad(); 8446 8447 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 8448 unsigned trap_opcode; 8449 if (Subtarget->isThumb()) 8450 trap_opcode = ARM::tTRAP; 8451 else 8452 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 8453 8454 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 8455 DispatchBB->addSuccessor(TrapBB); 8456 8457 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 8458 DispatchBB->addSuccessor(DispContBB); 8459 8460 // Insert and MBBs. 8461 MF->insert(MF->end(), DispatchBB); 8462 MF->insert(MF->end(), DispContBB); 8463 MF->insert(MF->end(), TrapBB); 8464 8465 // Insert code into the entry block that creates and registers the function 8466 // context. 8467 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 8468 8469 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 8470 MachinePointerInfo::getFixedStack(*MF, FI), 8471 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 8472 8473 MachineInstrBuilder MIB; 8474 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 8475 8476 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 8477 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 8478 8479 // Add a register mask with no preserved registers. This results in all 8480 // registers being marked as clobbered. This can't work if the dispatch block 8481 // is in a Thumb1 function and is linked with ARM code which uses the FP 8482 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 8483 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 8484 8485 bool IsPositionIndependent = isPositionIndependent(); 8486 unsigned NumLPads = LPadList.size(); 8487 if (Subtarget->isThumb2()) { 8488 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8489 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 8490 .addFrameIndex(FI) 8491 .addImm(4) 8492 .addMemOperand(FIMMOLd) 8493 .add(predOps(ARMCC::AL)); 8494 8495 if (NumLPads < 256) { 8496 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 8497 .addReg(NewVReg1) 8498 .addImm(LPadList.size()) 8499 .add(predOps(ARMCC::AL)); 8500 } else { 8501 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8502 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 8503 .addImm(NumLPads & 0xFFFF) 8504 .add(predOps(ARMCC::AL)); 8505 8506 unsigned VReg2 = VReg1; 8507 if ((NumLPads & 0xFFFF0000) != 0) { 8508 VReg2 = MRI->createVirtualRegister(TRC); 8509 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 8510 .addReg(VReg1) 8511 .addImm(NumLPads >> 16) 8512 .add(predOps(ARMCC::AL)); 8513 } 8514 8515 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 8516 .addReg(NewVReg1) 8517 .addReg(VReg2) 8518 .add(predOps(ARMCC::AL)); 8519 } 8520 8521 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 8522 .addMBB(TrapBB) 8523 .addImm(ARMCC::HI) 8524 .addReg(ARM::CPSR); 8525 8526 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8527 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 8528 .addJumpTableIndex(MJTI) 8529 .add(predOps(ARMCC::AL)); 8530 8531 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8532 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 8533 .addReg(NewVReg3, RegState::Kill) 8534 .addReg(NewVReg1) 8535 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 8536 .add(predOps(ARMCC::AL)) 8537 .add(condCodeOp()); 8538 8539 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 8540 .addReg(NewVReg4, RegState::Kill) 8541 .addReg(NewVReg1) 8542 .addJumpTableIndex(MJTI); 8543 } else if (Subtarget->isThumb()) { 8544 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8545 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 8546 .addFrameIndex(FI) 8547 .addImm(1) 8548 .addMemOperand(FIMMOLd) 8549 .add(predOps(ARMCC::AL)); 8550 8551 if (NumLPads < 256) { 8552 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 8553 .addReg(NewVReg1) 8554 .addImm(NumLPads) 8555 .add(predOps(ARMCC::AL)); 8556 } else { 8557 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8558 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 8559 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 8560 8561 // MachineConstantPool wants an explicit alignment. 8562 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8563 if (Align == 0) 8564 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8565 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8566 8567 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8568 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 8569 .addReg(VReg1, RegState::Define) 8570 .addConstantPoolIndex(Idx) 8571 .add(predOps(ARMCC::AL)); 8572 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 8573 .addReg(NewVReg1) 8574 .addReg(VReg1) 8575 .add(predOps(ARMCC::AL)); 8576 } 8577 8578 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 8579 .addMBB(TrapBB) 8580 .addImm(ARMCC::HI) 8581 .addReg(ARM::CPSR); 8582 8583 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8584 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 8585 .addReg(ARM::CPSR, RegState::Define) 8586 .addReg(NewVReg1) 8587 .addImm(2) 8588 .add(predOps(ARMCC::AL)); 8589 8590 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8591 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 8592 .addJumpTableIndex(MJTI) 8593 .add(predOps(ARMCC::AL)); 8594 8595 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8596 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 8597 .addReg(ARM::CPSR, RegState::Define) 8598 .addReg(NewVReg2, RegState::Kill) 8599 .addReg(NewVReg3) 8600 .add(predOps(ARMCC::AL)); 8601 8602 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8603 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8604 8605 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8606 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 8607 .addReg(NewVReg4, RegState::Kill) 8608 .addImm(0) 8609 .addMemOperand(JTMMOLd) 8610 .add(predOps(ARMCC::AL)); 8611 8612 unsigned NewVReg6 = NewVReg5; 8613 if (IsPositionIndependent) { 8614 NewVReg6 = MRI->createVirtualRegister(TRC); 8615 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 8616 .addReg(ARM::CPSR, RegState::Define) 8617 .addReg(NewVReg5, RegState::Kill) 8618 .addReg(NewVReg3) 8619 .add(predOps(ARMCC::AL)); 8620 } 8621 8622 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 8623 .addReg(NewVReg6, RegState::Kill) 8624 .addJumpTableIndex(MJTI); 8625 } else { 8626 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8627 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 8628 .addFrameIndex(FI) 8629 .addImm(4) 8630 .addMemOperand(FIMMOLd) 8631 .add(predOps(ARMCC::AL)); 8632 8633 if (NumLPads < 256) { 8634 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 8635 .addReg(NewVReg1) 8636 .addImm(NumLPads) 8637 .add(predOps(ARMCC::AL)); 8638 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 8639 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8640 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 8641 .addImm(NumLPads & 0xFFFF) 8642 .add(predOps(ARMCC::AL)); 8643 8644 unsigned VReg2 = VReg1; 8645 if ((NumLPads & 0xFFFF0000) != 0) { 8646 VReg2 = MRI->createVirtualRegister(TRC); 8647 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 8648 .addReg(VReg1) 8649 .addImm(NumLPads >> 16) 8650 .add(predOps(ARMCC::AL)); 8651 } 8652 8653 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8654 .addReg(NewVReg1) 8655 .addReg(VReg2) 8656 .add(predOps(ARMCC::AL)); 8657 } else { 8658 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8659 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 8660 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 8661 8662 // MachineConstantPool wants an explicit alignment. 8663 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8664 if (Align == 0) 8665 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8666 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8667 8668 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8669 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 8670 .addReg(VReg1, RegState::Define) 8671 .addConstantPoolIndex(Idx) 8672 .addImm(0) 8673 .add(predOps(ARMCC::AL)); 8674 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8675 .addReg(NewVReg1) 8676 .addReg(VReg1, RegState::Kill) 8677 .add(predOps(ARMCC::AL)); 8678 } 8679 8680 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 8681 .addMBB(TrapBB) 8682 .addImm(ARMCC::HI) 8683 .addReg(ARM::CPSR); 8684 8685 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8686 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 8687 .addReg(NewVReg1) 8688 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 8689 .add(predOps(ARMCC::AL)) 8690 .add(condCodeOp()); 8691 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8692 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 8693 .addJumpTableIndex(MJTI) 8694 .add(predOps(ARMCC::AL)); 8695 8696 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8697 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8698 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8699 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 8700 .addReg(NewVReg3, RegState::Kill) 8701 .addReg(NewVReg4) 8702 .addImm(0) 8703 .addMemOperand(JTMMOLd) 8704 .add(predOps(ARMCC::AL)); 8705 8706 if (IsPositionIndependent) { 8707 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 8708 .addReg(NewVReg5, RegState::Kill) 8709 .addReg(NewVReg4) 8710 .addJumpTableIndex(MJTI); 8711 } else { 8712 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 8713 .addReg(NewVReg5, RegState::Kill) 8714 .addJumpTableIndex(MJTI); 8715 } 8716 } 8717 8718 // Add the jump table entries as successors to the MBB. 8719 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 8720 for (std::vector<MachineBasicBlock*>::iterator 8721 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 8722 MachineBasicBlock *CurMBB = *I; 8723 if (SeenMBBs.insert(CurMBB).second) 8724 DispContBB->addSuccessor(CurMBB); 8725 } 8726 8727 // N.B. the order the invoke BBs are processed in doesn't matter here. 8728 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 8729 SmallVector<MachineBasicBlock*, 64> MBBLPads; 8730 for (MachineBasicBlock *BB : InvokeBBs) { 8731 8732 // Remove the landing pad successor from the invoke block and replace it 8733 // with the new dispatch block. 8734 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 8735 BB->succ_end()); 8736 while (!Successors.empty()) { 8737 MachineBasicBlock *SMBB = Successors.pop_back_val(); 8738 if (SMBB->isEHPad()) { 8739 BB->removeSuccessor(SMBB); 8740 MBBLPads.push_back(SMBB); 8741 } 8742 } 8743 8744 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 8745 BB->normalizeSuccProbs(); 8746 8747 // Find the invoke call and mark all of the callee-saved registers as 8748 // 'implicit defined' so that they're spilled. This prevents code from 8749 // moving instructions to before the EH block, where they will never be 8750 // executed. 8751 for (MachineBasicBlock::reverse_iterator 8752 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 8753 if (!II->isCall()) continue; 8754 8755 DenseMap<unsigned, bool> DefRegs; 8756 for (MachineInstr::mop_iterator 8757 OI = II->operands_begin(), OE = II->operands_end(); 8758 OI != OE; ++OI) { 8759 if (!OI->isReg()) continue; 8760 DefRegs[OI->getReg()] = true; 8761 } 8762 8763 MachineInstrBuilder MIB(*MF, &*II); 8764 8765 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 8766 unsigned Reg = SavedRegs[i]; 8767 if (Subtarget->isThumb2() && 8768 !ARM::tGPRRegClass.contains(Reg) && 8769 !ARM::hGPRRegClass.contains(Reg)) 8770 continue; 8771 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 8772 continue; 8773 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 8774 continue; 8775 if (!DefRegs[Reg]) 8776 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 8777 } 8778 8779 break; 8780 } 8781 } 8782 8783 // Mark all former landing pads as non-landing pads. The dispatch is the only 8784 // landing pad now. 8785 for (SmallVectorImpl<MachineBasicBlock*>::iterator 8786 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 8787 (*I)->setIsEHPad(false); 8788 8789 // The instruction is gone now. 8790 MI.eraseFromParent(); 8791 } 8792 8793 static 8794 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 8795 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 8796 E = MBB->succ_end(); I != E; ++I) 8797 if (*I != Succ) 8798 return *I; 8799 llvm_unreachable("Expecting a BB with two successors!"); 8800 } 8801 8802 /// Return the load opcode for a given load size. If load size >= 8, 8803 /// neon opcode will be returned. 8804 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 8805 if (LdSize >= 8) 8806 return LdSize == 16 ? ARM::VLD1q32wb_fixed 8807 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 8808 if (IsThumb1) 8809 return LdSize == 4 ? ARM::tLDRi 8810 : LdSize == 2 ? ARM::tLDRHi 8811 : LdSize == 1 ? ARM::tLDRBi : 0; 8812 if (IsThumb2) 8813 return LdSize == 4 ? ARM::t2LDR_POST 8814 : LdSize == 2 ? ARM::t2LDRH_POST 8815 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 8816 return LdSize == 4 ? ARM::LDR_POST_IMM 8817 : LdSize == 2 ? ARM::LDRH_POST 8818 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 8819 } 8820 8821 /// Return the store opcode for a given store size. If store size >= 8, 8822 /// neon opcode will be returned. 8823 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 8824 if (StSize >= 8) 8825 return StSize == 16 ? ARM::VST1q32wb_fixed 8826 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 8827 if (IsThumb1) 8828 return StSize == 4 ? ARM::tSTRi 8829 : StSize == 2 ? ARM::tSTRHi 8830 : StSize == 1 ? ARM::tSTRBi : 0; 8831 if (IsThumb2) 8832 return StSize == 4 ? ARM::t2STR_POST 8833 : StSize == 2 ? ARM::t2STRH_POST 8834 : StSize == 1 ? ARM::t2STRB_POST : 0; 8835 return StSize == 4 ? ARM::STR_POST_IMM 8836 : StSize == 2 ? ARM::STRH_POST 8837 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 8838 } 8839 8840 /// Emit a post-increment load operation with given size. The instructions 8841 /// will be added to BB at Pos. 8842 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8843 const TargetInstrInfo *TII, const DebugLoc &dl, 8844 unsigned LdSize, unsigned Data, unsigned AddrIn, 8845 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8846 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 8847 assert(LdOpc != 0 && "Should have a load opcode"); 8848 if (LdSize >= 8) { 8849 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8850 .addReg(AddrOut, RegState::Define) 8851 .addReg(AddrIn) 8852 .addImm(0) 8853 .add(predOps(ARMCC::AL)); 8854 } else if (IsThumb1) { 8855 // load + update AddrIn 8856 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8857 .addReg(AddrIn) 8858 .addImm(0) 8859 .add(predOps(ARMCC::AL)); 8860 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8861 .add(t1CondCodeOp()) 8862 .addReg(AddrIn) 8863 .addImm(LdSize) 8864 .add(predOps(ARMCC::AL)); 8865 } else if (IsThumb2) { 8866 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8867 .addReg(AddrOut, RegState::Define) 8868 .addReg(AddrIn) 8869 .addImm(LdSize) 8870 .add(predOps(ARMCC::AL)); 8871 } else { // arm 8872 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8873 .addReg(AddrOut, RegState::Define) 8874 .addReg(AddrIn) 8875 .addReg(0) 8876 .addImm(LdSize) 8877 .add(predOps(ARMCC::AL)); 8878 } 8879 } 8880 8881 /// Emit a post-increment store operation with given size. The instructions 8882 /// will be added to BB at Pos. 8883 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8884 const TargetInstrInfo *TII, const DebugLoc &dl, 8885 unsigned StSize, unsigned Data, unsigned AddrIn, 8886 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8887 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 8888 assert(StOpc != 0 && "Should have a store opcode"); 8889 if (StSize >= 8) { 8890 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8891 .addReg(AddrIn) 8892 .addImm(0) 8893 .addReg(Data) 8894 .add(predOps(ARMCC::AL)); 8895 } else if (IsThumb1) { 8896 // store + update AddrIn 8897 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 8898 .addReg(Data) 8899 .addReg(AddrIn) 8900 .addImm(0) 8901 .add(predOps(ARMCC::AL)); 8902 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8903 .add(t1CondCodeOp()) 8904 .addReg(AddrIn) 8905 .addImm(StSize) 8906 .add(predOps(ARMCC::AL)); 8907 } else if (IsThumb2) { 8908 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8909 .addReg(Data) 8910 .addReg(AddrIn) 8911 .addImm(StSize) 8912 .add(predOps(ARMCC::AL)); 8913 } else { // arm 8914 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8915 .addReg(Data) 8916 .addReg(AddrIn) 8917 .addReg(0) 8918 .addImm(StSize) 8919 .add(predOps(ARMCC::AL)); 8920 } 8921 } 8922 8923 MachineBasicBlock * 8924 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 8925 MachineBasicBlock *BB) const { 8926 // This pseudo instruction has 3 operands: dst, src, size 8927 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 8928 // Otherwise, we will generate unrolled scalar copies. 8929 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8930 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8931 MachineFunction::iterator It = ++BB->getIterator(); 8932 8933 unsigned dest = MI.getOperand(0).getReg(); 8934 unsigned src = MI.getOperand(1).getReg(); 8935 unsigned SizeVal = MI.getOperand(2).getImm(); 8936 unsigned Align = MI.getOperand(3).getImm(); 8937 DebugLoc dl = MI.getDebugLoc(); 8938 8939 MachineFunction *MF = BB->getParent(); 8940 MachineRegisterInfo &MRI = MF->getRegInfo(); 8941 unsigned UnitSize = 0; 8942 const TargetRegisterClass *TRC = nullptr; 8943 const TargetRegisterClass *VecTRC = nullptr; 8944 8945 bool IsThumb1 = Subtarget->isThumb1Only(); 8946 bool IsThumb2 = Subtarget->isThumb2(); 8947 bool IsThumb = Subtarget->isThumb(); 8948 8949 if (Align & 1) { 8950 UnitSize = 1; 8951 } else if (Align & 2) { 8952 UnitSize = 2; 8953 } else { 8954 // Check whether we can use NEON instructions. 8955 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 8956 Subtarget->hasNEON()) { 8957 if ((Align % 16 == 0) && SizeVal >= 16) 8958 UnitSize = 16; 8959 else if ((Align % 8 == 0) && SizeVal >= 8) 8960 UnitSize = 8; 8961 } 8962 // Can't use NEON instructions. 8963 if (UnitSize == 0) 8964 UnitSize = 4; 8965 } 8966 8967 // Select the correct opcode and register class for unit size load/store 8968 bool IsNeon = UnitSize >= 8; 8969 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 8970 if (IsNeon) 8971 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 8972 : UnitSize == 8 ? &ARM::DPRRegClass 8973 : nullptr; 8974 8975 unsigned BytesLeft = SizeVal % UnitSize; 8976 unsigned LoopSize = SizeVal - BytesLeft; 8977 8978 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 8979 // Use LDR and STR to copy. 8980 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 8981 // [destOut] = STR_POST(scratch, destIn, UnitSize) 8982 unsigned srcIn = src; 8983 unsigned destIn = dest; 8984 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 8985 unsigned srcOut = MRI.createVirtualRegister(TRC); 8986 unsigned destOut = MRI.createVirtualRegister(TRC); 8987 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 8988 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 8989 IsThumb1, IsThumb2); 8990 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 8991 IsThumb1, IsThumb2); 8992 srcIn = srcOut; 8993 destIn = destOut; 8994 } 8995 8996 // Handle the leftover bytes with LDRB and STRB. 8997 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 8998 // [destOut] = STRB_POST(scratch, destIn, 1) 8999 for (unsigned i = 0; i < BytesLeft; i++) { 9000 unsigned srcOut = MRI.createVirtualRegister(TRC); 9001 unsigned destOut = MRI.createVirtualRegister(TRC); 9002 unsigned scratch = MRI.createVirtualRegister(TRC); 9003 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 9004 IsThumb1, IsThumb2); 9005 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 9006 IsThumb1, IsThumb2); 9007 srcIn = srcOut; 9008 destIn = destOut; 9009 } 9010 MI.eraseFromParent(); // The instruction is gone now. 9011 return BB; 9012 } 9013 9014 // Expand the pseudo op to a loop. 9015 // thisMBB: 9016 // ... 9017 // movw varEnd, # --> with thumb2 9018 // movt varEnd, # 9019 // ldrcp varEnd, idx --> without thumb2 9020 // fallthrough --> loopMBB 9021 // loopMBB: 9022 // PHI varPhi, varEnd, varLoop 9023 // PHI srcPhi, src, srcLoop 9024 // PHI destPhi, dst, destLoop 9025 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 9026 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 9027 // subs varLoop, varPhi, #UnitSize 9028 // bne loopMBB 9029 // fallthrough --> exitMBB 9030 // exitMBB: 9031 // epilogue to handle left-over bytes 9032 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 9033 // [destOut] = STRB_POST(scratch, destLoop, 1) 9034 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9035 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9036 MF->insert(It, loopMBB); 9037 MF->insert(It, exitMBB); 9038 9039 // Transfer the remainder of BB and its successor edges to exitMBB. 9040 exitMBB->splice(exitMBB->begin(), BB, 9041 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9042 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9043 9044 // Load an immediate to varEnd. 9045 unsigned varEnd = MRI.createVirtualRegister(TRC); 9046 if (Subtarget->useMovt(*MF)) { 9047 unsigned Vtmp = varEnd; 9048 if ((LoopSize & 0xFFFF0000) != 0) 9049 Vtmp = MRI.createVirtualRegister(TRC); 9050 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 9051 .addImm(LoopSize & 0xFFFF) 9052 .add(predOps(ARMCC::AL)); 9053 9054 if ((LoopSize & 0xFFFF0000) != 0) 9055 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 9056 .addReg(Vtmp) 9057 .addImm(LoopSize >> 16) 9058 .add(predOps(ARMCC::AL)); 9059 } else { 9060 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9061 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9062 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 9063 9064 // MachineConstantPool wants an explicit alignment. 9065 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9066 if (Align == 0) 9067 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9068 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9069 9070 if (IsThumb) 9071 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 9072 .addReg(varEnd, RegState::Define) 9073 .addConstantPoolIndex(Idx) 9074 .add(predOps(ARMCC::AL)); 9075 else 9076 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 9077 .addReg(varEnd, RegState::Define) 9078 .addConstantPoolIndex(Idx) 9079 .addImm(0) 9080 .add(predOps(ARMCC::AL)); 9081 } 9082 BB->addSuccessor(loopMBB); 9083 9084 // Generate the loop body: 9085 // varPhi = PHI(varLoop, varEnd) 9086 // srcPhi = PHI(srcLoop, src) 9087 // destPhi = PHI(destLoop, dst) 9088 MachineBasicBlock *entryBB = BB; 9089 BB = loopMBB; 9090 unsigned varLoop = MRI.createVirtualRegister(TRC); 9091 unsigned varPhi = MRI.createVirtualRegister(TRC); 9092 unsigned srcLoop = MRI.createVirtualRegister(TRC); 9093 unsigned srcPhi = MRI.createVirtualRegister(TRC); 9094 unsigned destLoop = MRI.createVirtualRegister(TRC); 9095 unsigned destPhi = MRI.createVirtualRegister(TRC); 9096 9097 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 9098 .addReg(varLoop).addMBB(loopMBB) 9099 .addReg(varEnd).addMBB(entryBB); 9100 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 9101 .addReg(srcLoop).addMBB(loopMBB) 9102 .addReg(src).addMBB(entryBB); 9103 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 9104 .addReg(destLoop).addMBB(loopMBB) 9105 .addReg(dest).addMBB(entryBB); 9106 9107 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 9108 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 9109 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 9110 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 9111 IsThumb1, IsThumb2); 9112 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 9113 IsThumb1, IsThumb2); 9114 9115 // Decrement loop variable by UnitSize. 9116 if (IsThumb1) { 9117 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 9118 .add(t1CondCodeOp()) 9119 .addReg(varPhi) 9120 .addImm(UnitSize) 9121 .add(predOps(ARMCC::AL)); 9122 } else { 9123 MachineInstrBuilder MIB = 9124 BuildMI(*BB, BB->end(), dl, 9125 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 9126 MIB.addReg(varPhi) 9127 .addImm(UnitSize) 9128 .add(predOps(ARMCC::AL)) 9129 .add(condCodeOp()); 9130 MIB->getOperand(5).setReg(ARM::CPSR); 9131 MIB->getOperand(5).setIsDef(true); 9132 } 9133 BuildMI(*BB, BB->end(), dl, 9134 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 9135 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 9136 9137 // loopMBB can loop back to loopMBB or fall through to exitMBB. 9138 BB->addSuccessor(loopMBB); 9139 BB->addSuccessor(exitMBB); 9140 9141 // Add epilogue to handle BytesLeft. 9142 BB = exitMBB; 9143 auto StartOfExit = exitMBB->begin(); 9144 9145 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 9146 // [destOut] = STRB_POST(scratch, destLoop, 1) 9147 unsigned srcIn = srcLoop; 9148 unsigned destIn = destLoop; 9149 for (unsigned i = 0; i < BytesLeft; i++) { 9150 unsigned srcOut = MRI.createVirtualRegister(TRC); 9151 unsigned destOut = MRI.createVirtualRegister(TRC); 9152 unsigned scratch = MRI.createVirtualRegister(TRC); 9153 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 9154 IsThumb1, IsThumb2); 9155 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 9156 IsThumb1, IsThumb2); 9157 srcIn = srcOut; 9158 destIn = destOut; 9159 } 9160 9161 MI.eraseFromParent(); // The instruction is gone now. 9162 return BB; 9163 } 9164 9165 MachineBasicBlock * 9166 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 9167 MachineBasicBlock *MBB) const { 9168 const TargetMachine &TM = getTargetMachine(); 9169 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 9170 DebugLoc DL = MI.getDebugLoc(); 9171 9172 assert(Subtarget->isTargetWindows() && 9173 "__chkstk is only supported on Windows"); 9174 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 9175 9176 // __chkstk takes the number of words to allocate on the stack in R4, and 9177 // returns the stack adjustment in number of bytes in R4. This will not 9178 // clober any other registers (other than the obvious lr). 9179 // 9180 // Although, technically, IP should be considered a register which may be 9181 // clobbered, the call itself will not touch it. Windows on ARM is a pure 9182 // thumb-2 environment, so there is no interworking required. As a result, we 9183 // do not expect a veneer to be emitted by the linker, clobbering IP. 9184 // 9185 // Each module receives its own copy of __chkstk, so no import thunk is 9186 // required, again, ensuring that IP is not clobbered. 9187 // 9188 // Finally, although some linkers may theoretically provide a trampoline for 9189 // out of range calls (which is quite common due to a 32M range limitation of 9190 // branches for Thumb), we can generate the long-call version via 9191 // -mcmodel=large, alleviating the need for the trampoline which may clobber 9192 // IP. 9193 9194 switch (TM.getCodeModel()) { 9195 case CodeModel::Small: 9196 case CodeModel::Medium: 9197 case CodeModel::Kernel: 9198 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 9199 .add(predOps(ARMCC::AL)) 9200 .addExternalSymbol("__chkstk") 9201 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 9202 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 9203 .addReg(ARM::R12, 9204 RegState::Implicit | RegState::Define | RegState::Dead) 9205 .addReg(ARM::CPSR, 9206 RegState::Implicit | RegState::Define | RegState::Dead); 9207 break; 9208 case CodeModel::Large: { 9209 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9210 unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 9211 9212 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 9213 .addExternalSymbol("__chkstk"); 9214 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 9215 .add(predOps(ARMCC::AL)) 9216 .addReg(Reg, RegState::Kill) 9217 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 9218 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 9219 .addReg(ARM::R12, 9220 RegState::Implicit | RegState::Define | RegState::Dead) 9221 .addReg(ARM::CPSR, 9222 RegState::Implicit | RegState::Define | RegState::Dead); 9223 break; 9224 } 9225 } 9226 9227 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 9228 .addReg(ARM::SP, RegState::Kill) 9229 .addReg(ARM::R4, RegState::Kill) 9230 .setMIFlags(MachineInstr::FrameSetup) 9231 .add(predOps(ARMCC::AL)) 9232 .add(condCodeOp()); 9233 9234 MI.eraseFromParent(); 9235 return MBB; 9236 } 9237 9238 MachineBasicBlock * 9239 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 9240 MachineBasicBlock *MBB) const { 9241 DebugLoc DL = MI.getDebugLoc(); 9242 MachineFunction *MF = MBB->getParent(); 9243 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9244 9245 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 9246 MF->insert(++MBB->getIterator(), ContBB); 9247 ContBB->splice(ContBB->begin(), MBB, 9248 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 9249 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 9250 MBB->addSuccessor(ContBB); 9251 9252 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9253 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 9254 MF->push_back(TrapBB); 9255 MBB->addSuccessor(TrapBB); 9256 9257 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 9258 .addReg(MI.getOperand(0).getReg()) 9259 .addImm(0) 9260 .add(predOps(ARMCC::AL)); 9261 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 9262 .addMBB(TrapBB) 9263 .addImm(ARMCC::EQ) 9264 .addReg(ARM::CPSR); 9265 9266 MI.eraseFromParent(); 9267 return ContBB; 9268 } 9269 9270 MachineBasicBlock * 9271 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 9272 MachineBasicBlock *BB) const { 9273 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9274 DebugLoc dl = MI.getDebugLoc(); 9275 bool isThumb2 = Subtarget->isThumb2(); 9276 switch (MI.getOpcode()) { 9277 default: { 9278 MI.print(errs()); 9279 llvm_unreachable("Unexpected instr type to insert"); 9280 } 9281 9282 // Thumb1 post-indexed loads are really just single-register LDMs. 9283 case ARM::tLDR_postidx: { 9284 MachineOperand Def(MI.getOperand(1)); 9285 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 9286 .add(Def) // Rn_wb 9287 .add(MI.getOperand(2)) // Rn 9288 .add(MI.getOperand(3)) // PredImm 9289 .add(MI.getOperand(4)) // PredReg 9290 .add(MI.getOperand(0)); // Rt 9291 MI.eraseFromParent(); 9292 return BB; 9293 } 9294 9295 // The Thumb2 pre-indexed stores have the same MI operands, they just 9296 // define them differently in the .td files from the isel patterns, so 9297 // they need pseudos. 9298 case ARM::t2STR_preidx: 9299 MI.setDesc(TII->get(ARM::t2STR_PRE)); 9300 return BB; 9301 case ARM::t2STRB_preidx: 9302 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 9303 return BB; 9304 case ARM::t2STRH_preidx: 9305 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 9306 return BB; 9307 9308 case ARM::STRi_preidx: 9309 case ARM::STRBi_preidx: { 9310 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 9311 : ARM::STRB_PRE_IMM; 9312 // Decode the offset. 9313 unsigned Offset = MI.getOperand(4).getImm(); 9314 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 9315 Offset = ARM_AM::getAM2Offset(Offset); 9316 if (isSub) 9317 Offset = -Offset; 9318 9319 MachineMemOperand *MMO = *MI.memoperands_begin(); 9320 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 9321 .add(MI.getOperand(0)) // Rn_wb 9322 .add(MI.getOperand(1)) // Rt 9323 .add(MI.getOperand(2)) // Rn 9324 .addImm(Offset) // offset (skip GPR==zero_reg) 9325 .add(MI.getOperand(5)) // pred 9326 .add(MI.getOperand(6)) 9327 .addMemOperand(MMO); 9328 MI.eraseFromParent(); 9329 return BB; 9330 } 9331 case ARM::STRr_preidx: 9332 case ARM::STRBr_preidx: 9333 case ARM::STRH_preidx: { 9334 unsigned NewOpc; 9335 switch (MI.getOpcode()) { 9336 default: llvm_unreachable("unexpected opcode!"); 9337 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 9338 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 9339 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 9340 } 9341 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 9342 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 9343 MIB.add(MI.getOperand(i)); 9344 MI.eraseFromParent(); 9345 return BB; 9346 } 9347 9348 case ARM::tMOVCCr_pseudo: { 9349 // To "insert" a SELECT_CC instruction, we actually have to insert the 9350 // diamond control-flow pattern. The incoming instruction knows the 9351 // destination vreg to set, the condition code register to branch on, the 9352 // true/false values to select between, and a branch opcode to use. 9353 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9354 MachineFunction::iterator It = ++BB->getIterator(); 9355 9356 // thisMBB: 9357 // ... 9358 // TrueVal = ... 9359 // cmpTY ccX, r1, r2 9360 // bCC copy1MBB 9361 // fallthrough --> copy0MBB 9362 MachineBasicBlock *thisMBB = BB; 9363 MachineFunction *F = BB->getParent(); 9364 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9365 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9366 F->insert(It, copy0MBB); 9367 F->insert(It, sinkMBB); 9368 9369 // Transfer the remainder of BB and its successor edges to sinkMBB. 9370 sinkMBB->splice(sinkMBB->begin(), BB, 9371 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9372 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9373 9374 BB->addSuccessor(copy0MBB); 9375 BB->addSuccessor(sinkMBB); 9376 9377 BuildMI(BB, dl, TII->get(ARM::tBcc)) 9378 .addMBB(sinkMBB) 9379 .addImm(MI.getOperand(3).getImm()) 9380 .addReg(MI.getOperand(4).getReg()); 9381 9382 // copy0MBB: 9383 // %FalseValue = ... 9384 // # fallthrough to sinkMBB 9385 BB = copy0MBB; 9386 9387 // Update machine-CFG edges 9388 BB->addSuccessor(sinkMBB); 9389 9390 // sinkMBB: 9391 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9392 // ... 9393 BB = sinkMBB; 9394 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 9395 .addReg(MI.getOperand(1).getReg()) 9396 .addMBB(copy0MBB) 9397 .addReg(MI.getOperand(2).getReg()) 9398 .addMBB(thisMBB); 9399 9400 MI.eraseFromParent(); // The pseudo instruction is gone now. 9401 return BB; 9402 } 9403 9404 case ARM::BCCi64: 9405 case ARM::BCCZi64: { 9406 // If there is an unconditional branch to the other successor, remove it. 9407 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9408 9409 // Compare both parts that make up the double comparison separately for 9410 // equality. 9411 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 9412 9413 unsigned LHS1 = MI.getOperand(1).getReg(); 9414 unsigned LHS2 = MI.getOperand(2).getReg(); 9415 if (RHSisZero) { 9416 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9417 .addReg(LHS1) 9418 .addImm(0) 9419 .add(predOps(ARMCC::AL)); 9420 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9421 .addReg(LHS2).addImm(0) 9422 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 9423 } else { 9424 unsigned RHS1 = MI.getOperand(3).getReg(); 9425 unsigned RHS2 = MI.getOperand(4).getReg(); 9426 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 9427 .addReg(LHS1) 9428 .addReg(RHS1) 9429 .add(predOps(ARMCC::AL)); 9430 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 9431 .addReg(LHS2).addReg(RHS2) 9432 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 9433 } 9434 9435 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 9436 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 9437 if (MI.getOperand(0).getImm() == ARMCC::NE) 9438 std::swap(destMBB, exitMBB); 9439 9440 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 9441 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 9442 if (isThumb2) 9443 BuildMI(BB, dl, TII->get(ARM::t2B)) 9444 .addMBB(exitMBB) 9445 .add(predOps(ARMCC::AL)); 9446 else 9447 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 9448 9449 MI.eraseFromParent(); // The pseudo instruction is gone now. 9450 return BB; 9451 } 9452 9453 case ARM::Int_eh_sjlj_setjmp: 9454 case ARM::Int_eh_sjlj_setjmp_nofp: 9455 case ARM::tInt_eh_sjlj_setjmp: 9456 case ARM::t2Int_eh_sjlj_setjmp: 9457 case ARM::t2Int_eh_sjlj_setjmp_nofp: 9458 return BB; 9459 9460 case ARM::Int_eh_sjlj_setup_dispatch: 9461 EmitSjLjDispatchBlock(MI, BB); 9462 return BB; 9463 9464 case ARM::ABS: 9465 case ARM::t2ABS: { 9466 // To insert an ABS instruction, we have to insert the 9467 // diamond control-flow pattern. The incoming instruction knows the 9468 // source vreg to test against 0, the destination vreg to set, 9469 // the condition code register to branch on, the 9470 // true/false values to select between, and a branch opcode to use. 9471 // It transforms 9472 // V1 = ABS V0 9473 // into 9474 // V2 = MOVS V0 9475 // BCC (branch to SinkBB if V0 >= 0) 9476 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 9477 // SinkBB: V1 = PHI(V2, V3) 9478 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9479 MachineFunction::iterator BBI = ++BB->getIterator(); 9480 MachineFunction *Fn = BB->getParent(); 9481 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 9482 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 9483 Fn->insert(BBI, RSBBB); 9484 Fn->insert(BBI, SinkBB); 9485 9486 unsigned int ABSSrcReg = MI.getOperand(1).getReg(); 9487 unsigned int ABSDstReg = MI.getOperand(0).getReg(); 9488 bool ABSSrcKIll = MI.getOperand(1).isKill(); 9489 bool isThumb2 = Subtarget->isThumb2(); 9490 MachineRegisterInfo &MRI = Fn->getRegInfo(); 9491 // In Thumb mode S must not be specified if source register is the SP or 9492 // PC and if destination register is the SP, so restrict register class 9493 unsigned NewRsbDstReg = 9494 MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 9495 9496 // Transfer the remainder of BB and its successor edges to sinkMBB. 9497 SinkBB->splice(SinkBB->begin(), BB, 9498 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9499 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 9500 9501 BB->addSuccessor(RSBBB); 9502 BB->addSuccessor(SinkBB); 9503 9504 // fall through to SinkMBB 9505 RSBBB->addSuccessor(SinkBB); 9506 9507 // insert a cmp at the end of BB 9508 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9509 .addReg(ABSSrcReg) 9510 .addImm(0) 9511 .add(predOps(ARMCC::AL)); 9512 9513 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 9514 BuildMI(BB, dl, 9515 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 9516 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 9517 9518 // insert rsbri in RSBBB 9519 // Note: BCC and rsbri will be converted into predicated rsbmi 9520 // by if-conversion pass 9521 BuildMI(*RSBBB, RSBBB->begin(), dl, 9522 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 9523 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 9524 .addImm(0) 9525 .add(predOps(ARMCC::AL)) 9526 .add(condCodeOp()); 9527 9528 // insert PHI in SinkBB, 9529 // reuse ABSDstReg to not change uses of ABS instruction 9530 BuildMI(*SinkBB, SinkBB->begin(), dl, 9531 TII->get(ARM::PHI), ABSDstReg) 9532 .addReg(NewRsbDstReg).addMBB(RSBBB) 9533 .addReg(ABSSrcReg).addMBB(BB); 9534 9535 // remove ABS instruction 9536 MI.eraseFromParent(); 9537 9538 // return last added BB 9539 return SinkBB; 9540 } 9541 case ARM::COPY_STRUCT_BYVAL_I32: 9542 ++NumLoopByVals; 9543 return EmitStructByval(MI, BB); 9544 case ARM::WIN__CHKSTK: 9545 return EmitLowered__chkstk(MI, BB); 9546 case ARM::WIN__DBZCHK: 9547 return EmitLowered__dbzchk(MI, BB); 9548 } 9549 } 9550 9551 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers 9552 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 9553 /// instead of as a custom inserter because we need the use list from the SDNode. 9554 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 9555 MachineInstr &MI, const SDNode *Node) { 9556 bool isThumb1 = Subtarget->isThumb1Only(); 9557 9558 DebugLoc DL = MI.getDebugLoc(); 9559 MachineFunction *MF = MI.getParent()->getParent(); 9560 MachineRegisterInfo &MRI = MF->getRegInfo(); 9561 MachineInstrBuilder MIB(*MF, MI); 9562 9563 // If the new dst/src is unused mark it as dead. 9564 if (!Node->hasAnyUseOfValue(0)) { 9565 MI.getOperand(0).setIsDead(true); 9566 } 9567 if (!Node->hasAnyUseOfValue(1)) { 9568 MI.getOperand(1).setIsDead(true); 9569 } 9570 9571 // The MEMCPY both defines and kills the scratch registers. 9572 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 9573 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 9574 : &ARM::GPRRegClass); 9575 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 9576 } 9577 } 9578 9579 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 9580 SDNode *Node) const { 9581 if (MI.getOpcode() == ARM::MEMCPY) { 9582 attachMEMCPYScratchRegs(Subtarget, MI, Node); 9583 return; 9584 } 9585 9586 const MCInstrDesc *MCID = &MI.getDesc(); 9587 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 9588 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 9589 // operand is still set to noreg. If needed, set the optional operand's 9590 // register to CPSR, and remove the redundant implicit def. 9591 // 9592 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 9593 9594 // Rename pseudo opcodes. 9595 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 9596 unsigned ccOutIdx; 9597 if (NewOpc) { 9598 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 9599 MCID = &TII->get(NewOpc); 9600 9601 assert(MCID->getNumOperands() == 9602 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 9603 && "converted opcode should be the same except for cc_out" 9604 " (and, on Thumb1, pred)"); 9605 9606 MI.setDesc(*MCID); 9607 9608 // Add the optional cc_out operand 9609 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 9610 9611 // On Thumb1, move all input operands to the end, then add the predicate 9612 if (Subtarget->isThumb1Only()) { 9613 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 9614 MI.addOperand(MI.getOperand(1)); 9615 MI.RemoveOperand(1); 9616 } 9617 9618 // Restore the ties 9619 for (unsigned i = MI.getNumOperands(); i--;) { 9620 const MachineOperand& op = MI.getOperand(i); 9621 if (op.isReg() && op.isUse()) { 9622 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 9623 if (DefIdx != -1) 9624 MI.tieOperands(DefIdx, i); 9625 } 9626 } 9627 9628 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 9629 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 9630 ccOutIdx = 1; 9631 } else 9632 ccOutIdx = MCID->getNumOperands() - 1; 9633 } else 9634 ccOutIdx = MCID->getNumOperands() - 1; 9635 9636 // Any ARM instruction that sets the 's' bit should specify an optional 9637 // "cc_out" operand in the last operand position. 9638 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 9639 assert(!NewOpc && "Optional cc_out operand required"); 9640 return; 9641 } 9642 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 9643 // since we already have an optional CPSR def. 9644 bool definesCPSR = false; 9645 bool deadCPSR = false; 9646 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 9647 ++i) { 9648 const MachineOperand &MO = MI.getOperand(i); 9649 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 9650 definesCPSR = true; 9651 if (MO.isDead()) 9652 deadCPSR = true; 9653 MI.RemoveOperand(i); 9654 break; 9655 } 9656 } 9657 if (!definesCPSR) { 9658 assert(!NewOpc && "Optional cc_out operand required"); 9659 return; 9660 } 9661 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 9662 if (deadCPSR) { 9663 assert(!MI.getOperand(ccOutIdx).getReg() && 9664 "expect uninitialized optional cc_out operand"); 9665 // Thumb1 instructions must have the S bit even if the CPSR is dead. 9666 if (!Subtarget->isThumb1Only()) 9667 return; 9668 } 9669 9670 // If this instruction was defined with an optional CPSR def and its dag node 9671 // had a live implicit CPSR def, then activate the optional CPSR def. 9672 MachineOperand &MO = MI.getOperand(ccOutIdx); 9673 MO.setReg(ARM::CPSR); 9674 MO.setIsDef(true); 9675 } 9676 9677 //===----------------------------------------------------------------------===// 9678 // ARM Optimization Hooks 9679 //===----------------------------------------------------------------------===// 9680 9681 // Helper function that checks if N is a null or all ones constant. 9682 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 9683 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 9684 } 9685 9686 // Return true if N is conditionally 0 or all ones. 9687 // Detects these expressions where cc is an i1 value: 9688 // 9689 // (select cc 0, y) [AllOnes=0] 9690 // (select cc y, 0) [AllOnes=0] 9691 // (zext cc) [AllOnes=0] 9692 // (sext cc) [AllOnes=0/1] 9693 // (select cc -1, y) [AllOnes=1] 9694 // (select cc y, -1) [AllOnes=1] 9695 // 9696 // Invert is set when N is the null/all ones constant when CC is false. 9697 // OtherOp is set to the alternative value of N. 9698 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 9699 SDValue &CC, bool &Invert, 9700 SDValue &OtherOp, 9701 SelectionDAG &DAG) { 9702 switch (N->getOpcode()) { 9703 default: return false; 9704 case ISD::SELECT: { 9705 CC = N->getOperand(0); 9706 SDValue N1 = N->getOperand(1); 9707 SDValue N2 = N->getOperand(2); 9708 if (isZeroOrAllOnes(N1, AllOnes)) { 9709 Invert = false; 9710 OtherOp = N2; 9711 return true; 9712 } 9713 if (isZeroOrAllOnes(N2, AllOnes)) { 9714 Invert = true; 9715 OtherOp = N1; 9716 return true; 9717 } 9718 return false; 9719 } 9720 case ISD::ZERO_EXTEND: 9721 // (zext cc) can never be the all ones value. 9722 if (AllOnes) 9723 return false; 9724 LLVM_FALLTHROUGH; 9725 case ISD::SIGN_EXTEND: { 9726 SDLoc dl(N); 9727 EVT VT = N->getValueType(0); 9728 CC = N->getOperand(0); 9729 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 9730 return false; 9731 Invert = !AllOnes; 9732 if (AllOnes) 9733 // When looking for an AllOnes constant, N is an sext, and the 'other' 9734 // value is 0. 9735 OtherOp = DAG.getConstant(0, dl, VT); 9736 else if (N->getOpcode() == ISD::ZERO_EXTEND) 9737 // When looking for a 0 constant, N can be zext or sext. 9738 OtherOp = DAG.getConstant(1, dl, VT); 9739 else 9740 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 9741 VT); 9742 return true; 9743 } 9744 } 9745 } 9746 9747 // Combine a constant select operand into its use: 9748 // 9749 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 9750 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 9751 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 9752 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 9753 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 9754 // 9755 // The transform is rejected if the select doesn't have a constant operand that 9756 // is null, or all ones when AllOnes is set. 9757 // 9758 // Also recognize sext/zext from i1: 9759 // 9760 // (add (zext cc), x) -> (select cc (add x, 1), x) 9761 // (add (sext cc), x) -> (select cc (add x, -1), x) 9762 // 9763 // These transformations eventually create predicated instructions. 9764 // 9765 // @param N The node to transform. 9766 // @param Slct The N operand that is a select. 9767 // @param OtherOp The other N operand (x above). 9768 // @param DCI Context. 9769 // @param AllOnes Require the select constant to be all ones instead of null. 9770 // @returns The new node, or SDValue() on failure. 9771 static 9772 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 9773 TargetLowering::DAGCombinerInfo &DCI, 9774 bool AllOnes = false) { 9775 SelectionDAG &DAG = DCI.DAG; 9776 EVT VT = N->getValueType(0); 9777 SDValue NonConstantVal; 9778 SDValue CCOp; 9779 bool SwapSelectOps; 9780 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 9781 NonConstantVal, DAG)) 9782 return SDValue(); 9783 9784 // Slct is now know to be the desired identity constant when CC is true. 9785 SDValue TrueVal = OtherOp; 9786 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 9787 OtherOp, NonConstantVal); 9788 // Unless SwapSelectOps says CC should be false. 9789 if (SwapSelectOps) 9790 std::swap(TrueVal, FalseVal); 9791 9792 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 9793 CCOp, TrueVal, FalseVal); 9794 } 9795 9796 // Attempt combineSelectAndUse on each operand of a commutative operator N. 9797 static 9798 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 9799 TargetLowering::DAGCombinerInfo &DCI) { 9800 SDValue N0 = N->getOperand(0); 9801 SDValue N1 = N->getOperand(1); 9802 if (N0.getNode()->hasOneUse()) 9803 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 9804 return Result; 9805 if (N1.getNode()->hasOneUse()) 9806 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 9807 return Result; 9808 return SDValue(); 9809 } 9810 9811 static bool IsVUZPShuffleNode(SDNode *N) { 9812 // VUZP shuffle node. 9813 if (N->getOpcode() == ARMISD::VUZP) 9814 return true; 9815 9816 // "VUZP" on i32 is an alias for VTRN. 9817 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 9818 return true; 9819 9820 return false; 9821 } 9822 9823 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 9824 TargetLowering::DAGCombinerInfo &DCI, 9825 const ARMSubtarget *Subtarget) { 9826 // Look for ADD(VUZP.0, VUZP.1). 9827 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 9828 N0 == N1) 9829 return SDValue(); 9830 9831 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 9832 if (!N->getValueType(0).is64BitVector()) 9833 return SDValue(); 9834 9835 // Generate vpadd. 9836 SelectionDAG &DAG = DCI.DAG; 9837 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9838 SDLoc dl(N); 9839 SDNode *Unzip = N0.getNode(); 9840 EVT VT = N->getValueType(0); 9841 9842 SmallVector<SDValue, 8> Ops; 9843 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 9844 TLI.getPointerTy(DAG.getDataLayout()))); 9845 Ops.push_back(Unzip->getOperand(0)); 9846 Ops.push_back(Unzip->getOperand(1)); 9847 9848 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9849 } 9850 9851 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9852 TargetLowering::DAGCombinerInfo &DCI, 9853 const ARMSubtarget *Subtarget) { 9854 // Check for two extended operands. 9855 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 9856 N1.getOpcode() == ISD::SIGN_EXTEND) && 9857 !(N0.getOpcode() == ISD::ZERO_EXTEND && 9858 N1.getOpcode() == ISD::ZERO_EXTEND)) 9859 return SDValue(); 9860 9861 SDValue N00 = N0.getOperand(0); 9862 SDValue N10 = N1.getOperand(0); 9863 9864 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 9865 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 9866 N00 == N10) 9867 return SDValue(); 9868 9869 // We only recognize Q register paddl here; this can't be reached until 9870 // after type legalization. 9871 if (!N00.getValueType().is64BitVector() || 9872 !N0.getValueType().is128BitVector()) 9873 return SDValue(); 9874 9875 // Generate vpaddl. 9876 SelectionDAG &DAG = DCI.DAG; 9877 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9878 SDLoc dl(N); 9879 EVT VT = N->getValueType(0); 9880 9881 SmallVector<SDValue, 8> Ops; 9882 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 9883 unsigned Opcode; 9884 if (N0.getOpcode() == ISD::SIGN_EXTEND) 9885 Opcode = Intrinsic::arm_neon_vpaddls; 9886 else 9887 Opcode = Intrinsic::arm_neon_vpaddlu; 9888 Ops.push_back(DAG.getConstant(Opcode, dl, 9889 TLI.getPointerTy(DAG.getDataLayout()))); 9890 EVT ElemTy = N00.getValueType().getVectorElementType(); 9891 unsigned NumElts = VT.getVectorNumElements(); 9892 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 9893 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 9894 N00.getOperand(0), N00.getOperand(1)); 9895 Ops.push_back(Concat); 9896 9897 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9898 } 9899 9900 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 9901 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 9902 // much easier to match. 9903 static SDValue 9904 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9905 TargetLowering::DAGCombinerInfo &DCI, 9906 const ARMSubtarget *Subtarget) { 9907 // Only perform optimization if after legalize, and if NEON is available. We 9908 // also expected both operands to be BUILD_VECTORs. 9909 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 9910 || N0.getOpcode() != ISD::BUILD_VECTOR 9911 || N1.getOpcode() != ISD::BUILD_VECTOR) 9912 return SDValue(); 9913 9914 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 9915 EVT VT = N->getValueType(0); 9916 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 9917 return SDValue(); 9918 9919 // Check that the vector operands are of the right form. 9920 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 9921 // operands, where N is the size of the formed vector. 9922 // Each EXTRACT_VECTOR should have the same input vector and odd or even 9923 // index such that we have a pair wise add pattern. 9924 9925 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 9926 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9927 return SDValue(); 9928 SDValue Vec = N0->getOperand(0)->getOperand(0); 9929 SDNode *V = Vec.getNode(); 9930 unsigned nextIndex = 0; 9931 9932 // For each operands to the ADD which are BUILD_VECTORs, 9933 // check to see if each of their operands are an EXTRACT_VECTOR with 9934 // the same vector and appropriate index. 9935 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 9936 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 9937 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 9938 9939 SDValue ExtVec0 = N0->getOperand(i); 9940 SDValue ExtVec1 = N1->getOperand(i); 9941 9942 // First operand is the vector, verify its the same. 9943 if (V != ExtVec0->getOperand(0).getNode() || 9944 V != ExtVec1->getOperand(0).getNode()) 9945 return SDValue(); 9946 9947 // Second is the constant, verify its correct. 9948 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 9949 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 9950 9951 // For the constant, we want to see all the even or all the odd. 9952 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 9953 || C1->getZExtValue() != nextIndex+1) 9954 return SDValue(); 9955 9956 // Increment index. 9957 nextIndex+=2; 9958 } else 9959 return SDValue(); 9960 } 9961 9962 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 9963 // we're using the entire input vector, otherwise there's a size/legality 9964 // mismatch somewhere. 9965 if (nextIndex != Vec.getValueType().getVectorNumElements() || 9966 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 9967 return SDValue(); 9968 9969 // Create VPADDL node. 9970 SelectionDAG &DAG = DCI.DAG; 9971 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9972 9973 SDLoc dl(N); 9974 9975 // Build operand list. 9976 SmallVector<SDValue, 8> Ops; 9977 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 9978 TLI.getPointerTy(DAG.getDataLayout()))); 9979 9980 // Input is the vector. 9981 Ops.push_back(Vec); 9982 9983 // Get widened type and narrowed type. 9984 MVT widenType; 9985 unsigned numElem = VT.getVectorNumElements(); 9986 9987 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 9988 switch (inputLaneType.getSimpleVT().SimpleTy) { 9989 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 9990 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 9991 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 9992 default: 9993 llvm_unreachable("Invalid vector element type for padd optimization."); 9994 } 9995 9996 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 9997 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 9998 return DAG.getNode(ExtOp, dl, VT, tmp); 9999 } 10000 10001 static SDValue findMUL_LOHI(SDValue V) { 10002 if (V->getOpcode() == ISD::UMUL_LOHI || 10003 V->getOpcode() == ISD::SMUL_LOHI) 10004 return V; 10005 return SDValue(); 10006 } 10007 10008 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 10009 TargetLowering::DAGCombinerInfo &DCI, 10010 const ARMSubtarget *Subtarget) { 10011 if (Subtarget->isThumb()) { 10012 if (!Subtarget->hasDSP()) 10013 return SDValue(); 10014 } else if (!Subtarget->hasV5TEOps()) 10015 return SDValue(); 10016 10017 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 10018 // accumulates the product into a 64-bit value. The 16-bit values will 10019 // be sign extended somehow or SRA'd into 32-bit values 10020 // (addc (adde (mul 16bit, 16bit), lo), hi) 10021 SDValue Mul = AddcNode->getOperand(0); 10022 SDValue Lo = AddcNode->getOperand(1); 10023 if (Mul.getOpcode() != ISD::MUL) { 10024 Lo = AddcNode->getOperand(0); 10025 Mul = AddcNode->getOperand(1); 10026 if (Mul.getOpcode() != ISD::MUL) 10027 return SDValue(); 10028 } 10029 10030 SDValue SRA = AddeNode->getOperand(0); 10031 SDValue Hi = AddeNode->getOperand(1); 10032 if (SRA.getOpcode() != ISD::SRA) { 10033 SRA = AddeNode->getOperand(1); 10034 Hi = AddeNode->getOperand(0); 10035 if (SRA.getOpcode() != ISD::SRA) 10036 return SDValue(); 10037 } 10038 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 10039 if (Const->getZExtValue() != 31) 10040 return SDValue(); 10041 } else 10042 return SDValue(); 10043 10044 if (SRA.getOperand(0) != Mul) 10045 return SDValue(); 10046 10047 SelectionDAG &DAG = DCI.DAG; 10048 SDLoc dl(AddcNode); 10049 unsigned Opcode = 0; 10050 SDValue Op0; 10051 SDValue Op1; 10052 10053 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 10054 Opcode = ARMISD::SMLALBB; 10055 Op0 = Mul.getOperand(0); 10056 Op1 = Mul.getOperand(1); 10057 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 10058 Opcode = ARMISD::SMLALBT; 10059 Op0 = Mul.getOperand(0); 10060 Op1 = Mul.getOperand(1).getOperand(0); 10061 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 10062 Opcode = ARMISD::SMLALTB; 10063 Op0 = Mul.getOperand(0).getOperand(0); 10064 Op1 = Mul.getOperand(1); 10065 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 10066 Opcode = ARMISD::SMLALTT; 10067 Op0 = Mul->getOperand(0).getOperand(0); 10068 Op1 = Mul->getOperand(1).getOperand(0); 10069 } 10070 10071 if (!Op0 || !Op1) 10072 return SDValue(); 10073 10074 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 10075 Op0, Op1, Lo, Hi); 10076 // Replace the ADDs' nodes uses by the MLA node's values. 10077 SDValue HiMLALResult(SMLAL.getNode(), 1); 10078 SDValue LoMLALResult(SMLAL.getNode(), 0); 10079 10080 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 10081 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 10082 10083 // Return original node to notify the driver to stop replacing. 10084 SDValue resNode(AddcNode, 0); 10085 return resNode; 10086 } 10087 10088 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 10089 TargetLowering::DAGCombinerInfo &DCI, 10090 const ARMSubtarget *Subtarget) { 10091 // Look for multiply add opportunities. 10092 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 10093 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 10094 // a glue link from the first add to the second add. 10095 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 10096 // a S/UMLAL instruction. 10097 // UMUL_LOHI 10098 // / :lo \ :hi 10099 // V \ [no multiline comment] 10100 // loAdd -> ADDC | 10101 // \ :carry / 10102 // V V 10103 // ADDE <- hiAdd 10104 // 10105 // In the special case where only the higher part of a signed result is used 10106 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 10107 // a constant with the exact value of 0x80000000, we recognize we are dealing 10108 // with a "rounded multiply and add" (or subtract) and transform it into 10109 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 10110 10111 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 10112 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 10113 "Expect an ADDE or SUBE"); 10114 10115 assert(AddeSubeNode->getNumOperands() == 3 && 10116 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 10117 "ADDE node has the wrong inputs"); 10118 10119 // Check that we are chained to the right ADDC or SUBC node. 10120 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 10121 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 10122 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 10123 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 10124 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 10125 return SDValue(); 10126 10127 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 10128 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 10129 10130 // Check if the two operands are from the same mul_lohi node. 10131 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 10132 return SDValue(); 10133 10134 assert(AddcSubcNode->getNumValues() == 2 && 10135 AddcSubcNode->getValueType(0) == MVT::i32 && 10136 "Expect ADDC with two result values. First: i32"); 10137 10138 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 10139 // maybe a SMLAL which multiplies two 16-bit values. 10140 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 10141 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 10142 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 10143 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 10144 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 10145 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 10146 10147 // Check for the triangle shape. 10148 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 10149 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 10150 10151 // Make sure that the ADDE/SUBE operands are not coming from the same node. 10152 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 10153 return SDValue(); 10154 10155 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 10156 bool IsLeftOperandMUL = false; 10157 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 10158 if (MULOp == SDValue()) 10159 MULOp = findMUL_LOHI(AddeSubeOp1); 10160 else 10161 IsLeftOperandMUL = true; 10162 if (MULOp == SDValue()) 10163 return SDValue(); 10164 10165 // Figure out the right opcode. 10166 unsigned Opc = MULOp->getOpcode(); 10167 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 10168 10169 // Figure out the high and low input values to the MLAL node. 10170 SDValue *HiAddSub = nullptr; 10171 SDValue *LoMul = nullptr; 10172 SDValue *LowAddSub = nullptr; 10173 10174 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 10175 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 10176 return SDValue(); 10177 10178 if (IsLeftOperandMUL) 10179 HiAddSub = &AddeSubeOp1; 10180 else 10181 HiAddSub = &AddeSubeOp0; 10182 10183 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 10184 // whose low result is fed to the ADDC/SUBC we are checking. 10185 10186 if (AddcSubcOp0 == MULOp.getValue(0)) { 10187 LoMul = &AddcSubcOp0; 10188 LowAddSub = &AddcSubcOp1; 10189 } 10190 if (AddcSubcOp1 == MULOp.getValue(0)) { 10191 LoMul = &AddcSubcOp1; 10192 LowAddSub = &AddcSubcOp0; 10193 } 10194 10195 if (!LoMul) 10196 return SDValue(); 10197 10198 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 10199 // the replacement below will create a cycle. 10200 if (AddcSubcNode == HiAddSub->getNode() || 10201 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 10202 return SDValue(); 10203 10204 // Create the merged node. 10205 SelectionDAG &DAG = DCI.DAG; 10206 10207 // Start building operand list. 10208 SmallVector<SDValue, 8> Ops; 10209 Ops.push_back(LoMul->getOperand(0)); 10210 Ops.push_back(LoMul->getOperand(1)); 10211 10212 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 10213 // the case, we must be doing signed multiplication and only use the higher 10214 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 10215 // addition or subtraction with the value of 0x800000. 10216 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 10217 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 10218 LowAddSub->getNode()->getOpcode() == ISD::Constant && 10219 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 10220 0x80000000) { 10221 Ops.push_back(*HiAddSub); 10222 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 10223 FinalOpc = ARMISD::SMMLSR; 10224 } else { 10225 FinalOpc = ARMISD::SMMLAR; 10226 } 10227 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 10228 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 10229 10230 return SDValue(AddeSubeNode, 0); 10231 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 10232 // SMMLS is generated during instruction selection and the rest of this 10233 // function can not handle the case where AddcSubcNode is a SUBC. 10234 return SDValue(); 10235 10236 // Finish building the operand list for {U/S}MLAL 10237 Ops.push_back(*LowAddSub); 10238 Ops.push_back(*HiAddSub); 10239 10240 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 10241 DAG.getVTList(MVT::i32, MVT::i32), Ops); 10242 10243 // Replace the ADDs' nodes uses by the MLA node's values. 10244 SDValue HiMLALResult(MLALNode.getNode(), 1); 10245 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 10246 10247 SDValue LoMLALResult(MLALNode.getNode(), 0); 10248 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 10249 10250 // Return original node to notify the driver to stop replacing. 10251 return SDValue(AddeSubeNode, 0); 10252 } 10253 10254 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 10255 TargetLowering::DAGCombinerInfo &DCI, 10256 const ARMSubtarget *Subtarget) { 10257 // UMAAL is similar to UMLAL except that it adds two unsigned values. 10258 // While trying to combine for the other MLAL nodes, first search for the 10259 // chance to use UMAAL. Check if Addc uses a node which has already 10260 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 10261 // as the addend, and it's handled in PerformUMLALCombine. 10262 10263 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 10264 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 10265 10266 // Check that we have a glued ADDC node. 10267 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 10268 if (AddcNode->getOpcode() != ARMISD::ADDC) 10269 return SDValue(); 10270 10271 // Find the converted UMAAL or quit if it doesn't exist. 10272 SDNode *UmlalNode = nullptr; 10273 SDValue AddHi; 10274 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 10275 UmlalNode = AddcNode->getOperand(0).getNode(); 10276 AddHi = AddcNode->getOperand(1); 10277 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 10278 UmlalNode = AddcNode->getOperand(1).getNode(); 10279 AddHi = AddcNode->getOperand(0); 10280 } else { 10281 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 10282 } 10283 10284 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 10285 // the ADDC as well as Zero. 10286 if (!isNullConstant(UmlalNode->getOperand(3))) 10287 return SDValue(); 10288 10289 if ((isNullConstant(AddeNode->getOperand(0)) && 10290 AddeNode->getOperand(1).getNode() == UmlalNode) || 10291 (AddeNode->getOperand(0).getNode() == UmlalNode && 10292 isNullConstant(AddeNode->getOperand(1)))) { 10293 SelectionDAG &DAG = DCI.DAG; 10294 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 10295 UmlalNode->getOperand(2), AddHi }; 10296 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 10297 DAG.getVTList(MVT::i32, MVT::i32), Ops); 10298 10299 // Replace the ADDs' nodes uses by the UMAAL node's values. 10300 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 10301 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 10302 10303 // Return original node to notify the driver to stop replacing. 10304 return SDValue(AddeNode, 0); 10305 } 10306 return SDValue(); 10307 } 10308 10309 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 10310 const ARMSubtarget *Subtarget) { 10311 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 10312 return SDValue(); 10313 10314 // Check that we have a pair of ADDC and ADDE as operands. 10315 // Both addends of the ADDE must be zero. 10316 SDNode* AddcNode = N->getOperand(2).getNode(); 10317 SDNode* AddeNode = N->getOperand(3).getNode(); 10318 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 10319 (AddeNode->getOpcode() == ARMISD::ADDE) && 10320 isNullConstant(AddeNode->getOperand(0)) && 10321 isNullConstant(AddeNode->getOperand(1)) && 10322 (AddeNode->getOperand(2).getNode() == AddcNode)) 10323 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 10324 DAG.getVTList(MVT::i32, MVT::i32), 10325 {N->getOperand(0), N->getOperand(1), 10326 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 10327 else 10328 return SDValue(); 10329 } 10330 10331 static SDValue PerformAddcSubcCombine(SDNode *N, 10332 TargetLowering::DAGCombinerInfo &DCI, 10333 const ARMSubtarget *Subtarget) { 10334 SelectionDAG &DAG(DCI.DAG); 10335 10336 if (N->getOpcode() == ARMISD::SUBC) { 10337 // (SUBC (ADDE 0, 0, C), 1) -> C 10338 SDValue LHS = N->getOperand(0); 10339 SDValue RHS = N->getOperand(1); 10340 if (LHS->getOpcode() == ARMISD::ADDE && 10341 isNullConstant(LHS->getOperand(0)) && 10342 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 10343 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 10344 } 10345 } 10346 10347 if (Subtarget->isThumb1Only()) { 10348 SDValue RHS = N->getOperand(1); 10349 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 10350 int32_t imm = C->getSExtValue(); 10351 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 10352 SDLoc DL(N); 10353 RHS = DAG.getConstant(-imm, DL, MVT::i32); 10354 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 10355 : ARMISD::ADDC; 10356 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 10357 } 10358 } 10359 } 10360 return SDValue(); 10361 } 10362 10363 static SDValue PerformAddeSubeCombine(SDNode *N, 10364 TargetLowering::DAGCombinerInfo &DCI, 10365 const ARMSubtarget *Subtarget) { 10366 if (Subtarget->isThumb1Only()) { 10367 SelectionDAG &DAG = DCI.DAG; 10368 SDValue RHS = N->getOperand(1); 10369 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 10370 int64_t imm = C->getSExtValue(); 10371 if (imm < 0) { 10372 SDLoc DL(N); 10373 10374 // The with-carry-in form matches bitwise not instead of the negation. 10375 // Effectively, the inverse interpretation of the carry flag already 10376 // accounts for part of the negation. 10377 RHS = DAG.getConstant(~imm, DL, MVT::i32); 10378 10379 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 10380 : ARMISD::ADDE; 10381 return DAG.getNode(Opcode, DL, N->getVTList(), 10382 N->getOperand(0), RHS, N->getOperand(2)); 10383 } 10384 } 10385 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 10386 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 10387 } 10388 return SDValue(); 10389 } 10390 10391 /// PerformADDECombine - Target-specific dag combine transform from 10392 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 10393 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 10394 static SDValue PerformADDECombine(SDNode *N, 10395 TargetLowering::DAGCombinerInfo &DCI, 10396 const ARMSubtarget *Subtarget) { 10397 // Only ARM and Thumb2 support UMLAL/SMLAL. 10398 if (Subtarget->isThumb1Only()) 10399 return PerformAddeSubeCombine(N, DCI, Subtarget); 10400 10401 // Only perform the checks after legalize when the pattern is available. 10402 if (DCI.isBeforeLegalize()) return SDValue(); 10403 10404 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 10405 } 10406 10407 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 10408 /// operands N0 and N1. This is a helper for PerformADDCombine that is 10409 /// called with the default operands, and if that fails, with commuted 10410 /// operands. 10411 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 10412 TargetLowering::DAGCombinerInfo &DCI, 10413 const ARMSubtarget *Subtarget){ 10414 // Attempt to create vpadd for this add. 10415 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 10416 return Result; 10417 10418 // Attempt to create vpaddl for this add. 10419 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 10420 return Result; 10421 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 10422 Subtarget)) 10423 return Result; 10424 10425 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 10426 if (N0.getNode()->hasOneUse()) 10427 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 10428 return Result; 10429 return SDValue(); 10430 } 10431 10432 static SDValue PerformSHLSimplify(SDNode *N, 10433 TargetLowering::DAGCombinerInfo &DCI, 10434 const ARMSubtarget *ST) { 10435 // Allow the generic combiner to identify potential bswaps. 10436 if (DCI.isBeforeLegalize()) 10437 return SDValue(); 10438 10439 // DAG combiner will fold: 10440 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 10441 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 10442 // Other code patterns that can be also be modified have the following form: 10443 // b + ((a << 1) | 510) 10444 // b + ((a << 1) & 510) 10445 // b + ((a << 1) ^ 510) 10446 // b + ((a << 1) + 510) 10447 10448 // Many instructions can perform the shift for free, but it requires both 10449 // the operands to be registers. If c1 << c2 is too large, a mov immediate 10450 // instruction will needed. So, unfold back to the original pattern if: 10451 // - if c1 and c2 are small enough that they don't require mov imms. 10452 // - the user(s) of the node can perform an shl 10453 10454 // No shifted operands for 16-bit instructions. 10455 if (ST->isThumb() && ST->isThumb1Only()) 10456 return SDValue(); 10457 10458 // Check that all the users could perform the shl themselves. 10459 for (auto U : N->uses()) { 10460 switch(U->getOpcode()) { 10461 default: 10462 return SDValue(); 10463 case ISD::SUB: 10464 case ISD::ADD: 10465 case ISD::AND: 10466 case ISD::OR: 10467 case ISD::XOR: 10468 case ISD::SETCC: 10469 case ARMISD::CMP: 10470 // Check that the user isn't already using a constant because there 10471 // aren't any instructions that support an immediate operand and a 10472 // shifted operand. 10473 if (isa<ConstantSDNode>(U->getOperand(0)) || 10474 isa<ConstantSDNode>(U->getOperand(1))) 10475 return SDValue(); 10476 10477 // Check that it's not already using a shift. 10478 if (U->getOperand(0).getOpcode() == ISD::SHL || 10479 U->getOperand(1).getOpcode() == ISD::SHL) 10480 return SDValue(); 10481 break; 10482 } 10483 } 10484 10485 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 10486 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 10487 return SDValue(); 10488 10489 if (N->getOperand(0).getOpcode() != ISD::SHL) 10490 return SDValue(); 10491 10492 SDValue SHL = N->getOperand(0); 10493 10494 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10495 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 10496 if (!C1ShlC2 || !C2) 10497 return SDValue(); 10498 10499 APInt C2Int = C2->getAPIntValue(); 10500 APInt C1Int = C1ShlC2->getAPIntValue(); 10501 10502 // Check that performing a lshr will not lose any information. 10503 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 10504 C2Int.getBitWidth() - C2->getZExtValue()); 10505 if ((C1Int & Mask) != C1Int) 10506 return SDValue(); 10507 10508 // Shift the first constant. 10509 C1Int.lshrInPlace(C2Int); 10510 10511 // The immediates are encoded as an 8-bit value that can be rotated. 10512 auto LargeImm = [](const APInt &Imm) { 10513 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 10514 return Imm.getBitWidth() - Zeros > 8; 10515 }; 10516 10517 if (LargeImm(C1Int) || LargeImm(C2Int)) 10518 return SDValue(); 10519 10520 SelectionDAG &DAG = DCI.DAG; 10521 SDLoc dl(N); 10522 SDValue X = SHL.getOperand(0); 10523 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 10524 DAG.getConstant(C1Int, dl, MVT::i32)); 10525 // Shift left to compensate for the lshr of C1Int. 10526 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 10527 10528 DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); SHL.dump(); 10529 N->dump()); 10530 DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 10531 10532 DAG.ReplaceAllUsesWith(SDValue(N, 0), Res); 10533 return SDValue(N, 0); 10534 } 10535 10536 10537 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 10538 /// 10539 static SDValue PerformADDCombine(SDNode *N, 10540 TargetLowering::DAGCombinerInfo &DCI, 10541 const ARMSubtarget *Subtarget) { 10542 SDValue N0 = N->getOperand(0); 10543 SDValue N1 = N->getOperand(1); 10544 10545 // Only works one way, because it needs an immediate operand. 10546 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 10547 return Result; 10548 10549 // First try with the default operand order. 10550 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 10551 return Result; 10552 10553 // If that didn't work, try again with the operands commuted. 10554 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 10555 } 10556 10557 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 10558 /// 10559 static SDValue PerformSUBCombine(SDNode *N, 10560 TargetLowering::DAGCombinerInfo &DCI) { 10561 SDValue N0 = N->getOperand(0); 10562 SDValue N1 = N->getOperand(1); 10563 10564 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 10565 if (N1.getNode()->hasOneUse()) 10566 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 10567 return Result; 10568 10569 return SDValue(); 10570 } 10571 10572 /// PerformVMULCombine 10573 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 10574 /// special multiplier accumulator forwarding. 10575 /// vmul d3, d0, d2 10576 /// vmla d3, d1, d2 10577 /// is faster than 10578 /// vadd d3, d0, d1 10579 /// vmul d3, d3, d2 10580 // However, for (A + B) * (A + B), 10581 // vadd d2, d0, d1 10582 // vmul d3, d0, d2 10583 // vmla d3, d1, d2 10584 // is slower than 10585 // vadd d2, d0, d1 10586 // vmul d3, d2, d2 10587 static SDValue PerformVMULCombine(SDNode *N, 10588 TargetLowering::DAGCombinerInfo &DCI, 10589 const ARMSubtarget *Subtarget) { 10590 if (!Subtarget->hasVMLxForwarding()) 10591 return SDValue(); 10592 10593 SelectionDAG &DAG = DCI.DAG; 10594 SDValue N0 = N->getOperand(0); 10595 SDValue N1 = N->getOperand(1); 10596 unsigned Opcode = N0.getOpcode(); 10597 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 10598 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 10599 Opcode = N1.getOpcode(); 10600 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 10601 Opcode != ISD::FADD && Opcode != ISD::FSUB) 10602 return SDValue(); 10603 std::swap(N0, N1); 10604 } 10605 10606 if (N0 == N1) 10607 return SDValue(); 10608 10609 EVT VT = N->getValueType(0); 10610 SDLoc DL(N); 10611 SDValue N00 = N0->getOperand(0); 10612 SDValue N01 = N0->getOperand(1); 10613 return DAG.getNode(Opcode, DL, VT, 10614 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 10615 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 10616 } 10617 10618 static SDValue PerformMULCombine(SDNode *N, 10619 TargetLowering::DAGCombinerInfo &DCI, 10620 const ARMSubtarget *Subtarget) { 10621 SelectionDAG &DAG = DCI.DAG; 10622 10623 if (Subtarget->isThumb1Only()) 10624 return SDValue(); 10625 10626 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10627 return SDValue(); 10628 10629 EVT VT = N->getValueType(0); 10630 if (VT.is64BitVector() || VT.is128BitVector()) 10631 return PerformVMULCombine(N, DCI, Subtarget); 10632 if (VT != MVT::i32) 10633 return SDValue(); 10634 10635 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10636 if (!C) 10637 return SDValue(); 10638 10639 int64_t MulAmt = C->getSExtValue(); 10640 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 10641 10642 ShiftAmt = ShiftAmt & (32 - 1); 10643 SDValue V = N->getOperand(0); 10644 SDLoc DL(N); 10645 10646 SDValue Res; 10647 MulAmt >>= ShiftAmt; 10648 10649 if (MulAmt >= 0) { 10650 if (isPowerOf2_32(MulAmt - 1)) { 10651 // (mul x, 2^N + 1) => (add (shl x, N), x) 10652 Res = DAG.getNode(ISD::ADD, DL, VT, 10653 V, 10654 DAG.getNode(ISD::SHL, DL, VT, 10655 V, 10656 DAG.getConstant(Log2_32(MulAmt - 1), DL, 10657 MVT::i32))); 10658 } else if (isPowerOf2_32(MulAmt + 1)) { 10659 // (mul x, 2^N - 1) => (sub (shl x, N), x) 10660 Res = DAG.getNode(ISD::SUB, DL, VT, 10661 DAG.getNode(ISD::SHL, DL, VT, 10662 V, 10663 DAG.getConstant(Log2_32(MulAmt + 1), DL, 10664 MVT::i32)), 10665 V); 10666 } else 10667 return SDValue(); 10668 } else { 10669 uint64_t MulAmtAbs = -MulAmt; 10670 if (isPowerOf2_32(MulAmtAbs + 1)) { 10671 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 10672 Res = DAG.getNode(ISD::SUB, DL, VT, 10673 V, 10674 DAG.getNode(ISD::SHL, DL, VT, 10675 V, 10676 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 10677 MVT::i32))); 10678 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 10679 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 10680 Res = DAG.getNode(ISD::ADD, DL, VT, 10681 V, 10682 DAG.getNode(ISD::SHL, DL, VT, 10683 V, 10684 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 10685 MVT::i32))); 10686 Res = DAG.getNode(ISD::SUB, DL, VT, 10687 DAG.getConstant(0, DL, MVT::i32), Res); 10688 } else 10689 return SDValue(); 10690 } 10691 10692 if (ShiftAmt != 0) 10693 Res = DAG.getNode(ISD::SHL, DL, VT, 10694 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 10695 10696 // Do not add new nodes to DAG combiner worklist. 10697 DCI.CombineTo(N, Res, false); 10698 return SDValue(); 10699 } 10700 10701 static SDValue PerformANDCombine(SDNode *N, 10702 TargetLowering::DAGCombinerInfo &DCI, 10703 const ARMSubtarget *Subtarget) { 10704 // Attempt to use immediate-form VBIC 10705 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 10706 SDLoc dl(N); 10707 EVT VT = N->getValueType(0); 10708 SelectionDAG &DAG = DCI.DAG; 10709 10710 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10711 return SDValue(); 10712 10713 APInt SplatBits, SplatUndef; 10714 unsigned SplatBitSize; 10715 bool HasAnyUndefs; 10716 if (BVN && 10717 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 10718 if (SplatBitSize <= 64) { 10719 EVT VbicVT; 10720 SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), 10721 SplatUndef.getZExtValue(), SplatBitSize, 10722 DAG, dl, VbicVT, VT.is128BitVector(), 10723 OtherModImm); 10724 if (Val.getNode()) { 10725 SDValue Input = 10726 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 10727 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 10728 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 10729 } 10730 } 10731 } 10732 10733 if (!Subtarget->isThumb1Only()) { 10734 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 10735 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 10736 return Result; 10737 10738 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 10739 return Result; 10740 } 10741 10742 return SDValue(); 10743 } 10744 10745 // Try combining OR nodes to SMULWB, SMULWT. 10746 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 10747 TargetLowering::DAGCombinerInfo &DCI, 10748 const ARMSubtarget *Subtarget) { 10749 if (!Subtarget->hasV6Ops() || 10750 (Subtarget->isThumb() && 10751 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 10752 return SDValue(); 10753 10754 SDValue SRL = OR->getOperand(0); 10755 SDValue SHL = OR->getOperand(1); 10756 10757 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 10758 SRL = OR->getOperand(1); 10759 SHL = OR->getOperand(0); 10760 } 10761 if (!isSRL16(SRL) || !isSHL16(SHL)) 10762 return SDValue(); 10763 10764 // The first operands to the shifts need to be the two results from the 10765 // same smul_lohi node. 10766 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 10767 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 10768 return SDValue(); 10769 10770 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 10771 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 10772 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 10773 return SDValue(); 10774 10775 // Now we have: 10776 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 10777 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 10778 // For SMUWB the 16-bit value will signed extended somehow. 10779 // For SMULWT only the SRA is required. 10780 // Check both sides of SMUL_LOHI 10781 SDValue OpS16 = SMULLOHI->getOperand(0); 10782 SDValue OpS32 = SMULLOHI->getOperand(1); 10783 10784 SelectionDAG &DAG = DCI.DAG; 10785 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 10786 OpS16 = OpS32; 10787 OpS32 = SMULLOHI->getOperand(0); 10788 } 10789 10790 SDLoc dl(OR); 10791 unsigned Opcode = 0; 10792 if (isS16(OpS16, DAG)) 10793 Opcode = ARMISD::SMULWB; 10794 else if (isSRA16(OpS16)) { 10795 Opcode = ARMISD::SMULWT; 10796 OpS16 = OpS16->getOperand(0); 10797 } 10798 else 10799 return SDValue(); 10800 10801 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 10802 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 10803 return SDValue(OR, 0); 10804 } 10805 10806 static SDValue PerformORCombineToBFI(SDNode *N, 10807 TargetLowering::DAGCombinerInfo &DCI, 10808 const ARMSubtarget *Subtarget) { 10809 // BFI is only available on V6T2+ 10810 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 10811 return SDValue(); 10812 10813 EVT VT = N->getValueType(0); 10814 SDValue N0 = N->getOperand(0); 10815 SDValue N1 = N->getOperand(1); 10816 SelectionDAG &DAG = DCI.DAG; 10817 SDLoc DL(N); 10818 // 1) or (and A, mask), val => ARMbfi A, val, mask 10819 // iff (val & mask) == val 10820 // 10821 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 10822 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 10823 // && mask == ~mask2 10824 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 10825 // && ~mask == mask2 10826 // (i.e., copy a bitfield value into another bitfield of the same width) 10827 10828 if (VT != MVT::i32) 10829 return SDValue(); 10830 10831 SDValue N00 = N0.getOperand(0); 10832 10833 // The value and the mask need to be constants so we can verify this is 10834 // actually a bitfield set. If the mask is 0xffff, we can do better 10835 // via a movt instruction, so don't use BFI in that case. 10836 SDValue MaskOp = N0.getOperand(1); 10837 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 10838 if (!MaskC) 10839 return SDValue(); 10840 unsigned Mask = MaskC->getZExtValue(); 10841 if (Mask == 0xffff) 10842 return SDValue(); 10843 SDValue Res; 10844 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 10845 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 10846 if (N1C) { 10847 unsigned Val = N1C->getZExtValue(); 10848 if ((Val & ~Mask) != Val) 10849 return SDValue(); 10850 10851 if (ARM::isBitFieldInvertedMask(Mask)) { 10852 Val >>= countTrailingZeros(~Mask); 10853 10854 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 10855 DAG.getConstant(Val, DL, MVT::i32), 10856 DAG.getConstant(Mask, DL, MVT::i32)); 10857 10858 DCI.CombineTo(N, Res, false); 10859 // Return value from the original node to inform the combiner than N is 10860 // now dead. 10861 return SDValue(N, 0); 10862 } 10863 } else if (N1.getOpcode() == ISD::AND) { 10864 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 10865 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 10866 if (!N11C) 10867 return SDValue(); 10868 unsigned Mask2 = N11C->getZExtValue(); 10869 10870 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 10871 // as is to match. 10872 if (ARM::isBitFieldInvertedMask(Mask) && 10873 (Mask == ~Mask2)) { 10874 // The pack halfword instruction works better for masks that fit it, 10875 // so use that when it's available. 10876 if (Subtarget->hasDSP() && 10877 (Mask == 0xffff || Mask == 0xffff0000)) 10878 return SDValue(); 10879 // 2a 10880 unsigned amt = countTrailingZeros(Mask2); 10881 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 10882 DAG.getConstant(amt, DL, MVT::i32)); 10883 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 10884 DAG.getConstant(Mask, DL, MVT::i32)); 10885 DCI.CombineTo(N, Res, false); 10886 // Return value from the original node to inform the combiner than N is 10887 // now dead. 10888 return SDValue(N, 0); 10889 } else if (ARM::isBitFieldInvertedMask(~Mask) && 10890 (~Mask == Mask2)) { 10891 // The pack halfword instruction works better for masks that fit it, 10892 // so use that when it's available. 10893 if (Subtarget->hasDSP() && 10894 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 10895 return SDValue(); 10896 // 2b 10897 unsigned lsb = countTrailingZeros(Mask); 10898 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 10899 DAG.getConstant(lsb, DL, MVT::i32)); 10900 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 10901 DAG.getConstant(Mask2, DL, MVT::i32)); 10902 DCI.CombineTo(N, Res, false); 10903 // Return value from the original node to inform the combiner than N is 10904 // now dead. 10905 return SDValue(N, 0); 10906 } 10907 } 10908 10909 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 10910 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 10911 ARM::isBitFieldInvertedMask(~Mask)) { 10912 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 10913 // where lsb(mask) == #shamt and masked bits of B are known zero. 10914 SDValue ShAmt = N00.getOperand(1); 10915 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 10916 unsigned LSB = countTrailingZeros(Mask); 10917 if (ShAmtC != LSB) 10918 return SDValue(); 10919 10920 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 10921 DAG.getConstant(~Mask, DL, MVT::i32)); 10922 10923 DCI.CombineTo(N, Res, false); 10924 // Return value from the original node to inform the combiner than N is 10925 // now dead. 10926 return SDValue(N, 0); 10927 } 10928 10929 return SDValue(); 10930 } 10931 10932 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 10933 static SDValue PerformORCombine(SDNode *N, 10934 TargetLowering::DAGCombinerInfo &DCI, 10935 const ARMSubtarget *Subtarget) { 10936 // Attempt to use immediate-form VORR 10937 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 10938 SDLoc dl(N); 10939 EVT VT = N->getValueType(0); 10940 SelectionDAG &DAG = DCI.DAG; 10941 10942 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10943 return SDValue(); 10944 10945 APInt SplatBits, SplatUndef; 10946 unsigned SplatBitSize; 10947 bool HasAnyUndefs; 10948 if (BVN && Subtarget->hasNEON() && 10949 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 10950 if (SplatBitSize <= 64) { 10951 EVT VorrVT; 10952 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 10953 SplatUndef.getZExtValue(), SplatBitSize, 10954 DAG, dl, VorrVT, VT.is128BitVector(), 10955 OtherModImm); 10956 if (Val.getNode()) { 10957 SDValue Input = 10958 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 10959 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 10960 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 10961 } 10962 } 10963 } 10964 10965 if (!Subtarget->isThumb1Only()) { 10966 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 10967 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 10968 return Result; 10969 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 10970 return Result; 10971 } 10972 10973 SDValue N0 = N->getOperand(0); 10974 SDValue N1 = N->getOperand(1); 10975 10976 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 10977 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 10978 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 10979 10980 // The code below optimizes (or (and X, Y), Z). 10981 // The AND operand needs to have a single user to make these optimizations 10982 // profitable. 10983 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 10984 return SDValue(); 10985 10986 APInt SplatUndef; 10987 unsigned SplatBitSize; 10988 bool HasAnyUndefs; 10989 10990 APInt SplatBits0, SplatBits1; 10991 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 10992 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 10993 // Ensure that the second operand of both ands are constants 10994 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 10995 HasAnyUndefs) && !HasAnyUndefs) { 10996 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 10997 HasAnyUndefs) && !HasAnyUndefs) { 10998 // Ensure that the bit width of the constants are the same and that 10999 // the splat arguments are logical inverses as per the pattern we 11000 // are trying to simplify. 11001 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 11002 SplatBits0 == ~SplatBits1) { 11003 // Canonicalize the vector type to make instruction selection 11004 // simpler. 11005 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 11006 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 11007 N0->getOperand(1), 11008 N0->getOperand(0), 11009 N1->getOperand(0)); 11010 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 11011 } 11012 } 11013 } 11014 } 11015 11016 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 11017 // reasonable. 11018 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 11019 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 11020 return Res; 11021 } 11022 11023 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11024 return Result; 11025 11026 return SDValue(); 11027 } 11028 11029 static SDValue PerformXORCombine(SDNode *N, 11030 TargetLowering::DAGCombinerInfo &DCI, 11031 const ARMSubtarget *Subtarget) { 11032 EVT VT = N->getValueType(0); 11033 SelectionDAG &DAG = DCI.DAG; 11034 11035 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11036 return SDValue(); 11037 11038 if (!Subtarget->isThumb1Only()) { 11039 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11040 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 11041 return Result; 11042 11043 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11044 return Result; 11045 } 11046 11047 return SDValue(); 11048 } 11049 11050 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 11051 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 11052 // their position in "to" (Rd). 11053 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 11054 assert(N->getOpcode() == ARMISD::BFI); 11055 11056 SDValue From = N->getOperand(1); 11057 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 11058 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 11059 11060 // If the Base came from a SHR #C, we can deduce that it is really testing bit 11061 // #C in the base of the SHR. 11062 if (From->getOpcode() == ISD::SRL && 11063 isa<ConstantSDNode>(From->getOperand(1))) { 11064 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 11065 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 11066 FromMask <<= Shift.getLimitedValue(31); 11067 From = From->getOperand(0); 11068 } 11069 11070 return From; 11071 } 11072 11073 // If A and B contain one contiguous set of bits, does A | B == A . B? 11074 // 11075 // Neither A nor B must be zero. 11076 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 11077 unsigned LastActiveBitInA = A.countTrailingZeros(); 11078 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 11079 return LastActiveBitInA - 1 == FirstActiveBitInB; 11080 } 11081 11082 static SDValue FindBFIToCombineWith(SDNode *N) { 11083 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 11084 // if one exists. 11085 APInt ToMask, FromMask; 11086 SDValue From = ParseBFI(N, ToMask, FromMask); 11087 SDValue To = N->getOperand(0); 11088 11089 // Now check for a compatible BFI to merge with. We can pass through BFIs that 11090 // aren't compatible, but not if they set the same bit in their destination as 11091 // we do (or that of any BFI we're going to combine with). 11092 SDValue V = To; 11093 APInt CombinedToMask = ToMask; 11094 while (V.getOpcode() == ARMISD::BFI) { 11095 APInt NewToMask, NewFromMask; 11096 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 11097 if (NewFrom != From) { 11098 // This BFI has a different base. Keep going. 11099 CombinedToMask |= NewToMask; 11100 V = V.getOperand(0); 11101 continue; 11102 } 11103 11104 // Do the written bits conflict with any we've seen so far? 11105 if ((NewToMask & CombinedToMask).getBoolValue()) 11106 // Conflicting bits - bail out because going further is unsafe. 11107 return SDValue(); 11108 11109 // Are the new bits contiguous when combined with the old bits? 11110 if (BitsProperlyConcatenate(ToMask, NewToMask) && 11111 BitsProperlyConcatenate(FromMask, NewFromMask)) 11112 return V; 11113 if (BitsProperlyConcatenate(NewToMask, ToMask) && 11114 BitsProperlyConcatenate(NewFromMask, FromMask)) 11115 return V; 11116 11117 // We've seen a write to some bits, so track it. 11118 CombinedToMask |= NewToMask; 11119 // Keep going... 11120 V = V.getOperand(0); 11121 } 11122 11123 return SDValue(); 11124 } 11125 11126 static SDValue PerformBFICombine(SDNode *N, 11127 TargetLowering::DAGCombinerInfo &DCI) { 11128 SDValue N1 = N->getOperand(1); 11129 if (N1.getOpcode() == ISD::AND) { 11130 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 11131 // the bits being cleared by the AND are not demanded by the BFI. 11132 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 11133 if (!N11C) 11134 return SDValue(); 11135 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 11136 unsigned LSB = countTrailingZeros(~InvMask); 11137 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 11138 assert(Width < 11139 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 11140 "undefined behavior"); 11141 unsigned Mask = (1u << Width) - 1; 11142 unsigned Mask2 = N11C->getZExtValue(); 11143 if ((Mask & (~Mask2)) == 0) 11144 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 11145 N->getOperand(0), N1.getOperand(0), 11146 N->getOperand(2)); 11147 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 11148 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 11149 // Keep track of any consecutive bits set that all come from the same base 11150 // value. We can combine these together into a single BFI. 11151 SDValue CombineBFI = FindBFIToCombineWith(N); 11152 if (CombineBFI == SDValue()) 11153 return SDValue(); 11154 11155 // We've found a BFI. 11156 APInt ToMask1, FromMask1; 11157 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 11158 11159 APInt ToMask2, FromMask2; 11160 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 11161 assert(From1 == From2); 11162 (void)From2; 11163 11164 // First, unlink CombineBFI. 11165 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 11166 // Then create a new BFI, combining the two together. 11167 APInt NewFromMask = FromMask1 | FromMask2; 11168 APInt NewToMask = ToMask1 | ToMask2; 11169 11170 EVT VT = N->getValueType(0); 11171 SDLoc dl(N); 11172 11173 if (NewFromMask[0] == 0) 11174 From1 = DCI.DAG.getNode( 11175 ISD::SRL, dl, VT, From1, 11176 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 11177 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 11178 DCI.DAG.getConstant(~NewToMask, dl, VT)); 11179 } 11180 return SDValue(); 11181 } 11182 11183 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 11184 /// ARMISD::VMOVRRD. 11185 static SDValue PerformVMOVRRDCombine(SDNode *N, 11186 TargetLowering::DAGCombinerInfo &DCI, 11187 const ARMSubtarget *Subtarget) { 11188 // vmovrrd(vmovdrr x, y) -> x,y 11189 SDValue InDouble = N->getOperand(0); 11190 if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) 11191 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 11192 11193 // vmovrrd(load f64) -> (load i32), (load i32) 11194 SDNode *InNode = InDouble.getNode(); 11195 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 11196 InNode->getValueType(0) == MVT::f64 && 11197 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 11198 !cast<LoadSDNode>(InNode)->isVolatile()) { 11199 // TODO: Should this be done for non-FrameIndex operands? 11200 LoadSDNode *LD = cast<LoadSDNode>(InNode); 11201 11202 SelectionDAG &DAG = DCI.DAG; 11203 SDLoc DL(LD); 11204 SDValue BasePtr = LD->getBasePtr(); 11205 SDValue NewLD1 = 11206 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 11207 LD->getAlignment(), LD->getMemOperand()->getFlags()); 11208 11209 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 11210 DAG.getConstant(4, DL, MVT::i32)); 11211 SDValue NewLD2 = DAG.getLoad( 11212 MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), 11213 std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); 11214 11215 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 11216 if (DCI.DAG.getDataLayout().isBigEndian()) 11217 std::swap (NewLD1, NewLD2); 11218 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 11219 return Result; 11220 } 11221 11222 return SDValue(); 11223 } 11224 11225 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 11226 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 11227 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 11228 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 11229 SDValue Op0 = N->getOperand(0); 11230 SDValue Op1 = N->getOperand(1); 11231 if (Op0.getOpcode() == ISD::BITCAST) 11232 Op0 = Op0.getOperand(0); 11233 if (Op1.getOpcode() == ISD::BITCAST) 11234 Op1 = Op1.getOperand(0); 11235 if (Op0.getOpcode() == ARMISD::VMOVRRD && 11236 Op0.getNode() == Op1.getNode() && 11237 Op0.getResNo() == 0 && Op1.getResNo() == 1) 11238 return DAG.getNode(ISD::BITCAST, SDLoc(N), 11239 N->getValueType(0), Op0.getOperand(0)); 11240 return SDValue(); 11241 } 11242 11243 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 11244 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 11245 /// i64 vector to have f64 elements, since the value can then be loaded 11246 /// directly into a VFP register. 11247 static bool hasNormalLoadOperand(SDNode *N) { 11248 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 11249 for (unsigned i = 0; i < NumElts; ++i) { 11250 SDNode *Elt = N->getOperand(i).getNode(); 11251 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 11252 return true; 11253 } 11254 return false; 11255 } 11256 11257 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 11258 /// ISD::BUILD_VECTOR. 11259 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 11260 TargetLowering::DAGCombinerInfo &DCI, 11261 const ARMSubtarget *Subtarget) { 11262 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 11263 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 11264 // into a pair of GPRs, which is fine when the value is used as a scalar, 11265 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 11266 SelectionDAG &DAG = DCI.DAG; 11267 if (N->getNumOperands() == 2) 11268 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 11269 return RV; 11270 11271 // Load i64 elements as f64 values so that type legalization does not split 11272 // them up into i32 values. 11273 EVT VT = N->getValueType(0); 11274 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 11275 return SDValue(); 11276 SDLoc dl(N); 11277 SmallVector<SDValue, 8> Ops; 11278 unsigned NumElts = VT.getVectorNumElements(); 11279 for (unsigned i = 0; i < NumElts; ++i) { 11280 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 11281 Ops.push_back(V); 11282 // Make the DAGCombiner fold the bitcast. 11283 DCI.AddToWorklist(V.getNode()); 11284 } 11285 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 11286 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 11287 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 11288 } 11289 11290 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 11291 static SDValue 11292 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 11293 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 11294 // At that time, we may have inserted bitcasts from integer to float. 11295 // If these bitcasts have survived DAGCombine, change the lowering of this 11296 // BUILD_VECTOR in something more vector friendly, i.e., that does not 11297 // force to use floating point types. 11298 11299 // Make sure we can change the type of the vector. 11300 // This is possible iff: 11301 // 1. The vector is only used in a bitcast to a integer type. I.e., 11302 // 1.1. Vector is used only once. 11303 // 1.2. Use is a bit convert to an integer type. 11304 // 2. The size of its operands are 32-bits (64-bits are not legal). 11305 EVT VT = N->getValueType(0); 11306 EVT EltVT = VT.getVectorElementType(); 11307 11308 // Check 1.1. and 2. 11309 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 11310 return SDValue(); 11311 11312 // By construction, the input type must be float. 11313 assert(EltVT == MVT::f32 && "Unexpected type!"); 11314 11315 // Check 1.2. 11316 SDNode *Use = *N->use_begin(); 11317 if (Use->getOpcode() != ISD::BITCAST || 11318 Use->getValueType(0).isFloatingPoint()) 11319 return SDValue(); 11320 11321 // Check profitability. 11322 // Model is, if more than half of the relevant operands are bitcast from 11323 // i32, turn the build_vector into a sequence of insert_vector_elt. 11324 // Relevant operands are everything that is not statically 11325 // (i.e., at compile time) bitcasted. 11326 unsigned NumOfBitCastedElts = 0; 11327 unsigned NumElts = VT.getVectorNumElements(); 11328 unsigned NumOfRelevantElts = NumElts; 11329 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 11330 SDValue Elt = N->getOperand(Idx); 11331 if (Elt->getOpcode() == ISD::BITCAST) { 11332 // Assume only bit cast to i32 will go away. 11333 if (Elt->getOperand(0).getValueType() == MVT::i32) 11334 ++NumOfBitCastedElts; 11335 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 11336 // Constants are statically casted, thus do not count them as 11337 // relevant operands. 11338 --NumOfRelevantElts; 11339 } 11340 11341 // Check if more than half of the elements require a non-free bitcast. 11342 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 11343 return SDValue(); 11344 11345 SelectionDAG &DAG = DCI.DAG; 11346 // Create the new vector type. 11347 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 11348 // Check if the type is legal. 11349 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11350 if (!TLI.isTypeLegal(VecVT)) 11351 return SDValue(); 11352 11353 // Combine: 11354 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 11355 // => BITCAST INSERT_VECTOR_ELT 11356 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 11357 // (BITCAST EN), N. 11358 SDValue Vec = DAG.getUNDEF(VecVT); 11359 SDLoc dl(N); 11360 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 11361 SDValue V = N->getOperand(Idx); 11362 if (V.isUndef()) 11363 continue; 11364 if (V.getOpcode() == ISD::BITCAST && 11365 V->getOperand(0).getValueType() == MVT::i32) 11366 // Fold obvious case. 11367 V = V.getOperand(0); 11368 else { 11369 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 11370 // Make the DAGCombiner fold the bitcasts. 11371 DCI.AddToWorklist(V.getNode()); 11372 } 11373 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 11374 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 11375 } 11376 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 11377 // Make the DAGCombiner fold the bitcasts. 11378 DCI.AddToWorklist(Vec.getNode()); 11379 return Vec; 11380 } 11381 11382 /// PerformInsertEltCombine - Target-specific dag combine xforms for 11383 /// ISD::INSERT_VECTOR_ELT. 11384 static SDValue PerformInsertEltCombine(SDNode *N, 11385 TargetLowering::DAGCombinerInfo &DCI) { 11386 // Bitcast an i64 load inserted into a vector to f64. 11387 // Otherwise, the i64 value will be legalized to a pair of i32 values. 11388 EVT VT = N->getValueType(0); 11389 SDNode *Elt = N->getOperand(1).getNode(); 11390 if (VT.getVectorElementType() != MVT::i64 || 11391 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 11392 return SDValue(); 11393 11394 SelectionDAG &DAG = DCI.DAG; 11395 SDLoc dl(N); 11396 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 11397 VT.getVectorNumElements()); 11398 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 11399 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 11400 // Make the DAGCombiner fold the bitcasts. 11401 DCI.AddToWorklist(Vec.getNode()); 11402 DCI.AddToWorklist(V.getNode()); 11403 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 11404 Vec, V, N->getOperand(2)); 11405 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 11406 } 11407 11408 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 11409 /// ISD::VECTOR_SHUFFLE. 11410 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 11411 // The LLVM shufflevector instruction does not require the shuffle mask 11412 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 11413 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 11414 // operands do not match the mask length, they are extended by concatenating 11415 // them with undef vectors. That is probably the right thing for other 11416 // targets, but for NEON it is better to concatenate two double-register 11417 // size vector operands into a single quad-register size vector. Do that 11418 // transformation here: 11419 // shuffle(concat(v1, undef), concat(v2, undef)) -> 11420 // shuffle(concat(v1, v2), undef) 11421 SDValue Op0 = N->getOperand(0); 11422 SDValue Op1 = N->getOperand(1); 11423 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 11424 Op1.getOpcode() != ISD::CONCAT_VECTORS || 11425 Op0.getNumOperands() != 2 || 11426 Op1.getNumOperands() != 2) 11427 return SDValue(); 11428 SDValue Concat0Op1 = Op0.getOperand(1); 11429 SDValue Concat1Op1 = Op1.getOperand(1); 11430 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 11431 return SDValue(); 11432 // Skip the transformation if any of the types are illegal. 11433 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11434 EVT VT = N->getValueType(0); 11435 if (!TLI.isTypeLegal(VT) || 11436 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 11437 !TLI.isTypeLegal(Concat1Op1.getValueType())) 11438 return SDValue(); 11439 11440 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 11441 Op0.getOperand(0), Op1.getOperand(0)); 11442 // Translate the shuffle mask. 11443 SmallVector<int, 16> NewMask; 11444 unsigned NumElts = VT.getVectorNumElements(); 11445 unsigned HalfElts = NumElts/2; 11446 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 11447 for (unsigned n = 0; n < NumElts; ++n) { 11448 int MaskElt = SVN->getMaskElt(n); 11449 int NewElt = -1; 11450 if (MaskElt < (int)HalfElts) 11451 NewElt = MaskElt; 11452 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 11453 NewElt = HalfElts + MaskElt - NumElts; 11454 NewMask.push_back(NewElt); 11455 } 11456 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 11457 DAG.getUNDEF(VT), NewMask); 11458 } 11459 11460 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 11461 /// NEON load/store intrinsics, and generic vector load/stores, to merge 11462 /// base address updates. 11463 /// For generic load/stores, the memory type is assumed to be a vector. 11464 /// The caller is assumed to have checked legality. 11465 static SDValue CombineBaseUpdate(SDNode *N, 11466 TargetLowering::DAGCombinerInfo &DCI) { 11467 SelectionDAG &DAG = DCI.DAG; 11468 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 11469 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 11470 const bool isStore = N->getOpcode() == ISD::STORE; 11471 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 11472 SDValue Addr = N->getOperand(AddrOpIdx); 11473 MemSDNode *MemN = cast<MemSDNode>(N); 11474 SDLoc dl(N); 11475 11476 // Search for a use of the address operand that is an increment. 11477 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 11478 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 11479 SDNode *User = *UI; 11480 if (User->getOpcode() != ISD::ADD || 11481 UI.getUse().getResNo() != Addr.getResNo()) 11482 continue; 11483 11484 // Check that the add is independent of the load/store. Otherwise, folding 11485 // it would create a cycle. 11486 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 11487 continue; 11488 11489 // Find the new opcode for the updating load/store. 11490 bool isLoadOp = true; 11491 bool isLaneOp = false; 11492 unsigned NewOpc = 0; 11493 unsigned NumVecs = 0; 11494 if (isIntrinsic) { 11495 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 11496 switch (IntNo) { 11497 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 11498 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 11499 NumVecs = 1; break; 11500 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 11501 NumVecs = 2; break; 11502 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 11503 NumVecs = 3; break; 11504 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 11505 NumVecs = 4; break; 11506 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 11507 NumVecs = 2; isLaneOp = true; break; 11508 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 11509 NumVecs = 3; isLaneOp = true; break; 11510 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 11511 NumVecs = 4; isLaneOp = true; break; 11512 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 11513 NumVecs = 1; isLoadOp = false; break; 11514 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 11515 NumVecs = 2; isLoadOp = false; break; 11516 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 11517 NumVecs = 3; isLoadOp = false; break; 11518 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 11519 NumVecs = 4; isLoadOp = false; break; 11520 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 11521 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 11522 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 11523 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 11524 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 11525 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 11526 } 11527 } else { 11528 isLaneOp = true; 11529 switch (N->getOpcode()) { 11530 default: llvm_unreachable("unexpected opcode for Neon base update"); 11531 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 11532 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 11533 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 11534 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 11535 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 11536 NumVecs = 1; isLaneOp = false; break; 11537 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 11538 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 11539 } 11540 } 11541 11542 // Find the size of memory referenced by the load/store. 11543 EVT VecTy; 11544 if (isLoadOp) { 11545 VecTy = N->getValueType(0); 11546 } else if (isIntrinsic) { 11547 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 11548 } else { 11549 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 11550 VecTy = N->getOperand(1).getValueType(); 11551 } 11552 11553 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 11554 if (isLaneOp) 11555 NumBytes /= VecTy.getVectorNumElements(); 11556 11557 // If the increment is a constant, it must match the memory ref size. 11558 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 11559 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 11560 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 11561 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 11562 // separate instructions that make it harder to use a non-constant update. 11563 continue; 11564 } 11565 11566 // OK, we found an ADD we can fold into the base update. 11567 // Now, create a _UPD node, taking care of not breaking alignment. 11568 11569 EVT AlignedVecTy = VecTy; 11570 unsigned Alignment = MemN->getAlignment(); 11571 11572 // If this is a less-than-standard-aligned load/store, change the type to 11573 // match the standard alignment. 11574 // The alignment is overlooked when selecting _UPD variants; and it's 11575 // easier to introduce bitcasts here than fix that. 11576 // There are 3 ways to get to this base-update combine: 11577 // - intrinsics: they are assumed to be properly aligned (to the standard 11578 // alignment of the memory type), so we don't need to do anything. 11579 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 11580 // intrinsics, so, likewise, there's nothing to do. 11581 // - generic load/store instructions: the alignment is specified as an 11582 // explicit operand, rather than implicitly as the standard alignment 11583 // of the memory type (like the intrisics). We need to change the 11584 // memory type to match the explicit alignment. That way, we don't 11585 // generate non-standard-aligned ARMISD::VLDx nodes. 11586 if (isa<LSBaseSDNode>(N)) { 11587 if (Alignment == 0) 11588 Alignment = 1; 11589 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 11590 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 11591 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 11592 assert(!isLaneOp && "Unexpected generic load/store lane."); 11593 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 11594 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 11595 } 11596 // Don't set an explicit alignment on regular load/stores that we want 11597 // to transform to VLD/VST 1_UPD nodes. 11598 // This matches the behavior of regular load/stores, which only get an 11599 // explicit alignment if the MMO alignment is larger than the standard 11600 // alignment of the memory type. 11601 // Intrinsics, however, always get an explicit alignment, set to the 11602 // alignment of the MMO. 11603 Alignment = 1; 11604 } 11605 11606 // Create the new updating load/store node. 11607 // First, create an SDVTList for the new updating node's results. 11608 EVT Tys[6]; 11609 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 11610 unsigned n; 11611 for (n = 0; n < NumResultVecs; ++n) 11612 Tys[n] = AlignedVecTy; 11613 Tys[n++] = MVT::i32; 11614 Tys[n] = MVT::Other; 11615 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 11616 11617 // Then, gather the new node's operands. 11618 SmallVector<SDValue, 8> Ops; 11619 Ops.push_back(N->getOperand(0)); // incoming chain 11620 Ops.push_back(N->getOperand(AddrOpIdx)); 11621 Ops.push_back(Inc); 11622 11623 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 11624 // Try to match the intrinsic's signature 11625 Ops.push_back(StN->getValue()); 11626 } else { 11627 // Loads (and of course intrinsics) match the intrinsics' signature, 11628 // so just add all but the alignment operand. 11629 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 11630 Ops.push_back(N->getOperand(i)); 11631 } 11632 11633 // For all node types, the alignment operand is always the last one. 11634 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 11635 11636 // If this is a non-standard-aligned STORE, the penultimate operand is the 11637 // stored value. Bitcast it to the aligned type. 11638 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 11639 SDValue &StVal = Ops[Ops.size()-2]; 11640 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 11641 } 11642 11643 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 11644 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 11645 MemN->getMemOperand()); 11646 11647 // Update the uses. 11648 SmallVector<SDValue, 5> NewResults; 11649 for (unsigned i = 0; i < NumResultVecs; ++i) 11650 NewResults.push_back(SDValue(UpdN.getNode(), i)); 11651 11652 // If this is an non-standard-aligned LOAD, the first result is the loaded 11653 // value. Bitcast it to the expected result type. 11654 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 11655 SDValue &LdVal = NewResults[0]; 11656 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 11657 } 11658 11659 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 11660 DCI.CombineTo(N, NewResults); 11661 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 11662 11663 break; 11664 } 11665 return SDValue(); 11666 } 11667 11668 static SDValue PerformVLDCombine(SDNode *N, 11669 TargetLowering::DAGCombinerInfo &DCI) { 11670 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11671 return SDValue(); 11672 11673 return CombineBaseUpdate(N, DCI); 11674 } 11675 11676 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 11677 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 11678 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 11679 /// return true. 11680 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 11681 SelectionDAG &DAG = DCI.DAG; 11682 EVT VT = N->getValueType(0); 11683 // vldN-dup instructions only support 64-bit vectors for N > 1. 11684 if (!VT.is64BitVector()) 11685 return false; 11686 11687 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 11688 SDNode *VLD = N->getOperand(0).getNode(); 11689 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 11690 return false; 11691 unsigned NumVecs = 0; 11692 unsigned NewOpc = 0; 11693 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 11694 if (IntNo == Intrinsic::arm_neon_vld2lane) { 11695 NumVecs = 2; 11696 NewOpc = ARMISD::VLD2DUP; 11697 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 11698 NumVecs = 3; 11699 NewOpc = ARMISD::VLD3DUP; 11700 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 11701 NumVecs = 4; 11702 NewOpc = ARMISD::VLD4DUP; 11703 } else { 11704 return false; 11705 } 11706 11707 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 11708 // numbers match the load. 11709 unsigned VLDLaneNo = 11710 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 11711 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 11712 UI != UE; ++UI) { 11713 // Ignore uses of the chain result. 11714 if (UI.getUse().getResNo() == NumVecs) 11715 continue; 11716 SDNode *User = *UI; 11717 if (User->getOpcode() != ARMISD::VDUPLANE || 11718 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 11719 return false; 11720 } 11721 11722 // Create the vldN-dup node. 11723 EVT Tys[5]; 11724 unsigned n; 11725 for (n = 0; n < NumVecs; ++n) 11726 Tys[n] = VT; 11727 Tys[n] = MVT::Other; 11728 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 11729 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 11730 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 11731 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 11732 Ops, VLDMemInt->getMemoryVT(), 11733 VLDMemInt->getMemOperand()); 11734 11735 // Update the uses. 11736 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 11737 UI != UE; ++UI) { 11738 unsigned ResNo = UI.getUse().getResNo(); 11739 // Ignore uses of the chain result. 11740 if (ResNo == NumVecs) 11741 continue; 11742 SDNode *User = *UI; 11743 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 11744 } 11745 11746 // Now the vldN-lane intrinsic is dead except for its chain result. 11747 // Update uses of the chain. 11748 std::vector<SDValue> VLDDupResults; 11749 for (unsigned n = 0; n < NumVecs; ++n) 11750 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 11751 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 11752 DCI.CombineTo(VLD, VLDDupResults); 11753 11754 return true; 11755 } 11756 11757 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 11758 /// ARMISD::VDUPLANE. 11759 static SDValue PerformVDUPLANECombine(SDNode *N, 11760 TargetLowering::DAGCombinerInfo &DCI) { 11761 SDValue Op = N->getOperand(0); 11762 11763 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 11764 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 11765 if (CombineVLDDUP(N, DCI)) 11766 return SDValue(N, 0); 11767 11768 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 11769 // redundant. Ignore bit_converts for now; element sizes are checked below. 11770 while (Op.getOpcode() == ISD::BITCAST) 11771 Op = Op.getOperand(0); 11772 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 11773 return SDValue(); 11774 11775 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 11776 unsigned EltSize = Op.getScalarValueSizeInBits(); 11777 // The canonical VMOV for a zero vector uses a 32-bit element size. 11778 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11779 unsigned EltBits; 11780 if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) 11781 EltSize = 8; 11782 EVT VT = N->getValueType(0); 11783 if (EltSize > VT.getScalarSizeInBits()) 11784 return SDValue(); 11785 11786 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 11787 } 11788 11789 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 11790 static SDValue PerformVDUPCombine(SDNode *N, 11791 TargetLowering::DAGCombinerInfo &DCI) { 11792 SelectionDAG &DAG = DCI.DAG; 11793 SDValue Op = N->getOperand(0); 11794 11795 // Match VDUP(LOAD) -> VLD1DUP. 11796 // We match this pattern here rather than waiting for isel because the 11797 // transform is only legal for unindexed loads. 11798 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 11799 if (LD && Op.hasOneUse() && LD->isUnindexed() && 11800 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 11801 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 11802 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 11803 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 11804 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 11805 Ops, LD->getMemoryVT(), 11806 LD->getMemOperand()); 11807 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 11808 return VLDDup; 11809 } 11810 11811 return SDValue(); 11812 } 11813 11814 static SDValue PerformLOADCombine(SDNode *N, 11815 TargetLowering::DAGCombinerInfo &DCI) { 11816 EVT VT = N->getValueType(0); 11817 11818 // If this is a legal vector load, try to combine it into a VLD1_UPD. 11819 if (ISD::isNormalLoad(N) && VT.isVector() && 11820 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11821 return CombineBaseUpdate(N, DCI); 11822 11823 return SDValue(); 11824 } 11825 11826 /// PerformSTORECombine - Target-specific dag combine xforms for 11827 /// ISD::STORE. 11828 static SDValue PerformSTORECombine(SDNode *N, 11829 TargetLowering::DAGCombinerInfo &DCI) { 11830 StoreSDNode *St = cast<StoreSDNode>(N); 11831 if (St->isVolatile()) 11832 return SDValue(); 11833 11834 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 11835 // pack all of the elements in one place. Next, store to memory in fewer 11836 // chunks. 11837 SDValue StVal = St->getValue(); 11838 EVT VT = StVal.getValueType(); 11839 if (St->isTruncatingStore() && VT.isVector()) { 11840 SelectionDAG &DAG = DCI.DAG; 11841 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11842 EVT StVT = St->getMemoryVT(); 11843 unsigned NumElems = VT.getVectorNumElements(); 11844 assert(StVT != VT && "Cannot truncate to the same type"); 11845 unsigned FromEltSz = VT.getScalarSizeInBits(); 11846 unsigned ToEltSz = StVT.getScalarSizeInBits(); 11847 11848 // From, To sizes and ElemCount must be pow of two 11849 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); 11850 11851 // We are going to use the original vector elt for storing. 11852 // Accumulated smaller vector elements must be a multiple of the store size. 11853 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); 11854 11855 unsigned SizeRatio = FromEltSz / ToEltSz; 11856 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 11857 11858 // Create a type on which we perform the shuffle. 11859 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 11860 NumElems*SizeRatio); 11861 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 11862 11863 SDLoc DL(St); 11864 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 11865 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 11866 for (unsigned i = 0; i < NumElems; ++i) 11867 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() 11868 ? (i + 1) * SizeRatio - 1 11869 : i * SizeRatio; 11870 11871 // Can't shuffle using an illegal type. 11872 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 11873 11874 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, 11875 DAG.getUNDEF(WideVec.getValueType()), 11876 ShuffleVec); 11877 // At this point all of the data is stored at the bottom of the 11878 // register. We now need to save it to mem. 11879 11880 // Find the largest store unit 11881 MVT StoreType = MVT::i8; 11882 for (MVT Tp : MVT::integer_valuetypes()) { 11883 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 11884 StoreType = Tp; 11885 } 11886 // Didn't find a legal store type. 11887 if (!TLI.isTypeLegal(StoreType)) 11888 return SDValue(); 11889 11890 // Bitcast the original vector into a vector of store-size units 11891 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 11892 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 11893 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 11894 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 11895 SmallVector<SDValue, 8> Chains; 11896 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 11897 TLI.getPointerTy(DAG.getDataLayout())); 11898 SDValue BasePtr = St->getBasePtr(); 11899 11900 // Perform one or more big stores into memory. 11901 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); 11902 for (unsigned I = 0; I < E; I++) { 11903 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 11904 StoreType, ShuffWide, 11905 DAG.getIntPtrConstant(I, DL)); 11906 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, 11907 St->getPointerInfo(), St->getAlignment(), 11908 St->getMemOperand()->getFlags()); 11909 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, 11910 Increment); 11911 Chains.push_back(Ch); 11912 } 11913 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 11914 } 11915 11916 if (!ISD::isNormalStore(St)) 11917 return SDValue(); 11918 11919 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 11920 // ARM stores of arguments in the same cache line. 11921 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 11922 StVal.getNode()->hasOneUse()) { 11923 SelectionDAG &DAG = DCI.DAG; 11924 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 11925 SDLoc DL(St); 11926 SDValue BasePtr = St->getBasePtr(); 11927 SDValue NewST1 = DAG.getStore( 11928 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 11929 BasePtr, St->getPointerInfo(), St->getAlignment(), 11930 St->getMemOperand()->getFlags()); 11931 11932 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 11933 DAG.getConstant(4, DL, MVT::i32)); 11934 return DAG.getStore(NewST1.getValue(0), DL, 11935 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 11936 OffsetPtr, St->getPointerInfo(), 11937 std::min(4U, St->getAlignment() / 2), 11938 St->getMemOperand()->getFlags()); 11939 } 11940 11941 if (StVal.getValueType() == MVT::i64 && 11942 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 11943 11944 // Bitcast an i64 store extracted from a vector to f64. 11945 // Otherwise, the i64 value will be legalized to a pair of i32 values. 11946 SelectionDAG &DAG = DCI.DAG; 11947 SDLoc dl(StVal); 11948 SDValue IntVec = StVal.getOperand(0); 11949 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 11950 IntVec.getValueType().getVectorNumElements()); 11951 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 11952 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 11953 Vec, StVal.getOperand(1)); 11954 dl = SDLoc(N); 11955 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 11956 // Make the DAGCombiner fold the bitcasts. 11957 DCI.AddToWorklist(Vec.getNode()); 11958 DCI.AddToWorklist(ExtElt.getNode()); 11959 DCI.AddToWorklist(V.getNode()); 11960 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 11961 St->getPointerInfo(), St->getAlignment(), 11962 St->getMemOperand()->getFlags(), St->getAAInfo()); 11963 } 11964 11965 // If this is a legal vector store, try to combine it into a VST1_UPD. 11966 if (ISD::isNormalStore(N) && VT.isVector() && 11967 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11968 return CombineBaseUpdate(N, DCI); 11969 11970 return SDValue(); 11971 } 11972 11973 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 11974 /// can replace combinations of VMUL and VCVT (floating-point to integer) 11975 /// when the VMUL has a constant operand that is a power of 2. 11976 /// 11977 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 11978 /// vmul.f32 d16, d17, d16 11979 /// vcvt.s32.f32 d16, d16 11980 /// becomes: 11981 /// vcvt.s32.f32 d16, d16, #3 11982 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 11983 const ARMSubtarget *Subtarget) { 11984 if (!Subtarget->hasNEON()) 11985 return SDValue(); 11986 11987 SDValue Op = N->getOperand(0); 11988 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 11989 Op.getOpcode() != ISD::FMUL) 11990 return SDValue(); 11991 11992 SDValue ConstVec = Op->getOperand(1); 11993 if (!isa<BuildVectorSDNode>(ConstVec)) 11994 return SDValue(); 11995 11996 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 11997 uint32_t FloatBits = FloatTy.getSizeInBits(); 11998 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 11999 uint32_t IntBits = IntTy.getSizeInBits(); 12000 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12001 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 12002 // These instructions only exist converting from f32 to i32. We can handle 12003 // smaller integers by generating an extra truncate, but larger ones would 12004 // be lossy. We also can't handle more then 4 lanes, since these intructions 12005 // only support v2i32/v4i32 types. 12006 return SDValue(); 12007 } 12008 12009 BitVector UndefElements; 12010 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12011 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 12012 if (C == -1 || C == 0 || C > 32) 12013 return SDValue(); 12014 12015 SDLoc dl(N); 12016 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 12017 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 12018 Intrinsic::arm_neon_vcvtfp2fxu; 12019 SDValue FixConv = DAG.getNode( 12020 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 12021 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 12022 DAG.getConstant(C, dl, MVT::i32)); 12023 12024 if (IntBits < FloatBits) 12025 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 12026 12027 return FixConv; 12028 } 12029 12030 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 12031 /// can replace combinations of VCVT (integer to floating-point) and VDIV 12032 /// when the VDIV has a constant operand that is a power of 2. 12033 /// 12034 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 12035 /// vcvt.f32.s32 d16, d16 12036 /// vdiv.f32 d16, d17, d16 12037 /// becomes: 12038 /// vcvt.f32.s32 d16, d16, #3 12039 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 12040 const ARMSubtarget *Subtarget) { 12041 if (!Subtarget->hasNEON()) 12042 return SDValue(); 12043 12044 SDValue Op = N->getOperand(0); 12045 unsigned OpOpcode = Op.getNode()->getOpcode(); 12046 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 12047 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 12048 return SDValue(); 12049 12050 SDValue ConstVec = N->getOperand(1); 12051 if (!isa<BuildVectorSDNode>(ConstVec)) 12052 return SDValue(); 12053 12054 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 12055 uint32_t FloatBits = FloatTy.getSizeInBits(); 12056 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 12057 uint32_t IntBits = IntTy.getSizeInBits(); 12058 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12059 if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { 12060 // These instructions only exist converting from i32 to f32. We can handle 12061 // smaller integers by generating an extra extend, but larger ones would 12062 // be lossy. We also can't handle more then 4 lanes, since these intructions 12063 // only support v2i32/v4i32 types. 12064 return SDValue(); 12065 } 12066 12067 BitVector UndefElements; 12068 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12069 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 12070 if (C == -1 || C == 0 || C > 32) 12071 return SDValue(); 12072 12073 SDLoc dl(N); 12074 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 12075 SDValue ConvInput = Op.getOperand(0); 12076 if (IntBits < FloatBits) 12077 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 12078 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 12079 ConvInput); 12080 12081 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 12082 Intrinsic::arm_neon_vcvtfxu2fp; 12083 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 12084 Op.getValueType(), 12085 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 12086 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 12087 } 12088 12089 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 12090 /// operand of a vector shift operation, where all the elements of the 12091 /// build_vector must have the same constant integer value. 12092 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 12093 // Ignore bit_converts. 12094 while (Op.getOpcode() == ISD::BITCAST) 12095 Op = Op.getOperand(0); 12096 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 12097 APInt SplatBits, SplatUndef; 12098 unsigned SplatBitSize; 12099 bool HasAnyUndefs; 12100 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 12101 HasAnyUndefs, ElementBits) || 12102 SplatBitSize > ElementBits) 12103 return false; 12104 Cnt = SplatBits.getSExtValue(); 12105 return true; 12106 } 12107 12108 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 12109 /// operand of a vector shift left operation. That value must be in the range: 12110 /// 0 <= Value < ElementBits for a left shift; or 12111 /// 0 <= Value <= ElementBits for a long left shift. 12112 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 12113 assert(VT.isVector() && "vector shift count is not a vector type"); 12114 int64_t ElementBits = VT.getScalarSizeInBits(); 12115 if (! getVShiftImm(Op, ElementBits, Cnt)) 12116 return false; 12117 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); 12118 } 12119 12120 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 12121 /// operand of a vector shift right operation. For a shift opcode, the value 12122 /// is positive, but for an intrinsic the value count must be negative. The 12123 /// absolute value must be in the range: 12124 /// 1 <= |Value| <= ElementBits for a right shift; or 12125 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 12126 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 12127 int64_t &Cnt) { 12128 assert(VT.isVector() && "vector shift count is not a vector type"); 12129 int64_t ElementBits = VT.getScalarSizeInBits(); 12130 if (! getVShiftImm(Op, ElementBits, Cnt)) 12131 return false; 12132 if (!isIntrinsic) 12133 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); 12134 if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { 12135 Cnt = -Cnt; 12136 return true; 12137 } 12138 return false; 12139 } 12140 12141 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 12142 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 12143 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 12144 switch (IntNo) { 12145 default: 12146 // Don't do anything for most intrinsics. 12147 break; 12148 12149 // Vector shifts: check for immediate versions and lower them. 12150 // Note: This is done during DAG combining instead of DAG legalizing because 12151 // the build_vectors for 64-bit vector element shift counts are generally 12152 // not legal, and it is hard to see their values after they get legalized to 12153 // loads from a constant pool. 12154 case Intrinsic::arm_neon_vshifts: 12155 case Intrinsic::arm_neon_vshiftu: 12156 case Intrinsic::arm_neon_vrshifts: 12157 case Intrinsic::arm_neon_vrshiftu: 12158 case Intrinsic::arm_neon_vrshiftn: 12159 case Intrinsic::arm_neon_vqshifts: 12160 case Intrinsic::arm_neon_vqshiftu: 12161 case Intrinsic::arm_neon_vqshiftsu: 12162 case Intrinsic::arm_neon_vqshiftns: 12163 case Intrinsic::arm_neon_vqshiftnu: 12164 case Intrinsic::arm_neon_vqshiftnsu: 12165 case Intrinsic::arm_neon_vqrshiftns: 12166 case Intrinsic::arm_neon_vqrshiftnu: 12167 case Intrinsic::arm_neon_vqrshiftnsu: { 12168 EVT VT = N->getOperand(1).getValueType(); 12169 int64_t Cnt; 12170 unsigned VShiftOpc = 0; 12171 12172 switch (IntNo) { 12173 case Intrinsic::arm_neon_vshifts: 12174 case Intrinsic::arm_neon_vshiftu: 12175 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 12176 VShiftOpc = ARMISD::VSHL; 12177 break; 12178 } 12179 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 12180 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? 12181 ARMISD::VSHRs : ARMISD::VSHRu); 12182 break; 12183 } 12184 return SDValue(); 12185 12186 case Intrinsic::arm_neon_vrshifts: 12187 case Intrinsic::arm_neon_vrshiftu: 12188 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 12189 break; 12190 return SDValue(); 12191 12192 case Intrinsic::arm_neon_vqshifts: 12193 case Intrinsic::arm_neon_vqshiftu: 12194 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 12195 break; 12196 return SDValue(); 12197 12198 case Intrinsic::arm_neon_vqshiftsu: 12199 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 12200 break; 12201 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 12202 12203 case Intrinsic::arm_neon_vrshiftn: 12204 case Intrinsic::arm_neon_vqshiftns: 12205 case Intrinsic::arm_neon_vqshiftnu: 12206 case Intrinsic::arm_neon_vqshiftnsu: 12207 case Intrinsic::arm_neon_vqrshiftns: 12208 case Intrinsic::arm_neon_vqrshiftnu: 12209 case Intrinsic::arm_neon_vqrshiftnsu: 12210 // Narrowing shifts require an immediate right shift. 12211 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 12212 break; 12213 llvm_unreachable("invalid shift count for narrowing vector shift " 12214 "intrinsic"); 12215 12216 default: 12217 llvm_unreachable("unhandled vector shift"); 12218 } 12219 12220 switch (IntNo) { 12221 case Intrinsic::arm_neon_vshifts: 12222 case Intrinsic::arm_neon_vshiftu: 12223 // Opcode already set above. 12224 break; 12225 case Intrinsic::arm_neon_vrshifts: 12226 VShiftOpc = ARMISD::VRSHRs; break; 12227 case Intrinsic::arm_neon_vrshiftu: 12228 VShiftOpc = ARMISD::VRSHRu; break; 12229 case Intrinsic::arm_neon_vrshiftn: 12230 VShiftOpc = ARMISD::VRSHRN; break; 12231 case Intrinsic::arm_neon_vqshifts: 12232 VShiftOpc = ARMISD::VQSHLs; break; 12233 case Intrinsic::arm_neon_vqshiftu: 12234 VShiftOpc = ARMISD::VQSHLu; break; 12235 case Intrinsic::arm_neon_vqshiftsu: 12236 VShiftOpc = ARMISD::VQSHLsu; break; 12237 case Intrinsic::arm_neon_vqshiftns: 12238 VShiftOpc = ARMISD::VQSHRNs; break; 12239 case Intrinsic::arm_neon_vqshiftnu: 12240 VShiftOpc = ARMISD::VQSHRNu; break; 12241 case Intrinsic::arm_neon_vqshiftnsu: 12242 VShiftOpc = ARMISD::VQSHRNsu; break; 12243 case Intrinsic::arm_neon_vqrshiftns: 12244 VShiftOpc = ARMISD::VQRSHRNs; break; 12245 case Intrinsic::arm_neon_vqrshiftnu: 12246 VShiftOpc = ARMISD::VQRSHRNu; break; 12247 case Intrinsic::arm_neon_vqrshiftnsu: 12248 VShiftOpc = ARMISD::VQRSHRNsu; break; 12249 } 12250 12251 SDLoc dl(N); 12252 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 12253 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 12254 } 12255 12256 case Intrinsic::arm_neon_vshiftins: { 12257 EVT VT = N->getOperand(1).getValueType(); 12258 int64_t Cnt; 12259 unsigned VShiftOpc = 0; 12260 12261 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 12262 VShiftOpc = ARMISD::VSLI; 12263 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 12264 VShiftOpc = ARMISD::VSRI; 12265 else { 12266 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 12267 } 12268 12269 SDLoc dl(N); 12270 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 12271 N->getOperand(1), N->getOperand(2), 12272 DAG.getConstant(Cnt, dl, MVT::i32)); 12273 } 12274 12275 case Intrinsic::arm_neon_vqrshifts: 12276 case Intrinsic::arm_neon_vqrshiftu: 12277 // No immediate versions of these to check for. 12278 break; 12279 } 12280 12281 return SDValue(); 12282 } 12283 12284 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 12285 /// lowers them. As with the vector shift intrinsics, this is done during DAG 12286 /// combining instead of DAG legalizing because the build_vectors for 64-bit 12287 /// vector element shift counts are generally not legal, and it is hard to see 12288 /// their values after they get legalized to loads from a constant pool. 12289 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, 12290 const ARMSubtarget *ST) { 12291 EVT VT = N->getValueType(0); 12292 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 12293 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 12294 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 12295 SDValue N1 = N->getOperand(1); 12296 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 12297 SDValue N0 = N->getOperand(0); 12298 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 12299 DAG.MaskedValueIsZero(N0.getOperand(0), 12300 APInt::getHighBitsSet(32, 16))) 12301 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 12302 } 12303 } 12304 12305 // Nothing to be done for scalar shifts. 12306 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12307 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 12308 return SDValue(); 12309 12310 assert(ST->hasNEON() && "unexpected vector shift"); 12311 int64_t Cnt; 12312 12313 switch (N->getOpcode()) { 12314 default: llvm_unreachable("unexpected shift opcode"); 12315 12316 case ISD::SHL: 12317 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 12318 SDLoc dl(N); 12319 return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), 12320 DAG.getConstant(Cnt, dl, MVT::i32)); 12321 } 12322 break; 12323 12324 case ISD::SRA: 12325 case ISD::SRL: 12326 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 12327 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? 12328 ARMISD::VSHRs : ARMISD::VSHRu); 12329 SDLoc dl(N); 12330 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 12331 DAG.getConstant(Cnt, dl, MVT::i32)); 12332 } 12333 } 12334 return SDValue(); 12335 } 12336 12337 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 12338 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 12339 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 12340 const ARMSubtarget *ST) { 12341 SDValue N0 = N->getOperand(0); 12342 12343 // Check for sign- and zero-extensions of vector extract operations of 8- 12344 // and 16-bit vector elements. NEON supports these directly. They are 12345 // handled during DAG combining because type legalization will promote them 12346 // to 32-bit types and it is messy to recognize the operations after that. 12347 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12348 SDValue Vec = N0.getOperand(0); 12349 SDValue Lane = N0.getOperand(1); 12350 EVT VT = N->getValueType(0); 12351 EVT EltVT = N0.getValueType(); 12352 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12353 12354 if (VT == MVT::i32 && 12355 (EltVT == MVT::i8 || EltVT == MVT::i16) && 12356 TLI.isTypeLegal(Vec.getValueType()) && 12357 isa<ConstantSDNode>(Lane)) { 12358 12359 unsigned Opc = 0; 12360 switch (N->getOpcode()) { 12361 default: llvm_unreachable("unexpected opcode"); 12362 case ISD::SIGN_EXTEND: 12363 Opc = ARMISD::VGETLANEs; 12364 break; 12365 case ISD::ZERO_EXTEND: 12366 case ISD::ANY_EXTEND: 12367 Opc = ARMISD::VGETLANEu; 12368 break; 12369 } 12370 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 12371 } 12372 } 12373 12374 return SDValue(); 12375 } 12376 12377 static const APInt *isPowerOf2Constant(SDValue V) { 12378 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 12379 if (!C) 12380 return nullptr; 12381 const APInt *CV = &C->getAPIntValue(); 12382 return CV->isPowerOf2() ? CV : nullptr; 12383 } 12384 12385 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 12386 // If we have a CMOV, OR and AND combination such as: 12387 // if (x & CN) 12388 // y |= CM; 12389 // 12390 // And: 12391 // * CN is a single bit; 12392 // * All bits covered by CM are known zero in y 12393 // 12394 // Then we can convert this into a sequence of BFI instructions. This will 12395 // always be a win if CM is a single bit, will always be no worse than the 12396 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 12397 // three bits (due to the extra IT instruction). 12398 12399 SDValue Op0 = CMOV->getOperand(0); 12400 SDValue Op1 = CMOV->getOperand(1); 12401 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 12402 auto CC = CCNode->getAPIntValue().getLimitedValue(); 12403 SDValue CmpZ = CMOV->getOperand(4); 12404 12405 // The compare must be against zero. 12406 if (!isNullConstant(CmpZ->getOperand(1))) 12407 return SDValue(); 12408 12409 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 12410 SDValue And = CmpZ->getOperand(0); 12411 if (And->getOpcode() != ISD::AND) 12412 return SDValue(); 12413 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 12414 if (!AndC) 12415 return SDValue(); 12416 SDValue X = And->getOperand(0); 12417 12418 if (CC == ARMCC::EQ) { 12419 // We're performing an "equal to zero" compare. Swap the operands so we 12420 // canonicalize on a "not equal to zero" compare. 12421 std::swap(Op0, Op1); 12422 } else { 12423 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 12424 } 12425 12426 if (Op1->getOpcode() != ISD::OR) 12427 return SDValue(); 12428 12429 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 12430 if (!OrC) 12431 return SDValue(); 12432 SDValue Y = Op1->getOperand(0); 12433 12434 if (Op0 != Y) 12435 return SDValue(); 12436 12437 // Now, is it profitable to continue? 12438 APInt OrCI = OrC->getAPIntValue(); 12439 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 12440 if (OrCI.countPopulation() > Heuristic) 12441 return SDValue(); 12442 12443 // Lastly, can we determine that the bits defined by OrCI 12444 // are zero in Y? 12445 KnownBits Known; 12446 DAG.computeKnownBits(Y, Known); 12447 if ((OrCI & Known.Zero) != OrCI) 12448 return SDValue(); 12449 12450 // OK, we can do the combine. 12451 SDValue V = Y; 12452 SDLoc dl(X); 12453 EVT VT = X.getValueType(); 12454 unsigned BitInX = AndC->logBase2(); 12455 12456 if (BitInX != 0) { 12457 // We must shift X first. 12458 X = DAG.getNode(ISD::SRL, dl, VT, X, 12459 DAG.getConstant(BitInX, dl, VT)); 12460 } 12461 12462 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 12463 BitInY < NumActiveBits; ++BitInY) { 12464 if (OrCI[BitInY] == 0) 12465 continue; 12466 APInt Mask(VT.getSizeInBits(), 0); 12467 Mask.setBit(BitInY); 12468 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 12469 // Confusingly, the operand is an *inverted* mask. 12470 DAG.getConstant(~Mask, dl, VT)); 12471 } 12472 12473 return V; 12474 } 12475 12476 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 12477 SDValue 12478 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 12479 SDValue Cmp = N->getOperand(4); 12480 if (Cmp.getOpcode() != ARMISD::CMPZ) 12481 // Only looking at NE cases. 12482 return SDValue(); 12483 12484 EVT VT = N->getValueType(0); 12485 SDLoc dl(N); 12486 SDValue LHS = Cmp.getOperand(0); 12487 SDValue RHS = Cmp.getOperand(1); 12488 SDValue Chain = N->getOperand(0); 12489 SDValue BB = N->getOperand(1); 12490 SDValue ARMcc = N->getOperand(2); 12491 ARMCC::CondCodes CC = 12492 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 12493 12494 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 12495 // -> (brcond Chain BB CC CPSR Cmp) 12496 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 12497 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 12498 LHS->getOperand(0)->hasOneUse()) { 12499 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 12500 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 12501 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 12502 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 12503 if ((LHS00C && LHS00C->getZExtValue() == 0) && 12504 (LHS01C && LHS01C->getZExtValue() == 1) && 12505 (LHS1C && LHS1C->getZExtValue() == 1) && 12506 (RHSC && RHSC->getZExtValue() == 0)) { 12507 return DAG.getNode( 12508 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 12509 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 12510 } 12511 } 12512 12513 return SDValue(); 12514 } 12515 12516 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 12517 SDValue 12518 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 12519 SDValue Cmp = N->getOperand(4); 12520 if (Cmp.getOpcode() != ARMISD::CMPZ) 12521 // Only looking at EQ and NE cases. 12522 return SDValue(); 12523 12524 EVT VT = N->getValueType(0); 12525 SDLoc dl(N); 12526 SDValue LHS = Cmp.getOperand(0); 12527 SDValue RHS = Cmp.getOperand(1); 12528 SDValue FalseVal = N->getOperand(0); 12529 SDValue TrueVal = N->getOperand(1); 12530 SDValue ARMcc = N->getOperand(2); 12531 ARMCC::CondCodes CC = 12532 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 12533 12534 // BFI is only available on V6T2+. 12535 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 12536 SDValue R = PerformCMOVToBFICombine(N, DAG); 12537 if (R) 12538 return R; 12539 } 12540 12541 // Simplify 12542 // mov r1, r0 12543 // cmp r1, x 12544 // mov r0, y 12545 // moveq r0, x 12546 // to 12547 // cmp r0, x 12548 // movne r0, y 12549 // 12550 // mov r1, r0 12551 // cmp r1, x 12552 // mov r0, x 12553 // movne r0, y 12554 // to 12555 // cmp r0, x 12556 // movne r0, y 12557 /// FIXME: Turn this into a target neutral optimization? 12558 SDValue Res; 12559 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 12560 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 12561 N->getOperand(3), Cmp); 12562 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 12563 SDValue ARMcc; 12564 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 12565 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 12566 N->getOperand(3), NewCmp); 12567 } 12568 12569 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 12570 // -> (cmov F T CC CPSR Cmp) 12571 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 12572 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 12573 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 12574 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 12575 if ((LHS0C && LHS0C->getZExtValue() == 0) && 12576 (LHS1C && LHS1C->getZExtValue() == 1) && 12577 (RHSC && RHSC->getZExtValue() == 0)) { 12578 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 12579 LHS->getOperand(2), LHS->getOperand(3), 12580 LHS->getOperand(4)); 12581 } 12582 } 12583 12584 if (!VT.isInteger()) 12585 return SDValue(); 12586 12587 // Materialize a boolean comparison for integers so we can avoid branching. 12588 if (isNullConstant(FalseVal)) { 12589 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 12590 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 12591 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 12592 // right 5 bits will make that 32 be 1, otherwise it will be 0. 12593 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 12594 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12595 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 12596 DAG.getConstant(5, dl, MVT::i32)); 12597 } else { 12598 // CMOV 0, 1, ==, (CMPZ x, y) -> 12599 // (ADDCARRY (SUB x, y), t:0, t:1) 12600 // where t = (SUBCARRY 0, (SUB x, y), 0) 12601 // 12602 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 12603 // x != y. In other words, a carry C == 1 when x == y, C == 0 12604 // otherwise. 12605 // The final ADDCARRY computes 12606 // x - y + (0 - (x - y)) + C == C 12607 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12608 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12609 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 12610 // ISD::SUBCARRY returns a borrow but we want the carry here 12611 // actually. 12612 SDValue Carry = 12613 DAG.getNode(ISD::SUB, dl, MVT::i32, 12614 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 12615 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 12616 } 12617 } else if (CC == ARMCC::NE && LHS != RHS && 12618 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 12619 // This seems pointless but will allow us to combine it further below. 12620 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) 12621 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12622 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 12623 N->getOperand(3), Cmp); 12624 } 12625 } else if (isNullConstant(TrueVal)) { 12626 if (CC == ARMCC::EQ && LHS != RHS && 12627 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 12628 // This seems pointless but will allow us to combine it further below 12629 // Note that we change == for != as this is the dual for the case above. 12630 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) 12631 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12632 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 12633 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 12634 N->getOperand(3), Cmp); 12635 } 12636 } 12637 12638 // On Thumb1, the DAG above may be further combined if z is a power of 2 12639 // (z == 2 ^ K). 12640 // CMOV (SUB x, y), z, !=, (CMPZ x, y) -> 12641 // merge t3, t4 12642 // where t1 = (SUBCARRY (SUB x, y), z, 0) 12643 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 12644 // t3 = if K != 0 then (SHL t2:0, K) else t2:0 12645 // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ] 12646 const APInt *TrueConst; 12647 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 12648 (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) && 12649 (FalseVal.getOperand(1) == RHS) && 12650 (TrueConst = isPowerOf2Constant(TrueVal))) { 12651 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12652 unsigned ShiftAmount = TrueConst->logBase2(); 12653 if (ShiftAmount) 12654 TrueVal = DAG.getConstant(1, dl, VT); 12655 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 12656 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 12657 // Make it a carry, not a borrow. 12658 SDValue Carry = DAG.getNode( 12659 ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1)); 12660 Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry); 12661 12662 if (ShiftAmount) 12663 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 12664 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 12665 } 12666 12667 if (Res.getNode()) { 12668 KnownBits Known; 12669 DAG.computeKnownBits(SDValue(N,0), Known); 12670 // Capture demanded bits information that would be otherwise lost. 12671 if (Known.Zero == 0xfffffffe) 12672 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12673 DAG.getValueType(MVT::i1)); 12674 else if (Known.Zero == 0xffffff00) 12675 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12676 DAG.getValueType(MVT::i8)); 12677 else if (Known.Zero == 0xffff0000) 12678 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12679 DAG.getValueType(MVT::i16)); 12680 } 12681 12682 return Res; 12683 } 12684 12685 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 12686 DAGCombinerInfo &DCI) const { 12687 switch (N->getOpcode()) { 12688 default: break; 12689 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 12690 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 12691 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 12692 case ISD::SUB: return PerformSUBCombine(N, DCI); 12693 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 12694 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 12695 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 12696 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 12697 case ARMISD::ADDC: 12698 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 12699 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 12700 case ARMISD::BFI: return PerformBFICombine(N, DCI); 12701 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 12702 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 12703 case ISD::STORE: return PerformSTORECombine(N, DCI); 12704 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 12705 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 12706 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 12707 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 12708 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); 12709 case ISD::FP_TO_SINT: 12710 case ISD::FP_TO_UINT: 12711 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 12712 case ISD::FDIV: 12713 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 12714 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 12715 case ISD::SHL: 12716 case ISD::SRA: 12717 case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); 12718 case ISD::SIGN_EXTEND: 12719 case ISD::ZERO_EXTEND: 12720 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 12721 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 12722 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 12723 case ISD::LOAD: return PerformLOADCombine(N, DCI); 12724 case ARMISD::VLD1DUP: 12725 case ARMISD::VLD2DUP: 12726 case ARMISD::VLD3DUP: 12727 case ARMISD::VLD4DUP: 12728 return PerformVLDCombine(N, DCI); 12729 case ARMISD::BUILD_VECTOR: 12730 return PerformARMBUILD_VECTORCombine(N, DCI); 12731 case ARMISD::SMULWB: { 12732 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12733 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 12734 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 12735 return SDValue(); 12736 break; 12737 } 12738 case ARMISD::SMULWT: { 12739 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12740 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 12741 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 12742 return SDValue(); 12743 break; 12744 } 12745 case ARMISD::SMLALBB: { 12746 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12747 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 12748 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 12749 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 12750 return SDValue(); 12751 break; 12752 } 12753 case ARMISD::SMLALBT: { 12754 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 12755 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 12756 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 12757 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 12758 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 12759 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 12760 return SDValue(); 12761 break; 12762 } 12763 case ARMISD::SMLALTB: { 12764 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 12765 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 12766 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 12767 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 12768 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 12769 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 12770 return SDValue(); 12771 break; 12772 } 12773 case ARMISD::SMLALTT: { 12774 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12775 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 12776 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 12777 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 12778 return SDValue(); 12779 break; 12780 } 12781 case ISD::INTRINSIC_VOID: 12782 case ISD::INTRINSIC_W_CHAIN: 12783 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 12784 case Intrinsic::arm_neon_vld1: 12785 case Intrinsic::arm_neon_vld2: 12786 case Intrinsic::arm_neon_vld3: 12787 case Intrinsic::arm_neon_vld4: 12788 case Intrinsic::arm_neon_vld2lane: 12789 case Intrinsic::arm_neon_vld3lane: 12790 case Intrinsic::arm_neon_vld4lane: 12791 case Intrinsic::arm_neon_vst1: 12792 case Intrinsic::arm_neon_vst2: 12793 case Intrinsic::arm_neon_vst3: 12794 case Intrinsic::arm_neon_vst4: 12795 case Intrinsic::arm_neon_vst2lane: 12796 case Intrinsic::arm_neon_vst3lane: 12797 case Intrinsic::arm_neon_vst4lane: 12798 return PerformVLDCombine(N, DCI); 12799 default: break; 12800 } 12801 break; 12802 } 12803 return SDValue(); 12804 } 12805 12806 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 12807 EVT VT) const { 12808 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 12809 } 12810 12811 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 12812 unsigned, 12813 unsigned, 12814 bool *Fast) const { 12815 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 12816 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 12817 12818 switch (VT.getSimpleVT().SimpleTy) { 12819 default: 12820 return false; 12821 case MVT::i8: 12822 case MVT::i16: 12823 case MVT::i32: { 12824 // Unaligned access can use (for example) LRDB, LRDH, LDR 12825 if (AllowsUnaligned) { 12826 if (Fast) 12827 *Fast = Subtarget->hasV7Ops(); 12828 return true; 12829 } 12830 return false; 12831 } 12832 case MVT::f64: 12833 case MVT::v2f64: { 12834 // For any little-endian targets with neon, we can support unaligned ld/st 12835 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 12836 // A big-endian target may also explicitly support unaligned accesses 12837 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 12838 if (Fast) 12839 *Fast = true; 12840 return true; 12841 } 12842 return false; 12843 } 12844 } 12845 } 12846 12847 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 12848 unsigned AlignCheck) { 12849 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 12850 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 12851 } 12852 12853 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, 12854 unsigned DstAlign, unsigned SrcAlign, 12855 bool IsMemset, bool ZeroMemset, 12856 bool MemcpyStrSrc, 12857 MachineFunction &MF) const { 12858 const Function &F = MF.getFunction(); 12859 12860 // See if we can use NEON instructions for this... 12861 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 12862 !F.hasFnAttribute(Attribute::NoImplicitFloat)) { 12863 bool Fast; 12864 if (Size >= 16 && 12865 (memOpAlign(SrcAlign, DstAlign, 16) || 12866 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { 12867 return MVT::v2f64; 12868 } else if (Size >= 8 && 12869 (memOpAlign(SrcAlign, DstAlign, 8) || 12870 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && 12871 Fast))) { 12872 return MVT::f64; 12873 } 12874 } 12875 12876 // Let the target-independent logic figure it out. 12877 return MVT::Other; 12878 } 12879 12880 // 64-bit integers are split into their high and low parts and held in two 12881 // different registers, so the trunc is free since the low register can just 12882 // be used. 12883 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 12884 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 12885 return false; 12886 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 12887 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 12888 return (SrcBits == 64 && DestBits == 32); 12889 } 12890 12891 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 12892 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 12893 !DstVT.isInteger()) 12894 return false; 12895 unsigned SrcBits = SrcVT.getSizeInBits(); 12896 unsigned DestBits = DstVT.getSizeInBits(); 12897 return (SrcBits == 64 && DestBits == 32); 12898 } 12899 12900 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 12901 if (Val.getOpcode() != ISD::LOAD) 12902 return false; 12903 12904 EVT VT1 = Val.getValueType(); 12905 if (!VT1.isSimple() || !VT1.isInteger() || 12906 !VT2.isSimple() || !VT2.isInteger()) 12907 return false; 12908 12909 switch (VT1.getSimpleVT().SimpleTy) { 12910 default: break; 12911 case MVT::i1: 12912 case MVT::i8: 12913 case MVT::i16: 12914 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 12915 return true; 12916 } 12917 12918 return false; 12919 } 12920 12921 bool ARMTargetLowering::isFNegFree(EVT VT) const { 12922 if (!VT.isSimple()) 12923 return false; 12924 12925 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 12926 // negate values directly (fneg is free). So, we don't want to let the DAG 12927 // combiner rewrite fneg into xors and some other instructions. For f16 and 12928 // FullFP16 argument passing, some bitcast nodes may be introduced, 12929 // triggering this DAG combine rewrite, so we are avoiding that with this. 12930 switch (VT.getSimpleVT().SimpleTy) { 12931 default: break; 12932 case MVT::f16: 12933 return Subtarget->hasFullFP16(); 12934 } 12935 12936 return false; 12937 } 12938 12939 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 12940 EVT VT = ExtVal.getValueType(); 12941 12942 if (!isTypeLegal(VT)) 12943 return false; 12944 12945 // Don't create a loadext if we can fold the extension into a wide/long 12946 // instruction. 12947 // If there's more than one user instruction, the loadext is desirable no 12948 // matter what. There can be two uses by the same instruction. 12949 if (ExtVal->use_empty() || 12950 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 12951 return true; 12952 12953 SDNode *U = *ExtVal->use_begin(); 12954 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 12955 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) 12956 return false; 12957 12958 return true; 12959 } 12960 12961 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 12962 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 12963 return false; 12964 12965 if (!isTypeLegal(EVT::getEVT(Ty1))) 12966 return false; 12967 12968 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 12969 12970 // Assuming the caller doesn't have a zeroext or signext return parameter, 12971 // truncation all the way down to i1 is valid. 12972 return true; 12973 } 12974 12975 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 12976 const AddrMode &AM, Type *Ty, 12977 unsigned AS) const { 12978 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 12979 if (Subtarget->hasFPAO()) 12980 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 12981 return 0; 12982 } 12983 return -1; 12984 } 12985 12986 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 12987 if (V < 0) 12988 return false; 12989 12990 unsigned Scale = 1; 12991 switch (VT.getSimpleVT().SimpleTy) { 12992 default: return false; 12993 case MVT::i1: 12994 case MVT::i8: 12995 // Scale == 1; 12996 break; 12997 case MVT::i16: 12998 // Scale == 2; 12999 Scale = 2; 13000 break; 13001 case MVT::i32: 13002 // Scale == 4; 13003 Scale = 4; 13004 break; 13005 } 13006 13007 if ((V & (Scale - 1)) != 0) 13008 return false; 13009 V /= Scale; 13010 return V == (V & ((1LL << 5) - 1)); 13011 } 13012 13013 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 13014 const ARMSubtarget *Subtarget) { 13015 bool isNeg = false; 13016 if (V < 0) { 13017 isNeg = true; 13018 V = - V; 13019 } 13020 13021 switch (VT.getSimpleVT().SimpleTy) { 13022 default: return false; 13023 case MVT::i1: 13024 case MVT::i8: 13025 case MVT::i16: 13026 case MVT::i32: 13027 // + imm12 or - imm8 13028 if (isNeg) 13029 return V == (V & ((1LL << 8) - 1)); 13030 return V == (V & ((1LL << 12) - 1)); 13031 case MVT::f32: 13032 case MVT::f64: 13033 // Same as ARM mode. FIXME: NEON? 13034 if (!Subtarget->hasVFP2()) 13035 return false; 13036 if ((V & 3) != 0) 13037 return false; 13038 V >>= 2; 13039 return V == (V & ((1LL << 8) - 1)); 13040 } 13041 } 13042 13043 /// isLegalAddressImmediate - Return true if the integer value can be used 13044 /// as the offset of the target addressing mode for load / store of the 13045 /// given type. 13046 static bool isLegalAddressImmediate(int64_t V, EVT VT, 13047 const ARMSubtarget *Subtarget) { 13048 if (V == 0) 13049 return true; 13050 13051 if (!VT.isSimple()) 13052 return false; 13053 13054 if (Subtarget->isThumb1Only()) 13055 return isLegalT1AddressImmediate(V, VT); 13056 else if (Subtarget->isThumb2()) 13057 return isLegalT2AddressImmediate(V, VT, Subtarget); 13058 13059 // ARM mode. 13060 if (V < 0) 13061 V = - V; 13062 switch (VT.getSimpleVT().SimpleTy) { 13063 default: return false; 13064 case MVT::i1: 13065 case MVT::i8: 13066 case MVT::i32: 13067 // +- imm12 13068 return V == (V & ((1LL << 12) - 1)); 13069 case MVT::i16: 13070 // +- imm8 13071 return V == (V & ((1LL << 8) - 1)); 13072 case MVT::f32: 13073 case MVT::f64: 13074 if (!Subtarget->hasVFP2()) // FIXME: NEON? 13075 return false; 13076 if ((V & 3) != 0) 13077 return false; 13078 V >>= 2; 13079 return V == (V & ((1LL << 8) - 1)); 13080 } 13081 } 13082 13083 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 13084 EVT VT) const { 13085 int Scale = AM.Scale; 13086 if (Scale < 0) 13087 return false; 13088 13089 switch (VT.getSimpleVT().SimpleTy) { 13090 default: return false; 13091 case MVT::i1: 13092 case MVT::i8: 13093 case MVT::i16: 13094 case MVT::i32: 13095 if (Scale == 1) 13096 return true; 13097 // r + r << imm 13098 Scale = Scale & ~1; 13099 return Scale == 2 || Scale == 4 || Scale == 8; 13100 case MVT::i64: 13101 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 13102 // version in Thumb mode. 13103 // r + r 13104 if (Scale == 1) 13105 return true; 13106 // r * 2 (this can be lowered to r + r). 13107 if (!AM.HasBaseReg && Scale == 2) 13108 return true; 13109 return false; 13110 case MVT::isVoid: 13111 // Note, we allow "void" uses (basically, uses that aren't loads or 13112 // stores), because arm allows folding a scale into many arithmetic 13113 // operations. This should be made more precise and revisited later. 13114 13115 // Allow r << imm, but the imm has to be a multiple of two. 13116 if (Scale & 1) return false; 13117 return isPowerOf2_32(Scale); 13118 } 13119 } 13120 13121 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 13122 EVT VT) const { 13123 const int Scale = AM.Scale; 13124 13125 // Negative scales are not supported in Thumb1. 13126 if (Scale < 0) 13127 return false; 13128 13129 // Thumb1 addressing modes do not support register scaling excepting the 13130 // following cases: 13131 // 1. Scale == 1 means no scaling. 13132 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 13133 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 13134 } 13135 13136 /// isLegalAddressingMode - Return true if the addressing mode represented 13137 /// by AM is legal for this target, for a load/store of the specified type. 13138 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 13139 const AddrMode &AM, Type *Ty, 13140 unsigned AS, Instruction *I) const { 13141 EVT VT = getValueType(DL, Ty, true); 13142 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 13143 return false; 13144 13145 // Can never fold addr of global into load/store. 13146 if (AM.BaseGV) 13147 return false; 13148 13149 switch (AM.Scale) { 13150 case 0: // no scale reg, must be "r+i" or "r", or "i". 13151 break; 13152 default: 13153 // ARM doesn't support any R+R*scale+imm addr modes. 13154 if (AM.BaseOffs) 13155 return false; 13156 13157 if (!VT.isSimple()) 13158 return false; 13159 13160 if (Subtarget->isThumb1Only()) 13161 return isLegalT1ScaledAddressingMode(AM, VT); 13162 13163 if (Subtarget->isThumb2()) 13164 return isLegalT2ScaledAddressingMode(AM, VT); 13165 13166 int Scale = AM.Scale; 13167 switch (VT.getSimpleVT().SimpleTy) { 13168 default: return false; 13169 case MVT::i1: 13170 case MVT::i8: 13171 case MVT::i32: 13172 if (Scale < 0) Scale = -Scale; 13173 if (Scale == 1) 13174 return true; 13175 // r + r << imm 13176 return isPowerOf2_32(Scale & ~1); 13177 case MVT::i16: 13178 case MVT::i64: 13179 // r +/- r 13180 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 13181 return true; 13182 // r * 2 (this can be lowered to r + r). 13183 if (!AM.HasBaseReg && Scale == 2) 13184 return true; 13185 return false; 13186 13187 case MVT::isVoid: 13188 // Note, we allow "void" uses (basically, uses that aren't loads or 13189 // stores), because arm allows folding a scale into many arithmetic 13190 // operations. This should be made more precise and revisited later. 13191 13192 // Allow r << imm, but the imm has to be a multiple of two. 13193 if (Scale & 1) return false; 13194 return isPowerOf2_32(Scale); 13195 } 13196 } 13197 return true; 13198 } 13199 13200 /// isLegalICmpImmediate - Return true if the specified immediate is legal 13201 /// icmp immediate, that is the target has icmp instructions which can compare 13202 /// a register against the immediate without having to materialize the 13203 /// immediate into a register. 13204 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 13205 // Thumb2 and ARM modes can use cmn for negative immediates. 13206 if (!Subtarget->isThumb()) 13207 return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; 13208 if (Subtarget->isThumb2()) 13209 return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; 13210 // Thumb1 doesn't have cmn, and only 8-bit immediates. 13211 return Imm >= 0 && Imm <= 255; 13212 } 13213 13214 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 13215 /// *or sub* immediate, that is the target has add or sub instructions which can 13216 /// add a register with the immediate without having to materialize the 13217 /// immediate into a register. 13218 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 13219 // Same encoding for add/sub, just flip the sign. 13220 int64_t AbsImm = std::abs(Imm); 13221 if (!Subtarget->isThumb()) 13222 return ARM_AM::getSOImmVal(AbsImm) != -1; 13223 if (Subtarget->isThumb2()) 13224 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 13225 // Thumb1 only has 8-bit unsigned immediate. 13226 return AbsImm >= 0 && AbsImm <= 255; 13227 } 13228 13229 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 13230 bool isSEXTLoad, SDValue &Base, 13231 SDValue &Offset, bool &isInc, 13232 SelectionDAG &DAG) { 13233 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 13234 return false; 13235 13236 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 13237 // AddressingMode 3 13238 Base = Ptr->getOperand(0); 13239 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13240 int RHSC = (int)RHS->getZExtValue(); 13241 if (RHSC < 0 && RHSC > -256) { 13242 assert(Ptr->getOpcode() == ISD::ADD); 13243 isInc = false; 13244 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13245 return true; 13246 } 13247 } 13248 isInc = (Ptr->getOpcode() == ISD::ADD); 13249 Offset = Ptr->getOperand(1); 13250 return true; 13251 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 13252 // AddressingMode 2 13253 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13254 int RHSC = (int)RHS->getZExtValue(); 13255 if (RHSC < 0 && RHSC > -0x1000) { 13256 assert(Ptr->getOpcode() == ISD::ADD); 13257 isInc = false; 13258 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13259 Base = Ptr->getOperand(0); 13260 return true; 13261 } 13262 } 13263 13264 if (Ptr->getOpcode() == ISD::ADD) { 13265 isInc = true; 13266 ARM_AM::ShiftOpc ShOpcVal= 13267 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 13268 if (ShOpcVal != ARM_AM::no_shift) { 13269 Base = Ptr->getOperand(1); 13270 Offset = Ptr->getOperand(0); 13271 } else { 13272 Base = Ptr->getOperand(0); 13273 Offset = Ptr->getOperand(1); 13274 } 13275 return true; 13276 } 13277 13278 isInc = (Ptr->getOpcode() == ISD::ADD); 13279 Base = Ptr->getOperand(0); 13280 Offset = Ptr->getOperand(1); 13281 return true; 13282 } 13283 13284 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 13285 return false; 13286 } 13287 13288 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 13289 bool isSEXTLoad, SDValue &Base, 13290 SDValue &Offset, bool &isInc, 13291 SelectionDAG &DAG) { 13292 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 13293 return false; 13294 13295 Base = Ptr->getOperand(0); 13296 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13297 int RHSC = (int)RHS->getZExtValue(); 13298 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 13299 assert(Ptr->getOpcode() == ISD::ADD); 13300 isInc = false; 13301 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13302 return true; 13303 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 13304 isInc = Ptr->getOpcode() == ISD::ADD; 13305 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13306 return true; 13307 } 13308 } 13309 13310 return false; 13311 } 13312 13313 /// getPreIndexedAddressParts - returns true by value, base pointer and 13314 /// offset pointer and addressing mode by reference if the node's address 13315 /// can be legally represented as pre-indexed load / store address. 13316 bool 13317 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 13318 SDValue &Offset, 13319 ISD::MemIndexedMode &AM, 13320 SelectionDAG &DAG) const { 13321 if (Subtarget->isThumb1Only()) 13322 return false; 13323 13324 EVT VT; 13325 SDValue Ptr; 13326 bool isSEXTLoad = false; 13327 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 13328 Ptr = LD->getBasePtr(); 13329 VT = LD->getMemoryVT(); 13330 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 13331 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 13332 Ptr = ST->getBasePtr(); 13333 VT = ST->getMemoryVT(); 13334 } else 13335 return false; 13336 13337 bool isInc; 13338 bool isLegal = false; 13339 if (Subtarget->isThumb2()) 13340 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 13341 Offset, isInc, DAG); 13342 else 13343 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 13344 Offset, isInc, DAG); 13345 if (!isLegal) 13346 return false; 13347 13348 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 13349 return true; 13350 } 13351 13352 /// getPostIndexedAddressParts - returns true by value, base pointer and 13353 /// offset pointer and addressing mode by reference if this node can be 13354 /// combined with a load / store to form a post-indexed load / store. 13355 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 13356 SDValue &Base, 13357 SDValue &Offset, 13358 ISD::MemIndexedMode &AM, 13359 SelectionDAG &DAG) const { 13360 EVT VT; 13361 SDValue Ptr; 13362 bool isSEXTLoad = false, isNonExt; 13363 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 13364 VT = LD->getMemoryVT(); 13365 Ptr = LD->getBasePtr(); 13366 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 13367 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 13368 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 13369 VT = ST->getMemoryVT(); 13370 Ptr = ST->getBasePtr(); 13371 isNonExt = !ST->isTruncatingStore(); 13372 } else 13373 return false; 13374 13375 if (Subtarget->isThumb1Only()) { 13376 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 13377 // must be non-extending/truncating, i32, with an offset of 4. 13378 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 13379 if (Op->getOpcode() != ISD::ADD || !isNonExt) 13380 return false; 13381 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 13382 if (!RHS || RHS->getZExtValue() != 4) 13383 return false; 13384 13385 Offset = Op->getOperand(1); 13386 Base = Op->getOperand(0); 13387 AM = ISD::POST_INC; 13388 return true; 13389 } 13390 13391 bool isInc; 13392 bool isLegal = false; 13393 if (Subtarget->isThumb2()) 13394 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 13395 isInc, DAG); 13396 else 13397 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 13398 isInc, DAG); 13399 if (!isLegal) 13400 return false; 13401 13402 if (Ptr != Base) { 13403 // Swap base ptr and offset to catch more post-index load / store when 13404 // it's legal. In Thumb2 mode, offset must be an immediate. 13405 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 13406 !Subtarget->isThumb2()) 13407 std::swap(Base, Offset); 13408 13409 // Post-indexed load / store update the base pointer. 13410 if (Ptr != Base) 13411 return false; 13412 } 13413 13414 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 13415 return true; 13416 } 13417 13418 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 13419 KnownBits &Known, 13420 const APInt &DemandedElts, 13421 const SelectionDAG &DAG, 13422 unsigned Depth) const { 13423 unsigned BitWidth = Known.getBitWidth(); 13424 Known.resetAll(); 13425 switch (Op.getOpcode()) { 13426 default: break; 13427 case ARMISD::ADDC: 13428 case ARMISD::ADDE: 13429 case ARMISD::SUBC: 13430 case ARMISD::SUBE: 13431 // Special cases when we convert a carry to a boolean. 13432 if (Op.getResNo() == 0) { 13433 SDValue LHS = Op.getOperand(0); 13434 SDValue RHS = Op.getOperand(1); 13435 // (ADDE 0, 0, C) will give us a single bit. 13436 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 13437 isNullConstant(RHS)) { 13438 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 13439 return; 13440 } 13441 } 13442 break; 13443 case ARMISD::CMOV: { 13444 // Bits are known zero/one if known on the LHS and RHS. 13445 DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1); 13446 if (Known.isUnknown()) 13447 return; 13448 13449 KnownBits KnownRHS; 13450 DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1); 13451 Known.Zero &= KnownRHS.Zero; 13452 Known.One &= KnownRHS.One; 13453 return; 13454 } 13455 case ISD::INTRINSIC_W_CHAIN: { 13456 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 13457 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 13458 switch (IntID) { 13459 default: return; 13460 case Intrinsic::arm_ldaex: 13461 case Intrinsic::arm_ldrex: { 13462 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 13463 unsigned MemBits = VT.getScalarSizeInBits(); 13464 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 13465 return; 13466 } 13467 } 13468 } 13469 case ARMISD::BFI: { 13470 // Conservatively, we can recurse down the first operand 13471 // and just mask out all affected bits. 13472 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); 13473 13474 // The operand to BFI is already a mask suitable for removing the bits it 13475 // sets. 13476 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 13477 const APInt &Mask = CI->getAPIntValue(); 13478 Known.Zero &= Mask; 13479 Known.One &= Mask; 13480 return; 13481 } 13482 } 13483 } 13484 13485 //===----------------------------------------------------------------------===// 13486 // ARM Inline Assembly Support 13487 //===----------------------------------------------------------------------===// 13488 13489 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 13490 // Looking for "rev" which is V6+. 13491 if (!Subtarget->hasV6Ops()) 13492 return false; 13493 13494 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 13495 std::string AsmStr = IA->getAsmString(); 13496 SmallVector<StringRef, 4> AsmPieces; 13497 SplitString(AsmStr, AsmPieces, ";\n"); 13498 13499 switch (AsmPieces.size()) { 13500 default: return false; 13501 case 1: 13502 AsmStr = AsmPieces[0]; 13503 AsmPieces.clear(); 13504 SplitString(AsmStr, AsmPieces, " \t,"); 13505 13506 // rev $0, $1 13507 if (AsmPieces.size() == 3 && 13508 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 13509 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 13510 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13511 if (Ty && Ty->getBitWidth() == 32) 13512 return IntrinsicLowering::LowerToByteSwap(CI); 13513 } 13514 break; 13515 } 13516 13517 return false; 13518 } 13519 13520 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 13521 // At this point, we have to lower this constraint to something else, so we 13522 // lower it to an "r" or "w". However, by doing this we will force the result 13523 // to be in register, while the X constraint is much more permissive. 13524 // 13525 // Although we are correct (we are free to emit anything, without 13526 // constraints), we might break use cases that would expect us to be more 13527 // efficient and emit something else. 13528 if (!Subtarget->hasVFP2()) 13529 return "r"; 13530 if (ConstraintVT.isFloatingPoint()) 13531 return "w"; 13532 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 13533 (ConstraintVT.getSizeInBits() == 64 || 13534 ConstraintVT.getSizeInBits() == 128)) 13535 return "w"; 13536 13537 return "r"; 13538 } 13539 13540 /// getConstraintType - Given a constraint letter, return the type of 13541 /// constraint it is for this target. 13542 ARMTargetLowering::ConstraintType 13543 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 13544 if (Constraint.size() == 1) { 13545 switch (Constraint[0]) { 13546 default: break; 13547 case 'l': return C_RegisterClass; 13548 case 'w': return C_RegisterClass; 13549 case 'h': return C_RegisterClass; 13550 case 'x': return C_RegisterClass; 13551 case 't': return C_RegisterClass; 13552 case 'j': return C_Other; // Constant for movw. 13553 // An address with a single base register. Due to the way we 13554 // currently handle addresses it is the same as an 'r' memory constraint. 13555 case 'Q': return C_Memory; 13556 } 13557 } else if (Constraint.size() == 2) { 13558 switch (Constraint[0]) { 13559 default: break; 13560 // All 'U+' constraints are addresses. 13561 case 'U': return C_Memory; 13562 } 13563 } 13564 return TargetLowering::getConstraintType(Constraint); 13565 } 13566 13567 /// Examine constraint type and operand type and determine a weight value. 13568 /// This object must already have been set up with the operand type 13569 /// and the current alternative constraint selected. 13570 TargetLowering::ConstraintWeight 13571 ARMTargetLowering::getSingleConstraintMatchWeight( 13572 AsmOperandInfo &info, const char *constraint) const { 13573 ConstraintWeight weight = CW_Invalid; 13574 Value *CallOperandVal = info.CallOperandVal; 13575 // If we don't have a value, we can't do a match, 13576 // but allow it at the lowest weight. 13577 if (!CallOperandVal) 13578 return CW_Default; 13579 Type *type = CallOperandVal->getType(); 13580 // Look at the constraint type. 13581 switch (*constraint) { 13582 default: 13583 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13584 break; 13585 case 'l': 13586 if (type->isIntegerTy()) { 13587 if (Subtarget->isThumb()) 13588 weight = CW_SpecificReg; 13589 else 13590 weight = CW_Register; 13591 } 13592 break; 13593 case 'w': 13594 if (type->isFloatingPointTy()) 13595 weight = CW_Register; 13596 break; 13597 } 13598 return weight; 13599 } 13600 13601 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 13602 13603 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 13604 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 13605 if (Constraint.size() == 1) { 13606 // GCC ARM Constraint Letters 13607 switch (Constraint[0]) { 13608 case 'l': // Low regs or general regs. 13609 if (Subtarget->isThumb()) 13610 return RCPair(0U, &ARM::tGPRRegClass); 13611 return RCPair(0U, &ARM::GPRRegClass); 13612 case 'h': // High regs or no regs. 13613 if (Subtarget->isThumb()) 13614 return RCPair(0U, &ARM::hGPRRegClass); 13615 break; 13616 case 'r': 13617 if (Subtarget->isThumb1Only()) 13618 return RCPair(0U, &ARM::tGPRRegClass); 13619 return RCPair(0U, &ARM::GPRRegClass); 13620 case 'w': 13621 if (VT == MVT::Other) 13622 break; 13623 if (VT == MVT::f32) 13624 return RCPair(0U, &ARM::SPRRegClass); 13625 if (VT.getSizeInBits() == 64) 13626 return RCPair(0U, &ARM::DPRRegClass); 13627 if (VT.getSizeInBits() == 128) 13628 return RCPair(0U, &ARM::QPRRegClass); 13629 break; 13630 case 'x': 13631 if (VT == MVT::Other) 13632 break; 13633 if (VT == MVT::f32) 13634 return RCPair(0U, &ARM::SPR_8RegClass); 13635 if (VT.getSizeInBits() == 64) 13636 return RCPair(0U, &ARM::DPR_8RegClass); 13637 if (VT.getSizeInBits() == 128) 13638 return RCPair(0U, &ARM::QPR_8RegClass); 13639 break; 13640 case 't': 13641 if (VT == MVT::Other) 13642 break; 13643 if (VT == MVT::f32 || VT == MVT::i32) 13644 return RCPair(0U, &ARM::SPRRegClass); 13645 if (VT.getSizeInBits() == 64) 13646 return RCPair(0U, &ARM::DPR_VFP2RegClass); 13647 if (VT.getSizeInBits() == 128) 13648 return RCPair(0U, &ARM::QPR_VFP2RegClass); 13649 break; 13650 } 13651 } 13652 if (StringRef("{cc}").equals_lower(Constraint)) 13653 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 13654 13655 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 13656 } 13657 13658 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13659 /// vector. If it is invalid, don't add anything to Ops. 13660 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13661 std::string &Constraint, 13662 std::vector<SDValue>&Ops, 13663 SelectionDAG &DAG) const { 13664 SDValue Result; 13665 13666 // Currently only support length 1 constraints. 13667 if (Constraint.length() != 1) return; 13668 13669 char ConstraintLetter = Constraint[0]; 13670 switch (ConstraintLetter) { 13671 default: break; 13672 case 'j': 13673 case 'I': case 'J': case 'K': case 'L': 13674 case 'M': case 'N': case 'O': 13675 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 13676 if (!C) 13677 return; 13678 13679 int64_t CVal64 = C->getSExtValue(); 13680 int CVal = (int) CVal64; 13681 // None of these constraints allow values larger than 32 bits. Check 13682 // that the value fits in an int. 13683 if (CVal != CVal64) 13684 return; 13685 13686 switch (ConstraintLetter) { 13687 case 'j': 13688 // Constant suitable for movw, must be between 0 and 13689 // 65535. 13690 if (Subtarget->hasV6T2Ops()) 13691 if (CVal >= 0 && CVal <= 65535) 13692 break; 13693 return; 13694 case 'I': 13695 if (Subtarget->isThumb1Only()) { 13696 // This must be a constant between 0 and 255, for ADD 13697 // immediates. 13698 if (CVal >= 0 && CVal <= 255) 13699 break; 13700 } else if (Subtarget->isThumb2()) { 13701 // A constant that can be used as an immediate value in a 13702 // data-processing instruction. 13703 if (ARM_AM::getT2SOImmVal(CVal) != -1) 13704 break; 13705 } else { 13706 // A constant that can be used as an immediate value in a 13707 // data-processing instruction. 13708 if (ARM_AM::getSOImmVal(CVal) != -1) 13709 break; 13710 } 13711 return; 13712 13713 case 'J': 13714 if (Subtarget->isThumb1Only()) { 13715 // This must be a constant between -255 and -1, for negated ADD 13716 // immediates. This can be used in GCC with an "n" modifier that 13717 // prints the negated value, for use with SUB instructions. It is 13718 // not useful otherwise but is implemented for compatibility. 13719 if (CVal >= -255 && CVal <= -1) 13720 break; 13721 } else { 13722 // This must be a constant between -4095 and 4095. It is not clear 13723 // what this constraint is intended for. Implemented for 13724 // compatibility with GCC. 13725 if (CVal >= -4095 && CVal <= 4095) 13726 break; 13727 } 13728 return; 13729 13730 case 'K': 13731 if (Subtarget->isThumb1Only()) { 13732 // A 32-bit value where only one byte has a nonzero value. Exclude 13733 // zero to match GCC. This constraint is used by GCC internally for 13734 // constants that can be loaded with a move/shift combination. 13735 // It is not useful otherwise but is implemented for compatibility. 13736 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 13737 break; 13738 } else if (Subtarget->isThumb2()) { 13739 // A constant whose bitwise inverse can be used as an immediate 13740 // value in a data-processing instruction. This can be used in GCC 13741 // with a "B" modifier that prints the inverted value, for use with 13742 // BIC and MVN instructions. It is not useful otherwise but is 13743 // implemented for compatibility. 13744 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 13745 break; 13746 } else { 13747 // A constant whose bitwise inverse can be used as an immediate 13748 // value in a data-processing instruction. This can be used in GCC 13749 // with a "B" modifier that prints the inverted value, for use with 13750 // BIC and MVN instructions. It is not useful otherwise but is 13751 // implemented for compatibility. 13752 if (ARM_AM::getSOImmVal(~CVal) != -1) 13753 break; 13754 } 13755 return; 13756 13757 case 'L': 13758 if (Subtarget->isThumb1Only()) { 13759 // This must be a constant between -7 and 7, 13760 // for 3-operand ADD/SUB immediate instructions. 13761 if (CVal >= -7 && CVal < 7) 13762 break; 13763 } else if (Subtarget->isThumb2()) { 13764 // A constant whose negation can be used as an immediate value in a 13765 // data-processing instruction. This can be used in GCC with an "n" 13766 // modifier that prints the negated value, for use with SUB 13767 // instructions. It is not useful otherwise but is implemented for 13768 // compatibility. 13769 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 13770 break; 13771 } else { 13772 // A constant whose negation can be used as an immediate value in a 13773 // data-processing instruction. This can be used in GCC with an "n" 13774 // modifier that prints the negated value, for use with SUB 13775 // instructions. It is not useful otherwise but is implemented for 13776 // compatibility. 13777 if (ARM_AM::getSOImmVal(-CVal) != -1) 13778 break; 13779 } 13780 return; 13781 13782 case 'M': 13783 if (Subtarget->isThumb1Only()) { 13784 // This must be a multiple of 4 between 0 and 1020, for 13785 // ADD sp + immediate. 13786 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 13787 break; 13788 } else { 13789 // A power of two or a constant between 0 and 32. This is used in 13790 // GCC for the shift amount on shifted register operands, but it is 13791 // useful in general for any shift amounts. 13792 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 13793 break; 13794 } 13795 return; 13796 13797 case 'N': 13798 if (Subtarget->isThumb()) { // FIXME thumb2 13799 // This must be a constant between 0 and 31, for shift amounts. 13800 if (CVal >= 0 && CVal <= 31) 13801 break; 13802 } 13803 return; 13804 13805 case 'O': 13806 if (Subtarget->isThumb()) { // FIXME thumb2 13807 // This must be a multiple of 4 between -508 and 508, for 13808 // ADD/SUB sp = sp + immediate. 13809 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 13810 break; 13811 } 13812 return; 13813 } 13814 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 13815 break; 13816 } 13817 13818 if (Result.getNode()) { 13819 Ops.push_back(Result); 13820 return; 13821 } 13822 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13823 } 13824 13825 static RTLIB::Libcall getDivRemLibcall( 13826 const SDNode *N, MVT::SimpleValueType SVT) { 13827 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 13828 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 13829 "Unhandled Opcode in getDivRemLibcall"); 13830 bool isSigned = N->getOpcode() == ISD::SDIVREM || 13831 N->getOpcode() == ISD::SREM; 13832 RTLIB::Libcall LC; 13833 switch (SVT) { 13834 default: llvm_unreachable("Unexpected request for libcall!"); 13835 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 13836 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 13837 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 13838 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 13839 } 13840 return LC; 13841 } 13842 13843 static TargetLowering::ArgListTy getDivRemArgList( 13844 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 13845 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 13846 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 13847 "Unhandled Opcode in getDivRemArgList"); 13848 bool isSigned = N->getOpcode() == ISD::SDIVREM || 13849 N->getOpcode() == ISD::SREM; 13850 TargetLowering::ArgListTy Args; 13851 TargetLowering::ArgListEntry Entry; 13852 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 13853 EVT ArgVT = N->getOperand(i).getValueType(); 13854 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 13855 Entry.Node = N->getOperand(i); 13856 Entry.Ty = ArgTy; 13857 Entry.IsSExt = isSigned; 13858 Entry.IsZExt = !isSigned; 13859 Args.push_back(Entry); 13860 } 13861 if (Subtarget->isTargetWindows() && Args.size() >= 2) 13862 std::swap(Args[0], Args[1]); 13863 return Args; 13864 } 13865 13866 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 13867 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 13868 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 13869 Subtarget->isTargetWindows()) && 13870 "Register-based DivRem lowering only"); 13871 unsigned Opcode = Op->getOpcode(); 13872 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 13873 "Invalid opcode for Div/Rem lowering"); 13874 bool isSigned = (Opcode == ISD::SDIVREM); 13875 EVT VT = Op->getValueType(0); 13876 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 13877 SDLoc dl(Op); 13878 13879 // If the target has hardware divide, use divide + multiply + subtract: 13880 // div = a / b 13881 // rem = a - b * div 13882 // return {div, rem} 13883 // This should be lowered into UDIV/SDIV + MLS later on. 13884 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 13885 : Subtarget->hasDivideInARMMode(); 13886 if (hasDivide && Op->getValueType(0).isSimple() && 13887 Op->getSimpleValueType(0) == MVT::i32) { 13888 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 13889 const SDValue Dividend = Op->getOperand(0); 13890 const SDValue Divisor = Op->getOperand(1); 13891 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 13892 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 13893 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 13894 13895 SDValue Values[2] = {Div, Rem}; 13896 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 13897 } 13898 13899 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 13900 VT.getSimpleVT().SimpleTy); 13901 SDValue InChain = DAG.getEntryNode(); 13902 13903 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 13904 DAG.getContext(), 13905 Subtarget); 13906 13907 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 13908 getPointerTy(DAG.getDataLayout())); 13909 13910 Type *RetTy = StructType::get(Ty, Ty); 13911 13912 if (Subtarget->isTargetWindows()) 13913 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 13914 13915 TargetLowering::CallLoweringInfo CLI(DAG); 13916 CLI.setDebugLoc(dl).setChain(InChain) 13917 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 13918 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 13919 13920 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 13921 return CallInfo.first; 13922 } 13923 13924 // Lowers REM using divmod helpers 13925 // see RTABI section 4.2/4.3 13926 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 13927 // Build return types (div and rem) 13928 std::vector<Type*> RetTyParams; 13929 Type *RetTyElement; 13930 13931 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 13932 default: llvm_unreachable("Unexpected request for libcall!"); 13933 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 13934 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 13935 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 13936 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 13937 } 13938 13939 RetTyParams.push_back(RetTyElement); 13940 RetTyParams.push_back(RetTyElement); 13941 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 13942 Type *RetTy = StructType::get(*DAG.getContext(), ret); 13943 13944 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 13945 SimpleTy); 13946 SDValue InChain = DAG.getEntryNode(); 13947 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 13948 Subtarget); 13949 bool isSigned = N->getOpcode() == ISD::SREM; 13950 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 13951 getPointerTy(DAG.getDataLayout())); 13952 13953 if (Subtarget->isTargetWindows()) 13954 InChain = WinDBZCheckDenominator(DAG, N, InChain); 13955 13956 // Lower call 13957 CallLoweringInfo CLI(DAG); 13958 CLI.setChain(InChain) 13959 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 13960 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 13961 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 13962 13963 // Return second (rem) result operand (first contains div) 13964 SDNode *ResNode = CallResult.first.getNode(); 13965 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 13966 return ResNode->getOperand(1); 13967 } 13968 13969 SDValue 13970 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 13971 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 13972 SDLoc DL(Op); 13973 13974 // Get the inputs. 13975 SDValue Chain = Op.getOperand(0); 13976 SDValue Size = Op.getOperand(1); 13977 13978 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 13979 "no-stack-arg-probe")) { 13980 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 13981 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 13982 Chain = SP.getValue(1); 13983 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 13984 if (Align) 13985 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 13986 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 13987 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 13988 SDValue Ops[2] = { SP, Chain }; 13989 return DAG.getMergeValues(Ops, DL); 13990 } 13991 13992 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 13993 DAG.getConstant(2, DL, MVT::i32)); 13994 13995 SDValue Flag; 13996 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 13997 Flag = Chain.getValue(1); 13998 13999 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 14000 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 14001 14002 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 14003 Chain = NewSP.getValue(1); 14004 14005 SDValue Ops[2] = { NewSP, Chain }; 14006 return DAG.getMergeValues(Ops, DL); 14007 } 14008 14009 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 14010 assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && 14011 "Unexpected type for custom-lowering FP_EXTEND"); 14012 14013 RTLIB::Libcall LC; 14014 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 14015 14016 SDValue SrcVal = Op.getOperand(0); 14017 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 14018 SDLoc(Op)).first; 14019 } 14020 14021 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 14022 assert(Op.getOperand(0).getValueType() == MVT::f64 && 14023 Subtarget->isFPOnlySP() && 14024 "Unexpected type for custom-lowering FP_ROUND"); 14025 14026 RTLIB::Libcall LC; 14027 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 14028 14029 SDValue SrcVal = Op.getOperand(0); 14030 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 14031 SDLoc(Op)).first; 14032 } 14033 14034 bool 14035 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 14036 // The ARM target isn't yet aware of offsets. 14037 return false; 14038 } 14039 14040 bool ARM::isBitFieldInvertedMask(unsigned v) { 14041 if (v == 0xffffffff) 14042 return false; 14043 14044 // there can be 1's on either or both "outsides", all the "inside" 14045 // bits must be 0's 14046 return isShiftedMask_32(~v); 14047 } 14048 14049 /// isFPImmLegal - Returns true if the target can instruction select the 14050 /// specified FP immediate natively. If false, the legalizer will 14051 /// materialize the FP immediate as a load from a constant pool. 14052 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 14053 if (!Subtarget->hasVFP3()) 14054 return false; 14055 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 14056 return ARM_AM::getFP16Imm(Imm) != -1; 14057 if (VT == MVT::f32) 14058 return ARM_AM::getFP32Imm(Imm) != -1; 14059 if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) 14060 return ARM_AM::getFP64Imm(Imm) != -1; 14061 return false; 14062 } 14063 14064 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 14065 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 14066 /// specified in the intrinsic calls. 14067 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 14068 const CallInst &I, 14069 MachineFunction &MF, 14070 unsigned Intrinsic) const { 14071 switch (Intrinsic) { 14072 case Intrinsic::arm_neon_vld1: 14073 case Intrinsic::arm_neon_vld2: 14074 case Intrinsic::arm_neon_vld3: 14075 case Intrinsic::arm_neon_vld4: 14076 case Intrinsic::arm_neon_vld2lane: 14077 case Intrinsic::arm_neon_vld3lane: 14078 case Intrinsic::arm_neon_vld4lane: { 14079 Info.opc = ISD::INTRINSIC_W_CHAIN; 14080 // Conservatively set memVT to the entire set of vectors loaded. 14081 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14082 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 14083 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14084 Info.ptrVal = I.getArgOperand(0); 14085 Info.offset = 0; 14086 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 14087 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 14088 // volatile loads with NEON intrinsics not supported 14089 Info.flags = MachineMemOperand::MOLoad; 14090 return true; 14091 } 14092 case Intrinsic::arm_neon_vst1: 14093 case Intrinsic::arm_neon_vst2: 14094 case Intrinsic::arm_neon_vst3: 14095 case Intrinsic::arm_neon_vst4: 14096 case Intrinsic::arm_neon_vst2lane: 14097 case Intrinsic::arm_neon_vst3lane: 14098 case Intrinsic::arm_neon_vst4lane: { 14099 Info.opc = ISD::INTRINSIC_VOID; 14100 // Conservatively set memVT to the entire set of vectors stored. 14101 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14102 unsigned NumElts = 0; 14103 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 14104 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 14105 if (!ArgTy->isVectorTy()) 14106 break; 14107 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 14108 } 14109 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14110 Info.ptrVal = I.getArgOperand(0); 14111 Info.offset = 0; 14112 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 14113 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 14114 // volatile stores with NEON intrinsics not supported 14115 Info.flags = MachineMemOperand::MOStore; 14116 return true; 14117 } 14118 case Intrinsic::arm_ldaex: 14119 case Intrinsic::arm_ldrex: { 14120 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14121 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 14122 Info.opc = ISD::INTRINSIC_W_CHAIN; 14123 Info.memVT = MVT::getVT(PtrTy->getElementType()); 14124 Info.ptrVal = I.getArgOperand(0); 14125 Info.offset = 0; 14126 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 14127 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14128 return true; 14129 } 14130 case Intrinsic::arm_stlex: 14131 case Intrinsic::arm_strex: { 14132 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14133 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 14134 Info.opc = ISD::INTRINSIC_W_CHAIN; 14135 Info.memVT = MVT::getVT(PtrTy->getElementType()); 14136 Info.ptrVal = I.getArgOperand(1); 14137 Info.offset = 0; 14138 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 14139 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14140 return true; 14141 } 14142 case Intrinsic::arm_stlexd: 14143 case Intrinsic::arm_strexd: 14144 Info.opc = ISD::INTRINSIC_W_CHAIN; 14145 Info.memVT = MVT::i64; 14146 Info.ptrVal = I.getArgOperand(2); 14147 Info.offset = 0; 14148 Info.align = 8; 14149 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14150 return true; 14151 14152 case Intrinsic::arm_ldaexd: 14153 case Intrinsic::arm_ldrexd: 14154 Info.opc = ISD::INTRINSIC_W_CHAIN; 14155 Info.memVT = MVT::i64; 14156 Info.ptrVal = I.getArgOperand(0); 14157 Info.offset = 0; 14158 Info.align = 8; 14159 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14160 return true; 14161 14162 default: 14163 break; 14164 } 14165 14166 return false; 14167 } 14168 14169 /// \brief Returns true if it is beneficial to convert a load of a constant 14170 /// to just the constant itself. 14171 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 14172 Type *Ty) const { 14173 assert(Ty->isIntegerTy()); 14174 14175 unsigned Bits = Ty->getPrimitiveSizeInBits(); 14176 if (Bits == 0 || Bits > 32) 14177 return false; 14178 return true; 14179 } 14180 14181 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 14182 unsigned Index) const { 14183 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 14184 return false; 14185 14186 return (Index == 0 || Index == ResVT.getVectorNumElements()); 14187 } 14188 14189 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 14190 ARM_MB::MemBOpt Domain) const { 14191 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14192 14193 // First, if the target has no DMB, see what fallback we can use. 14194 if (!Subtarget->hasDataBarrier()) { 14195 // Some ARMv6 cpus can support data barriers with an mcr instruction. 14196 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 14197 // here. 14198 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 14199 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 14200 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 14201 Builder.getInt32(0), Builder.getInt32(7), 14202 Builder.getInt32(10), Builder.getInt32(5)}; 14203 return Builder.CreateCall(MCR, args); 14204 } else { 14205 // Instead of using barriers, atomic accesses on these subtargets use 14206 // libcalls. 14207 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 14208 } 14209 } else { 14210 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 14211 // Only a full system barrier exists in the M-class architectures. 14212 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 14213 Constant *CDomain = Builder.getInt32(Domain); 14214 return Builder.CreateCall(DMB, CDomain); 14215 } 14216 } 14217 14218 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 14219 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 14220 Instruction *Inst, 14221 AtomicOrdering Ord) const { 14222 switch (Ord) { 14223 case AtomicOrdering::NotAtomic: 14224 case AtomicOrdering::Unordered: 14225 llvm_unreachable("Invalid fence: unordered/non-atomic"); 14226 case AtomicOrdering::Monotonic: 14227 case AtomicOrdering::Acquire: 14228 return nullptr; // Nothing to do 14229 case AtomicOrdering::SequentiallyConsistent: 14230 if (!Inst->hasAtomicStore()) 14231 return nullptr; // Nothing to do 14232 LLVM_FALLTHROUGH; 14233 case AtomicOrdering::Release: 14234 case AtomicOrdering::AcquireRelease: 14235 if (Subtarget->preferISHSTBarriers()) 14236 return makeDMB(Builder, ARM_MB::ISHST); 14237 // FIXME: add a comment with a link to documentation justifying this. 14238 else 14239 return makeDMB(Builder, ARM_MB::ISH); 14240 } 14241 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 14242 } 14243 14244 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 14245 Instruction *Inst, 14246 AtomicOrdering Ord) const { 14247 switch (Ord) { 14248 case AtomicOrdering::NotAtomic: 14249 case AtomicOrdering::Unordered: 14250 llvm_unreachable("Invalid fence: unordered/not-atomic"); 14251 case AtomicOrdering::Monotonic: 14252 case AtomicOrdering::Release: 14253 return nullptr; // Nothing to do 14254 case AtomicOrdering::Acquire: 14255 case AtomicOrdering::AcquireRelease: 14256 case AtomicOrdering::SequentiallyConsistent: 14257 return makeDMB(Builder, ARM_MB::ISH); 14258 } 14259 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 14260 } 14261 14262 // Loads and stores less than 64-bits are already atomic; ones above that 14263 // are doomed anyway, so defer to the default libcall and blame the OS when 14264 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 14265 // anything for those. 14266 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 14267 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 14268 return (Size == 64) && !Subtarget->isMClass(); 14269 } 14270 14271 // Loads and stores less than 64-bits are already atomic; ones above that 14272 // are doomed anyway, so defer to the default libcall and blame the OS when 14273 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 14274 // anything for those. 14275 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 14276 // guarantee, see DDI0406C ARM architecture reference manual, 14277 // sections A8.8.72-74 LDRD) 14278 TargetLowering::AtomicExpansionKind 14279 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 14280 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 14281 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 14282 : AtomicExpansionKind::None; 14283 } 14284 14285 // For the real atomic operations, we have ldrex/strex up to 32 bits, 14286 // and up to 64 bits on the non-M profiles 14287 TargetLowering::AtomicExpansionKind 14288 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 14289 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 14290 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 14291 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 14292 ? AtomicExpansionKind::LLSC 14293 : AtomicExpansionKind::None; 14294 } 14295 14296 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( 14297 AtomicCmpXchgInst *AI) const { 14298 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 14299 // implement cmpxchg without spilling. If the address being exchanged is also 14300 // on the stack and close enough to the spill slot, this can lead to a 14301 // situation where the monitor always gets cleared and the atomic operation 14302 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 14303 bool hasAtomicCmpXchg = 14304 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 14305 return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg; 14306 } 14307 14308 bool ARMTargetLowering::shouldInsertFencesForAtomic( 14309 const Instruction *I) const { 14310 return InsertFencesForAtomic; 14311 } 14312 14313 // This has so far only been implemented for MachO. 14314 bool ARMTargetLowering::useLoadStackGuardNode() const { 14315 return Subtarget->isTargetMachO(); 14316 } 14317 14318 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 14319 unsigned &Cost) const { 14320 // If we do not have NEON, vector types are not natively supported. 14321 if (!Subtarget->hasNEON()) 14322 return false; 14323 14324 // Floating point values and vector values map to the same register file. 14325 // Therefore, although we could do a store extract of a vector type, this is 14326 // better to leave at float as we have more freedom in the addressing mode for 14327 // those. 14328 if (VectorTy->isFPOrFPVectorTy()) 14329 return false; 14330 14331 // If the index is unknown at compile time, this is very expensive to lower 14332 // and it is not possible to combine the store with the extract. 14333 if (!isa<ConstantInt>(Idx)) 14334 return false; 14335 14336 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 14337 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 14338 // We can do a store + vector extract on any vector that fits perfectly in a D 14339 // or Q register. 14340 if (BitWidth == 64 || BitWidth == 128) { 14341 Cost = 0; 14342 return true; 14343 } 14344 return false; 14345 } 14346 14347 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 14348 return Subtarget->hasV6T2Ops(); 14349 } 14350 14351 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 14352 return Subtarget->hasV6T2Ops(); 14353 } 14354 14355 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 14356 AtomicOrdering Ord) const { 14357 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14358 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 14359 bool IsAcquire = isAcquireOrStronger(Ord); 14360 14361 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 14362 // intrinsic must return {i32, i32} and we have to recombine them into a 14363 // single i64 here. 14364 if (ValTy->getPrimitiveSizeInBits() == 64) { 14365 Intrinsic::ID Int = 14366 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 14367 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 14368 14369 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14370 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 14371 14372 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 14373 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 14374 if (!Subtarget->isLittle()) 14375 std::swap (Lo, Hi); 14376 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 14377 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 14378 return Builder.CreateOr( 14379 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 14380 } 14381 14382 Type *Tys[] = { Addr->getType() }; 14383 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 14384 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 14385 14386 return Builder.CreateTruncOrBitCast( 14387 Builder.CreateCall(Ldrex, Addr), 14388 cast<PointerType>(Addr->getType())->getElementType()); 14389 } 14390 14391 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 14392 IRBuilder<> &Builder) const { 14393 if (!Subtarget->hasV7Ops()) 14394 return; 14395 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14396 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 14397 } 14398 14399 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 14400 Value *Addr, 14401 AtomicOrdering Ord) const { 14402 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14403 bool IsRelease = isReleaseOrStronger(Ord); 14404 14405 // Since the intrinsics must have legal type, the i64 intrinsics take two 14406 // parameters: "i32, i32". We must marshal Val into the appropriate form 14407 // before the call. 14408 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 14409 Intrinsic::ID Int = 14410 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 14411 Function *Strex = Intrinsic::getDeclaration(M, Int); 14412 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 14413 14414 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 14415 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 14416 if (!Subtarget->isLittle()) 14417 std::swap(Lo, Hi); 14418 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14419 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 14420 } 14421 14422 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 14423 Type *Tys[] = { Addr->getType() }; 14424 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 14425 14426 return Builder.CreateCall( 14427 Strex, {Builder.CreateZExtOrBitCast( 14428 Val, Strex->getFunctionType()->getParamType(0)), 14429 Addr}); 14430 } 14431 14432 /// A helper function for determining the number of interleaved accesses we 14433 /// will generate when lowering accesses of the given type. 14434 unsigned 14435 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 14436 const DataLayout &DL) const { 14437 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 14438 } 14439 14440 bool ARMTargetLowering::isLegalInterleavedAccessType( 14441 VectorType *VecTy, const DataLayout &DL) const { 14442 14443 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 14444 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 14445 14446 // Ensure the vector doesn't have f16 elements. Even though we could do an 14447 // i16 vldN, we can't hold the f16 vectors and will end up converting via 14448 // f32. 14449 if (VecTy->getElementType()->isHalfTy()) 14450 return false; 14451 14452 // Ensure the number of vector elements is greater than 1. 14453 if (VecTy->getNumElements() < 2) 14454 return false; 14455 14456 // Ensure the element type is legal. 14457 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 14458 return false; 14459 14460 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 14461 // 128 will be split into multiple interleaved accesses. 14462 return VecSize == 64 || VecSize % 128 == 0; 14463 } 14464 14465 /// \brief Lower an interleaved load into a vldN intrinsic. 14466 /// 14467 /// E.g. Lower an interleaved load (Factor = 2): 14468 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 14469 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 14470 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 14471 /// 14472 /// Into: 14473 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 14474 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 14475 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 14476 bool ARMTargetLowering::lowerInterleavedLoad( 14477 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 14478 ArrayRef<unsigned> Indices, unsigned Factor) const { 14479 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 14480 "Invalid interleave factor"); 14481 assert(!Shuffles.empty() && "Empty shufflevector input"); 14482 assert(Shuffles.size() == Indices.size() && 14483 "Unmatched number of shufflevectors and indices"); 14484 14485 VectorType *VecTy = Shuffles[0]->getType(); 14486 Type *EltTy = VecTy->getVectorElementType(); 14487 14488 const DataLayout &DL = LI->getModule()->getDataLayout(); 14489 14490 // Skip if we do not have NEON and skip illegal vector types. We can 14491 // "legalize" wide vector types into multiple interleaved accesses as long as 14492 // the vector types are divisible by 128. 14493 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) 14494 return false; 14495 14496 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 14497 14498 // A pointer vector can not be the return type of the ldN intrinsics. Need to 14499 // load integer vectors first and then convert to pointer vectors. 14500 if (EltTy->isPointerTy()) 14501 VecTy = 14502 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 14503 14504 IRBuilder<> Builder(LI); 14505 14506 // The base address of the load. 14507 Value *BaseAddr = LI->getPointerOperand(); 14508 14509 if (NumLoads > 1) { 14510 // If we're going to generate more than one load, reset the sub-vector type 14511 // to something legal. 14512 VecTy = VectorType::get(VecTy->getVectorElementType(), 14513 VecTy->getVectorNumElements() / NumLoads); 14514 14515 // We will compute the pointer operand of each load from the original base 14516 // address using GEPs. Cast the base address to a pointer to the scalar 14517 // element type. 14518 BaseAddr = Builder.CreateBitCast( 14519 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 14520 LI->getPointerAddressSpace())); 14521 } 14522 14523 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 14524 14525 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 14526 Type *Tys[] = {VecTy, Int8Ptr}; 14527 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 14528 Intrinsic::arm_neon_vld3, 14529 Intrinsic::arm_neon_vld4}; 14530 Function *VldnFunc = 14531 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 14532 14533 // Holds sub-vectors extracted from the load intrinsic return values. The 14534 // sub-vectors are associated with the shufflevector instructions they will 14535 // replace. 14536 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 14537 14538 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 14539 // If we're generating more than one load, compute the base address of 14540 // subsequent loads as an offset from the previous. 14541 if (LoadCount > 0) 14542 BaseAddr = Builder.CreateConstGEP1_32( 14543 BaseAddr, VecTy->getVectorNumElements() * Factor); 14544 14545 SmallVector<Value *, 2> Ops; 14546 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 14547 Ops.push_back(Builder.getInt32(LI->getAlignment())); 14548 14549 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); 14550 14551 // Replace uses of each shufflevector with the corresponding vector loaded 14552 // by ldN. 14553 for (unsigned i = 0; i < Shuffles.size(); i++) { 14554 ShuffleVectorInst *SV = Shuffles[i]; 14555 unsigned Index = Indices[i]; 14556 14557 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 14558 14559 // Convert the integer vector to pointer vector if the element is pointer. 14560 if (EltTy->isPointerTy()) 14561 SubVec = Builder.CreateIntToPtr( 14562 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 14563 VecTy->getVectorNumElements())); 14564 14565 SubVecs[SV].push_back(SubVec); 14566 } 14567 } 14568 14569 // Replace uses of the shufflevector instructions with the sub-vectors 14570 // returned by the load intrinsic. If a shufflevector instruction is 14571 // associated with more than one sub-vector, those sub-vectors will be 14572 // concatenated into a single wide vector. 14573 for (ShuffleVectorInst *SVI : Shuffles) { 14574 auto &SubVec = SubVecs[SVI]; 14575 auto *WideVec = 14576 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 14577 SVI->replaceAllUsesWith(WideVec); 14578 } 14579 14580 return true; 14581 } 14582 14583 /// \brief Lower an interleaved store into a vstN intrinsic. 14584 /// 14585 /// E.g. Lower an interleaved store (Factor = 3): 14586 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 14587 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 14588 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 14589 /// 14590 /// Into: 14591 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 14592 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 14593 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 14594 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 14595 /// 14596 /// Note that the new shufflevectors will be removed and we'll only generate one 14597 /// vst3 instruction in CodeGen. 14598 /// 14599 /// Example for a more general valid mask (Factor 3). Lower: 14600 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 14601 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 14602 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 14603 /// 14604 /// Into: 14605 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 14606 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 14607 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 14608 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 14609 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 14610 ShuffleVectorInst *SVI, 14611 unsigned Factor) const { 14612 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 14613 "Invalid interleave factor"); 14614 14615 VectorType *VecTy = SVI->getType(); 14616 assert(VecTy->getVectorNumElements() % Factor == 0 && 14617 "Invalid interleaved store"); 14618 14619 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 14620 Type *EltTy = VecTy->getVectorElementType(); 14621 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 14622 14623 const DataLayout &DL = SI->getModule()->getDataLayout(); 14624 14625 // Skip if we do not have NEON and skip illegal vector types. We can 14626 // "legalize" wide vector types into multiple interleaved accesses as long as 14627 // the vector types are divisible by 128. 14628 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) 14629 return false; 14630 14631 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 14632 14633 Value *Op0 = SVI->getOperand(0); 14634 Value *Op1 = SVI->getOperand(1); 14635 IRBuilder<> Builder(SI); 14636 14637 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 14638 // vectors to integer vectors. 14639 if (EltTy->isPointerTy()) { 14640 Type *IntTy = DL.getIntPtrType(EltTy); 14641 14642 // Convert to the corresponding integer vector. 14643 Type *IntVecTy = 14644 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 14645 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 14646 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 14647 14648 SubVecTy = VectorType::get(IntTy, LaneLen); 14649 } 14650 14651 // The base address of the store. 14652 Value *BaseAddr = SI->getPointerOperand(); 14653 14654 if (NumStores > 1) { 14655 // If we're going to generate more than one store, reset the lane length 14656 // and sub-vector type to something legal. 14657 LaneLen /= NumStores; 14658 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 14659 14660 // We will compute the pointer operand of each store from the original base 14661 // address using GEPs. Cast the base address to a pointer to the scalar 14662 // element type. 14663 BaseAddr = Builder.CreateBitCast( 14664 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 14665 SI->getPointerAddressSpace())); 14666 } 14667 14668 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 14669 14670 auto Mask = SVI->getShuffleMask(); 14671 14672 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 14673 Type *Tys[] = {Int8Ptr, SubVecTy}; 14674 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 14675 Intrinsic::arm_neon_vst3, 14676 Intrinsic::arm_neon_vst4}; 14677 14678 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 14679 // If we generating more than one store, we compute the base address of 14680 // subsequent stores as an offset from the previous. 14681 if (StoreCount > 0) 14682 BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); 14683 14684 SmallVector<Value *, 6> Ops; 14685 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 14686 14687 Function *VstNFunc = 14688 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 14689 14690 // Split the shufflevector operands into sub vectors for the new vstN call. 14691 for (unsigned i = 0; i < Factor; i++) { 14692 unsigned IdxI = StoreCount * LaneLen * Factor + i; 14693 if (Mask[IdxI] >= 0) { 14694 Ops.push_back(Builder.CreateShuffleVector( 14695 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 14696 } else { 14697 unsigned StartMask = 0; 14698 for (unsigned j = 1; j < LaneLen; j++) { 14699 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 14700 if (Mask[IdxJ * Factor + IdxI] >= 0) { 14701 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 14702 break; 14703 } 14704 } 14705 // Note: If all elements in a chunk are undefs, StartMask=0! 14706 // Note: Filling undef gaps with random elements is ok, since 14707 // those elements were being written anyway (with undefs). 14708 // In the case of all undefs we're defaulting to using elems from 0 14709 // Note: StartMask cannot be negative, it's checked in 14710 // isReInterleaveMask 14711 Ops.push_back(Builder.CreateShuffleVector( 14712 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 14713 } 14714 } 14715 14716 Ops.push_back(Builder.getInt32(SI->getAlignment())); 14717 Builder.CreateCall(VstNFunc, Ops); 14718 } 14719 return true; 14720 } 14721 14722 enum HABaseType { 14723 HA_UNKNOWN = 0, 14724 HA_FLOAT, 14725 HA_DOUBLE, 14726 HA_VECT64, 14727 HA_VECT128 14728 }; 14729 14730 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 14731 uint64_t &Members) { 14732 if (auto *ST = dyn_cast<StructType>(Ty)) { 14733 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 14734 uint64_t SubMembers = 0; 14735 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 14736 return false; 14737 Members += SubMembers; 14738 } 14739 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 14740 uint64_t SubMembers = 0; 14741 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 14742 return false; 14743 Members += SubMembers * AT->getNumElements(); 14744 } else if (Ty->isFloatTy()) { 14745 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 14746 return false; 14747 Members = 1; 14748 Base = HA_FLOAT; 14749 } else if (Ty->isDoubleTy()) { 14750 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 14751 return false; 14752 Members = 1; 14753 Base = HA_DOUBLE; 14754 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 14755 Members = 1; 14756 switch (Base) { 14757 case HA_FLOAT: 14758 case HA_DOUBLE: 14759 return false; 14760 case HA_VECT64: 14761 return VT->getBitWidth() == 64; 14762 case HA_VECT128: 14763 return VT->getBitWidth() == 128; 14764 case HA_UNKNOWN: 14765 switch (VT->getBitWidth()) { 14766 case 64: 14767 Base = HA_VECT64; 14768 return true; 14769 case 128: 14770 Base = HA_VECT128; 14771 return true; 14772 default: 14773 return false; 14774 } 14775 } 14776 } 14777 14778 return (Members > 0 && Members <= 4); 14779 } 14780 14781 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 14782 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 14783 /// passing according to AAPCS rules. 14784 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 14785 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 14786 if (getEffectiveCallingConv(CallConv, isVarArg) != 14787 CallingConv::ARM_AAPCS_VFP) 14788 return false; 14789 14790 HABaseType Base = HA_UNKNOWN; 14791 uint64_t Members = 0; 14792 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 14793 DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 14794 14795 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 14796 return IsHA || IsIntArray; 14797 } 14798 14799 unsigned ARMTargetLowering::getExceptionPointerRegister( 14800 const Constant *PersonalityFn) const { 14801 // Platforms which do not use SjLj EH may return values in these registers 14802 // via the personality function. 14803 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 14804 } 14805 14806 unsigned ARMTargetLowering::getExceptionSelectorRegister( 14807 const Constant *PersonalityFn) const { 14808 // Platforms which do not use SjLj EH may return values in these registers 14809 // via the personality function. 14810 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 14811 } 14812 14813 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 14814 // Update IsSplitCSR in ARMFunctionInfo. 14815 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 14816 AFI->setIsSplitCSR(true); 14817 } 14818 14819 void ARMTargetLowering::insertCopiesSplitCSR( 14820 MachineBasicBlock *Entry, 14821 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 14822 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 14823 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 14824 if (!IStart) 14825 return; 14826 14827 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 14828 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 14829 MachineBasicBlock::iterator MBBI = Entry->begin(); 14830 for (const MCPhysReg *I = IStart; *I; ++I) { 14831 const TargetRegisterClass *RC = nullptr; 14832 if (ARM::GPRRegClass.contains(*I)) 14833 RC = &ARM::GPRRegClass; 14834 else if (ARM::DPRRegClass.contains(*I)) 14835 RC = &ARM::DPRRegClass; 14836 else 14837 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 14838 14839 unsigned NewVR = MRI->createVirtualRegister(RC); 14840 // Create copy from CSR to a virtual register. 14841 // FIXME: this currently does not emit CFI pseudo-instructions, it works 14842 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 14843 // nounwind. If we want to generalize this later, we may need to emit 14844 // CFI pseudo-instructions. 14845 assert(Entry->getParent()->getFunction().hasFnAttribute( 14846 Attribute::NoUnwind) && 14847 "Function should be nounwind in insertCopiesSplitCSR!"); 14848 Entry->addLiveIn(*I); 14849 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 14850 .addReg(*I); 14851 14852 // Insert the copy-back instructions right before the terminator. 14853 for (auto *Exit : Exits) 14854 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 14855 TII->get(TargetOpcode::COPY), *I) 14856 .addReg(NewVR); 14857 } 14858 } 14859 14860 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 14861 MF.getFrameInfo().computeMaxCallFrameSize(MF); 14862 TargetLoweringBase::finalizeLowering(MF); 14863 } 14864