1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "MCTargetDesc/ARMAddressingModes.h" 25 #include "MCTargetDesc/ARMBaseInfo.h" 26 #include "Utils/ARMBaseInfo.h" 27 #include "llvm/ADT/APFloat.h" 28 #include "llvm/ADT/APInt.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/BitVector.h" 31 #include "llvm/ADT/DenseMap.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringExtras.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/ADT/StringSwitch.h" 39 #include "llvm/ADT/Triple.h" 40 #include "llvm/ADT/Twine.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/CodeGen/CallingConvLower.h" 43 #include "llvm/CodeGen/ISDOpcodes.h" 44 #include "llvm/CodeGen/IntrinsicLowering.h" 45 #include "llvm/CodeGen/MachineBasicBlock.h" 46 #include "llvm/CodeGen/MachineConstantPool.h" 47 #include "llvm/CodeGen/MachineFrameInfo.h" 48 #include "llvm/CodeGen/MachineFunction.h" 49 #include "llvm/CodeGen/MachineInstr.h" 50 #include "llvm/CodeGen/MachineInstrBuilder.h" 51 #include "llvm/CodeGen/MachineJumpTableInfo.h" 52 #include "llvm/CodeGen/MachineMemOperand.h" 53 #include "llvm/CodeGen/MachineOperand.h" 54 #include "llvm/CodeGen/MachineRegisterInfo.h" 55 #include "llvm/CodeGen/RuntimeLibcalls.h" 56 #include "llvm/CodeGen/SelectionDAG.h" 57 #include "llvm/CodeGen/SelectionDAGNodes.h" 58 #include "llvm/CodeGen/TargetInstrInfo.h" 59 #include "llvm/CodeGen/TargetLowering.h" 60 #include "llvm/CodeGen/TargetOpcodes.h" 61 #include "llvm/CodeGen/TargetRegisterInfo.h" 62 #include "llvm/CodeGen/TargetSubtargetInfo.h" 63 #include "llvm/CodeGen/ValueTypes.h" 64 #include "llvm/IR/Attributes.h" 65 #include "llvm/IR/CallingConv.h" 66 #include "llvm/IR/Constant.h" 67 #include "llvm/IR/Constants.h" 68 #include "llvm/IR/DataLayout.h" 69 #include "llvm/IR/DebugLoc.h" 70 #include "llvm/IR/DerivedTypes.h" 71 #include "llvm/IR/Function.h" 72 #include "llvm/IR/GlobalAlias.h" 73 #include "llvm/IR/GlobalValue.h" 74 #include "llvm/IR/GlobalVariable.h" 75 #include "llvm/IR/IRBuilder.h" 76 #include "llvm/IR/InlineAsm.h" 77 #include "llvm/IR/Instruction.h" 78 #include "llvm/IR/Instructions.h" 79 #include "llvm/IR/IntrinsicInst.h" 80 #include "llvm/IR/Intrinsics.h" 81 #include "llvm/IR/Module.h" 82 #include "llvm/IR/PatternMatch.h" 83 #include "llvm/IR/Type.h" 84 #include "llvm/IR/User.h" 85 #include "llvm/IR/Value.h" 86 #include "llvm/MC/MCInstrDesc.h" 87 #include "llvm/MC/MCInstrItineraries.h" 88 #include "llvm/MC/MCRegisterInfo.h" 89 #include "llvm/MC/MCSchedule.h" 90 #include "llvm/Support/AtomicOrdering.h" 91 #include "llvm/Support/BranchProbability.h" 92 #include "llvm/Support/Casting.h" 93 #include "llvm/Support/CodeGen.h" 94 #include "llvm/Support/CommandLine.h" 95 #include "llvm/Support/Compiler.h" 96 #include "llvm/Support/Debug.h" 97 #include "llvm/Support/ErrorHandling.h" 98 #include "llvm/Support/KnownBits.h" 99 #include "llvm/Support/MachineValueType.h" 100 #include "llvm/Support/MathExtras.h" 101 #include "llvm/Support/raw_ostream.h" 102 #include "llvm/Target/TargetMachine.h" 103 #include "llvm/Target/TargetOptions.h" 104 #include <algorithm> 105 #include <cassert> 106 #include <cstdint> 107 #include <cstdlib> 108 #include <iterator> 109 #include <limits> 110 #include <string> 111 #include <tuple> 112 #include <utility> 113 #include <vector> 114 115 using namespace llvm; 116 using namespace llvm::PatternMatch; 117 118 #define DEBUG_TYPE "arm-isel" 119 120 STATISTIC(NumTailCalls, "Number of tail calls"); 121 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 122 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 123 STATISTIC(NumConstpoolPromoted, 124 "Number of constants with their storage promoted into constant pools"); 125 126 static cl::opt<bool> 127 ARMInterworking("arm-interworking", cl::Hidden, 128 cl::desc("Enable / disable ARM interworking (for debugging only)"), 129 cl::init(true)); 130 131 static cl::opt<bool> EnableConstpoolPromotion( 132 "arm-promote-constant", cl::Hidden, 133 cl::desc("Enable / disable promotion of unnamed_addr constants into " 134 "constant pools"), 135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 136 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 137 "arm-promote-constant-max-size", cl::Hidden, 138 cl::desc("Maximum size of constant to promote into a constant pool"), 139 cl::init(64)); 140 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 141 "arm-promote-constant-max-total", cl::Hidden, 142 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 143 cl::init(128)); 144 145 // The APCS parameter registers. 146 static const MCPhysReg GPRArgRegs[] = { 147 ARM::R0, ARM::R1, ARM::R2, ARM::R3 148 }; 149 150 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 151 MVT PromotedBitwiseVT) { 152 if (VT != PromotedLdStVT) { 153 setOperationAction(ISD::LOAD, VT, Promote); 154 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 155 156 setOperationAction(ISD::STORE, VT, Promote); 157 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 158 } 159 160 MVT ElemTy = VT.getVectorElementType(); 161 if (ElemTy != MVT::f64) 162 setOperationAction(ISD::SETCC, VT, Custom); 163 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 164 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 165 if (ElemTy == MVT::i32) { 166 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 167 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 168 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 169 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 170 } else { 171 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 172 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 173 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 174 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 175 } 176 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 177 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 178 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 179 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 180 setOperationAction(ISD::SELECT, VT, Expand); 181 setOperationAction(ISD::SELECT_CC, VT, Expand); 182 setOperationAction(ISD::VSELECT, VT, Expand); 183 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 184 if (VT.isInteger()) { 185 setOperationAction(ISD::SHL, VT, Custom); 186 setOperationAction(ISD::SRA, VT, Custom); 187 setOperationAction(ISD::SRL, VT, Custom); 188 } 189 190 // Promote all bit-wise operations. 191 if (VT.isInteger() && VT != PromotedBitwiseVT) { 192 setOperationAction(ISD::AND, VT, Promote); 193 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 194 setOperationAction(ISD::OR, VT, Promote); 195 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 196 setOperationAction(ISD::XOR, VT, Promote); 197 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 198 } 199 200 // Neon does not support vector divide/remainder operations. 201 setOperationAction(ISD::SDIV, VT, Expand); 202 setOperationAction(ISD::UDIV, VT, Expand); 203 setOperationAction(ISD::FDIV, VT, Expand); 204 setOperationAction(ISD::SREM, VT, Expand); 205 setOperationAction(ISD::UREM, VT, Expand); 206 setOperationAction(ISD::FREM, VT, Expand); 207 208 if (!VT.isFloatingPoint() && 209 VT != MVT::v2i64 && VT != MVT::v1i64) 210 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 211 setOperationAction(Opcode, VT, Legal); 212 } 213 214 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 215 addRegisterClass(VT, &ARM::DPRRegClass); 216 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 217 } 218 219 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 220 addRegisterClass(VT, &ARM::DPairRegClass); 221 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 222 } 223 224 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 225 const ARMSubtarget &STI) 226 : TargetLowering(TM), Subtarget(&STI) { 227 RegInfo = Subtarget->getRegisterInfo(); 228 Itins = Subtarget->getInstrItineraryData(); 229 230 setBooleanContents(ZeroOrOneBooleanContent); 231 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 232 233 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 234 !Subtarget->isTargetWatchOS()) { 235 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 236 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 237 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 238 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 239 : CallingConv::ARM_AAPCS); 240 } 241 242 if (Subtarget->isTargetMachO()) { 243 // Uses VFP for Thumb libfuncs if available. 244 if (Subtarget->isThumb() && Subtarget->hasVFP2() && 245 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 246 static const struct { 247 const RTLIB::Libcall Op; 248 const char * const Name; 249 const ISD::CondCode Cond; 250 } LibraryCalls[] = { 251 // Single-precision floating-point arithmetic. 252 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 253 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 254 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 255 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 256 257 // Double-precision floating-point arithmetic. 258 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 259 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 260 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 261 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 262 263 // Single-precision comparisons. 264 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 265 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 266 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 267 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 268 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 269 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 270 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 271 { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, 272 273 // Double-precision comparisons. 274 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 275 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 276 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 277 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 278 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 279 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 280 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 281 { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, 282 283 // Floating-point to integer conversions. 284 // i64 conversions are done via library routines even when generating VFP 285 // instructions, so use the same ones. 286 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 287 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 288 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 289 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 290 291 // Conversions between floating types. 292 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 293 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 294 295 // Integer to floating-point conversions. 296 // i64 conversions are done via library routines even when generating VFP 297 // instructions, so use the same ones. 298 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 299 // e.g., __floatunsidf vs. __floatunssidfvfp. 300 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 301 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 302 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 303 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 304 }; 305 306 for (const auto &LC : LibraryCalls) { 307 setLibcallName(LC.Op, LC.Name); 308 if (LC.Cond != ISD::SETCC_INVALID) 309 setCmpLibcallCC(LC.Op, LC.Cond); 310 } 311 } 312 } 313 314 // These libcalls are not available in 32-bit. 315 setLibcallName(RTLIB::SHL_I128, nullptr); 316 setLibcallName(RTLIB::SRL_I128, nullptr); 317 setLibcallName(RTLIB::SRA_I128, nullptr); 318 319 // RTLIB 320 if (Subtarget->isAAPCS_ABI() && 321 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 322 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 323 static const struct { 324 const RTLIB::Libcall Op; 325 const char * const Name; 326 const CallingConv::ID CC; 327 const ISD::CondCode Cond; 328 } LibraryCalls[] = { 329 // Double-precision floating-point arithmetic helper functions 330 // RTABI chapter 4.1.2, Table 2 331 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 332 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 333 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 334 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 335 336 // Double-precision floating-point comparison helper functions 337 // RTABI chapter 4.1.2, Table 3 338 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 339 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 340 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 341 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 342 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 343 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 344 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 345 { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 346 347 // Single-precision floating-point arithmetic helper functions 348 // RTABI chapter 4.1.2, Table 4 349 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 350 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 351 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 352 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 353 354 // Single-precision floating-point comparison helper functions 355 // RTABI chapter 4.1.2, Table 5 356 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 357 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 358 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 359 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 360 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 361 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 362 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 363 { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, 364 365 // Floating-point to integer conversions. 366 // RTABI chapter 4.1.2, Table 6 367 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 368 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 369 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 370 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 371 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 372 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 373 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 374 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 375 376 // Conversions between floating types. 377 // RTABI chapter 4.1.2, Table 7 378 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 379 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 380 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 381 382 // Integer to floating-point conversions. 383 // RTABI chapter 4.1.2, Table 8 384 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 385 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 386 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 387 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 388 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 389 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 390 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 391 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 392 393 // Long long helper functions 394 // RTABI chapter 4.2, Table 9 395 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 396 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 397 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 398 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 399 400 // Integer division functions 401 // RTABI chapter 4.3.1 402 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 403 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 404 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 405 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 406 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 407 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 408 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 409 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 410 }; 411 412 for (const auto &LC : LibraryCalls) { 413 setLibcallName(LC.Op, LC.Name); 414 setLibcallCallingConv(LC.Op, LC.CC); 415 if (LC.Cond != ISD::SETCC_INVALID) 416 setCmpLibcallCC(LC.Op, LC.Cond); 417 } 418 419 // EABI dependent RTLIB 420 if (TM.Options.EABIVersion == EABI::EABI4 || 421 TM.Options.EABIVersion == EABI::EABI5) { 422 static const struct { 423 const RTLIB::Libcall Op; 424 const char *const Name; 425 const CallingConv::ID CC; 426 const ISD::CondCode Cond; 427 } MemOpsLibraryCalls[] = { 428 // Memory operations 429 // RTABI chapter 4.3.4 430 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 431 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 432 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 433 }; 434 435 for (const auto &LC : MemOpsLibraryCalls) { 436 setLibcallName(LC.Op, LC.Name); 437 setLibcallCallingConv(LC.Op, LC.CC); 438 if (LC.Cond != ISD::SETCC_INVALID) 439 setCmpLibcallCC(LC.Op, LC.Cond); 440 } 441 } 442 } 443 444 if (Subtarget->isTargetWindows()) { 445 static const struct { 446 const RTLIB::Libcall Op; 447 const char * const Name; 448 const CallingConv::ID CC; 449 } LibraryCalls[] = { 450 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 451 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 452 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 453 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 454 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 455 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 456 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 457 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 458 }; 459 460 for (const auto &LC : LibraryCalls) { 461 setLibcallName(LC.Op, LC.Name); 462 setLibcallCallingConv(LC.Op, LC.CC); 463 } 464 } 465 466 // Use divmod compiler-rt calls for iOS 5.0 and later. 467 if (Subtarget->isTargetMachO() && 468 !(Subtarget->isTargetIOS() && 469 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 470 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 471 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 472 } 473 474 // The half <-> float conversion functions are always soft-float on 475 // non-watchos platforms, but are needed for some targets which use a 476 // hard-float calling convention by default. 477 if (!Subtarget->isTargetWatchABI()) { 478 if (Subtarget->isAAPCS_ABI()) { 479 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 480 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 481 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 482 } else { 483 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 484 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 485 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 486 } 487 } 488 489 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 490 // a __gnu_ prefix (which is the default). 491 if (Subtarget->isTargetAEABI()) { 492 static const struct { 493 const RTLIB::Libcall Op; 494 const char * const Name; 495 const CallingConv::ID CC; 496 } LibraryCalls[] = { 497 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 498 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 499 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 500 }; 501 502 for (const auto &LC : LibraryCalls) { 503 setLibcallName(LC.Op, LC.Name); 504 setLibcallCallingConv(LC.Op, LC.CC); 505 } 506 } 507 508 if (Subtarget->isThumb1Only()) 509 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 510 else 511 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 512 513 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 514 !Subtarget->isThumb1Only()) { 515 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 516 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 517 } 518 519 if (Subtarget->hasFullFP16()) { 520 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 521 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 522 setOperationAction(ISD::BITCAST, MVT::i32, Custom); 523 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 524 525 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 526 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 527 } 528 529 for (MVT VT : MVT::vector_valuetypes()) { 530 for (MVT InnerVT : MVT::vector_valuetypes()) { 531 setTruncStoreAction(VT, InnerVT, Expand); 532 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 533 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 534 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 535 } 536 537 setOperationAction(ISD::MULHS, VT, Expand); 538 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 539 setOperationAction(ISD::MULHU, VT, Expand); 540 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 541 542 setOperationAction(ISD::BSWAP, VT, Expand); 543 } 544 545 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 546 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 547 548 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 549 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 550 551 if (Subtarget->hasNEON()) { 552 addDRTypeForNEON(MVT::v2f32); 553 addDRTypeForNEON(MVT::v8i8); 554 addDRTypeForNEON(MVT::v4i16); 555 addDRTypeForNEON(MVT::v2i32); 556 addDRTypeForNEON(MVT::v1i64); 557 558 addQRTypeForNEON(MVT::v4f32); 559 addQRTypeForNEON(MVT::v2f64); 560 addQRTypeForNEON(MVT::v16i8); 561 addQRTypeForNEON(MVT::v8i16); 562 addQRTypeForNEON(MVT::v4i32); 563 addQRTypeForNEON(MVT::v2i64); 564 565 if (Subtarget->hasFullFP16()) { 566 addQRTypeForNEON(MVT::v8f16); 567 addDRTypeForNEON(MVT::v4f16); 568 } 569 570 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 571 // neither Neon nor VFP support any arithmetic operations on it. 572 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 573 // supported for v4f32. 574 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 575 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 576 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 577 // FIXME: Code duplication: FDIV and FREM are expanded always, see 578 // ARMTargetLowering::addTypeForNEON method for details. 579 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 580 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 581 // FIXME: Create unittest. 582 // In another words, find a way when "copysign" appears in DAG with vector 583 // operands. 584 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 585 // FIXME: Code duplication: SETCC has custom operation action, see 586 // ARMTargetLowering::addTypeForNEON method for details. 587 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 588 // FIXME: Create unittest for FNEG and for FABS. 589 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 590 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 591 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 592 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 593 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 594 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 595 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 596 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 597 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 598 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 599 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 600 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 601 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 602 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 603 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 604 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 605 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 606 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 607 608 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 609 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 610 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 611 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 612 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 613 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 614 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 615 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 616 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 617 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 618 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 619 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 620 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 621 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 622 623 // Mark v2f32 intrinsics. 624 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 625 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 626 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 627 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 628 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 629 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 630 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 631 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 632 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 633 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 634 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 635 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 636 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 637 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 638 639 // Neon does not support some operations on v1i64 and v2i64 types. 640 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 641 // Custom handling for some quad-vector types to detect VMULL. 642 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 643 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 644 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 645 // Custom handling for some vector types to avoid expensive expansions 646 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 647 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 648 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 649 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 650 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 651 // a destination type that is wider than the source, and nor does 652 // it have a FP_TO_[SU]INT instruction with a narrower destination than 653 // source. 654 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 655 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 656 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 657 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 658 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 659 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 660 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 661 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 662 663 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 664 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 665 666 // NEON does not have single instruction CTPOP for vectors with element 667 // types wider than 8-bits. However, custom lowering can leverage the 668 // v8i8/v16i8 vcnt instruction. 669 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 670 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 671 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 672 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 673 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 674 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 675 676 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 677 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 678 679 // NEON does not have single instruction CTTZ for vectors. 680 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 681 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 682 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 683 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 684 685 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 686 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 687 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 688 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 689 690 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 691 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 692 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 693 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 694 695 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 696 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 697 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 698 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 699 700 // NEON only has FMA instructions as of VFP4. 701 if (!Subtarget->hasVFP4()) { 702 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 703 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 704 } 705 706 setTargetDAGCombine(ISD::INTRINSIC_VOID); 707 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 708 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 709 setTargetDAGCombine(ISD::SHL); 710 setTargetDAGCombine(ISD::SRL); 711 setTargetDAGCombine(ISD::SRA); 712 setTargetDAGCombine(ISD::SIGN_EXTEND); 713 setTargetDAGCombine(ISD::ZERO_EXTEND); 714 setTargetDAGCombine(ISD::ANY_EXTEND); 715 setTargetDAGCombine(ISD::BUILD_VECTOR); 716 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 717 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 718 setTargetDAGCombine(ISD::STORE); 719 setTargetDAGCombine(ISD::FP_TO_SINT); 720 setTargetDAGCombine(ISD::FP_TO_UINT); 721 setTargetDAGCombine(ISD::FDIV); 722 setTargetDAGCombine(ISD::LOAD); 723 724 // It is legal to extload from v4i8 to v4i16 or v4i32. 725 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 726 MVT::v2i32}) { 727 for (MVT VT : MVT::integer_vector_valuetypes()) { 728 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 729 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 730 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 731 } 732 } 733 } 734 735 if (Subtarget->isFPOnlySP()) { 736 // When targeting a floating-point unit with only single-precision 737 // operations, f64 is legal for the few double-precision instructions which 738 // are present However, no double-precision operations other than moves, 739 // loads and stores are provided by the hardware. 740 setOperationAction(ISD::FADD, MVT::f64, Expand); 741 setOperationAction(ISD::FSUB, MVT::f64, Expand); 742 setOperationAction(ISD::FMUL, MVT::f64, Expand); 743 setOperationAction(ISD::FMA, MVT::f64, Expand); 744 setOperationAction(ISD::FDIV, MVT::f64, Expand); 745 setOperationAction(ISD::FREM, MVT::f64, Expand); 746 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 747 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 748 setOperationAction(ISD::FNEG, MVT::f64, Expand); 749 setOperationAction(ISD::FABS, MVT::f64, Expand); 750 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 751 setOperationAction(ISD::FSIN, MVT::f64, Expand); 752 setOperationAction(ISD::FCOS, MVT::f64, Expand); 753 setOperationAction(ISD::FPOW, MVT::f64, Expand); 754 setOperationAction(ISD::FLOG, MVT::f64, Expand); 755 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 756 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 757 setOperationAction(ISD::FEXP, MVT::f64, Expand); 758 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 759 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 760 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 761 setOperationAction(ISD::FRINT, MVT::f64, Expand); 762 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 763 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 764 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 765 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 766 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 767 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 768 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 769 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 770 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 771 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 772 } 773 774 computeRegisterProperties(Subtarget->getRegisterInfo()); 775 776 // ARM does not have floating-point extending loads. 777 for (MVT VT : MVT::fp_valuetypes()) { 778 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 779 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 780 } 781 782 // ... or truncating stores 783 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 784 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 785 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 786 787 // ARM does not have i1 sign extending load. 788 for (MVT VT : MVT::integer_valuetypes()) 789 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 790 791 // ARM supports all 4 flavors of integer indexed load / store. 792 if (!Subtarget->isThumb1Only()) { 793 for (unsigned im = (unsigned)ISD::PRE_INC; 794 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 795 setIndexedLoadAction(im, MVT::i1, Legal); 796 setIndexedLoadAction(im, MVT::i8, Legal); 797 setIndexedLoadAction(im, MVT::i16, Legal); 798 setIndexedLoadAction(im, MVT::i32, Legal); 799 setIndexedStoreAction(im, MVT::i1, Legal); 800 setIndexedStoreAction(im, MVT::i8, Legal); 801 setIndexedStoreAction(im, MVT::i16, Legal); 802 setIndexedStoreAction(im, MVT::i32, Legal); 803 } 804 } else { 805 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 806 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 807 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 808 } 809 810 setOperationAction(ISD::SADDO, MVT::i32, Custom); 811 setOperationAction(ISD::UADDO, MVT::i32, Custom); 812 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 813 setOperationAction(ISD::USUBO, MVT::i32, Custom); 814 815 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 816 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 817 818 // i64 operation support. 819 setOperationAction(ISD::MUL, MVT::i64, Expand); 820 setOperationAction(ISD::MULHU, MVT::i32, Expand); 821 if (Subtarget->isThumb1Only()) { 822 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 823 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 824 } 825 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 826 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 827 setOperationAction(ISD::MULHS, MVT::i32, Expand); 828 829 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 830 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 831 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 832 setOperationAction(ISD::SRL, MVT::i64, Custom); 833 setOperationAction(ISD::SRA, MVT::i64, Custom); 834 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 835 836 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 837 if (Subtarget->isThumb1Only()) { 838 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 839 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 840 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 841 } 842 843 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 844 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 845 846 // ARM does not have ROTL. 847 setOperationAction(ISD::ROTL, MVT::i32, Expand); 848 for (MVT VT : MVT::vector_valuetypes()) { 849 setOperationAction(ISD::ROTL, VT, Expand); 850 setOperationAction(ISD::ROTR, VT, Expand); 851 } 852 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 853 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 854 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 855 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 856 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 857 } 858 859 // @llvm.readcyclecounter requires the Performance Monitors extension. 860 // Default to the 0 expansion on unsupported platforms. 861 // FIXME: Technically there are older ARM CPUs that have 862 // implementation-specific ways of obtaining this information. 863 if (Subtarget->hasPerfMon()) 864 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 865 866 // Only ARMv6 has BSWAP. 867 if (!Subtarget->hasV6Ops()) 868 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 869 870 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 871 : Subtarget->hasDivideInARMMode(); 872 if (!hasDivide) { 873 // These are expanded into libcalls if the cpu doesn't have HW divider. 874 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 875 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 876 } 877 878 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 879 setOperationAction(ISD::SDIV, MVT::i32, Custom); 880 setOperationAction(ISD::UDIV, MVT::i32, Custom); 881 882 setOperationAction(ISD::SDIV, MVT::i64, Custom); 883 setOperationAction(ISD::UDIV, MVT::i64, Custom); 884 } 885 886 setOperationAction(ISD::SREM, MVT::i32, Expand); 887 setOperationAction(ISD::UREM, MVT::i32, Expand); 888 889 // Register based DivRem for AEABI (RTABI 4.2) 890 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 891 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 892 Subtarget->isTargetWindows()) { 893 setOperationAction(ISD::SREM, MVT::i64, Custom); 894 setOperationAction(ISD::UREM, MVT::i64, Custom); 895 HasStandaloneRem = false; 896 897 if (Subtarget->isTargetWindows()) { 898 const struct { 899 const RTLIB::Libcall Op; 900 const char * const Name; 901 const CallingConv::ID CC; 902 } LibraryCalls[] = { 903 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 904 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 905 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 906 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 907 908 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 909 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 910 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 911 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 912 }; 913 914 for (const auto &LC : LibraryCalls) { 915 setLibcallName(LC.Op, LC.Name); 916 setLibcallCallingConv(LC.Op, LC.CC); 917 } 918 } else { 919 const struct { 920 const RTLIB::Libcall Op; 921 const char * const Name; 922 const CallingConv::ID CC; 923 } LibraryCalls[] = { 924 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 925 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 926 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 927 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 928 929 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 930 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 931 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 932 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 933 }; 934 935 for (const auto &LC : LibraryCalls) { 936 setLibcallName(LC.Op, LC.Name); 937 setLibcallCallingConv(LC.Op, LC.CC); 938 } 939 } 940 941 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 942 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 943 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 944 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 945 } else { 946 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 947 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 948 } 949 950 if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) 951 for (auto &VT : {MVT::f32, MVT::f64}) 952 setOperationAction(ISD::FPOWI, VT, Custom); 953 954 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 955 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 956 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 957 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 958 959 setOperationAction(ISD::TRAP, MVT::Other, Legal); 960 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 961 962 // Use the default implementation. 963 setOperationAction(ISD::VASTART, MVT::Other, Custom); 964 setOperationAction(ISD::VAARG, MVT::Other, Expand); 965 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 966 setOperationAction(ISD::VAEND, MVT::Other, Expand); 967 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 968 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 969 970 if (Subtarget->isTargetWindows()) 971 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 972 else 973 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 974 975 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 976 // the default expansion. 977 InsertFencesForAtomic = false; 978 if (Subtarget->hasAnyDataBarrier() && 979 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 980 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 981 // to ldrex/strex loops already. 982 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 983 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 984 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 985 986 // On v8, we have particularly efficient implementations of atomic fences 987 // if they can be combined with nearby atomic loads and stores. 988 if (!Subtarget->hasAcquireRelease() || 989 getTargetMachine().getOptLevel() == 0) { 990 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 991 InsertFencesForAtomic = true; 992 } 993 } else { 994 // If there's anything we can use as a barrier, go through custom lowering 995 // for ATOMIC_FENCE. 996 // If target has DMB in thumb, Fences can be inserted. 997 if (Subtarget->hasDataBarrier()) 998 InsertFencesForAtomic = true; 999 1000 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1001 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1002 1003 // Set them all for expansion, which will force libcalls. 1004 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1005 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1006 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1007 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1008 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1009 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1010 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1011 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1012 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1013 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1014 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1015 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1016 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1017 // Unordered/Monotonic case. 1018 if (!InsertFencesForAtomic) { 1019 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1020 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1021 } 1022 } 1023 1024 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1025 1026 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1027 if (!Subtarget->hasV6Ops()) { 1028 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1029 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1030 } 1031 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1032 1033 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 1034 !Subtarget->isThumb1Only()) { 1035 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1036 // iff target supports vfp2. 1037 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1038 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1039 } 1040 1041 // We want to custom lower some of our intrinsics. 1042 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1043 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1044 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1045 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1046 if (Subtarget->useSjLjEH()) 1047 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1048 1049 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1050 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1051 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1052 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1053 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1054 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1055 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1056 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1057 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1058 if (Subtarget->hasFullFP16()) { 1059 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1060 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1061 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1062 } 1063 1064 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1065 1066 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1067 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1068 if (Subtarget->hasFullFP16()) 1069 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1070 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1071 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1072 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1073 1074 // We don't support sin/cos/fmod/copysign/pow 1075 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1076 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1077 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1078 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1079 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1080 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1081 setOperationAction(ISD::FREM, MVT::f64, Expand); 1082 setOperationAction(ISD::FREM, MVT::f32, Expand); 1083 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && 1084 !Subtarget->isThumb1Only()) { 1085 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1086 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1087 } 1088 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1089 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1090 1091 if (!Subtarget->hasVFP4()) { 1092 setOperationAction(ISD::FMA, MVT::f64, Expand); 1093 setOperationAction(ISD::FMA, MVT::f32, Expand); 1094 } 1095 1096 // Various VFP goodness 1097 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1098 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1099 if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { 1100 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1101 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1102 } 1103 1104 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1105 if (!Subtarget->hasFP16()) { 1106 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1107 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1108 } 1109 } 1110 1111 // Use __sincos_stret if available. 1112 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1113 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1114 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1115 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1116 } 1117 1118 // FP-ARMv8 implements a lot of rounding-like FP operations. 1119 if (Subtarget->hasFPARMv8()) { 1120 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1121 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1122 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1123 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1124 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1125 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1126 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1127 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1128 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1129 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1130 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1131 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1132 1133 if (!Subtarget->isFPOnlySP()) { 1134 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1135 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1136 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1137 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1138 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1139 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1140 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1141 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1142 } 1143 } 1144 1145 if (Subtarget->hasNEON()) { 1146 // vmin and vmax aren't available in a scalar form, so we use 1147 // a NEON instruction with an undef lane instead. 1148 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1149 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1150 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1151 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1152 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1153 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1154 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1155 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1156 1157 if (Subtarget->hasFullFP16()) { 1158 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1159 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1160 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1161 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1162 1163 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1164 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1165 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1166 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1167 } 1168 } 1169 1170 // We have target-specific dag combine patterns for the following nodes: 1171 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1172 setTargetDAGCombine(ISD::ADD); 1173 setTargetDAGCombine(ISD::SUB); 1174 setTargetDAGCombine(ISD::MUL); 1175 setTargetDAGCombine(ISD::AND); 1176 setTargetDAGCombine(ISD::OR); 1177 setTargetDAGCombine(ISD::XOR); 1178 1179 if (Subtarget->hasV6Ops()) 1180 setTargetDAGCombine(ISD::SRL); 1181 if (Subtarget->isThumb1Only()) 1182 setTargetDAGCombine(ISD::SHL); 1183 1184 setStackPointerRegisterToSaveRestore(ARM::SP); 1185 1186 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1187 !Subtarget->hasVFP2()) 1188 setSchedulingPreference(Sched::RegPressure); 1189 else 1190 setSchedulingPreference(Sched::Hybrid); 1191 1192 //// temporary - rewrite interface to use type 1193 MaxStoresPerMemset = 8; 1194 MaxStoresPerMemsetOptSize = 4; 1195 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1196 MaxStoresPerMemcpyOptSize = 2; 1197 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1198 MaxStoresPerMemmoveOptSize = 2; 1199 1200 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1201 // are at least 4 bytes aligned. 1202 setMinStackArgumentAlignment(4); 1203 1204 // Prefer likely predicted branches to selects on out-of-order cores. 1205 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1206 1207 setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); 1208 1209 setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); 1210 1211 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1212 setTargetDAGCombine(ISD::ABS); 1213 } 1214 1215 bool ARMTargetLowering::useSoftFloat() const { 1216 return Subtarget->useSoftFloat(); 1217 } 1218 1219 // FIXME: It might make sense to define the representative register class as the 1220 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1221 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1222 // SPR's representative would be DPR_VFP2. This should work well if register 1223 // pressure tracking were modified such that a register use would increment the 1224 // pressure of the register class's representative and all of it's super 1225 // classes' representatives transitively. We have not implemented this because 1226 // of the difficulty prior to coalescing of modeling operand register classes 1227 // due to the common occurrence of cross class copies and subregister insertions 1228 // and extractions. 1229 std::pair<const TargetRegisterClass *, uint8_t> 1230 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1231 MVT VT) const { 1232 const TargetRegisterClass *RRC = nullptr; 1233 uint8_t Cost = 1; 1234 switch (VT.SimpleTy) { 1235 default: 1236 return TargetLowering::findRepresentativeClass(TRI, VT); 1237 // Use DPR as representative register class for all floating point 1238 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1239 // the cost is 1 for both f32 and f64. 1240 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1241 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1242 RRC = &ARM::DPRRegClass; 1243 // When NEON is used for SP, only half of the register file is available 1244 // because operations that define both SP and DP results will be constrained 1245 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1246 // coalescing by double-counting the SP regs. See the FIXME above. 1247 if (Subtarget->useNEONForSinglePrecisionFP()) 1248 Cost = 2; 1249 break; 1250 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1251 case MVT::v4f32: case MVT::v2f64: 1252 RRC = &ARM::DPRRegClass; 1253 Cost = 2; 1254 break; 1255 case MVT::v4i64: 1256 RRC = &ARM::DPRRegClass; 1257 Cost = 4; 1258 break; 1259 case MVT::v8i64: 1260 RRC = &ARM::DPRRegClass; 1261 Cost = 8; 1262 break; 1263 } 1264 return std::make_pair(RRC, Cost); 1265 } 1266 1267 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1268 switch ((ARMISD::NodeType)Opcode) { 1269 case ARMISD::FIRST_NUMBER: break; 1270 case ARMISD::Wrapper: return "ARMISD::Wrapper"; 1271 case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; 1272 case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; 1273 case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; 1274 case ARMISD::CALL: return "ARMISD::CALL"; 1275 case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; 1276 case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; 1277 case ARMISD::BRCOND: return "ARMISD::BRCOND"; 1278 case ARMISD::BR_JT: return "ARMISD::BR_JT"; 1279 case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; 1280 case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; 1281 case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; 1282 case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; 1283 case ARMISD::CMP: return "ARMISD::CMP"; 1284 case ARMISD::CMN: return "ARMISD::CMN"; 1285 case ARMISD::CMPZ: return "ARMISD::CMPZ"; 1286 case ARMISD::CMPFP: return "ARMISD::CMPFP"; 1287 case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; 1288 case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; 1289 case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; 1290 1291 case ARMISD::CMOV: return "ARMISD::CMOV"; 1292 case ARMISD::SUBS: return "ARMISD::SUBS"; 1293 1294 case ARMISD::SSAT: return "ARMISD::SSAT"; 1295 case ARMISD::USAT: return "ARMISD::USAT"; 1296 1297 case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; 1298 case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; 1299 case ARMISD::RRX: return "ARMISD::RRX"; 1300 1301 case ARMISD::ADDC: return "ARMISD::ADDC"; 1302 case ARMISD::ADDE: return "ARMISD::ADDE"; 1303 case ARMISD::SUBC: return "ARMISD::SUBC"; 1304 case ARMISD::SUBE: return "ARMISD::SUBE"; 1305 1306 case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; 1307 case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; 1308 case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; 1309 case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; 1310 case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; 1311 1312 case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; 1313 case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; 1314 case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; 1315 1316 case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; 1317 1318 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; 1319 1320 case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; 1321 1322 case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; 1323 1324 case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; 1325 1326 case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; 1327 case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; 1328 1329 case ARMISD::VCEQ: return "ARMISD::VCEQ"; 1330 case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; 1331 case ARMISD::VCGE: return "ARMISD::VCGE"; 1332 case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; 1333 case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; 1334 case ARMISD::VCGEU: return "ARMISD::VCGEU"; 1335 case ARMISD::VCGT: return "ARMISD::VCGT"; 1336 case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; 1337 case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; 1338 case ARMISD::VCGTU: return "ARMISD::VCGTU"; 1339 case ARMISD::VTST: return "ARMISD::VTST"; 1340 1341 case ARMISD::VSHL: return "ARMISD::VSHL"; 1342 case ARMISD::VSHRs: return "ARMISD::VSHRs"; 1343 case ARMISD::VSHRu: return "ARMISD::VSHRu"; 1344 case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; 1345 case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; 1346 case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; 1347 case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; 1348 case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; 1349 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; 1350 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; 1351 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; 1352 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; 1353 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; 1354 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; 1355 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; 1356 case ARMISD::VSLI: return "ARMISD::VSLI"; 1357 case ARMISD::VSRI: return "ARMISD::VSRI"; 1358 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; 1359 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; 1360 case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; 1361 case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; 1362 case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; 1363 case ARMISD::VDUP: return "ARMISD::VDUP"; 1364 case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; 1365 case ARMISD::VEXT: return "ARMISD::VEXT"; 1366 case ARMISD::VREV64: return "ARMISD::VREV64"; 1367 case ARMISD::VREV32: return "ARMISD::VREV32"; 1368 case ARMISD::VREV16: return "ARMISD::VREV16"; 1369 case ARMISD::VZIP: return "ARMISD::VZIP"; 1370 case ARMISD::VUZP: return "ARMISD::VUZP"; 1371 case ARMISD::VTRN: return "ARMISD::VTRN"; 1372 case ARMISD::VTBL1: return "ARMISD::VTBL1"; 1373 case ARMISD::VTBL2: return "ARMISD::VTBL2"; 1374 case ARMISD::VMULLs: return "ARMISD::VMULLs"; 1375 case ARMISD::VMULLu: return "ARMISD::VMULLu"; 1376 case ARMISD::UMAAL: return "ARMISD::UMAAL"; 1377 case ARMISD::UMLAL: return "ARMISD::UMLAL"; 1378 case ARMISD::SMLAL: return "ARMISD::SMLAL"; 1379 case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; 1380 case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; 1381 case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; 1382 case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; 1383 case ARMISD::SMULWB: return "ARMISD::SMULWB"; 1384 case ARMISD::SMULWT: return "ARMISD::SMULWT"; 1385 case ARMISD::SMLALD: return "ARMISD::SMLALD"; 1386 case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; 1387 case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; 1388 case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; 1389 case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; 1390 case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; 1391 case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; 1392 case ARMISD::BFI: return "ARMISD::BFI"; 1393 case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; 1394 case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; 1395 case ARMISD::VBSL: return "ARMISD::VBSL"; 1396 case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; 1397 case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; 1398 case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; 1399 case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; 1400 case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; 1401 case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; 1402 case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; 1403 case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; 1404 case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; 1405 case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; 1406 case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; 1407 case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; 1408 case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; 1409 case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; 1410 case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; 1411 case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; 1412 case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; 1413 case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; 1414 case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; 1415 case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; 1416 case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; 1417 case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; 1418 case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; 1419 } 1420 return nullptr; 1421 } 1422 1423 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1424 EVT VT) const { 1425 if (!VT.isVector()) 1426 return getPointerTy(DL); 1427 return VT.changeVectorElementTypeToInteger(); 1428 } 1429 1430 /// getRegClassFor - Return the register class that should be used for the 1431 /// specified value type. 1432 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { 1433 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1434 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1435 // load / store 4 to 8 consecutive D registers. 1436 if (Subtarget->hasNEON()) { 1437 if (VT == MVT::v4i64) 1438 return &ARM::QQPRRegClass; 1439 if (VT == MVT::v8i64) 1440 return &ARM::QQQQPRRegClass; 1441 } 1442 return TargetLowering::getRegClassFor(VT); 1443 } 1444 1445 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1446 // source/dest is aligned and the copy size is large enough. We therefore want 1447 // to align such objects passed to memory intrinsics. 1448 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1449 unsigned &PrefAlign) const { 1450 if (!isa<MemIntrinsic>(CI)) 1451 return false; 1452 MinSize = 8; 1453 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1454 // cycle faster than 4-byte aligned LDM. 1455 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1456 return true; 1457 } 1458 1459 // Create a fast isel object. 1460 FastISel * 1461 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1462 const TargetLibraryInfo *libInfo) const { 1463 return ARM::createFastISel(funcInfo, libInfo); 1464 } 1465 1466 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1467 unsigned NumVals = N->getNumValues(); 1468 if (!NumVals) 1469 return Sched::RegPressure; 1470 1471 for (unsigned i = 0; i != NumVals; ++i) { 1472 EVT VT = N->getValueType(i); 1473 if (VT == MVT::Glue || VT == MVT::Other) 1474 continue; 1475 if (VT.isFloatingPoint() || VT.isVector()) 1476 return Sched::ILP; 1477 } 1478 1479 if (!N->isMachineOpcode()) 1480 return Sched::RegPressure; 1481 1482 // Load are scheduled for latency even if there instruction itinerary 1483 // is not available. 1484 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1485 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1486 1487 if (MCID.getNumDefs() == 0) 1488 return Sched::RegPressure; 1489 if (!Itins->isEmpty() && 1490 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1491 return Sched::ILP; 1492 1493 return Sched::RegPressure; 1494 } 1495 1496 //===----------------------------------------------------------------------===// 1497 // Lowering Code 1498 //===----------------------------------------------------------------------===// 1499 1500 static bool isSRL16(const SDValue &Op) { 1501 if (Op.getOpcode() != ISD::SRL) 1502 return false; 1503 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1504 return Const->getZExtValue() == 16; 1505 return false; 1506 } 1507 1508 static bool isSRA16(const SDValue &Op) { 1509 if (Op.getOpcode() != ISD::SRA) 1510 return false; 1511 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1512 return Const->getZExtValue() == 16; 1513 return false; 1514 } 1515 1516 static bool isSHL16(const SDValue &Op) { 1517 if (Op.getOpcode() != ISD::SHL) 1518 return false; 1519 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1520 return Const->getZExtValue() == 16; 1521 return false; 1522 } 1523 1524 // Check for a signed 16-bit value. We special case SRA because it makes it 1525 // more simple when also looking for SRAs that aren't sign extending a 1526 // smaller value. Without the check, we'd need to take extra care with 1527 // checking order for some operations. 1528 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1529 if (isSRA16(Op)) 1530 return isSHL16(Op.getOperand(0)); 1531 return DAG.ComputeNumSignBits(Op) == 17; 1532 } 1533 1534 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1535 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1536 switch (CC) { 1537 default: llvm_unreachable("Unknown condition code!"); 1538 case ISD::SETNE: return ARMCC::NE; 1539 case ISD::SETEQ: return ARMCC::EQ; 1540 case ISD::SETGT: return ARMCC::GT; 1541 case ISD::SETGE: return ARMCC::GE; 1542 case ISD::SETLT: return ARMCC::LT; 1543 case ISD::SETLE: return ARMCC::LE; 1544 case ISD::SETUGT: return ARMCC::HI; 1545 case ISD::SETUGE: return ARMCC::HS; 1546 case ISD::SETULT: return ARMCC::LO; 1547 case ISD::SETULE: return ARMCC::LS; 1548 } 1549 } 1550 1551 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1552 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1553 ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { 1554 CondCode2 = ARMCC::AL; 1555 InvalidOnQNaN = true; 1556 switch (CC) { 1557 default: llvm_unreachable("Unknown FP condition!"); 1558 case ISD::SETEQ: 1559 case ISD::SETOEQ: 1560 CondCode = ARMCC::EQ; 1561 InvalidOnQNaN = false; 1562 break; 1563 case ISD::SETGT: 1564 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1565 case ISD::SETGE: 1566 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1567 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1568 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1569 case ISD::SETONE: 1570 CondCode = ARMCC::MI; 1571 CondCode2 = ARMCC::GT; 1572 InvalidOnQNaN = false; 1573 break; 1574 case ISD::SETO: CondCode = ARMCC::VC; break; 1575 case ISD::SETUO: CondCode = ARMCC::VS; break; 1576 case ISD::SETUEQ: 1577 CondCode = ARMCC::EQ; 1578 CondCode2 = ARMCC::VS; 1579 InvalidOnQNaN = false; 1580 break; 1581 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1582 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1583 case ISD::SETLT: 1584 case ISD::SETULT: CondCode = ARMCC::LT; break; 1585 case ISD::SETLE: 1586 case ISD::SETULE: CondCode = ARMCC::LE; break; 1587 case ISD::SETNE: 1588 case ISD::SETUNE: 1589 CondCode = ARMCC::NE; 1590 InvalidOnQNaN = false; 1591 break; 1592 } 1593 } 1594 1595 //===----------------------------------------------------------------------===// 1596 // Calling Convention Implementation 1597 //===----------------------------------------------------------------------===// 1598 1599 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1600 /// account presence of floating point hardware and calling convention 1601 /// limitations, such as support for variadic functions. 1602 CallingConv::ID 1603 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1604 bool isVarArg) const { 1605 switch (CC) { 1606 default: 1607 report_fatal_error("Unsupported calling convention"); 1608 case CallingConv::ARM_AAPCS: 1609 case CallingConv::ARM_APCS: 1610 case CallingConv::GHC: 1611 return CC; 1612 case CallingConv::PreserveMost: 1613 return CallingConv::PreserveMost; 1614 case CallingConv::ARM_AAPCS_VFP: 1615 case CallingConv::Swift: 1616 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 1617 case CallingConv::C: 1618 if (!Subtarget->isAAPCS_ABI()) 1619 return CallingConv::ARM_APCS; 1620 else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && 1621 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 1622 !isVarArg) 1623 return CallingConv::ARM_AAPCS_VFP; 1624 else 1625 return CallingConv::ARM_AAPCS; 1626 case CallingConv::Fast: 1627 case CallingConv::CXX_FAST_TLS: 1628 if (!Subtarget->isAAPCS_ABI()) { 1629 if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1630 return CallingConv::Fast; 1631 return CallingConv::ARM_APCS; 1632 } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) 1633 return CallingConv::ARM_AAPCS_VFP; 1634 else 1635 return CallingConv::ARM_AAPCS; 1636 } 1637 } 1638 1639 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1640 bool isVarArg) const { 1641 return CCAssignFnForNode(CC, false, isVarArg); 1642 } 1643 1644 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1645 bool isVarArg) const { 1646 return CCAssignFnForNode(CC, true, isVarArg); 1647 } 1648 1649 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 1650 /// CallingConvention. 1651 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 1652 bool Return, 1653 bool isVarArg) const { 1654 switch (getEffectiveCallingConv(CC, isVarArg)) { 1655 default: 1656 report_fatal_error("Unsupported calling convention"); 1657 case CallingConv::ARM_APCS: 1658 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 1659 case CallingConv::ARM_AAPCS: 1660 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1661 case CallingConv::ARM_AAPCS_VFP: 1662 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 1663 case CallingConv::Fast: 1664 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 1665 case CallingConv::GHC: 1666 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 1667 case CallingConv::PreserveMost: 1668 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 1669 } 1670 } 1671 1672 /// LowerCallResult - Lower the result values of a call into the 1673 /// appropriate copies out of appropriate physical registers. 1674 SDValue ARMTargetLowering::LowerCallResult( 1675 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 1676 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 1677 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 1678 SDValue ThisVal) const { 1679 // Assign locations to each value returned by this call. 1680 SmallVector<CCValAssign, 16> RVLocs; 1681 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1682 *DAG.getContext()); 1683 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 1684 1685 // Copy all of the result registers out of their specified physreg. 1686 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1687 CCValAssign VA = RVLocs[i]; 1688 1689 // Pass 'this' value directly from the argument to return value, to avoid 1690 // reg unit interference 1691 if (i == 0 && isThisReturn) { 1692 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 1693 "unexpected return calling convention register assignment"); 1694 InVals.push_back(ThisVal); 1695 continue; 1696 } 1697 1698 SDValue Val; 1699 if (VA.needsCustom()) { 1700 // Handle f64 or half of a v2f64. 1701 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1702 InFlag); 1703 Chain = Lo.getValue(1); 1704 InFlag = Lo.getValue(2); 1705 VA = RVLocs[++i]; // skip ahead to next loc 1706 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 1707 InFlag); 1708 Chain = Hi.getValue(1); 1709 InFlag = Hi.getValue(2); 1710 if (!Subtarget->isLittle()) 1711 std::swap (Lo, Hi); 1712 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1713 1714 if (VA.getLocVT() == MVT::v2f64) { 1715 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 1716 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1717 DAG.getConstant(0, dl, MVT::i32)); 1718 1719 VA = RVLocs[++i]; // skip ahead to next loc 1720 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1721 Chain = Lo.getValue(1); 1722 InFlag = Lo.getValue(2); 1723 VA = RVLocs[++i]; // skip ahead to next loc 1724 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 1725 Chain = Hi.getValue(1); 1726 InFlag = Hi.getValue(2); 1727 if (!Subtarget->isLittle()) 1728 std::swap (Lo, Hi); 1729 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 1730 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 1731 DAG.getConstant(1, dl, MVT::i32)); 1732 } 1733 } else { 1734 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1735 InFlag); 1736 Chain = Val.getValue(1); 1737 InFlag = Val.getValue(2); 1738 } 1739 1740 switch (VA.getLocInfo()) { 1741 default: llvm_unreachable("Unknown loc info!"); 1742 case CCValAssign::Full: break; 1743 case CCValAssign::BCvt: 1744 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1745 break; 1746 } 1747 1748 InVals.push_back(Val); 1749 } 1750 1751 return Chain; 1752 } 1753 1754 /// LowerMemOpCallTo - Store the argument to the stack. 1755 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 1756 SDValue Arg, const SDLoc &dl, 1757 SelectionDAG &DAG, 1758 const CCValAssign &VA, 1759 ISD::ArgFlagsTy Flags) const { 1760 unsigned LocMemOffset = VA.getLocMemOffset(); 1761 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1762 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 1763 StackPtr, PtrOff); 1764 return DAG.getStore( 1765 Chain, dl, Arg, PtrOff, 1766 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 1767 } 1768 1769 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 1770 SDValue Chain, SDValue &Arg, 1771 RegsToPassVector &RegsToPass, 1772 CCValAssign &VA, CCValAssign &NextVA, 1773 SDValue &StackPtr, 1774 SmallVectorImpl<SDValue> &MemOpChains, 1775 ISD::ArgFlagsTy Flags) const { 1776 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 1777 DAG.getVTList(MVT::i32, MVT::i32), Arg); 1778 unsigned id = Subtarget->isLittle() ? 0 : 1; 1779 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 1780 1781 if (NextVA.isRegLoc()) 1782 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 1783 else { 1784 assert(NextVA.isMemLoc()); 1785 if (!StackPtr.getNode()) 1786 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 1787 getPointerTy(DAG.getDataLayout())); 1788 1789 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 1790 dl, DAG, NextVA, 1791 Flags)); 1792 } 1793 } 1794 1795 /// LowerCall - Lowering a call into a callseq_start <- 1796 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 1797 /// nodes. 1798 SDValue 1799 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1800 SmallVectorImpl<SDValue> &InVals) const { 1801 SelectionDAG &DAG = CLI.DAG; 1802 SDLoc &dl = CLI.DL; 1803 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1804 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1805 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1806 SDValue Chain = CLI.Chain; 1807 SDValue Callee = CLI.Callee; 1808 bool &isTailCall = CLI.IsTailCall; 1809 CallingConv::ID CallConv = CLI.CallConv; 1810 bool doesNotRet = CLI.DoesNotReturn; 1811 bool isVarArg = CLI.IsVarArg; 1812 1813 MachineFunction &MF = DAG.getMachineFunction(); 1814 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 1815 bool isThisReturn = false; 1816 bool isSibCall = false; 1817 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); 1818 1819 // Disable tail calls if they're not supported. 1820 if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") 1821 isTailCall = false; 1822 1823 if (isTailCall) { 1824 // Check if it's really possible to do a tail call. 1825 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1826 isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), 1827 Outs, OutVals, Ins, DAG); 1828 if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) 1829 report_fatal_error("failed to perform tail call elimination on a call " 1830 "site marked musttail"); 1831 // We don't support GuaranteedTailCallOpt for ARM, only automatically 1832 // detected sibcalls. 1833 if (isTailCall) { 1834 ++NumTailCalls; 1835 isSibCall = true; 1836 } 1837 } 1838 1839 // Analyze operands of the call, assigning locations to each operand. 1840 SmallVector<CCValAssign, 16> ArgLocs; 1841 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1842 *DAG.getContext()); 1843 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 1844 1845 // Get a count of how many bytes are to be pushed on the stack. 1846 unsigned NumBytes = CCInfo.getNextStackOffset(); 1847 1848 // For tail calls, memory operands are available in our caller's stack. 1849 if (isSibCall) 1850 NumBytes = 0; 1851 1852 // Adjust the stack pointer for the new arguments... 1853 // These operations are automatically eliminated by the prolog/epilog pass 1854 if (!isSibCall) 1855 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 1856 1857 SDValue StackPtr = 1858 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 1859 1860 RegsToPassVector RegsToPass; 1861 SmallVector<SDValue, 8> MemOpChains; 1862 1863 // Walk the register/memloc assignments, inserting copies/loads. In the case 1864 // of tail call optimization, arguments are handled later. 1865 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 1866 i != e; 1867 ++i, ++realArgIdx) { 1868 CCValAssign &VA = ArgLocs[i]; 1869 SDValue Arg = OutVals[realArgIdx]; 1870 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 1871 bool isByVal = Flags.isByVal(); 1872 1873 // Promote the value if needed. 1874 switch (VA.getLocInfo()) { 1875 default: llvm_unreachable("Unknown loc info!"); 1876 case CCValAssign::Full: break; 1877 case CCValAssign::SExt: 1878 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1879 break; 1880 case CCValAssign::ZExt: 1881 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1882 break; 1883 case CCValAssign::AExt: 1884 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1885 break; 1886 case CCValAssign::BCvt: 1887 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1888 break; 1889 } 1890 1891 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 1892 if (VA.needsCustom()) { 1893 if (VA.getLocVT() == MVT::v2f64) { 1894 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1895 DAG.getConstant(0, dl, MVT::i32)); 1896 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 1897 DAG.getConstant(1, dl, MVT::i32)); 1898 1899 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, 1900 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1901 1902 VA = ArgLocs[++i]; // skip ahead to next loc 1903 if (VA.isRegLoc()) { 1904 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, 1905 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); 1906 } else { 1907 assert(VA.isMemLoc()); 1908 1909 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, 1910 dl, DAG, VA, Flags)); 1911 } 1912 } else { 1913 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 1914 StackPtr, MemOpChains, Flags); 1915 } 1916 } else if (VA.isRegLoc()) { 1917 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 1918 Outs[0].VT == MVT::i32) { 1919 assert(VA.getLocVT() == MVT::i32 && 1920 "unexpected calling convention register assignment"); 1921 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 1922 "unexpected use of 'returned'"); 1923 isThisReturn = true; 1924 } 1925 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1926 } else if (isByVal) { 1927 assert(VA.isMemLoc()); 1928 unsigned offset = 0; 1929 1930 // True if this byval aggregate will be split between registers 1931 // and memory. 1932 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 1933 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 1934 1935 if (CurByValIdx < ByValArgsCount) { 1936 1937 unsigned RegBegin, RegEnd; 1938 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 1939 1940 EVT PtrVT = 1941 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 1942 unsigned int i, j; 1943 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 1944 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 1945 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 1946 SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, 1947 MachinePointerInfo(), 1948 DAG.InferPtrAlignment(AddArg)); 1949 MemOpChains.push_back(Load.getValue(1)); 1950 RegsToPass.push_back(std::make_pair(j, Load)); 1951 } 1952 1953 // If parameter size outsides register area, "offset" value 1954 // helps us to calculate stack slot for remained part properly. 1955 offset = RegEnd - RegBegin; 1956 1957 CCInfo.nextInRegsParam(); 1958 } 1959 1960 if (Flags.getByValSize() > 4*offset) { 1961 auto PtrVT = getPointerTy(DAG.getDataLayout()); 1962 unsigned LocMemOffset = VA.getLocMemOffset(); 1963 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 1964 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 1965 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 1966 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 1967 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 1968 MVT::i32); 1969 SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, 1970 MVT::i32); 1971 1972 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 1973 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 1974 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 1975 Ops)); 1976 } 1977 } else if (!isSibCall) { 1978 assert(VA.isMemLoc()); 1979 1980 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1981 dl, DAG, VA, Flags)); 1982 } 1983 } 1984 1985 if (!MemOpChains.empty()) 1986 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 1987 1988 // Build a sequence of copy-to-reg nodes chained together with token chain 1989 // and flag operands which copy the outgoing args into the appropriate regs. 1990 SDValue InFlag; 1991 // Tail call byval lowering might overwrite argument registers so in case of 1992 // tail call optimization the copies to registers are lowered later. 1993 if (!isTailCall) 1994 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1995 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1996 RegsToPass[i].second, InFlag); 1997 InFlag = Chain.getValue(1); 1998 } 1999 2000 // For tail calls lower the arguments to the 'real' stack slot. 2001 if (isTailCall) { 2002 // Force all the incoming stack arguments to be loaded from the stack 2003 // before any new outgoing arguments are stored to the stack, because the 2004 // outgoing stack slots may alias the incoming argument stack slots, and 2005 // the alias isn't otherwise explicit. This is slightly more conservative 2006 // than necessary, because it means that each store effectively depends 2007 // on every argument instead of just those arguments it would clobber. 2008 2009 // Do not flag preceding copytoreg stuff together with the following stuff. 2010 InFlag = SDValue(); 2011 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2012 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2013 RegsToPass[i].second, InFlag); 2014 InFlag = Chain.getValue(1); 2015 } 2016 InFlag = SDValue(); 2017 } 2018 2019 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2020 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2021 // node so that legalize doesn't hack it. 2022 bool isDirect = false; 2023 2024 const TargetMachine &TM = getTargetMachine(); 2025 const Module *Mod = MF.getFunction().getParent(); 2026 const GlobalValue *GV = nullptr; 2027 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2028 GV = G->getGlobal(); 2029 bool isStub = 2030 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2031 2032 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2033 bool isLocalARMFunc = false; 2034 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2035 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2036 2037 if (Subtarget->genLongCalls()) { 2038 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2039 "long-calls codegen is not position independent!"); 2040 // Handle a global address or an external symbol. If it's not one of 2041 // those, the target's already in a register, so we don't need to do 2042 // anything extra. 2043 if (isa<GlobalAddressSDNode>(Callee)) { 2044 // Create a constant pool entry for the callee address 2045 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2046 ARMConstantPoolValue *CPV = 2047 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2048 2049 // Get the address of the callee into a register 2050 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2051 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2052 Callee = DAG.getLoad( 2053 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2054 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2055 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2056 const char *Sym = S->getSymbol(); 2057 2058 // Create a constant pool entry for the callee address 2059 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2060 ARMConstantPoolValue *CPV = 2061 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2062 ARMPCLabelIndex, 0); 2063 // Get the address of the callee into a register 2064 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2065 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2066 Callee = DAG.getLoad( 2067 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2068 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2069 } 2070 } else if (isa<GlobalAddressSDNode>(Callee)) { 2071 // If we're optimizing for minimum size and the function is called three or 2072 // more times in this block, we can improve codesize by calling indirectly 2073 // as BLXr has a 16-bit encoding. 2074 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2075 auto *BB = CLI.CS.getParent(); 2076 bool PreferIndirect = 2077 Subtarget->isThumb() && Subtarget->hasMinSize() && 2078 count_if(GV->users(), [&BB](const User *U) { 2079 return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB; 2080 }) > 2; 2081 2082 if (!PreferIndirect) { 2083 isDirect = true; 2084 bool isDef = GV->isStrongDefinitionForLinker(); 2085 2086 // ARM call to a local ARM function is predicable. 2087 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2088 // tBX takes a register source operand. 2089 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2090 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2091 Callee = DAG.getNode( 2092 ARMISD::WrapperPIC, dl, PtrVt, 2093 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2094 Callee = DAG.getLoad( 2095 PtrVt, dl, DAG.getEntryNode(), Callee, 2096 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2097 /* Alignment = */ 0, MachineMemOperand::MODereferenceable | 2098 MachineMemOperand::MOInvariant); 2099 } else if (Subtarget->isTargetCOFF()) { 2100 assert(Subtarget->isTargetWindows() && 2101 "Windows is the only supported COFF target"); 2102 unsigned TargetFlags = GV->hasDLLImportStorageClass() 2103 ? ARMII::MO_DLLIMPORT 2104 : ARMII::MO_NO_FLAG; 2105 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, 2106 TargetFlags); 2107 if (GV->hasDLLImportStorageClass()) 2108 Callee = 2109 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2110 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2111 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2112 } else { 2113 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2114 } 2115 } 2116 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2117 isDirect = true; 2118 // tBX takes a register source operand. 2119 const char *Sym = S->getSymbol(); 2120 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2121 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2122 ARMConstantPoolValue *CPV = 2123 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2124 ARMPCLabelIndex, 4); 2125 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); 2126 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2127 Callee = DAG.getLoad( 2128 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2129 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2130 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2131 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2132 } else { 2133 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2134 } 2135 } 2136 2137 // FIXME: handle tail calls differently. 2138 unsigned CallOpc; 2139 if (Subtarget->isThumb()) { 2140 if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2141 CallOpc = ARMISD::CALL_NOLINK; 2142 else 2143 CallOpc = ARMISD::CALL; 2144 } else { 2145 if (!isDirect && !Subtarget->hasV5TOps()) 2146 CallOpc = ARMISD::CALL_NOLINK; 2147 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2148 // Emit regular call when code size is the priority 2149 !Subtarget->hasMinSize()) 2150 // "mov lr, pc; b _foo" to avoid confusing the RSP 2151 CallOpc = ARMISD::CALL_NOLINK; 2152 else 2153 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2154 } 2155 2156 std::vector<SDValue> Ops; 2157 Ops.push_back(Chain); 2158 Ops.push_back(Callee); 2159 2160 // Add argument registers to the end of the list so that they are known live 2161 // into the call. 2162 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2163 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2164 RegsToPass[i].second.getValueType())); 2165 2166 // Add a register mask operand representing the call-preserved registers. 2167 if (!isTailCall) { 2168 const uint32_t *Mask; 2169 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2170 if (isThisReturn) { 2171 // For 'this' returns, use the R0-preserving mask if applicable 2172 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2173 if (!Mask) { 2174 // Set isThisReturn to false if the calling convention is not one that 2175 // allows 'returned' to be modeled in this way, so LowerCallResult does 2176 // not try to pass 'this' straight through 2177 isThisReturn = false; 2178 Mask = ARI->getCallPreservedMask(MF, CallConv); 2179 } 2180 } else 2181 Mask = ARI->getCallPreservedMask(MF, CallConv); 2182 2183 assert(Mask && "Missing call preserved mask for calling convention"); 2184 Ops.push_back(DAG.getRegisterMask(Mask)); 2185 } 2186 2187 if (InFlag.getNode()) 2188 Ops.push_back(InFlag); 2189 2190 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2191 if (isTailCall) { 2192 MF.getFrameInfo().setHasTailCall(); 2193 return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2194 } 2195 2196 // Returns a chain and a flag for retval copy to use. 2197 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2198 InFlag = Chain.getValue(1); 2199 2200 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2201 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2202 if (!Ins.empty()) 2203 InFlag = Chain.getValue(1); 2204 2205 // Handle result values, copying them out of physregs into vregs that we 2206 // return. 2207 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2208 InVals, isThisReturn, 2209 isThisReturn ? OutVals[0] : SDValue()); 2210 } 2211 2212 /// HandleByVal - Every parameter *after* a byval parameter is passed 2213 /// on the stack. Remember the next parameter register to allocate, 2214 /// and then confiscate the rest of the parameter registers to insure 2215 /// this. 2216 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2217 unsigned Align) const { 2218 // Byval (as with any stack) slots are always at least 4 byte aligned. 2219 Align = std::max(Align, 4U); 2220 2221 unsigned Reg = State->AllocateReg(GPRArgRegs); 2222 if (!Reg) 2223 return; 2224 2225 unsigned AlignInRegs = Align / 4; 2226 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2227 for (unsigned i = 0; i < Waste; ++i) 2228 Reg = State->AllocateReg(GPRArgRegs); 2229 2230 if (!Reg) 2231 return; 2232 2233 unsigned Excess = 4 * (ARM::R4 - Reg); 2234 2235 // Special case when NSAA != SP and parameter size greater than size of 2236 // all remained GPR regs. In that case we can't split parameter, we must 2237 // send it to stack. We also must set NCRN to R4, so waste all 2238 // remained registers. 2239 const unsigned NSAAOffset = State->getNextStackOffset(); 2240 if (NSAAOffset != 0 && Size > Excess) { 2241 while (State->AllocateReg(GPRArgRegs)) 2242 ; 2243 return; 2244 } 2245 2246 // First register for byval parameter is the first register that wasn't 2247 // allocated before this method call, so it would be "reg". 2248 // If parameter is small enough to be saved in range [reg, r4), then 2249 // the end (first after last) register would be reg + param-size-in-regs, 2250 // else parameter would be splitted between registers and stack, 2251 // end register would be r4 in this case. 2252 unsigned ByValRegBegin = Reg; 2253 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2254 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2255 // Note, first register is allocated in the beginning of function already, 2256 // allocate remained amount of registers we need. 2257 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2258 State->AllocateReg(GPRArgRegs); 2259 // A byval parameter that is split between registers and memory needs its 2260 // size truncated here. 2261 // In the case where the entire structure fits in registers, we set the 2262 // size in memory to zero. 2263 Size = std::max<int>(Size - Excess, 0); 2264 } 2265 2266 /// MatchingStackOffset - Return true if the given stack call argument is 2267 /// already available in the same position (relatively) of the caller's 2268 /// incoming argument stack. 2269 static 2270 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2271 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2272 const TargetInstrInfo *TII) { 2273 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2274 int FI = std::numeric_limits<int>::max(); 2275 if (Arg.getOpcode() == ISD::CopyFromReg) { 2276 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2277 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2278 return false; 2279 MachineInstr *Def = MRI->getVRegDef(VR); 2280 if (!Def) 2281 return false; 2282 if (!Flags.isByVal()) { 2283 if (!TII->isLoadFromStackSlot(*Def, FI)) 2284 return false; 2285 } else { 2286 return false; 2287 } 2288 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2289 if (Flags.isByVal()) 2290 // ByVal argument is passed in as a pointer but it's now being 2291 // dereferenced. e.g. 2292 // define @foo(%struct.X* %A) { 2293 // tail call @bar(%struct.X* byval %A) 2294 // } 2295 return false; 2296 SDValue Ptr = Ld->getBasePtr(); 2297 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2298 if (!FINode) 2299 return false; 2300 FI = FINode->getIndex(); 2301 } else 2302 return false; 2303 2304 assert(FI != std::numeric_limits<int>::max()); 2305 if (!MFI.isFixedObjectIndex(FI)) 2306 return false; 2307 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2308 } 2309 2310 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2311 /// for tail call optimization. Targets which want to do tail call 2312 /// optimization should implement this function. 2313 bool 2314 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2315 CallingConv::ID CalleeCC, 2316 bool isVarArg, 2317 bool isCalleeStructRet, 2318 bool isCallerStructRet, 2319 const SmallVectorImpl<ISD::OutputArg> &Outs, 2320 const SmallVectorImpl<SDValue> &OutVals, 2321 const SmallVectorImpl<ISD::InputArg> &Ins, 2322 SelectionDAG& DAG) const { 2323 MachineFunction &MF = DAG.getMachineFunction(); 2324 const Function &CallerF = MF.getFunction(); 2325 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2326 2327 assert(Subtarget->supportsTailCall()); 2328 2329 // Tail calls to function pointers cannot be optimized for Thumb1 if the args 2330 // to the call take up r0-r3. The reason is that there are no legal registers 2331 // left to hold the pointer to the function to be called. 2332 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2333 !isa<GlobalAddressSDNode>(Callee.getNode())) 2334 return false; 2335 2336 // Look for obvious safe cases to perform tail call optimization that do not 2337 // require ABI changes. This is what gcc calls sibcall. 2338 2339 // Exception-handling functions need a special set of instructions to indicate 2340 // a return to the hardware. Tail-calling another function would probably 2341 // break this. 2342 if (CallerF.hasFnAttribute("interrupt")) 2343 return false; 2344 2345 // Also avoid sibcall optimization if either caller or callee uses struct 2346 // return semantics. 2347 if (isCalleeStructRet || isCallerStructRet) 2348 return false; 2349 2350 // Externally-defined functions with weak linkage should not be 2351 // tail-called on ARM when the OS does not support dynamic 2352 // pre-emption of symbols, as the AAELF spec requires normal calls 2353 // to undefined weak functions to be replaced with a NOP or jump to the 2354 // next instruction. The behaviour of branch instructions in this 2355 // situation (as used for tail calls) is implementation-defined, so we 2356 // cannot rely on the linker replacing the tail call with a return. 2357 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2358 const GlobalValue *GV = G->getGlobal(); 2359 const Triple &TT = getTargetMachine().getTargetTriple(); 2360 if (GV->hasExternalWeakLinkage() && 2361 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2362 return false; 2363 } 2364 2365 // Check that the call results are passed in the same way. 2366 LLVMContext &C = *DAG.getContext(); 2367 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 2368 CCAssignFnForReturn(CalleeCC, isVarArg), 2369 CCAssignFnForReturn(CallerCC, isVarArg))) 2370 return false; 2371 // The callee has to preserve all registers the caller needs to preserve. 2372 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2373 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2374 if (CalleeCC != CallerCC) { 2375 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2376 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2377 return false; 2378 } 2379 2380 // If Caller's vararg or byval argument has been split between registers and 2381 // stack, do not perform tail call, since part of the argument is in caller's 2382 // local frame. 2383 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2384 if (AFI_Caller->getArgRegsSaveSize()) 2385 return false; 2386 2387 // If the callee takes no arguments then go on to check the results of the 2388 // call. 2389 if (!Outs.empty()) { 2390 // Check if stack adjustment is needed. For now, do not do this if any 2391 // argument is passed on the stack. 2392 SmallVector<CCValAssign, 16> ArgLocs; 2393 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2394 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2395 if (CCInfo.getNextStackOffset()) { 2396 // Check if the arguments are already laid out in the right way as 2397 // the caller's fixed stack objects. 2398 MachineFrameInfo &MFI = MF.getFrameInfo(); 2399 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2400 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2401 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2402 i != e; 2403 ++i, ++realArgIdx) { 2404 CCValAssign &VA = ArgLocs[i]; 2405 EVT RegVT = VA.getLocVT(); 2406 SDValue Arg = OutVals[realArgIdx]; 2407 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2408 if (VA.getLocInfo() == CCValAssign::Indirect) 2409 return false; 2410 if (VA.needsCustom()) { 2411 // f64 and vector types are split into multiple registers or 2412 // register/stack-slot combinations. The types will not match 2413 // the registers; give up on memory f64 refs until we figure 2414 // out what to do about this. 2415 if (!VA.isRegLoc()) 2416 return false; 2417 if (!ArgLocs[++i].isRegLoc()) 2418 return false; 2419 if (RegVT == MVT::v2f64) { 2420 if (!ArgLocs[++i].isRegLoc()) 2421 return false; 2422 if (!ArgLocs[++i].isRegLoc()) 2423 return false; 2424 } 2425 } else if (!VA.isRegLoc()) { 2426 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2427 MFI, MRI, TII)) 2428 return false; 2429 } 2430 } 2431 } 2432 2433 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2434 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2435 return false; 2436 } 2437 2438 return true; 2439 } 2440 2441 bool 2442 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2443 MachineFunction &MF, bool isVarArg, 2444 const SmallVectorImpl<ISD::OutputArg> &Outs, 2445 LLVMContext &Context) const { 2446 SmallVector<CCValAssign, 16> RVLocs; 2447 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2448 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2449 } 2450 2451 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2452 const SDLoc &DL, SelectionDAG &DAG) { 2453 const MachineFunction &MF = DAG.getMachineFunction(); 2454 const Function &F = MF.getFunction(); 2455 2456 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2457 2458 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2459 // version of the "preferred return address". These offsets affect the return 2460 // instruction if this is a return from PL1 without hypervisor extensions. 2461 // IRQ/FIQ: +4 "subs pc, lr, #4" 2462 // SWI: 0 "subs pc, lr, #0" 2463 // ABORT: +4 "subs pc, lr, #4" 2464 // UNDEF: +4/+2 "subs pc, lr, #0" 2465 // UNDEF varies depending on where the exception came from ARM or Thumb 2466 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2467 2468 int64_t LROffset; 2469 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2470 IntKind == "ABORT") 2471 LROffset = 4; 2472 else if (IntKind == "SWI" || IntKind == "UNDEF") 2473 LROffset = 0; 2474 else 2475 report_fatal_error("Unsupported interrupt attribute. If present, value " 2476 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2477 2478 RetOps.insert(RetOps.begin() + 1, 2479 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2480 2481 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2482 } 2483 2484 SDValue 2485 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2486 bool isVarArg, 2487 const SmallVectorImpl<ISD::OutputArg> &Outs, 2488 const SmallVectorImpl<SDValue> &OutVals, 2489 const SDLoc &dl, SelectionDAG &DAG) const { 2490 // CCValAssign - represent the assignment of the return value to a location. 2491 SmallVector<CCValAssign, 16> RVLocs; 2492 2493 // CCState - Info about the registers and stack slots. 2494 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2495 *DAG.getContext()); 2496 2497 // Analyze outgoing return values. 2498 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2499 2500 SDValue Flag; 2501 SmallVector<SDValue, 4> RetOps; 2502 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2503 bool isLittleEndian = Subtarget->isLittle(); 2504 2505 MachineFunction &MF = DAG.getMachineFunction(); 2506 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2507 AFI->setReturnRegsCount(RVLocs.size()); 2508 2509 // Copy the result values into the output registers. 2510 for (unsigned i = 0, realRVLocIdx = 0; 2511 i != RVLocs.size(); 2512 ++i, ++realRVLocIdx) { 2513 CCValAssign &VA = RVLocs[i]; 2514 assert(VA.isRegLoc() && "Can only return in registers!"); 2515 2516 SDValue Arg = OutVals[realRVLocIdx]; 2517 bool ReturnF16 = false; 2518 2519 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2520 // Half-precision return values can be returned like this: 2521 // 2522 // t11 f16 = fadd ... 2523 // t12: i16 = bitcast t11 2524 // t13: i32 = zero_extend t12 2525 // t14: f32 = bitcast t13 <~~~~~~~ Arg 2526 // 2527 // to avoid code generation for bitcasts, we simply set Arg to the node 2528 // that produces the f16 value, t11 in this case. 2529 // 2530 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 2531 SDValue ZE = Arg.getOperand(0); 2532 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 2533 SDValue BC = ZE.getOperand(0); 2534 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 2535 Arg = BC.getOperand(0); 2536 ReturnF16 = true; 2537 } 2538 } 2539 } 2540 } 2541 2542 switch (VA.getLocInfo()) { 2543 default: llvm_unreachable("Unknown loc info!"); 2544 case CCValAssign::Full: break; 2545 case CCValAssign::BCvt: 2546 if (!ReturnF16) 2547 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2548 break; 2549 } 2550 2551 if (VA.needsCustom()) { 2552 if (VA.getLocVT() == MVT::v2f64) { 2553 // Extract the first half and return it in two registers. 2554 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2555 DAG.getConstant(0, dl, MVT::i32)); 2556 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 2557 DAG.getVTList(MVT::i32, MVT::i32), Half); 2558 2559 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2560 HalfGPRs.getValue(isLittleEndian ? 0 : 1), 2561 Flag); 2562 Flag = Chain.getValue(1); 2563 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2564 VA = RVLocs[++i]; // skip ahead to next loc 2565 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2566 HalfGPRs.getValue(isLittleEndian ? 1 : 0), 2567 Flag); 2568 Flag = Chain.getValue(1); 2569 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2570 VA = RVLocs[++i]; // skip ahead to next loc 2571 2572 // Extract the 2nd half and fall through to handle it as an f64 value. 2573 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2574 DAG.getConstant(1, dl, MVT::i32)); 2575 } 2576 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 2577 // available. 2578 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2579 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2580 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2581 fmrrd.getValue(isLittleEndian ? 0 : 1), 2582 Flag); 2583 Flag = Chain.getValue(1); 2584 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2585 VA = RVLocs[++i]; // skip ahead to next loc 2586 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 2587 fmrrd.getValue(isLittleEndian ? 1 : 0), 2588 Flag); 2589 } else 2590 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 2591 2592 // Guarantee that all emitted copies are 2593 // stuck together, avoiding something bad. 2594 Flag = Chain.getValue(1); 2595 RetOps.push_back(DAG.getRegister(VA.getLocReg(), 2596 ReturnF16 ? MVT::f16 : VA.getLocVT())); 2597 } 2598 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2599 const MCPhysReg *I = 2600 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 2601 if (I) { 2602 for (; *I; ++I) { 2603 if (ARM::GPRRegClass.contains(*I)) 2604 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 2605 else if (ARM::DPRRegClass.contains(*I)) 2606 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 2607 else 2608 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2609 } 2610 } 2611 2612 // Update chain and glue. 2613 RetOps[0] = Chain; 2614 if (Flag.getNode()) 2615 RetOps.push_back(Flag); 2616 2617 // CPUs which aren't M-class use a special sequence to return from 2618 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 2619 // though we use "subs pc, lr, #N"). 2620 // 2621 // M-class CPUs actually use a normal return sequence with a special 2622 // (hardware-provided) value in LR, so the normal code path works. 2623 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 2624 !Subtarget->isMClass()) { 2625 if (Subtarget->isThumb1Only()) 2626 report_fatal_error("interrupt attribute is not supported in Thumb1"); 2627 return LowerInterruptReturn(RetOps, dl, DAG); 2628 } 2629 2630 return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); 2631 } 2632 2633 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 2634 if (N->getNumValues() != 1) 2635 return false; 2636 if (!N->hasNUsesOfValue(1, 0)) 2637 return false; 2638 2639 SDValue TCChain = Chain; 2640 SDNode *Copy = *N->use_begin(); 2641 if (Copy->getOpcode() == ISD::CopyToReg) { 2642 // If the copy has a glue operand, we conservatively assume it isn't safe to 2643 // perform a tail call. 2644 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2645 return false; 2646 TCChain = Copy->getOperand(0); 2647 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 2648 SDNode *VMov = Copy; 2649 // f64 returned in a pair of GPRs. 2650 SmallPtrSet<SDNode*, 2> Copies; 2651 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2652 UI != UE; ++UI) { 2653 if (UI->getOpcode() != ISD::CopyToReg) 2654 return false; 2655 Copies.insert(*UI); 2656 } 2657 if (Copies.size() > 2) 2658 return false; 2659 2660 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 2661 UI != UE; ++UI) { 2662 SDValue UseChain = UI->getOperand(0); 2663 if (Copies.count(UseChain.getNode())) 2664 // Second CopyToReg 2665 Copy = *UI; 2666 else { 2667 // We are at the top of this chain. 2668 // If the copy has a glue operand, we conservatively assume it 2669 // isn't safe to perform a tail call. 2670 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 2671 return false; 2672 // First CopyToReg 2673 TCChain = UseChain; 2674 } 2675 } 2676 } else if (Copy->getOpcode() == ISD::BITCAST) { 2677 // f32 returned in a single GPR. 2678 if (!Copy->hasOneUse()) 2679 return false; 2680 Copy = *Copy->use_begin(); 2681 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 2682 return false; 2683 // If the copy has a glue operand, we conservatively assume it isn't safe to 2684 // perform a tail call. 2685 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 2686 return false; 2687 TCChain = Copy->getOperand(0); 2688 } else { 2689 return false; 2690 } 2691 2692 bool HasRet = false; 2693 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 2694 UI != UE; ++UI) { 2695 if (UI->getOpcode() != ARMISD::RET_FLAG && 2696 UI->getOpcode() != ARMISD::INTRET_FLAG) 2697 return false; 2698 HasRet = true; 2699 } 2700 2701 if (!HasRet) 2702 return false; 2703 2704 Chain = TCChain; 2705 return true; 2706 } 2707 2708 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2709 if (!Subtarget->supportsTailCall()) 2710 return false; 2711 2712 auto Attr = 2713 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); 2714 if (!CI->isTailCall() || Attr.getValueAsString() == "true") 2715 return false; 2716 2717 return true; 2718 } 2719 2720 // Trying to write a 64 bit value so need to split into two 32 bit values first, 2721 // and pass the lower and high parts through. 2722 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 2723 SDLoc DL(Op); 2724 SDValue WriteValue = Op->getOperand(2); 2725 2726 // This function is only supposed to be called for i64 type argument. 2727 assert(WriteValue.getValueType() == MVT::i64 2728 && "LowerWRITE_REGISTER called for non-i64 type argument."); 2729 2730 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2731 DAG.getConstant(0, DL, MVT::i32)); 2732 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 2733 DAG.getConstant(1, DL, MVT::i32)); 2734 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 2735 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 2736 } 2737 2738 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 2739 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 2740 // one of the above mentioned nodes. It has to be wrapped because otherwise 2741 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 2742 // be used to form addressing mode. These wrapped nodes will be selected 2743 // into MOVi. 2744 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 2745 SelectionDAG &DAG) const { 2746 EVT PtrVT = Op.getValueType(); 2747 // FIXME there is no actual debug info here 2748 SDLoc dl(Op); 2749 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2750 SDValue Res; 2751 2752 // When generating execute-only code Constant Pools must be promoted to the 2753 // global data section. It's a bit ugly that we can't share them across basic 2754 // blocks, but this way we guarantee that execute-only behaves correct with 2755 // position-independent addressing modes. 2756 if (Subtarget->genExecuteOnly()) { 2757 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 2758 auto T = const_cast<Type*>(CP->getType()); 2759 auto C = const_cast<Constant*>(CP->getConstVal()); 2760 auto M = const_cast<Module*>(DAG.getMachineFunction(). 2761 getFunction().getParent()); 2762 auto GV = new GlobalVariable( 2763 *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, 2764 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 2765 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 2766 Twine(AFI->createPICLabelUId()) 2767 ); 2768 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 2769 dl, PtrVT); 2770 return LowerGlobalAddress(GA, DAG); 2771 } 2772 2773 if (CP->isMachineConstantPoolEntry()) 2774 Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, 2775 CP->getAlignment()); 2776 else 2777 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, 2778 CP->getAlignment()); 2779 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 2780 } 2781 2782 unsigned ARMTargetLowering::getJumpTableEncoding() const { 2783 return MachineJumpTableInfo::EK_Inline; 2784 } 2785 2786 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 2787 SelectionDAG &DAG) const { 2788 MachineFunction &MF = DAG.getMachineFunction(); 2789 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2790 unsigned ARMPCLabelIndex = 0; 2791 SDLoc DL(Op); 2792 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2793 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 2794 SDValue CPAddr; 2795 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 2796 if (!IsPositionIndependent) { 2797 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); 2798 } else { 2799 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 2800 ARMPCLabelIndex = AFI->createPICLabelUId(); 2801 ARMConstantPoolValue *CPV = 2802 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 2803 ARMCP::CPBlockAddress, PCAdj); 2804 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2805 } 2806 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 2807 SDValue Result = DAG.getLoad( 2808 PtrVT, DL, DAG.getEntryNode(), CPAddr, 2809 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2810 if (!IsPositionIndependent) 2811 return Result; 2812 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 2813 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 2814 } 2815 2816 /// Convert a TLS address reference into the correct sequence of loads 2817 /// and calls to compute the variable's address for Darwin, and return an 2818 /// SDValue containing the final node. 2819 2820 /// Darwin only has one TLS scheme which must be capable of dealing with the 2821 /// fully general situation, in the worst case. This means: 2822 /// + "extern __thread" declaration. 2823 /// + Defined in a possibly unknown dynamic library. 2824 /// 2825 /// The general system is that each __thread variable has a [3 x i32] descriptor 2826 /// which contains information used by the runtime to calculate the address. The 2827 /// only part of this the compiler needs to know about is the first word, which 2828 /// contains a function pointer that must be called with the address of the 2829 /// entire descriptor in "r0". 2830 /// 2831 /// Since this descriptor may be in a different unit, in general access must 2832 /// proceed along the usual ARM rules. A common sequence to produce is: 2833 /// 2834 /// movw rT1, :lower16:_var$non_lazy_ptr 2835 /// movt rT1, :upper16:_var$non_lazy_ptr 2836 /// ldr r0, [rT1] 2837 /// ldr rT2, [r0] 2838 /// blx rT2 2839 /// [...address now in r0...] 2840 SDValue 2841 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 2842 SelectionDAG &DAG) const { 2843 assert(Subtarget->isTargetDarwin() && 2844 "This function expects a Darwin target"); 2845 SDLoc DL(Op); 2846 2847 // First step is to get the address of the actua global symbol. This is where 2848 // the TLS descriptor lives. 2849 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 2850 2851 // The first entry in the descriptor is a function pointer that we must call 2852 // to obtain the address of the variable. 2853 SDValue Chain = DAG.getEntryNode(); 2854 SDValue FuncTLVGet = DAG.getLoad( 2855 MVT::i32, DL, Chain, DescAddr, 2856 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 2857 /* Alignment = */ 4, 2858 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 2859 MachineMemOperand::MOInvariant); 2860 Chain = FuncTLVGet.getValue(1); 2861 2862 MachineFunction &F = DAG.getMachineFunction(); 2863 MachineFrameInfo &MFI = F.getFrameInfo(); 2864 MFI.setAdjustsStack(true); 2865 2866 // TLS calls preserve all registers except those that absolutely must be 2867 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 2868 // silly). 2869 auto TRI = 2870 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 2871 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 2872 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 2873 2874 // Finally, we can make the call. This is just a degenerate version of a 2875 // normal AArch64 call node: r0 takes the address of the descriptor, and 2876 // returns the address of the variable in this thread. 2877 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 2878 Chain = 2879 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 2880 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 2881 DAG.getRegisterMask(Mask), Chain.getValue(1)); 2882 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 2883 } 2884 2885 SDValue 2886 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 2887 SelectionDAG &DAG) const { 2888 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 2889 2890 SDValue Chain = DAG.getEntryNode(); 2891 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2892 SDLoc DL(Op); 2893 2894 // Load the current TEB (thread environment block) 2895 SDValue Ops[] = {Chain, 2896 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 2897 DAG.getConstant(15, DL, MVT::i32), 2898 DAG.getConstant(0, DL, MVT::i32), 2899 DAG.getConstant(13, DL, MVT::i32), 2900 DAG.getConstant(0, DL, MVT::i32), 2901 DAG.getConstant(2, DL, MVT::i32)}; 2902 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 2903 DAG.getVTList(MVT::i32, MVT::Other), Ops); 2904 2905 SDValue TEB = CurrentTEB.getValue(0); 2906 Chain = CurrentTEB.getValue(1); 2907 2908 // Load the ThreadLocalStoragePointer from the TEB 2909 // A pointer to the TLS array is located at offset 0x2c from the TEB. 2910 SDValue TLSArray = 2911 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 2912 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 2913 2914 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 2915 // offset into the TLSArray. 2916 2917 // Load the TLS index from the C runtime 2918 SDValue TLSIndex = 2919 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 2920 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 2921 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 2922 2923 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 2924 DAG.getConstant(2, DL, MVT::i32)); 2925 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 2926 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 2927 MachinePointerInfo()); 2928 2929 // Get the offset of the start of the .tls section (section base) 2930 const auto *GA = cast<GlobalAddressSDNode>(Op); 2931 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 2932 SDValue Offset = DAG.getLoad( 2933 PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 2934 DAG.getTargetConstantPool(CPV, PtrVT, 4)), 2935 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2936 2937 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 2938 } 2939 2940 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 2941 SDValue 2942 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 2943 SelectionDAG &DAG) const { 2944 SDLoc dl(GA); 2945 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2946 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 2947 MachineFunction &MF = DAG.getMachineFunction(); 2948 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2949 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2950 ARMConstantPoolValue *CPV = 2951 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 2952 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 2953 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); 2954 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 2955 Argument = DAG.getLoad( 2956 PtrVT, dl, DAG.getEntryNode(), Argument, 2957 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2958 SDValue Chain = Argument.getValue(1); 2959 2960 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2961 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 2962 2963 // call __tls_get_addr. 2964 ArgListTy Args; 2965 ArgListEntry Entry; 2966 Entry.Node = Argument; 2967 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 2968 Args.push_back(Entry); 2969 2970 // FIXME: is there useful debug info available here? 2971 TargetLowering::CallLoweringInfo CLI(DAG); 2972 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 2973 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 2974 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 2975 2976 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2977 return CallResult.first; 2978 } 2979 2980 // Lower ISD::GlobalTLSAddress using the "initial exec" or 2981 // "local exec" model. 2982 SDValue 2983 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 2984 SelectionDAG &DAG, 2985 TLSModel::Model model) const { 2986 const GlobalValue *GV = GA->getGlobal(); 2987 SDLoc dl(GA); 2988 SDValue Offset; 2989 SDValue Chain = DAG.getEntryNode(); 2990 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2991 // Get the Thread Pointer 2992 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 2993 2994 if (model == TLSModel::InitialExec) { 2995 MachineFunction &MF = DAG.getMachineFunction(); 2996 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2997 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2998 // Initial exec model. 2999 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3000 ARMConstantPoolValue *CPV = 3001 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3002 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3003 true); 3004 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3005 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3006 Offset = DAG.getLoad( 3007 PtrVT, dl, Chain, Offset, 3008 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3009 Chain = Offset.getValue(1); 3010 3011 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3012 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3013 3014 Offset = DAG.getLoad( 3015 PtrVT, dl, Chain, Offset, 3016 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3017 } else { 3018 // local exec model 3019 assert(model == TLSModel::LocalExec); 3020 ARMConstantPoolValue *CPV = 3021 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3022 Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3023 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3024 Offset = DAG.getLoad( 3025 PtrVT, dl, Chain, Offset, 3026 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3027 } 3028 3029 // The address of the thread local variable is the add of the thread 3030 // pointer with the offset of the variable. 3031 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3032 } 3033 3034 SDValue 3035 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3036 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3037 if (DAG.getTarget().useEmulatedTLS()) 3038 return LowerToTLSEmulatedModel(GA, DAG); 3039 3040 if (Subtarget->isTargetDarwin()) 3041 return LowerGlobalTLSAddressDarwin(Op, DAG); 3042 3043 if (Subtarget->isTargetWindows()) 3044 return LowerGlobalTLSAddressWindows(Op, DAG); 3045 3046 // TODO: implement the "local dynamic" model 3047 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3048 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3049 3050 switch (model) { 3051 case TLSModel::GeneralDynamic: 3052 case TLSModel::LocalDynamic: 3053 return LowerToTLSGeneralDynamicModel(GA, DAG); 3054 case TLSModel::InitialExec: 3055 case TLSModel::LocalExec: 3056 return LowerToTLSExecModels(GA, DAG, model); 3057 } 3058 llvm_unreachable("bogus TLS model"); 3059 } 3060 3061 /// Return true if all users of V are within function F, looking through 3062 /// ConstantExprs. 3063 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3064 SmallVector<const User*,4> Worklist; 3065 for (auto *U : V->users()) 3066 Worklist.push_back(U); 3067 while (!Worklist.empty()) { 3068 auto *U = Worklist.pop_back_val(); 3069 if (isa<ConstantExpr>(U)) { 3070 for (auto *UU : U->users()) 3071 Worklist.push_back(UU); 3072 continue; 3073 } 3074 3075 auto *I = dyn_cast<Instruction>(U); 3076 if (!I || I->getParent()->getParent() != F) 3077 return false; 3078 } 3079 return true; 3080 } 3081 3082 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3083 const GlobalValue *GV, SelectionDAG &DAG, 3084 EVT PtrVT, const SDLoc &dl) { 3085 // If we're creating a pool entry for a constant global with unnamed address, 3086 // and the global is small enough, we can emit it inline into the constant pool 3087 // to save ourselves an indirection. 3088 // 3089 // This is a win if the constant is only used in one function (so it doesn't 3090 // need to be duplicated) or duplicating the constant wouldn't increase code 3091 // size (implying the constant is no larger than 4 bytes). 3092 const Function &F = DAG.getMachineFunction().getFunction(); 3093 3094 // We rely on this decision to inline being idemopotent and unrelated to the 3095 // use-site. We know that if we inline a variable at one use site, we'll 3096 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3097 // doesn't know about this optimization, so bail out if it's enabled else 3098 // we could decide to inline here (and thus never emit the GV) but require 3099 // the GV from fast-isel generated code. 3100 if (!EnableConstpoolPromotion || 3101 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3102 return SDValue(); 3103 3104 auto *GVar = dyn_cast<GlobalVariable>(GV); 3105 if (!GVar || !GVar->hasInitializer() || 3106 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3107 !GVar->hasLocalLinkage()) 3108 return SDValue(); 3109 3110 // If we inline a value that contains relocations, we move the relocations 3111 // from .data to .text. This is not allowed in position-independent code. 3112 auto *Init = GVar->getInitializer(); 3113 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3114 Init->needsRelocation()) 3115 return SDValue(); 3116 3117 // The constant islands pass can only really deal with alignment requests 3118 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3119 // any type wanting greater alignment requirements than 4 bytes. We also 3120 // can only promote constants that are multiples of 4 bytes in size or 3121 // are paddable to a multiple of 4. Currently we only try and pad constants 3122 // that are strings for simplicity. 3123 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3124 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3125 unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); 3126 unsigned RequiredPadding = 4 - (Size % 4); 3127 bool PaddingPossible = 3128 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3129 if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || 3130 Size == 0) 3131 return SDValue(); 3132 3133 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3134 MachineFunction &MF = DAG.getMachineFunction(); 3135 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3136 3137 // We can't bloat the constant pool too much, else the ConstantIslands pass 3138 // may fail to converge. If we haven't promoted this global yet (it may have 3139 // multiple uses), and promoting it would increase the constant pool size (Sz 3140 // > 4), ensure we have space to do so up to MaxTotal. 3141 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3142 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3143 ConstpoolPromotionMaxTotal) 3144 return SDValue(); 3145 3146 // This is only valid if all users are in a single function; we can't clone 3147 // the constant in general. The LLVM IR unnamed_addr allows merging 3148 // constants, but not cloning them. 3149 // 3150 // We could potentially allow cloning if we could prove all uses of the 3151 // constant in the current function don't care about the address, like 3152 // printf format strings. But that isn't implemented for now. 3153 if (!allUsersAreInFunction(GVar, &F)) 3154 return SDValue(); 3155 3156 // We're going to inline this global. Pad it out if needed. 3157 if (RequiredPadding != 4) { 3158 StringRef S = CDAInit->getAsString(); 3159 3160 SmallVector<uint8_t,16> V(S.size()); 3161 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3162 while (RequiredPadding--) 3163 V.push_back(0); 3164 Init = ConstantDataArray::get(*DAG.getContext(), V); 3165 } 3166 3167 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3168 SDValue CPAddr = 3169 DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); 3170 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3171 AFI->markGlobalAsPromotedToConstantPool(GVar); 3172 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3173 PaddedSize - 4); 3174 } 3175 ++NumConstpoolPromoted; 3176 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3177 } 3178 3179 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3180 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3181 if (!(GV = GA->getBaseObject())) 3182 return false; 3183 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3184 return V->isConstant(); 3185 return isa<Function>(GV); 3186 } 3187 3188 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3189 SelectionDAG &DAG) const { 3190 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3191 default: llvm_unreachable("unknown object format"); 3192 case Triple::COFF: 3193 return LowerGlobalAddressWindows(Op, DAG); 3194 case Triple::ELF: 3195 return LowerGlobalAddressELF(Op, DAG); 3196 case Triple::MachO: 3197 return LowerGlobalAddressDarwin(Op, DAG); 3198 } 3199 } 3200 3201 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3202 SelectionDAG &DAG) const { 3203 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3204 SDLoc dl(Op); 3205 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3206 const TargetMachine &TM = getTargetMachine(); 3207 bool IsRO = isReadOnly(GV); 3208 3209 // promoteToConstantPool only if not generating XO text section 3210 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3211 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3212 return V; 3213 3214 if (isPositionIndependent()) { 3215 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3216 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3217 UseGOT_PREL ? ARMII::MO_GOT : 0); 3218 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3219 if (UseGOT_PREL) 3220 Result = 3221 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3222 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3223 return Result; 3224 } else if (Subtarget->isROPI() && IsRO) { 3225 // PC-relative. 3226 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3227 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3228 return Result; 3229 } else if (Subtarget->isRWPI() && !IsRO) { 3230 // SB-relative. 3231 SDValue RelAddr; 3232 if (Subtarget->useMovt()) { 3233 ++NumMovwMovt; 3234 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3235 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3236 } else { // use literal pool for address constant 3237 ARMConstantPoolValue *CPV = 3238 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3239 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3240 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3241 RelAddr = DAG.getLoad( 3242 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3243 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3244 } 3245 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3246 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3247 return Result; 3248 } 3249 3250 // If we have T2 ops, we can materialize the address directly via movt/movw 3251 // pair. This is always cheaper. 3252 if (Subtarget->useMovt()) { 3253 ++NumMovwMovt; 3254 // FIXME: Once remat is capable of dealing with instructions with register 3255 // operands, expand this into two nodes. 3256 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3257 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3258 } else { 3259 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); 3260 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3261 return DAG.getLoad( 3262 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3263 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3264 } 3265 } 3266 3267 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3268 SelectionDAG &DAG) const { 3269 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3270 "ROPI/RWPI not currently supported for Darwin"); 3271 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3272 SDLoc dl(Op); 3273 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3274 3275 if (Subtarget->useMovt()) 3276 ++NumMovwMovt; 3277 3278 // FIXME: Once remat is capable of dealing with instructions with register 3279 // operands, expand this into multiple nodes 3280 unsigned Wrapper = 3281 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3282 3283 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3284 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3285 3286 if (Subtarget->isGVIndirectSymbol(GV)) 3287 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3288 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3289 return Result; 3290 } 3291 3292 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3293 SelectionDAG &DAG) const { 3294 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3295 assert(Subtarget->useMovt() && 3296 "Windows on ARM expects to use movw/movt"); 3297 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3298 "ROPI/RWPI not currently supported for Windows"); 3299 3300 const TargetMachine &TM = getTargetMachine(); 3301 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3302 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3303 if (GV->hasDLLImportStorageClass()) 3304 TargetFlags = ARMII::MO_DLLIMPORT; 3305 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3306 TargetFlags = ARMII::MO_COFFSTUB; 3307 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3308 SDValue Result; 3309 SDLoc DL(Op); 3310 3311 ++NumMovwMovt; 3312 3313 // FIXME: Once remat is capable of dealing with instructions with register 3314 // operands, expand this into two nodes. 3315 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3316 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, 3317 TargetFlags)); 3318 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3319 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3320 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3321 return Result; 3322 } 3323 3324 SDValue 3325 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3326 SDLoc dl(Op); 3327 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3328 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3329 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3330 Op.getOperand(1), Val); 3331 } 3332 3333 SDValue 3334 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3335 SDLoc dl(Op); 3336 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3337 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3338 } 3339 3340 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3341 SelectionDAG &DAG) const { 3342 SDLoc dl(Op); 3343 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3344 Op.getOperand(0)); 3345 } 3346 3347 SDValue 3348 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3349 const ARMSubtarget *Subtarget) const { 3350 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3351 SDLoc dl(Op); 3352 switch (IntNo) { 3353 default: return SDValue(); // Don't custom lower most intrinsics. 3354 case Intrinsic::thread_pointer: { 3355 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3356 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3357 } 3358 case Intrinsic::eh_sjlj_lsda: { 3359 MachineFunction &MF = DAG.getMachineFunction(); 3360 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3361 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3362 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3363 SDValue CPAddr; 3364 bool IsPositionIndependent = isPositionIndependent(); 3365 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3366 ARMConstantPoolValue *CPV = 3367 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3368 ARMCP::CPLSDA, PCAdj); 3369 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); 3370 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3371 SDValue Result = DAG.getLoad( 3372 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3373 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3374 3375 if (IsPositionIndependent) { 3376 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3377 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3378 } 3379 return Result; 3380 } 3381 case Intrinsic::arm_neon_vabs: 3382 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3383 Op.getOperand(1)); 3384 case Intrinsic::arm_neon_vmulls: 3385 case Intrinsic::arm_neon_vmullu: { 3386 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3387 ? ARMISD::VMULLs : ARMISD::VMULLu; 3388 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3389 Op.getOperand(1), Op.getOperand(2)); 3390 } 3391 case Intrinsic::arm_neon_vminnm: 3392 case Intrinsic::arm_neon_vmaxnm: { 3393 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3394 ? ISD::FMINNUM : ISD::FMAXNUM; 3395 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3396 Op.getOperand(1), Op.getOperand(2)); 3397 } 3398 case Intrinsic::arm_neon_vminu: 3399 case Intrinsic::arm_neon_vmaxu: { 3400 if (Op.getValueType().isFloatingPoint()) 3401 return SDValue(); 3402 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3403 ? ISD::UMIN : ISD::UMAX; 3404 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3405 Op.getOperand(1), Op.getOperand(2)); 3406 } 3407 case Intrinsic::arm_neon_vmins: 3408 case Intrinsic::arm_neon_vmaxs: { 3409 // v{min,max}s is overloaded between signed integers and floats. 3410 if (!Op.getValueType().isFloatingPoint()) { 3411 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3412 ? ISD::SMIN : ISD::SMAX; 3413 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3414 Op.getOperand(1), Op.getOperand(2)); 3415 } 3416 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3417 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3418 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3419 Op.getOperand(1), Op.getOperand(2)); 3420 } 3421 case Intrinsic::arm_neon_vtbl1: 3422 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3423 Op.getOperand(1), Op.getOperand(2)); 3424 case Intrinsic::arm_neon_vtbl2: 3425 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3426 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3427 } 3428 } 3429 3430 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 3431 const ARMSubtarget *Subtarget) { 3432 SDLoc dl(Op); 3433 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 3434 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 3435 if (SSID == SyncScope::SingleThread) 3436 return Op; 3437 3438 if (!Subtarget->hasDataBarrier()) { 3439 // Some ARMv6 cpus can support data barriers with an mcr instruction. 3440 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 3441 // here. 3442 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 3443 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 3444 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 3445 DAG.getConstant(0, dl, MVT::i32)); 3446 } 3447 3448 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 3449 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 3450 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 3451 if (Subtarget->isMClass()) { 3452 // Only a full system barrier exists in the M-class architectures. 3453 Domain = ARM_MB::SY; 3454 } else if (Subtarget->preferISHSTBarriers() && 3455 Ord == AtomicOrdering::Release) { 3456 // Swift happens to implement ISHST barriers in a way that's compatible with 3457 // Release semantics but weaker than ISH so we'd be fools not to use 3458 // it. Beware: other processors probably don't! 3459 Domain = ARM_MB::ISHST; 3460 } 3461 3462 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 3463 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 3464 DAG.getConstant(Domain, dl, MVT::i32)); 3465 } 3466 3467 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 3468 const ARMSubtarget *Subtarget) { 3469 // ARM pre v5TE and Thumb1 does not have preload instructions. 3470 if (!(Subtarget->isThumb2() || 3471 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 3472 // Just preserve the chain. 3473 return Op.getOperand(0); 3474 3475 SDLoc dl(Op); 3476 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 3477 if (!isRead && 3478 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 3479 // ARMv7 with MP extension has PLDW. 3480 return Op.getOperand(0); 3481 3482 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 3483 if (Subtarget->isThumb()) { 3484 // Invert the bits. 3485 isRead = ~isRead & 1; 3486 isData = ~isData & 1; 3487 } 3488 3489 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 3490 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 3491 DAG.getConstant(isData, dl, MVT::i32)); 3492 } 3493 3494 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 3495 MachineFunction &MF = DAG.getMachineFunction(); 3496 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 3497 3498 // vastart just stores the address of the VarArgsFrameIndex slot into the 3499 // memory location argument. 3500 SDLoc dl(Op); 3501 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 3502 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3503 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3504 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3505 MachinePointerInfo(SV)); 3506 } 3507 3508 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 3509 CCValAssign &NextVA, 3510 SDValue &Root, 3511 SelectionDAG &DAG, 3512 const SDLoc &dl) const { 3513 MachineFunction &MF = DAG.getMachineFunction(); 3514 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3515 3516 const TargetRegisterClass *RC; 3517 if (AFI->isThumb1OnlyFunction()) 3518 RC = &ARM::tGPRRegClass; 3519 else 3520 RC = &ARM::GPRRegClass; 3521 3522 // Transform the arguments stored in physical registers into virtual ones. 3523 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3524 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3525 3526 SDValue ArgValue2; 3527 if (NextVA.isMemLoc()) { 3528 MachineFrameInfo &MFI = MF.getFrameInfo(); 3529 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 3530 3531 // Create load node to retrieve arguments from the stack. 3532 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3533 ArgValue2 = DAG.getLoad( 3534 MVT::i32, dl, Root, FIN, 3535 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 3536 } else { 3537 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 3538 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 3539 } 3540 if (!Subtarget->isLittle()) 3541 std::swap (ArgValue, ArgValue2); 3542 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 3543 } 3544 3545 // The remaining GPRs hold either the beginning of variable-argument 3546 // data, or the beginning of an aggregate passed by value (usually 3547 // byval). Either way, we allocate stack slots adjacent to the data 3548 // provided by our caller, and store the unallocated registers there. 3549 // If this is a variadic function, the va_list pointer will begin with 3550 // these values; otherwise, this reassembles a (byval) structure that 3551 // was split between registers and memory. 3552 // Return: The frame index registers were stored into. 3553 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 3554 const SDLoc &dl, SDValue &Chain, 3555 const Value *OrigArg, 3556 unsigned InRegsParamRecordIdx, 3557 int ArgOffset, unsigned ArgSize) const { 3558 // Currently, two use-cases possible: 3559 // Case #1. Non-var-args function, and we meet first byval parameter. 3560 // Setup first unallocated register as first byval register; 3561 // eat all remained registers 3562 // (these two actions are performed by HandleByVal method). 3563 // Then, here, we initialize stack frame with 3564 // "store-reg" instructions. 3565 // Case #2. Var-args function, that doesn't contain byval parameters. 3566 // The same: eat all remained unallocated registers, 3567 // initialize stack frame. 3568 3569 MachineFunction &MF = DAG.getMachineFunction(); 3570 MachineFrameInfo &MFI = MF.getFrameInfo(); 3571 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3572 unsigned RBegin, REnd; 3573 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 3574 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 3575 } else { 3576 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3577 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 3578 REnd = ARM::R4; 3579 } 3580 3581 if (REnd != RBegin) 3582 ArgOffset = -4 * (ARM::R4 - RBegin); 3583 3584 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3585 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 3586 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 3587 3588 SmallVector<SDValue, 4> MemOps; 3589 const TargetRegisterClass *RC = 3590 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 3591 3592 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 3593 unsigned VReg = MF.addLiveIn(Reg, RC); 3594 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 3595 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3596 MachinePointerInfo(OrigArg, 4 * i)); 3597 MemOps.push_back(Store); 3598 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 3599 } 3600 3601 if (!MemOps.empty()) 3602 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3603 return FrameIndex; 3604 } 3605 3606 // Setup stack frame, the va_list pointer will start from. 3607 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 3608 const SDLoc &dl, SDValue &Chain, 3609 unsigned ArgOffset, 3610 unsigned TotalArgRegsSaveSize, 3611 bool ForceMutable) const { 3612 MachineFunction &MF = DAG.getMachineFunction(); 3613 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3614 3615 // Try to store any remaining integer argument regs 3616 // to their spots on the stack so that they may be loaded by dereferencing 3617 // the result of va_next. 3618 // If there is no regs to be stored, just point address after last 3619 // argument passed via stack. 3620 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 3621 CCInfo.getInRegsParamsCount(), 3622 CCInfo.getNextStackOffset(), 4); 3623 AFI->setVarArgsFrameIndex(FrameIndex); 3624 } 3625 3626 SDValue ARMTargetLowering::LowerFormalArguments( 3627 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3628 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3629 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3630 MachineFunction &MF = DAG.getMachineFunction(); 3631 MachineFrameInfo &MFI = MF.getFrameInfo(); 3632 3633 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3634 3635 // Assign locations to all of the incoming arguments. 3636 SmallVector<CCValAssign, 16> ArgLocs; 3637 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3638 *DAG.getContext()); 3639 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 3640 3641 SmallVector<SDValue, 16> ArgValues; 3642 SDValue ArgValue; 3643 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 3644 unsigned CurArgIdx = 0; 3645 3646 // Initially ArgRegsSaveSize is zero. 3647 // Then we increase this value each time we meet byval parameter. 3648 // We also increase this value in case of varargs function. 3649 AFI->setArgRegsSaveSize(0); 3650 3651 // Calculate the amount of stack space that we need to allocate to store 3652 // byval and variadic arguments that are passed in registers. 3653 // We need to know this before we allocate the first byval or variadic 3654 // argument, as they will be allocated a stack slot below the CFA (Canonical 3655 // Frame Address, the stack pointer at entry to the function). 3656 unsigned ArgRegBegin = ARM::R4; 3657 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3658 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 3659 break; 3660 3661 CCValAssign &VA = ArgLocs[i]; 3662 unsigned Index = VA.getValNo(); 3663 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 3664 if (!Flags.isByVal()) 3665 continue; 3666 3667 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 3668 unsigned RBegin, REnd; 3669 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 3670 ArgRegBegin = std::min(ArgRegBegin, RBegin); 3671 3672 CCInfo.nextInRegsParam(); 3673 } 3674 CCInfo.rewindByValRegsInfo(); 3675 3676 int lastInsIndex = -1; 3677 if (isVarArg && MFI.hasVAStart()) { 3678 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 3679 if (RegIdx != array_lengthof(GPRArgRegs)) 3680 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 3681 } 3682 3683 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 3684 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 3685 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3686 3687 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3688 CCValAssign &VA = ArgLocs[i]; 3689 if (Ins[VA.getValNo()].isOrigArg()) { 3690 std::advance(CurOrigArg, 3691 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 3692 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 3693 } 3694 // Arguments stored in registers. 3695 if (VA.isRegLoc()) { 3696 EVT RegVT = VA.getLocVT(); 3697 3698 if (VA.needsCustom()) { 3699 // f64 and vector types are split up into multiple registers or 3700 // combinations of registers and stack slots. 3701 if (VA.getLocVT() == MVT::v2f64) { 3702 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], 3703 Chain, DAG, dl); 3704 VA = ArgLocs[++i]; // skip ahead to next loc 3705 SDValue ArgValue2; 3706 if (VA.isMemLoc()) { 3707 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 3708 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3709 ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, 3710 MachinePointerInfo::getFixedStack( 3711 DAG.getMachineFunction(), FI)); 3712 } else { 3713 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], 3714 Chain, DAG, dl); 3715 } 3716 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 3717 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3718 ArgValue, ArgValue1, 3719 DAG.getIntPtrConstant(0, dl)); 3720 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, 3721 ArgValue, ArgValue2, 3722 DAG.getIntPtrConstant(1, dl)); 3723 } else 3724 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 3725 } else { 3726 const TargetRegisterClass *RC; 3727 3728 3729 if (RegVT == MVT::f16) 3730 RC = &ARM::HPRRegClass; 3731 else if (RegVT == MVT::f32) 3732 RC = &ARM::SPRRegClass; 3733 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) 3734 RC = &ARM::DPRRegClass; 3735 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) 3736 RC = &ARM::QPRRegClass; 3737 else if (RegVT == MVT::i32) 3738 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 3739 : &ARM::GPRRegClass; 3740 else 3741 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3742 3743 // Transform the arguments in physical registers into virtual ones. 3744 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3745 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 3746 } 3747 3748 // If this is an 8 or 16-bit value, it is really passed promoted 3749 // to 32 bits. Insert an assert[sz]ext to capture this, then 3750 // truncate to the right size. 3751 switch (VA.getLocInfo()) { 3752 default: llvm_unreachable("Unknown loc info!"); 3753 case CCValAssign::Full: break; 3754 case CCValAssign::BCvt: 3755 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 3756 break; 3757 case CCValAssign::SExt: 3758 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 3759 DAG.getValueType(VA.getValVT())); 3760 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3761 break; 3762 case CCValAssign::ZExt: 3763 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 3764 DAG.getValueType(VA.getValVT())); 3765 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 3766 break; 3767 } 3768 3769 InVals.push_back(ArgValue); 3770 } else { // VA.isRegLoc() 3771 // sanity check 3772 assert(VA.isMemLoc()); 3773 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 3774 3775 int index = VA.getValNo(); 3776 3777 // Some Ins[] entries become multiple ArgLoc[] entries. 3778 // Process them only once. 3779 if (index != lastInsIndex) 3780 { 3781 ISD::ArgFlagsTy Flags = Ins[index].Flags; 3782 // FIXME: For now, all byval parameter objects are marked mutable. 3783 // This can be changed with more analysis. 3784 // In case of tail call optimization mark all arguments mutable. 3785 // Since they could be overwritten by lowering of arguments in case of 3786 // a tail call. 3787 if (Flags.isByVal()) { 3788 assert(Ins[index].isOrigArg() && 3789 "Byval arguments cannot be implicit"); 3790 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 3791 3792 int FrameIndex = StoreByValRegs( 3793 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 3794 VA.getLocMemOffset(), Flags.getByValSize()); 3795 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 3796 CCInfo.nextInRegsParam(); 3797 } else { 3798 unsigned FIOffset = VA.getLocMemOffset(); 3799 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 3800 FIOffset, true); 3801 3802 // Create load nodes to retrieve arguments from the stack. 3803 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3804 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 3805 MachinePointerInfo::getFixedStack( 3806 DAG.getMachineFunction(), FI))); 3807 } 3808 lastInsIndex = index; 3809 } 3810 } 3811 } 3812 3813 // varargs 3814 if (isVarArg && MFI.hasVAStart()) 3815 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 3816 CCInfo.getNextStackOffset(), 3817 TotalArgRegsSaveSize); 3818 3819 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 3820 3821 return Chain; 3822 } 3823 3824 /// isFloatingPointZero - Return true if this is +0.0. 3825 static bool isFloatingPointZero(SDValue Op) { 3826 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 3827 return CFP->getValueAPF().isPosZero(); 3828 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 3829 // Maybe this has already been legalized into the constant pool? 3830 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 3831 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 3832 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 3833 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 3834 return CFP->getValueAPF().isPosZero(); 3835 } 3836 } else if (Op->getOpcode() == ISD::BITCAST && 3837 Op->getValueType(0) == MVT::f64) { 3838 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 3839 // created by LowerConstantFP(). 3840 SDValue BitcastOp = Op->getOperand(0); 3841 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 3842 isNullConstant(BitcastOp->getOperand(0))) 3843 return true; 3844 } 3845 return false; 3846 } 3847 3848 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 3849 /// the given operands. 3850 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3851 SDValue &ARMcc, SelectionDAG &DAG, 3852 const SDLoc &dl) const { 3853 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3854 unsigned C = RHSC->getZExtValue(); 3855 if (!isLegalICmpImmediate((int32_t)C)) { 3856 // Constant does not fit, try adjusting it by one. 3857 switch (CC) { 3858 default: break; 3859 case ISD::SETLT: 3860 case ISD::SETGE: 3861 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 3862 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3863 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3864 } 3865 break; 3866 case ISD::SETULT: 3867 case ISD::SETUGE: 3868 if (C != 0 && isLegalICmpImmediate(C-1)) { 3869 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3870 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 3871 } 3872 break; 3873 case ISD::SETLE: 3874 case ISD::SETGT: 3875 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 3876 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3877 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3878 } 3879 break; 3880 case ISD::SETULE: 3881 case ISD::SETUGT: 3882 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 3883 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3884 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 3885 } 3886 break; 3887 } 3888 } 3889 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 3890 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 3891 // In ARM and Thumb-2, the compare instructions can shift their second 3892 // operand. 3893 CC = ISD::getSetCCSwappedOperands(CC); 3894 std::swap(LHS, RHS); 3895 } 3896 3897 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 3898 ARMISD::NodeType CompareType; 3899 switch (CondCode) { 3900 default: 3901 CompareType = ARMISD::CMP; 3902 break; 3903 case ARMCC::EQ: 3904 case ARMCC::NE: 3905 // Uses only Z Flag 3906 CompareType = ARMISD::CMPZ; 3907 break; 3908 } 3909 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 3910 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 3911 } 3912 3913 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 3914 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 3915 SelectionDAG &DAG, const SDLoc &dl, 3916 bool InvalidOnQNaN) const { 3917 assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); 3918 SDValue Cmp; 3919 SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); 3920 if (!isFloatingPointZero(RHS)) 3921 Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); 3922 else 3923 Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); 3924 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 3925 } 3926 3927 /// duplicateCmp - Glue values can have only one use, so this function 3928 /// duplicates a comparison node. 3929 SDValue 3930 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 3931 unsigned Opc = Cmp.getOpcode(); 3932 SDLoc DL(Cmp); 3933 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 3934 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 3935 3936 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 3937 Cmp = Cmp.getOperand(0); 3938 Opc = Cmp.getOpcode(); 3939 if (Opc == ARMISD::CMPFP) 3940 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), 3941 Cmp.getOperand(1), Cmp.getOperand(2)); 3942 else { 3943 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 3944 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), 3945 Cmp.getOperand(1)); 3946 } 3947 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 3948 } 3949 3950 // This function returns three things: the arithmetic computation itself 3951 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 3952 // comparison and the condition code define the case in which the arithmetic 3953 // computation *does not* overflow. 3954 std::pair<SDValue, SDValue> 3955 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 3956 SDValue &ARMcc) const { 3957 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 3958 3959 SDValue Value, OverflowCmp; 3960 SDValue LHS = Op.getOperand(0); 3961 SDValue RHS = Op.getOperand(1); 3962 SDLoc dl(Op); 3963 3964 // FIXME: We are currently always generating CMPs because we don't support 3965 // generating CMN through the backend. This is not as good as the natural 3966 // CMP case because it causes a register dependency and cannot be folded 3967 // later. 3968 3969 switch (Op.getOpcode()) { 3970 default: 3971 llvm_unreachable("Unknown overflow instruction!"); 3972 case ISD::SADDO: 3973 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3974 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 3975 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3976 break; 3977 case ISD::UADDO: 3978 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3979 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 3980 // We do not use it in the USUBO case as Value may not be used. 3981 Value = DAG.getNode(ARMISD::ADDC, dl, 3982 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 3983 .getValue(0); 3984 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 3985 break; 3986 case ISD::SSUBO: 3987 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 3988 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3989 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3990 break; 3991 case ISD::USUBO: 3992 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 3993 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 3994 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 3995 break; 3996 case ISD::UMULO: 3997 // We generate a UMUL_LOHI and then check if the high word is 0. 3998 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 3999 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4000 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4001 LHS, RHS); 4002 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4003 DAG.getConstant(0, dl, MVT::i32)); 4004 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4005 break; 4006 case ISD::SMULO: 4007 // We generate a SMUL_LOHI and then check if all the bits of the high word 4008 // are the same as the sign bit of the low word. 4009 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4010 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4011 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4012 LHS, RHS); 4013 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4014 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4015 Value.getValue(0), 4016 DAG.getConstant(31, dl, MVT::i32))); 4017 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4018 break; 4019 } // switch (...) 4020 4021 return std::make_pair(Value, OverflowCmp); 4022 } 4023 4024 SDValue 4025 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4026 // Let legalize expand this if it isn't a legal type yet. 4027 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4028 return SDValue(); 4029 4030 SDValue Value, OverflowCmp; 4031 SDValue ARMcc; 4032 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4033 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4034 SDLoc dl(Op); 4035 // We use 0 and 1 as false and true values. 4036 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4037 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4038 EVT VT = Op.getValueType(); 4039 4040 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4041 ARMcc, CCR, OverflowCmp); 4042 4043 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4044 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4045 } 4046 4047 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4048 SelectionDAG &DAG) { 4049 SDLoc DL(BoolCarry); 4050 EVT CarryVT = BoolCarry.getValueType(); 4051 4052 // This converts the boolean value carry into the carry flag by doing 4053 // ARMISD::SUBC Carry, 1 4054 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4055 DAG.getVTList(CarryVT, MVT::i32), 4056 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4057 return Carry.getValue(1); 4058 } 4059 4060 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4061 SelectionDAG &DAG) { 4062 SDLoc DL(Flags); 4063 4064 // Now convert the carry flag into a boolean carry. We do this 4065 // using ARMISD:ADDE 0, 0, Carry 4066 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4067 DAG.getConstant(0, DL, MVT::i32), 4068 DAG.getConstant(0, DL, MVT::i32), Flags); 4069 } 4070 4071 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4072 SelectionDAG &DAG) const { 4073 // Let legalize expand this if it isn't a legal type yet. 4074 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4075 return SDValue(); 4076 4077 SDValue LHS = Op.getOperand(0); 4078 SDValue RHS = Op.getOperand(1); 4079 SDLoc dl(Op); 4080 4081 EVT VT = Op.getValueType(); 4082 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4083 SDValue Value; 4084 SDValue Overflow; 4085 switch (Op.getOpcode()) { 4086 default: 4087 llvm_unreachable("Unknown overflow instruction!"); 4088 case ISD::UADDO: 4089 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4090 // Convert the carry flag into a boolean value. 4091 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4092 break; 4093 case ISD::USUBO: { 4094 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4095 // Convert the carry flag into a boolean value. 4096 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4097 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4098 // value. So compute 1 - C. 4099 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4100 DAG.getConstant(1, dl, MVT::i32), Overflow); 4101 break; 4102 } 4103 } 4104 4105 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4106 } 4107 4108 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4109 SDValue Cond = Op.getOperand(0); 4110 SDValue SelectTrue = Op.getOperand(1); 4111 SDValue SelectFalse = Op.getOperand(2); 4112 SDLoc dl(Op); 4113 unsigned Opc = Cond.getOpcode(); 4114 4115 if (Cond.getResNo() == 1 && 4116 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4117 Opc == ISD::USUBO)) { 4118 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4119 return SDValue(); 4120 4121 SDValue Value, OverflowCmp; 4122 SDValue ARMcc; 4123 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4124 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4125 EVT VT = Op.getValueType(); 4126 4127 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4128 OverflowCmp, DAG); 4129 } 4130 4131 // Convert: 4132 // 4133 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4134 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4135 // 4136 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4137 const ConstantSDNode *CMOVTrue = 4138 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4139 const ConstantSDNode *CMOVFalse = 4140 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4141 4142 if (CMOVTrue && CMOVFalse) { 4143 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4144 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4145 4146 SDValue True; 4147 SDValue False; 4148 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4149 True = SelectTrue; 4150 False = SelectFalse; 4151 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4152 True = SelectFalse; 4153 False = SelectTrue; 4154 } 4155 4156 if (True.getNode() && False.getNode()) { 4157 EVT VT = Op.getValueType(); 4158 SDValue ARMcc = Cond.getOperand(2); 4159 SDValue CCR = Cond.getOperand(3); 4160 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4161 assert(True.getValueType() == VT); 4162 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4163 } 4164 } 4165 } 4166 4167 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4168 // undefined bits before doing a full-word comparison with zero. 4169 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4170 DAG.getConstant(1, dl, Cond.getValueType())); 4171 4172 return DAG.getSelectCC(dl, Cond, 4173 DAG.getConstant(0, dl, Cond.getValueType()), 4174 SelectTrue, SelectFalse, ISD::SETNE); 4175 } 4176 4177 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4178 bool &swpCmpOps, bool &swpVselOps) { 4179 // Start by selecting the GE condition code for opcodes that return true for 4180 // 'equality' 4181 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4182 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4183 CondCode = ARMCC::GE; 4184 4185 // and GT for opcodes that return false for 'equality'. 4186 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4187 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4188 CondCode = ARMCC::GT; 4189 4190 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4191 // to swap the compare operands. 4192 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4193 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4194 swpCmpOps = true; 4195 4196 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4197 // If we have an unordered opcode, we need to swap the operands to the VSEL 4198 // instruction (effectively negating the condition). 4199 // 4200 // This also has the effect of swapping which one of 'less' or 'greater' 4201 // returns true, so we also swap the compare operands. It also switches 4202 // whether we return true for 'equality', so we compensate by picking the 4203 // opposite condition code to our original choice. 4204 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4205 CC == ISD::SETUGT) { 4206 swpCmpOps = !swpCmpOps; 4207 swpVselOps = !swpVselOps; 4208 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4209 } 4210 4211 // 'ordered' is 'anything but unordered', so use the VS condition code and 4212 // swap the VSEL operands. 4213 if (CC == ISD::SETO) { 4214 CondCode = ARMCC::VS; 4215 swpVselOps = true; 4216 } 4217 4218 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4219 // code and swap the VSEL operands. Also do this if we don't care about the 4220 // unordered case. 4221 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4222 CondCode = ARMCC::EQ; 4223 swpVselOps = true; 4224 } 4225 } 4226 4227 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4228 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4229 SDValue Cmp, SelectionDAG &DAG) const { 4230 if (Subtarget->isFPOnlySP() && VT == MVT::f64) { 4231 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4232 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4233 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4234 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4235 4236 SDValue TrueLow = TrueVal.getValue(0); 4237 SDValue TrueHigh = TrueVal.getValue(1); 4238 SDValue FalseLow = FalseVal.getValue(0); 4239 SDValue FalseHigh = FalseVal.getValue(1); 4240 4241 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4242 ARMcc, CCR, Cmp); 4243 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4244 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4245 4246 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4247 } else { 4248 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4249 Cmp); 4250 } 4251 } 4252 4253 static bool isGTorGE(ISD::CondCode CC) { 4254 return CC == ISD::SETGT || CC == ISD::SETGE; 4255 } 4256 4257 static bool isLTorLE(ISD::CondCode CC) { 4258 return CC == ISD::SETLT || CC == ISD::SETLE; 4259 } 4260 4261 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 4262 // All of these conditions (and their <= and >= counterparts) will do: 4263 // x < k ? k : x 4264 // x > k ? x : k 4265 // k < x ? x : k 4266 // k > x ? k : x 4267 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 4268 const SDValue TrueVal, const SDValue FalseVal, 4269 const ISD::CondCode CC, const SDValue K) { 4270 return (isGTorGE(CC) && 4271 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 4272 (isLTorLE(CC) && 4273 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 4274 } 4275 4276 // Similar to isLowerSaturate(), but checks for upper-saturating conditions. 4277 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, 4278 const SDValue TrueVal, const SDValue FalseVal, 4279 const ISD::CondCode CC, const SDValue K) { 4280 return (isGTorGE(CC) && 4281 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || 4282 (isLTorLE(CC) && 4283 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); 4284 } 4285 4286 // Check if two chained conditionals could be converted into SSAT or USAT. 4287 // 4288 // SSAT can replace a set of two conditional selectors that bound a number to an 4289 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 4290 // 4291 // x < -k ? -k : (x > k ? k : x) 4292 // x < -k ? -k : (x < k ? x : k) 4293 // x > -k ? (x > k ? k : x) : -k 4294 // x < k ? (x < -k ? -k : x) : k 4295 // etc. 4296 // 4297 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is 4298 // a power of 2. 4299 // 4300 // It returns true if the conversion can be done, false otherwise. 4301 // Additionally, the variable is returned in parameter V, the constant in K and 4302 // usat is set to true if the conditional represents an unsigned saturation 4303 static bool isSaturatingConditional(const SDValue &Op, SDValue &V, 4304 uint64_t &K, bool &usat) { 4305 SDValue LHS1 = Op.getOperand(0); 4306 SDValue RHS1 = Op.getOperand(1); 4307 SDValue TrueVal1 = Op.getOperand(2); 4308 SDValue FalseVal1 = Op.getOperand(3); 4309 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4310 4311 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 4312 if (Op2.getOpcode() != ISD::SELECT_CC) 4313 return false; 4314 4315 SDValue LHS2 = Op2.getOperand(0); 4316 SDValue RHS2 = Op2.getOperand(1); 4317 SDValue TrueVal2 = Op2.getOperand(2); 4318 SDValue FalseVal2 = Op2.getOperand(3); 4319 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 4320 4321 // Find out which are the constants and which are the variables 4322 // in each conditional 4323 SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1) 4324 ? &RHS1 4325 : nullptr; 4326 SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2) 4327 ? &RHS2 4328 : nullptr; 4329 SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2; 4330 SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; 4331 SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; 4332 SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; 4333 4334 // We must detect cases where the original operations worked with 16- or 4335 // 8-bit values. In such case, V2Tmp != V2 because the comparison operations 4336 // must work with sign-extended values but the select operations return 4337 // the original non-extended value. 4338 SDValue V2TmpReg = V2Tmp; 4339 if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) 4340 V2TmpReg = V2Tmp->getOperand(0); 4341 4342 // Check that the registers and the constants have the correct values 4343 // in both conditionals 4344 if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || 4345 V2TmpReg != V2) 4346 return false; 4347 4348 // Figure out which conditional is saturating the lower/upper bound. 4349 const SDValue *LowerCheckOp = 4350 isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4351 ? &Op 4352 : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4353 ? &Op2 4354 : nullptr; 4355 const SDValue *UpperCheckOp = 4356 isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) 4357 ? &Op 4358 : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) 4359 ? &Op2 4360 : nullptr; 4361 4362 if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) 4363 return false; 4364 4365 // Check that the constant in the lower-bound check is 4366 // the opposite of the constant in the upper-bound check 4367 // in 1's complement. 4368 int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); 4369 int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); 4370 int64_t PosVal = std::max(Val1, Val2); 4371 int64_t NegVal = std::min(Val1, Val2); 4372 4373 if (((Val1 > Val2 && UpperCheckOp == &Op) || 4374 (Val1 < Val2 && UpperCheckOp == &Op2)) && 4375 isPowerOf2_64(PosVal + 1)) { 4376 4377 // Handle the difference between USAT (unsigned) and SSAT (signed) saturation 4378 if (Val1 == ~Val2) 4379 usat = false; 4380 else if (NegVal == 0) 4381 usat = true; 4382 else 4383 return false; 4384 4385 V = V2; 4386 K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive 4387 4388 return true; 4389 } 4390 4391 return false; 4392 } 4393 4394 // Check if a condition of the type x < k ? k : x can be converted into a 4395 // bit operation instead of conditional moves. 4396 // Currently this is allowed given: 4397 // - The conditions and values match up 4398 // - k is 0 or -1 (all ones) 4399 // This function will not check the last condition, thats up to the caller 4400 // It returns true if the transformation can be made, and in such case 4401 // returns x in V, and k in SatK. 4402 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 4403 SDValue &SatK) 4404 { 4405 SDValue LHS = Op.getOperand(0); 4406 SDValue RHS = Op.getOperand(1); 4407 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4408 SDValue TrueVal = Op.getOperand(2); 4409 SDValue FalseVal = Op.getOperand(3); 4410 4411 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 4412 ? &RHS 4413 : nullptr; 4414 4415 // No constant operation in comparison, early out 4416 if (!K) 4417 return false; 4418 4419 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 4420 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 4421 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 4422 4423 // If the constant on left and right side, or variable on left and right, 4424 // does not match, early out 4425 if (*K != KTmp || V != VTmp) 4426 return false; 4427 4428 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 4429 SatK = *K; 4430 return true; 4431 } 4432 4433 return false; 4434 } 4435 4436 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 4437 EVT VT = Op.getValueType(); 4438 SDLoc dl(Op); 4439 4440 // Try to convert two saturating conditional selects into a single SSAT 4441 SDValue SatValue; 4442 uint64_t SatConstant; 4443 bool SatUSat; 4444 if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && 4445 isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { 4446 if (SatUSat) 4447 return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, 4448 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4449 else 4450 return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, 4451 DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); 4452 } 4453 4454 // Try to convert expressions of the form x < k ? k : x (and similar forms) 4455 // into more efficient bit operations, which is possible when k is 0 or -1 4456 // On ARM and Thumb-2 which have flexible operand 2 this will result in 4457 // single instructions. On Thumb the shift and the bit operation will be two 4458 // instructions. 4459 // Only allow this transformation on full-width (32-bit) operations 4460 SDValue LowerSatConstant; 4461 if (VT == MVT::i32 && 4462 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 4463 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 4464 DAG.getConstant(31, dl, VT)); 4465 if (isNullConstant(LowerSatConstant)) { 4466 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 4467 DAG.getAllOnesConstant(dl, VT)); 4468 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 4469 } else if (isAllOnesConstant(LowerSatConstant)) 4470 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 4471 } 4472 4473 SDValue LHS = Op.getOperand(0); 4474 SDValue RHS = Op.getOperand(1); 4475 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4476 SDValue TrueVal = Op.getOperand(2); 4477 SDValue FalseVal = Op.getOperand(3); 4478 4479 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4480 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4481 dl); 4482 4483 // If softenSetCCOperands only returned one value, we should compare it to 4484 // zero. 4485 if (!RHS.getNode()) { 4486 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4487 CC = ISD::SETNE; 4488 } 4489 } 4490 4491 if (LHS.getValueType() == MVT::i32) { 4492 // Try to generate VSEL on ARMv8. 4493 // The VSEL instruction can't use all the usual ARM condition 4494 // codes: it only has two bits to select the condition code, so it's 4495 // constrained to use only GE, GT, VS and EQ. 4496 // 4497 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 4498 // swap the operands of the previous compare instruction (effectively 4499 // inverting the compare condition, swapping 'less' and 'greater') and 4500 // sometimes need to swap the operands to the VSEL (which inverts the 4501 // condition in the sense of firing whenever the previous condition didn't) 4502 if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f16 || 4503 TrueVal.getValueType() == MVT::f32 || 4504 TrueVal.getValueType() == MVT::f64)) { 4505 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4506 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 4507 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 4508 CC = ISD::getSetCCInverse(CC, true); 4509 std::swap(TrueVal, FalseVal); 4510 } 4511 } 4512 4513 SDValue ARMcc; 4514 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4515 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4516 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4517 } 4518 4519 ARMCC::CondCodes CondCode, CondCode2; 4520 bool InvalidOnQNaN; 4521 FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); 4522 4523 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 4524 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 4525 // must use VSEL (limited condition codes), due to not having conditional f16 4526 // moves. 4527 if (Subtarget->hasFPARMv8() && 4528 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 4529 (TrueVal.getValueType() == MVT::f16 || 4530 TrueVal.getValueType() == MVT::f32 || 4531 TrueVal.getValueType() == MVT::f64)) { 4532 bool swpCmpOps = false; 4533 bool swpVselOps = false; 4534 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 4535 4536 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 4537 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 4538 if (swpCmpOps) 4539 std::swap(LHS, RHS); 4540 if (swpVselOps) 4541 std::swap(TrueVal, FalseVal); 4542 } 4543 } 4544 4545 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4546 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4547 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4548 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 4549 if (CondCode2 != ARMCC::AL) { 4550 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 4551 // FIXME: Needs another CMP because flag can have but one use. 4552 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4553 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 4554 } 4555 return Result; 4556 } 4557 4558 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 4559 /// to morph to an integer compare sequence. 4560 static bool canChangeToInt(SDValue Op, bool &SeenZero, 4561 const ARMSubtarget *Subtarget) { 4562 SDNode *N = Op.getNode(); 4563 if (!N->hasOneUse()) 4564 // Otherwise it requires moving the value from fp to integer registers. 4565 return false; 4566 if (!N->getNumValues()) 4567 return false; 4568 EVT VT = Op.getValueType(); 4569 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 4570 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 4571 // vmrs are very slow, e.g. cortex-a8. 4572 return false; 4573 4574 if (isFloatingPointZero(Op)) { 4575 SeenZero = true; 4576 return true; 4577 } 4578 return ISD::isNormalLoad(N); 4579 } 4580 4581 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 4582 if (isFloatingPointZero(Op)) 4583 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 4584 4585 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 4586 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 4587 Ld->getPointerInfo(), Ld->getAlignment(), 4588 Ld->getMemOperand()->getFlags()); 4589 4590 llvm_unreachable("Unknown VFP cmp argument!"); 4591 } 4592 4593 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 4594 SDValue &RetVal1, SDValue &RetVal2) { 4595 SDLoc dl(Op); 4596 4597 if (isFloatingPointZero(Op)) { 4598 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 4599 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 4600 return; 4601 } 4602 4603 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 4604 SDValue Ptr = Ld->getBasePtr(); 4605 RetVal1 = 4606 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 4607 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 4608 4609 EVT PtrType = Ptr.getValueType(); 4610 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 4611 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 4612 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 4613 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 4614 Ld->getPointerInfo().getWithOffset(4), NewAlign, 4615 Ld->getMemOperand()->getFlags()); 4616 return; 4617 } 4618 4619 llvm_unreachable("Unknown VFP cmp argument!"); 4620 } 4621 4622 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 4623 /// f32 and even f64 comparisons to integer ones. 4624 SDValue 4625 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 4626 SDValue Chain = Op.getOperand(0); 4627 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4628 SDValue LHS = Op.getOperand(2); 4629 SDValue RHS = Op.getOperand(3); 4630 SDValue Dest = Op.getOperand(4); 4631 SDLoc dl(Op); 4632 4633 bool LHSSeenZero = false; 4634 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 4635 bool RHSSeenZero = false; 4636 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 4637 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 4638 // If unsafe fp math optimization is enabled and there are no other uses of 4639 // the CMP operands, and the condition code is EQ or NE, we can optimize it 4640 // to an integer comparison. 4641 if (CC == ISD::SETOEQ) 4642 CC = ISD::SETEQ; 4643 else if (CC == ISD::SETUNE) 4644 CC = ISD::SETNE; 4645 4646 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 4647 SDValue ARMcc; 4648 if (LHS.getValueType() == MVT::f32) { 4649 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4650 bitcastf32Toi32(LHS, DAG), Mask); 4651 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 4652 bitcastf32Toi32(RHS, DAG), Mask); 4653 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4654 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4655 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4656 Chain, Dest, ARMcc, CCR, Cmp); 4657 } 4658 4659 SDValue LHS1, LHS2; 4660 SDValue RHS1, RHS2; 4661 expandf64Toi32(LHS, DAG, LHS1, LHS2); 4662 expandf64Toi32(RHS, DAG, RHS1, RHS2); 4663 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 4664 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 4665 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4666 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4667 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4668 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 4669 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 4670 } 4671 4672 return SDValue(); 4673 } 4674 4675 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 4676 SDValue Chain = Op.getOperand(0); 4677 SDValue Cond = Op.getOperand(1); 4678 SDValue Dest = Op.getOperand(2); 4679 SDLoc dl(Op); 4680 4681 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 4682 // instruction. 4683 unsigned Opc = Cond.getOpcode(); 4684 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 4685 !Subtarget->isThumb1Only(); 4686 if (Cond.getResNo() == 1 && 4687 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4688 Opc == ISD::USUBO || OptimizeMul)) { 4689 // Only lower legal XALUO ops. 4690 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4691 return SDValue(); 4692 4693 // The actual operation with overflow check. 4694 SDValue Value, OverflowCmp; 4695 SDValue ARMcc; 4696 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4697 4698 // Reverse the condition code. 4699 ARMCC::CondCodes CondCode = 4700 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 4701 CondCode = ARMCC::getOppositeCondition(CondCode); 4702 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 4703 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4704 4705 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 4706 OverflowCmp); 4707 } 4708 4709 return SDValue(); 4710 } 4711 4712 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 4713 SDValue Chain = Op.getOperand(0); 4714 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4715 SDValue LHS = Op.getOperand(2); 4716 SDValue RHS = Op.getOperand(3); 4717 SDValue Dest = Op.getOperand(4); 4718 SDLoc dl(Op); 4719 4720 if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { 4721 DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, 4722 dl); 4723 4724 // If softenSetCCOperands only returned one value, we should compare it to 4725 // zero. 4726 if (!RHS.getNode()) { 4727 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4728 CC = ISD::SETNE; 4729 } 4730 } 4731 4732 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 4733 // instruction. 4734 unsigned Opc = LHS.getOpcode(); 4735 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 4736 !Subtarget->isThumb1Only(); 4737 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 4738 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4739 Opc == ISD::USUBO || OptimizeMul) && 4740 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 4741 // Only lower legal XALUO ops. 4742 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 4743 return SDValue(); 4744 4745 // The actual operation with overflow check. 4746 SDValue Value, OverflowCmp; 4747 SDValue ARMcc; 4748 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 4749 4750 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 4751 // Reverse the condition code. 4752 ARMCC::CondCodes CondCode = 4753 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 4754 CondCode = ARMCC::getOppositeCondition(CondCode); 4755 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 4756 } 4757 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4758 4759 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 4760 OverflowCmp); 4761 } 4762 4763 if (LHS.getValueType() == MVT::i32) { 4764 SDValue ARMcc; 4765 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 4766 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4767 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 4768 Chain, Dest, ARMcc, CCR, Cmp); 4769 } 4770 4771 if (getTargetMachine().Options.UnsafeFPMath && 4772 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 4773 CC == ISD::SETNE || CC == ISD::SETUNE)) { 4774 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 4775 return Result; 4776 } 4777 4778 ARMCC::CondCodes CondCode, CondCode2; 4779 bool InvalidOnQNaN; 4780 FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); 4781 4782 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4783 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); 4784 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4785 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 4786 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 4787 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4788 if (CondCode2 != ARMCC::AL) { 4789 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 4790 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 4791 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 4792 } 4793 return Res; 4794 } 4795 4796 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 4797 SDValue Chain = Op.getOperand(0); 4798 SDValue Table = Op.getOperand(1); 4799 SDValue Index = Op.getOperand(2); 4800 SDLoc dl(Op); 4801 4802 EVT PTy = getPointerTy(DAG.getDataLayout()); 4803 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 4804 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 4805 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 4806 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 4807 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 4808 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 4809 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 4810 // which does another jump to the destination. This also makes it easier 4811 // to translate it to TBB / TBH later (Thumb2 only). 4812 // FIXME: This might not work if the function is extremely large. 4813 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 4814 Addr, Op.getOperand(2), JTI); 4815 } 4816 if (isPositionIndependent() || Subtarget->isROPI()) { 4817 Addr = 4818 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 4819 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4820 Chain = Addr.getValue(1); 4821 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 4822 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4823 } else { 4824 Addr = 4825 DAG.getLoad(PTy, dl, Chain, Addr, 4826 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 4827 Chain = Addr.getValue(1); 4828 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 4829 } 4830 } 4831 4832 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 4833 EVT VT = Op.getValueType(); 4834 SDLoc dl(Op); 4835 4836 if (Op.getValueType().getVectorElementType() == MVT::i32) { 4837 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 4838 return Op; 4839 return DAG.UnrollVectorOp(Op.getNode()); 4840 } 4841 4842 const bool HasFullFP16 = 4843 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 4844 4845 EVT NewTy; 4846 const EVT OpTy = Op.getOperand(0).getValueType(); 4847 if (OpTy == MVT::v4f32) 4848 NewTy = MVT::v4i32; 4849 else if (OpTy == MVT::v4f16 && HasFullFP16) 4850 NewTy = MVT::v4i16; 4851 else if (OpTy == MVT::v8f16 && HasFullFP16) 4852 NewTy = MVT::v8i16; 4853 else 4854 llvm_unreachable("Invalid type for custom lowering!"); 4855 4856 if (VT != MVT::v4i16 && VT != MVT::v8i16) 4857 return DAG.UnrollVectorOp(Op.getNode()); 4858 4859 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 4860 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 4861 } 4862 4863 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 4864 EVT VT = Op.getValueType(); 4865 if (VT.isVector()) 4866 return LowerVectorFP_TO_INT(Op, DAG); 4867 if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { 4868 RTLIB::Libcall LC; 4869 if (Op.getOpcode() == ISD::FP_TO_SINT) 4870 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), 4871 Op.getValueType()); 4872 else 4873 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), 4874 Op.getValueType()); 4875 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4876 /*isSigned*/ false, SDLoc(Op)).first; 4877 } 4878 4879 return Op; 4880 } 4881 4882 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4883 EVT VT = Op.getValueType(); 4884 SDLoc dl(Op); 4885 4886 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 4887 if (VT.getVectorElementType() == MVT::f32) 4888 return Op; 4889 return DAG.UnrollVectorOp(Op.getNode()); 4890 } 4891 4892 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 4893 Op.getOperand(0).getValueType() == MVT::v8i16) && 4894 "Invalid type for custom lowering!"); 4895 4896 const bool HasFullFP16 = 4897 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 4898 4899 EVT DestVecType; 4900 if (VT == MVT::v4f32) 4901 DestVecType = MVT::v4i32; 4902 else if (VT == MVT::v4f16 && HasFullFP16) 4903 DestVecType = MVT::v4i16; 4904 else if (VT == MVT::v8f16 && HasFullFP16) 4905 DestVecType = MVT::v8i16; 4906 else 4907 return DAG.UnrollVectorOp(Op.getNode()); 4908 4909 unsigned CastOpc; 4910 unsigned Opc; 4911 switch (Op.getOpcode()) { 4912 default: llvm_unreachable("Invalid opcode!"); 4913 case ISD::SINT_TO_FP: 4914 CastOpc = ISD::SIGN_EXTEND; 4915 Opc = ISD::SINT_TO_FP; 4916 break; 4917 case ISD::UINT_TO_FP: 4918 CastOpc = ISD::ZERO_EXTEND; 4919 Opc = ISD::UINT_TO_FP; 4920 break; 4921 } 4922 4923 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 4924 return DAG.getNode(Opc, dl, VT, Op); 4925 } 4926 4927 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 4928 EVT VT = Op.getValueType(); 4929 if (VT.isVector()) 4930 return LowerVectorINT_TO_FP(Op, DAG); 4931 if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { 4932 RTLIB::Libcall LC; 4933 if (Op.getOpcode() == ISD::SINT_TO_FP) 4934 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 4935 Op.getValueType()); 4936 else 4937 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 4938 Op.getValueType()); 4939 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 4940 /*isSigned*/ false, SDLoc(Op)).first; 4941 } 4942 4943 return Op; 4944 } 4945 4946 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 4947 // Implement fcopysign with a fabs and a conditional fneg. 4948 SDValue Tmp0 = Op.getOperand(0); 4949 SDValue Tmp1 = Op.getOperand(1); 4950 SDLoc dl(Op); 4951 EVT VT = Op.getValueType(); 4952 EVT SrcVT = Tmp1.getValueType(); 4953 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 4954 Tmp0.getOpcode() == ARMISD::VMOVDRR; 4955 bool UseNEON = !InGPR && Subtarget->hasNEON(); 4956 4957 if (UseNEON) { 4958 // Use VBSL to copy the sign bit. 4959 unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); 4960 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 4961 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 4962 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 4963 if (VT == MVT::f64) 4964 Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4965 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 4966 DAG.getConstant(32, dl, MVT::i32)); 4967 else /*if (VT == MVT::f32)*/ 4968 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 4969 if (SrcVT == MVT::f32) { 4970 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 4971 if (VT == MVT::f64) 4972 Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, 4973 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 4974 DAG.getConstant(32, dl, MVT::i32)); 4975 } else if (VT == MVT::f32) 4976 Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, 4977 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 4978 DAG.getConstant(32, dl, MVT::i32)); 4979 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 4980 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 4981 4982 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), 4983 dl, MVT::i32); 4984 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 4985 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 4986 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 4987 4988 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 4989 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 4990 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 4991 if (VT == MVT::f32) { 4992 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 4993 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 4994 DAG.getConstant(0, dl, MVT::i32)); 4995 } else { 4996 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 4997 } 4998 4999 return Res; 5000 } 5001 5002 // Bitcast operand 1 to i32. 5003 if (SrcVT == MVT::f64) 5004 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5005 Tmp1).getValue(1); 5006 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5007 5008 // Or in the signbit with integer operations. 5009 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5010 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5011 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5012 if (VT == MVT::f32) { 5013 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5014 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5015 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5016 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5017 } 5018 5019 // f64: Or the high part with signbit and then combine two parts. 5020 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5021 Tmp0); 5022 SDValue Lo = Tmp0.getValue(0); 5023 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5024 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5025 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5026 } 5027 5028 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5029 MachineFunction &MF = DAG.getMachineFunction(); 5030 MachineFrameInfo &MFI = MF.getFrameInfo(); 5031 MFI.setReturnAddressIsTaken(true); 5032 5033 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5034 return SDValue(); 5035 5036 EVT VT = Op.getValueType(); 5037 SDLoc dl(Op); 5038 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5039 if (Depth) { 5040 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5041 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5042 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5043 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5044 MachinePointerInfo()); 5045 } 5046 5047 // Return LR, which contains the return address. Mark it an implicit live-in. 5048 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5049 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5050 } 5051 5052 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5053 const ARMBaseRegisterInfo &ARI = 5054 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5055 MachineFunction &MF = DAG.getMachineFunction(); 5056 MachineFrameInfo &MFI = MF.getFrameInfo(); 5057 MFI.setFrameAddressIsTaken(true); 5058 5059 EVT VT = Op.getValueType(); 5060 SDLoc dl(Op); // FIXME probably not meaningful 5061 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5062 unsigned FrameReg = ARI.getFrameRegister(MF); 5063 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5064 while (Depth--) 5065 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5066 MachinePointerInfo()); 5067 return FrameAddr; 5068 } 5069 5070 // FIXME? Maybe this could be a TableGen attribute on some registers and 5071 // this table could be generated automatically from RegInfo. 5072 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, 5073 SelectionDAG &DAG) const { 5074 unsigned Reg = StringSwitch<unsigned>(RegName) 5075 .Case("sp", ARM::SP) 5076 .Default(0); 5077 if (Reg) 5078 return Reg; 5079 report_fatal_error(Twine("Invalid register name \"" 5080 + StringRef(RegName) + "\".")); 5081 } 5082 5083 // Result is 64 bit value so split into two 32 bit values and return as a 5084 // pair of values. 5085 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5086 SelectionDAG &DAG) { 5087 SDLoc DL(N); 5088 5089 // This function is only supposed to be called for i64 type destination. 5090 assert(N->getValueType(0) == MVT::i64 5091 && "ExpandREAD_REGISTER called for non-i64 type result."); 5092 5093 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5094 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5095 N->getOperand(0), 5096 N->getOperand(1)); 5097 5098 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5099 Read.getValue(1))); 5100 Results.push_back(Read.getOperand(0)); 5101 } 5102 5103 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5104 /// When \p DstVT, the destination type of \p BC, is on the vector 5105 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5106 /// it might be possible to combine them, such that everything stays on the 5107 /// vector register bank. 5108 /// \p return The node that would replace \p BT, if the combine 5109 /// is possible. 5110 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5111 SelectionDAG &DAG) { 5112 SDValue Op = BC->getOperand(0); 5113 EVT DstVT = BC->getValueType(0); 5114 5115 // The only vector instruction that can produce a scalar (remember, 5116 // since the bitcast was about to be turned into VMOVDRR, the source 5117 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5118 // Moreover, we can do this combine only if there is one use. 5119 // Finally, if the destination type is not a vector, there is not 5120 // much point on forcing everything on the vector bank. 5121 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5122 !Op.hasOneUse()) 5123 return SDValue(); 5124 5125 // If the index is not constant, we will introduce an additional 5126 // multiply that will stick. 5127 // Give up in that case. 5128 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5129 if (!Index) 5130 return SDValue(); 5131 unsigned DstNumElt = DstVT.getVectorNumElements(); 5132 5133 // Compute the new index. 5134 const APInt &APIntIndex = Index->getAPIntValue(); 5135 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5136 NewIndex *= APIntIndex; 5137 // Check if the new constant index fits into i32. 5138 if (NewIndex.getBitWidth() > 32) 5139 return SDValue(); 5140 5141 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5142 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5143 SDLoc dl(Op); 5144 SDValue ExtractSrc = Op.getOperand(0); 5145 EVT VecVT = EVT::getVectorVT( 5146 *DAG.getContext(), DstVT.getScalarType(), 5147 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5148 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5149 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5150 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5151 } 5152 5153 /// ExpandBITCAST - If the target supports VFP, this function is called to 5154 /// expand a bit convert where either the source or destination type is i64 to 5155 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5156 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5157 /// vectors), since the legalizer won't know what to do with that. 5158 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5159 const ARMSubtarget *Subtarget) { 5160 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5161 SDLoc dl(N); 5162 SDValue Op = N->getOperand(0); 5163 5164 // This function is only supposed to be called for i64 types, either as the 5165 // source or destination of the bit convert. 5166 EVT SrcVT = Op.getValueType(); 5167 EVT DstVT = N->getValueType(0); 5168 const bool HasFullFP16 = Subtarget->hasFullFP16(); 5169 5170 if (SrcVT == MVT::f32 && DstVT == MVT::i32) { 5171 // FullFP16: half values are passed in S-registers, and we don't 5172 // need any of the bitcast and moves: 5173 // 5174 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 5175 // t5: i32 = bitcast t2 5176 // t18: f16 = ARMISD::VMOVhr t5 5177 if (Op.getOpcode() != ISD::CopyFromReg || 5178 Op.getValueType() != MVT::f32) 5179 return SDValue(); 5180 5181 auto Move = N->use_begin(); 5182 if (Move->getOpcode() != ARMISD::VMOVhr) 5183 return SDValue(); 5184 5185 SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; 5186 SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); 5187 DAG.ReplaceAllUsesWith(*Move, &Copy); 5188 return Copy; 5189 } 5190 5191 if (SrcVT == MVT::i16 && DstVT == MVT::f16) { 5192 if (!HasFullFP16) 5193 return SDValue(); 5194 // SoftFP: read half-precision arguments: 5195 // 5196 // t2: i32,ch = ... 5197 // t7: i16 = truncate t2 <~~~~ Op 5198 // t8: f16 = bitcast t7 <~~~~ N 5199 // 5200 if (Op.getOperand(0).getValueType() == MVT::i32) 5201 return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), 5202 MVT::f16, Op.getOperand(0)); 5203 5204 return SDValue(); 5205 } 5206 5207 // Half-precision return values 5208 if (SrcVT == MVT::f16 && DstVT == MVT::i16) { 5209 if (!HasFullFP16) 5210 return SDValue(); 5211 // 5212 // t11: f16 = fadd t8, t10 5213 // t12: i16 = bitcast t11 <~~~ SDNode N 5214 // t13: i32 = zero_extend t12 5215 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 5216 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 5217 // 5218 // transform this into: 5219 // 5220 // t20: i32 = ARMISD::VMOVrh t11 5221 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 5222 // 5223 auto ZeroExtend = N->use_begin(); 5224 if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || 5225 ZeroExtend->getValueType(0) != MVT::i32) 5226 return SDValue(); 5227 5228 auto Copy = ZeroExtend->use_begin(); 5229 if (Copy->getOpcode() == ISD::CopyToReg && 5230 Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { 5231 SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); 5232 DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); 5233 return Cvt; 5234 } 5235 return SDValue(); 5236 } 5237 5238 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5239 return SDValue(); 5240 5241 // Turn i64->f64 into VMOVDRR. 5242 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5243 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5244 // if we can combine the bitcast with its source. 5245 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5246 return Val; 5247 5248 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5249 DAG.getConstant(0, dl, MVT::i32)); 5250 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5251 DAG.getConstant(1, dl, MVT::i32)); 5252 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5253 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5254 } 5255 5256 // Turn f64->i64 into VMOVRRD. 5257 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5258 SDValue Cvt; 5259 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5260 SrcVT.getVectorNumElements() > 1) 5261 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5262 DAG.getVTList(MVT::i32, MVT::i32), 5263 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5264 else 5265 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5266 DAG.getVTList(MVT::i32, MVT::i32), Op); 5267 // Merge the pieces into a single i64 value. 5268 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5269 } 5270 5271 return SDValue(); 5272 } 5273 5274 /// getZeroVector - Returns a vector of specified type with all zero elements. 5275 /// Zero vectors are used to represent vector negation and in those cases 5276 /// will be implemented with the NEON VNEG instruction. However, VNEG does 5277 /// not support i64 elements, so sometimes the zero vectors will need to be 5278 /// explicitly constructed. Regardless, use a canonical VMOV to create the 5279 /// zero vector. 5280 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 5281 assert(VT.isVector() && "Expected a vector type"); 5282 // The canonical modified immediate encoding of a zero vector is....0! 5283 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 5284 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 5285 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 5286 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 5287 } 5288 5289 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5290 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5291 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 5292 SelectionDAG &DAG) const { 5293 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5294 EVT VT = Op.getValueType(); 5295 unsigned VTBits = VT.getSizeInBits(); 5296 SDLoc dl(Op); 5297 SDValue ShOpLo = Op.getOperand(0); 5298 SDValue ShOpHi = Op.getOperand(1); 5299 SDValue ShAmt = Op.getOperand(2); 5300 SDValue ARMcc; 5301 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5302 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5303 5304 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5305 5306 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5307 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5308 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5309 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5310 DAG.getConstant(VTBits, dl, MVT::i32)); 5311 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5312 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5313 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5314 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5315 ISD::SETGE, ARMcc, DAG, dl); 5316 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 5317 ARMcc, CCR, CmpLo); 5318 5319 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5320 SDValue HiBigShift = Opc == ISD::SRA 5321 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5322 DAG.getConstant(VTBits - 1, dl, VT)) 5323 : DAG.getConstant(0, dl, VT); 5324 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5325 ISD::SETGE, ARMcc, DAG, dl); 5326 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5327 ARMcc, CCR, CmpHi); 5328 5329 SDValue Ops[2] = { Lo, Hi }; 5330 return DAG.getMergeValues(Ops, dl); 5331 } 5332 5333 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5334 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 5335 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 5336 SelectionDAG &DAG) const { 5337 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5338 EVT VT = Op.getValueType(); 5339 unsigned VTBits = VT.getSizeInBits(); 5340 SDLoc dl(Op); 5341 SDValue ShOpLo = Op.getOperand(0); 5342 SDValue ShOpHi = Op.getOperand(1); 5343 SDValue ShAmt = Op.getOperand(2); 5344 SDValue ARMcc; 5345 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5346 5347 assert(Op.getOpcode() == ISD::SHL_PARTS); 5348 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 5349 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 5350 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5351 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5352 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 5353 5354 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 5355 DAG.getConstant(VTBits, dl, MVT::i32)); 5356 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5357 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5358 ISD::SETGE, ARMcc, DAG, dl); 5359 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 5360 ARMcc, CCR, CmpHi); 5361 5362 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 5363 ISD::SETGE, ARMcc, DAG, dl); 5364 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5365 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 5366 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 5367 5368 SDValue Ops[2] = { Lo, Hi }; 5369 return DAG.getMergeValues(Ops, dl); 5370 } 5371 5372 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 5373 SelectionDAG &DAG) const { 5374 // The rounding mode is in bits 23:22 of the FPSCR. 5375 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 5376 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 5377 // so that the shift + and get folded into a bitfield extract. 5378 SDLoc dl(Op); 5379 SDValue Ops[] = { DAG.getEntryNode(), 5380 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; 5381 5382 SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); 5383 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 5384 DAG.getConstant(1U << 22, dl, MVT::i32)); 5385 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 5386 DAG.getConstant(22, dl, MVT::i32)); 5387 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 5388 DAG.getConstant(3, dl, MVT::i32)); 5389 } 5390 5391 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 5392 const ARMSubtarget *ST) { 5393 SDLoc dl(N); 5394 EVT VT = N->getValueType(0); 5395 if (VT.isVector()) { 5396 assert(ST->hasNEON()); 5397 5398 // Compute the least significant set bit: LSB = X & -X 5399 SDValue X = N->getOperand(0); 5400 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 5401 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 5402 5403 EVT ElemTy = VT.getVectorElementType(); 5404 5405 if (ElemTy == MVT::i8) { 5406 // Compute with: cttz(x) = ctpop(lsb - 1) 5407 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5408 DAG.getTargetConstant(1, dl, ElemTy)); 5409 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5410 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5411 } 5412 5413 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 5414 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 5415 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 5416 unsigned NumBits = ElemTy.getSizeInBits(); 5417 SDValue WidthMinus1 = 5418 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5419 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 5420 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 5421 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 5422 } 5423 5424 // Compute with: cttz(x) = ctpop(lsb - 1) 5425 5426 // Compute LSB - 1. 5427 SDValue Bits; 5428 if (ElemTy == MVT::i64) { 5429 // Load constant 0xffff'ffff'ffff'ffff to register. 5430 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5431 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 5432 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 5433 } else { 5434 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 5435 DAG.getTargetConstant(1, dl, ElemTy)); 5436 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 5437 } 5438 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 5439 } 5440 5441 if (!ST->hasV6T2Ops()) 5442 return SDValue(); 5443 5444 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 5445 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 5446 } 5447 5448 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 5449 const ARMSubtarget *ST) { 5450 EVT VT = N->getValueType(0); 5451 SDLoc DL(N); 5452 5453 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 5454 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 5455 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 5456 "Unexpected type for custom ctpop lowering"); 5457 5458 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5459 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 5460 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 5461 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 5462 5463 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 5464 unsigned EltSize = 8; 5465 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 5466 while (EltSize != VT.getScalarSizeInBits()) { 5467 SmallVector<SDValue, 8> Ops; 5468 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 5469 TLI.getPointerTy(DAG.getDataLayout()))); 5470 Ops.push_back(Res); 5471 5472 EltSize *= 2; 5473 NumElts /= 2; 5474 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 5475 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 5476 } 5477 5478 return Res; 5479 } 5480 5481 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 5482 const ARMSubtarget *ST) { 5483 EVT VT = N->getValueType(0); 5484 SDLoc dl(N); 5485 5486 if (!VT.isVector()) 5487 return SDValue(); 5488 5489 // Lower vector shifts on NEON to use VSHL. 5490 assert(ST->hasNEON() && "unexpected vector shift"); 5491 5492 // Left shifts translate directly to the vshiftu intrinsic. 5493 if (N->getOpcode() == ISD::SHL) 5494 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5495 DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, 5496 MVT::i32), 5497 N->getOperand(0), N->getOperand(1)); 5498 5499 assert((N->getOpcode() == ISD::SRA || 5500 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); 5501 5502 // NEON uses the same intrinsics for both left and right shifts. For 5503 // right shifts, the shift amounts are negative, so negate the vector of 5504 // shift amounts. 5505 EVT ShiftVT = N->getOperand(1).getValueType(); 5506 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, 5507 getZeroVector(ShiftVT, DAG, dl), 5508 N->getOperand(1)); 5509 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? 5510 Intrinsic::arm_neon_vshifts : 5511 Intrinsic::arm_neon_vshiftu); 5512 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 5513 DAG.getConstant(vshiftInt, dl, MVT::i32), 5514 N->getOperand(0), NegatedCount); 5515 } 5516 5517 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 5518 const ARMSubtarget *ST) { 5519 EVT VT = N->getValueType(0); 5520 SDLoc dl(N); 5521 5522 // We can get here for a node like i32 = ISD::SHL i32, i64 5523 if (VT != MVT::i64) 5524 return SDValue(); 5525 5526 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && 5527 "Unknown shift to lower!"); 5528 5529 // We only lower SRA, SRL of 1 here, all others use generic lowering. 5530 if (!isOneConstant(N->getOperand(1))) 5531 return SDValue(); 5532 5533 // If we are in thumb mode, we don't have RRX. 5534 if (ST->isThumb1Only()) return SDValue(); 5535 5536 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 5537 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5538 DAG.getConstant(0, dl, MVT::i32)); 5539 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 5540 DAG.getConstant(1, dl, MVT::i32)); 5541 5542 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 5543 // captures the result into a carry flag. 5544 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 5545 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 5546 5547 // The low part is an ARMISD::RRX operand, which shifts the carry in. 5548 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 5549 5550 // Merge the pieces into a single i64 value. 5551 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5552 } 5553 5554 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5555 SDValue TmpOp0, TmpOp1; 5556 bool Invert = false; 5557 bool Swap = false; 5558 unsigned Opc = 0; 5559 5560 SDValue Op0 = Op.getOperand(0); 5561 SDValue Op1 = Op.getOperand(1); 5562 SDValue CC = Op.getOperand(2); 5563 EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 5564 EVT VT = Op.getValueType(); 5565 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5566 SDLoc dl(Op); 5567 5568 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 5569 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 5570 // Special-case integer 64-bit equality comparisons. They aren't legal, 5571 // but they can be lowered with a few vector instructions. 5572 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 5573 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 5574 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 5575 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 5576 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 5577 DAG.getCondCode(ISD::SETEQ)); 5578 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 5579 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 5580 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 5581 if (SetCCOpcode == ISD::SETNE) 5582 Merged = DAG.getNOT(dl, Merged, CmpVT); 5583 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 5584 return Merged; 5585 } 5586 5587 if (CmpVT.getVectorElementType() == MVT::i64) 5588 // 64-bit comparisons are not legal in general. 5589 return SDValue(); 5590 5591 if (Op1.getValueType().isFloatingPoint()) { 5592 switch (SetCCOpcode) { 5593 default: llvm_unreachable("Illegal FP comparison"); 5594 case ISD::SETUNE: 5595 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; 5596 case ISD::SETOEQ: 5597 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5598 case ISD::SETOLT: 5599 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 5600 case ISD::SETOGT: 5601 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5602 case ISD::SETOLE: 5603 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 5604 case ISD::SETOGE: 5605 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5606 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 5607 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; 5608 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 5609 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; 5610 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 5611 case ISD::SETONE: 5612 // Expand this to (OLT | OGT). 5613 TmpOp0 = Op0; 5614 TmpOp1 = Op1; 5615 Opc = ISD::OR; 5616 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5617 Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); 5618 break; 5619 case ISD::SETUO: 5620 Invert = true; 5621 LLVM_FALLTHROUGH; 5622 case ISD::SETO: 5623 // Expand this to (OLT | OGE). 5624 TmpOp0 = Op0; 5625 TmpOp1 = Op1; 5626 Opc = ISD::OR; 5627 Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); 5628 Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); 5629 break; 5630 } 5631 } else { 5632 // Integer comparisons. 5633 switch (SetCCOpcode) { 5634 default: llvm_unreachable("Illegal integer comparison"); 5635 case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; 5636 case ISD::SETEQ: Opc = ARMISD::VCEQ; break; 5637 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 5638 case ISD::SETGT: Opc = ARMISD::VCGT; break; 5639 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 5640 case ISD::SETGE: Opc = ARMISD::VCGE; break; 5641 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 5642 case ISD::SETUGT: Opc = ARMISD::VCGTU; break; 5643 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 5644 case ISD::SETUGE: Opc = ARMISD::VCGEU; break; 5645 } 5646 5647 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 5648 if (Opc == ARMISD::VCEQ) { 5649 SDValue AndOp; 5650 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5651 AndOp = Op0; 5652 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 5653 AndOp = Op1; 5654 5655 // Ignore bitconvert. 5656 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 5657 AndOp = AndOp.getOperand(0); 5658 5659 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 5660 Opc = ARMISD::VTST; 5661 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 5662 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 5663 Invert = !Invert; 5664 } 5665 } 5666 } 5667 5668 if (Swap) 5669 std::swap(Op0, Op1); 5670 5671 // If one of the operands is a constant vector zero, attempt to fold the 5672 // comparison to a specialized compare-against-zero form. 5673 SDValue SingleOp; 5674 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 5675 SingleOp = Op0; 5676 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 5677 if (Opc == ARMISD::VCGE) 5678 Opc = ARMISD::VCLEZ; 5679 else if (Opc == ARMISD::VCGT) 5680 Opc = ARMISD::VCLTZ; 5681 SingleOp = Op1; 5682 } 5683 5684 SDValue Result; 5685 if (SingleOp.getNode()) { 5686 switch (Opc) { 5687 case ARMISD::VCEQ: 5688 Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; 5689 case ARMISD::VCGE: 5690 Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; 5691 case ARMISD::VCLEZ: 5692 Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; 5693 case ARMISD::VCGT: 5694 Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; 5695 case ARMISD::VCLTZ: 5696 Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; 5697 default: 5698 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5699 } 5700 } else { 5701 Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); 5702 } 5703 5704 Result = DAG.getSExtOrTrunc(Result, dl, VT); 5705 5706 if (Invert) 5707 Result = DAG.getNOT(dl, Result, VT); 5708 5709 return Result; 5710 } 5711 5712 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 5713 SDValue LHS = Op.getOperand(0); 5714 SDValue RHS = Op.getOperand(1); 5715 SDValue Carry = Op.getOperand(2); 5716 SDValue Cond = Op.getOperand(3); 5717 SDLoc DL(Op); 5718 5719 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 5720 5721 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 5722 // have to invert the carry first. 5723 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 5724 DAG.getConstant(1, DL, MVT::i32), Carry); 5725 // This converts the boolean value carry into the carry flag. 5726 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 5727 5728 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 5729 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 5730 5731 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 5732 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 5733 SDValue ARMcc = DAG.getConstant( 5734 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 5735 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5736 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 5737 Cmp.getValue(1), SDValue()); 5738 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 5739 CCR, Chain.getValue(1)); 5740 } 5741 5742 /// isNEONModifiedImm - Check if the specified splat value corresponds to a 5743 /// valid vector constant for a NEON instruction with a "modified immediate" 5744 /// operand (e.g., VMOV). If so, return the encoded value. 5745 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 5746 unsigned SplatBitSize, SelectionDAG &DAG, 5747 const SDLoc &dl, EVT &VT, bool is128Bits, 5748 NEONModImmType type) { 5749 unsigned OpCmode, Imm; 5750 5751 // SplatBitSize is set to the smallest size that splats the vector, so a 5752 // zero vector will always have SplatBitSize == 8. However, NEON modified 5753 // immediate instructions others than VMOV do not support the 8-bit encoding 5754 // of a zero vector, and the default encoding of zero is supposed to be the 5755 // 32-bit version. 5756 if (SplatBits == 0) 5757 SplatBitSize = 32; 5758 5759 switch (SplatBitSize) { 5760 case 8: 5761 if (type != VMOVModImm) 5762 return SDValue(); 5763 // Any 1-byte value is OK. Op=0, Cmode=1110. 5764 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 5765 OpCmode = 0xe; 5766 Imm = SplatBits; 5767 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 5768 break; 5769 5770 case 16: 5771 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 5772 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 5773 if ((SplatBits & ~0xff) == 0) { 5774 // Value = 0x00nn: Op=x, Cmode=100x. 5775 OpCmode = 0x8; 5776 Imm = SplatBits; 5777 break; 5778 } 5779 if ((SplatBits & ~0xff00) == 0) { 5780 // Value = 0xnn00: Op=x, Cmode=101x. 5781 OpCmode = 0xa; 5782 Imm = SplatBits >> 8; 5783 break; 5784 } 5785 return SDValue(); 5786 5787 case 32: 5788 // NEON's 32-bit VMOV supports splat values where: 5789 // * only one byte is nonzero, or 5790 // * the least significant byte is 0xff and the second byte is nonzero, or 5791 // * the least significant 2 bytes are 0xff and the third is nonzero. 5792 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 5793 if ((SplatBits & ~0xff) == 0) { 5794 // Value = 0x000000nn: Op=x, Cmode=000x. 5795 OpCmode = 0; 5796 Imm = SplatBits; 5797 break; 5798 } 5799 if ((SplatBits & ~0xff00) == 0) { 5800 // Value = 0x0000nn00: Op=x, Cmode=001x. 5801 OpCmode = 0x2; 5802 Imm = SplatBits >> 8; 5803 break; 5804 } 5805 if ((SplatBits & ~0xff0000) == 0) { 5806 // Value = 0x00nn0000: Op=x, Cmode=010x. 5807 OpCmode = 0x4; 5808 Imm = SplatBits >> 16; 5809 break; 5810 } 5811 if ((SplatBits & ~0xff000000) == 0) { 5812 // Value = 0xnn000000: Op=x, Cmode=011x. 5813 OpCmode = 0x6; 5814 Imm = SplatBits >> 24; 5815 break; 5816 } 5817 5818 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 5819 if (type == OtherModImm) return SDValue(); 5820 5821 if ((SplatBits & ~0xffff) == 0 && 5822 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 5823 // Value = 0x0000nnff: Op=x, Cmode=1100. 5824 OpCmode = 0xc; 5825 Imm = SplatBits >> 8; 5826 break; 5827 } 5828 5829 if ((SplatBits & ~0xffffff) == 0 && 5830 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 5831 // Value = 0x00nnffff: Op=x, Cmode=1101. 5832 OpCmode = 0xd; 5833 Imm = SplatBits >> 16; 5834 break; 5835 } 5836 5837 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 5838 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 5839 // VMOV.I32. A (very) minor optimization would be to replicate the value 5840 // and fall through here to test for a valid 64-bit splat. But, then the 5841 // caller would also need to check and handle the change in size. 5842 return SDValue(); 5843 5844 case 64: { 5845 if (type != VMOVModImm) 5846 return SDValue(); 5847 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 5848 uint64_t BitMask = 0xff; 5849 uint64_t Val = 0; 5850 unsigned ImmMask = 1; 5851 Imm = 0; 5852 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 5853 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 5854 Val |= BitMask; 5855 Imm |= ImmMask; 5856 } else if ((SplatBits & BitMask) != 0) { 5857 return SDValue(); 5858 } 5859 BitMask <<= 8; 5860 ImmMask <<= 1; 5861 } 5862 5863 if (DAG.getDataLayout().isBigEndian()) 5864 // swap higher and lower 32 bit word 5865 Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); 5866 5867 // Op=1, Cmode=1110. 5868 OpCmode = 0x1e; 5869 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 5870 break; 5871 } 5872 5873 default: 5874 llvm_unreachable("unexpected size for isNEONModifiedImm"); 5875 } 5876 5877 unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); 5878 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 5879 } 5880 5881 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 5882 const ARMSubtarget *ST) const { 5883 EVT VT = Op.getValueType(); 5884 bool IsDouble = (VT == MVT::f64); 5885 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 5886 const APFloat &FPVal = CFP->getValueAPF(); 5887 5888 // Prevent floating-point constants from using literal loads 5889 // when execute-only is enabled. 5890 if (ST->genExecuteOnly()) { 5891 // If we can represent the constant as an immediate, don't lower it 5892 if (isFPImmLegal(FPVal, VT)) 5893 return Op; 5894 // Otherwise, construct as integer, and move to float register 5895 APInt INTVal = FPVal.bitcastToAPInt(); 5896 SDLoc DL(CFP); 5897 switch (VT.getSimpleVT().SimpleTy) { 5898 default: 5899 llvm_unreachable("Unknown floating point type!"); 5900 break; 5901 case MVT::f64: { 5902 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 5903 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 5904 if (!ST->isLittle()) 5905 std::swap(Lo, Hi); 5906 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 5907 } 5908 case MVT::f32: 5909 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 5910 DAG.getConstant(INTVal, DL, MVT::i32)); 5911 } 5912 } 5913 5914 if (!ST->hasVFP3()) 5915 return SDValue(); 5916 5917 // Use the default (constant pool) lowering for double constants when we have 5918 // an SP-only FPU 5919 if (IsDouble && Subtarget->isFPOnlySP()) 5920 return SDValue(); 5921 5922 // Try splatting with a VMOV.f32... 5923 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 5924 5925 if (ImmVal != -1) { 5926 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 5927 // We have code in place to select a valid ConstantFP already, no need to 5928 // do any mangling. 5929 return Op; 5930 } 5931 5932 // It's a float and we are trying to use NEON operations where 5933 // possible. Lower it to a splat followed by an extract. 5934 SDLoc DL(Op); 5935 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 5936 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 5937 NewVal); 5938 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 5939 DAG.getConstant(0, DL, MVT::i32)); 5940 } 5941 5942 // The rest of our options are NEON only, make sure that's allowed before 5943 // proceeding.. 5944 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 5945 return SDValue(); 5946 5947 EVT VMovVT; 5948 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 5949 5950 // It wouldn't really be worth bothering for doubles except for one very 5951 // important value, which does happen to match: 0.0. So make sure we don't do 5952 // anything stupid. 5953 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 5954 return SDValue(); 5955 5956 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 5957 SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 5958 VMovVT, false, VMOVModImm); 5959 if (NewVal != SDValue()) { 5960 SDLoc DL(Op); 5961 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 5962 NewVal); 5963 if (IsDouble) 5964 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5965 5966 // It's a float: cast and extract a vector element. 5967 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5968 VecConstant); 5969 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5970 DAG.getConstant(0, DL, MVT::i32)); 5971 } 5972 5973 // Finally, try a VMVN.i32 5974 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 5975 false, VMVNModImm); 5976 if (NewVal != SDValue()) { 5977 SDLoc DL(Op); 5978 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 5979 5980 if (IsDouble) 5981 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 5982 5983 // It's a float: cast and extract a vector element. 5984 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 5985 VecConstant); 5986 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 5987 DAG.getConstant(0, DL, MVT::i32)); 5988 } 5989 5990 return SDValue(); 5991 } 5992 5993 // check if an VEXT instruction can handle the shuffle mask when the 5994 // vector sources of the shuffle are the same. 5995 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 5996 unsigned NumElts = VT.getVectorNumElements(); 5997 5998 // Assume that the first shuffle index is not UNDEF. Fail if it is. 5999 if (M[0] < 0) 6000 return false; 6001 6002 Imm = M[0]; 6003 6004 // If this is a VEXT shuffle, the immediate value is the index of the first 6005 // element. The other shuffle indices must be the successive elements after 6006 // the first one. 6007 unsigned ExpectedElt = Imm; 6008 for (unsigned i = 1; i < NumElts; ++i) { 6009 // Increment the expected index. If it wraps around, just follow it 6010 // back to index zero and keep going. 6011 ++ExpectedElt; 6012 if (ExpectedElt == NumElts) 6013 ExpectedElt = 0; 6014 6015 if (M[i] < 0) continue; // ignore UNDEF indices 6016 if (ExpectedElt != static_cast<unsigned>(M[i])) 6017 return false; 6018 } 6019 6020 return true; 6021 } 6022 6023 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6024 bool &ReverseVEXT, unsigned &Imm) { 6025 unsigned NumElts = VT.getVectorNumElements(); 6026 ReverseVEXT = false; 6027 6028 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6029 if (M[0] < 0) 6030 return false; 6031 6032 Imm = M[0]; 6033 6034 // If this is a VEXT shuffle, the immediate value is the index of the first 6035 // element. The other shuffle indices must be the successive elements after 6036 // the first one. 6037 unsigned ExpectedElt = Imm; 6038 for (unsigned i = 1; i < NumElts; ++i) { 6039 // Increment the expected index. If it wraps around, it may still be 6040 // a VEXT but the source vectors must be swapped. 6041 ExpectedElt += 1; 6042 if (ExpectedElt == NumElts * 2) { 6043 ExpectedElt = 0; 6044 ReverseVEXT = true; 6045 } 6046 6047 if (M[i] < 0) continue; // ignore UNDEF indices 6048 if (ExpectedElt != static_cast<unsigned>(M[i])) 6049 return false; 6050 } 6051 6052 // Adjust the index value if the source operands will be swapped. 6053 if (ReverseVEXT) 6054 Imm -= NumElts; 6055 6056 return true; 6057 } 6058 6059 /// isVREVMask - Check if a vector shuffle corresponds to a VREV 6060 /// instruction with the specified blocksize. (The order of the elements 6061 /// within each block of the vector is reversed.) 6062 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6063 assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && 6064 "Only possible block sizes for VREV are: 16, 32, 64"); 6065 6066 unsigned EltSz = VT.getScalarSizeInBits(); 6067 if (EltSz == 64) 6068 return false; 6069 6070 unsigned NumElts = VT.getVectorNumElements(); 6071 unsigned BlockElts = M[0] + 1; 6072 // If the first shuffle index is UNDEF, be optimistic. 6073 if (M[0] < 0) 6074 BlockElts = BlockSize / EltSz; 6075 6076 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6077 return false; 6078 6079 for (unsigned i = 0; i < NumElts; ++i) { 6080 if (M[i] < 0) continue; // ignore UNDEF indices 6081 if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) 6082 return false; 6083 } 6084 6085 return true; 6086 } 6087 6088 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6089 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6090 // range, then 0 is placed into the resulting vector. So pretty much any mask 6091 // of 8 elements can work here. 6092 return VT == MVT::v8i8 && M.size() == 8; 6093 } 6094 6095 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6096 unsigned Index) { 6097 if (Mask.size() == Elements * 2) 6098 return Index / Elements; 6099 return Mask[Index] == 0 ? 0 : 1; 6100 } 6101 6102 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6103 // checking that pairs of elements in the shuffle mask represent the same index 6104 // in each vector, incrementing the expected index by 2 at each step. 6105 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6106 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6107 // v2={e,f,g,h} 6108 // WhichResult gives the offset for each element in the mask based on which 6109 // of the two results it belongs to. 6110 // 6111 // The transpose can be represented either as: 6112 // result1 = shufflevector v1, v2, result1_shuffle_mask 6113 // result2 = shufflevector v1, v2, result2_shuffle_mask 6114 // where v1/v2 and the shuffle masks have the same number of elements 6115 // (here WhichResult (see below) indicates which result is being checked) 6116 // 6117 // or as: 6118 // results = shufflevector v1, v2, shuffle_mask 6119 // where both results are returned in one vector and the shuffle mask has twice 6120 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6121 // want to check the low half and high half of the shuffle mask as if it were 6122 // the other case 6123 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6124 unsigned EltSz = VT.getScalarSizeInBits(); 6125 if (EltSz == 64) 6126 return false; 6127 6128 unsigned NumElts = VT.getVectorNumElements(); 6129 if (M.size() != NumElts && M.size() != NumElts*2) 6130 return false; 6131 6132 // If the mask is twice as long as the input vector then we need to check the 6133 // upper and lower parts of the mask with a matching value for WhichResult 6134 // FIXME: A mask with only even values will be rejected in case the first 6135 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 6136 // M[0] is used to determine WhichResult 6137 for (unsigned i = 0; i < M.size(); i += NumElts) { 6138 WhichResult = SelectPairHalf(NumElts, M, i); 6139 for (unsigned j = 0; j < NumElts; j += 2) { 6140 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6141 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 6142 return false; 6143 } 6144 } 6145 6146 if (M.size() == NumElts*2) 6147 WhichResult = 0; 6148 6149 return true; 6150 } 6151 6152 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 6153 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6154 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6155 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6156 unsigned EltSz = VT.getScalarSizeInBits(); 6157 if (EltSz == 64) 6158 return false; 6159 6160 unsigned NumElts = VT.getVectorNumElements(); 6161 if (M.size() != NumElts && M.size() != NumElts*2) 6162 return false; 6163 6164 for (unsigned i = 0; i < M.size(); i += NumElts) { 6165 WhichResult = SelectPairHalf(NumElts, M, i); 6166 for (unsigned j = 0; j < NumElts; j += 2) { 6167 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 6168 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 6169 return false; 6170 } 6171 } 6172 6173 if (M.size() == NumElts*2) 6174 WhichResult = 0; 6175 6176 return true; 6177 } 6178 6179 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 6180 // that the mask elements are either all even and in steps of size 2 or all odd 6181 // and in steps of size 2. 6182 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 6183 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 6184 // v2={e,f,g,h} 6185 // Requires similar checks to that of isVTRNMask with 6186 // respect the how results are returned. 6187 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6188 unsigned EltSz = VT.getScalarSizeInBits(); 6189 if (EltSz == 64) 6190 return false; 6191 6192 unsigned NumElts = VT.getVectorNumElements(); 6193 if (M.size() != NumElts && M.size() != NumElts*2) 6194 return false; 6195 6196 for (unsigned i = 0; i < M.size(); i += NumElts) { 6197 WhichResult = SelectPairHalf(NumElts, M, i); 6198 for (unsigned j = 0; j < NumElts; ++j) { 6199 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 6200 return false; 6201 } 6202 } 6203 6204 if (M.size() == NumElts*2) 6205 WhichResult = 0; 6206 6207 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6208 if (VT.is64BitVector() && EltSz == 32) 6209 return false; 6210 6211 return true; 6212 } 6213 6214 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 6215 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6216 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6217 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6218 unsigned EltSz = VT.getScalarSizeInBits(); 6219 if (EltSz == 64) 6220 return false; 6221 6222 unsigned NumElts = VT.getVectorNumElements(); 6223 if (M.size() != NumElts && M.size() != NumElts*2) 6224 return false; 6225 6226 unsigned Half = NumElts / 2; 6227 for (unsigned i = 0; i < M.size(); i += NumElts) { 6228 WhichResult = SelectPairHalf(NumElts, M, i); 6229 for (unsigned j = 0; j < NumElts; j += Half) { 6230 unsigned Idx = WhichResult; 6231 for (unsigned k = 0; k < Half; ++k) { 6232 int MIdx = M[i + j + k]; 6233 if (MIdx >= 0 && (unsigned) MIdx != Idx) 6234 return false; 6235 Idx += 2; 6236 } 6237 } 6238 } 6239 6240 if (M.size() == NumElts*2) 6241 WhichResult = 0; 6242 6243 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6244 if (VT.is64BitVector() && EltSz == 32) 6245 return false; 6246 6247 return true; 6248 } 6249 6250 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 6251 // that pairs of elements of the shufflemask represent the same index in each 6252 // vector incrementing sequentially through the vectors. 6253 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 6254 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 6255 // v2={e,f,g,h} 6256 // Requires similar checks to that of isVTRNMask with respect the how results 6257 // are returned. 6258 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6259 unsigned EltSz = VT.getScalarSizeInBits(); 6260 if (EltSz == 64) 6261 return false; 6262 6263 unsigned NumElts = VT.getVectorNumElements(); 6264 if (M.size() != NumElts && M.size() != NumElts*2) 6265 return false; 6266 6267 for (unsigned i = 0; i < M.size(); i += NumElts) { 6268 WhichResult = SelectPairHalf(NumElts, M, i); 6269 unsigned Idx = WhichResult * NumElts / 2; 6270 for (unsigned j = 0; j < NumElts; j += 2) { 6271 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6272 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 6273 return false; 6274 Idx += 1; 6275 } 6276 } 6277 6278 if (M.size() == NumElts*2) 6279 WhichResult = 0; 6280 6281 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6282 if (VT.is64BitVector() && EltSz == 32) 6283 return false; 6284 6285 return true; 6286 } 6287 6288 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 6289 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6290 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6291 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 6292 unsigned EltSz = VT.getScalarSizeInBits(); 6293 if (EltSz == 64) 6294 return false; 6295 6296 unsigned NumElts = VT.getVectorNumElements(); 6297 if (M.size() != NumElts && M.size() != NumElts*2) 6298 return false; 6299 6300 for (unsigned i = 0; i < M.size(); i += NumElts) { 6301 WhichResult = SelectPairHalf(NumElts, M, i); 6302 unsigned Idx = WhichResult * NumElts / 2; 6303 for (unsigned j = 0; j < NumElts; j += 2) { 6304 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 6305 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 6306 return false; 6307 Idx += 1; 6308 } 6309 } 6310 6311 if (M.size() == NumElts*2) 6312 WhichResult = 0; 6313 6314 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 6315 if (VT.is64BitVector() && EltSz == 32) 6316 return false; 6317 6318 return true; 6319 } 6320 6321 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 6322 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 6323 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 6324 unsigned &WhichResult, 6325 bool &isV_UNDEF) { 6326 isV_UNDEF = false; 6327 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 6328 return ARMISD::VTRN; 6329 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 6330 return ARMISD::VUZP; 6331 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 6332 return ARMISD::VZIP; 6333 6334 isV_UNDEF = true; 6335 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6336 return ARMISD::VTRN; 6337 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6338 return ARMISD::VUZP; 6339 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 6340 return ARMISD::VZIP; 6341 6342 return 0; 6343 } 6344 6345 /// \return true if this is a reverse operation on an vector. 6346 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 6347 unsigned NumElts = VT.getVectorNumElements(); 6348 // Make sure the mask has the right size. 6349 if (NumElts != M.size()) 6350 return false; 6351 6352 // Look for <15, ..., 3, -1, 1, 0>. 6353 for (unsigned i = 0; i != NumElts; ++i) 6354 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 6355 return false; 6356 6357 return true; 6358 } 6359 6360 // If N is an integer constant that can be moved into a register in one 6361 // instruction, return an SDValue of such a constant (will become a MOV 6362 // instruction). Otherwise return null. 6363 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 6364 const ARMSubtarget *ST, const SDLoc &dl) { 6365 uint64_t Val; 6366 if (!isa<ConstantSDNode>(N)) 6367 return SDValue(); 6368 Val = cast<ConstantSDNode>(N)->getZExtValue(); 6369 6370 if (ST->isThumb1Only()) { 6371 if (Val <= 255 || ~Val <= 255) 6372 return DAG.getConstant(Val, dl, MVT::i32); 6373 } else { 6374 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 6375 return DAG.getConstant(Val, dl, MVT::i32); 6376 } 6377 return SDValue(); 6378 } 6379 6380 // If this is a case we can't handle, return null and let the default 6381 // expansion code take care of it. 6382 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 6383 const ARMSubtarget *ST) const { 6384 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 6385 SDLoc dl(Op); 6386 EVT VT = Op.getValueType(); 6387 6388 APInt SplatBits, SplatUndef; 6389 unsigned SplatBitSize; 6390 bool HasAnyUndefs; 6391 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 6392 if (SplatUndef.isAllOnesValue()) 6393 return DAG.getUNDEF(VT); 6394 6395 if (SplatBitSize <= 64) { 6396 // Check if an immediate VMOV works. 6397 EVT VmovVT; 6398 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 6399 SplatUndef.getZExtValue(), SplatBitSize, 6400 DAG, dl, VmovVT, VT.is128BitVector(), 6401 VMOVModImm); 6402 if (Val.getNode()) { 6403 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 6404 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6405 } 6406 6407 // Try an immediate VMVN. 6408 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 6409 Val = isNEONModifiedImm(NegatedImm, 6410 SplatUndef.getZExtValue(), SplatBitSize, 6411 DAG, dl, VmovVT, VT.is128BitVector(), 6412 VMVNModImm); 6413 if (Val.getNode()) { 6414 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 6415 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6416 } 6417 6418 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 6419 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 6420 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 6421 if (ImmVal != -1) { 6422 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 6423 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 6424 } 6425 } 6426 } 6427 } 6428 6429 // Scan through the operands to see if only one value is used. 6430 // 6431 // As an optimisation, even if more than one value is used it may be more 6432 // profitable to splat with one value then change some lanes. 6433 // 6434 // Heuristically we decide to do this if the vector has a "dominant" value, 6435 // defined as splatted to more than half of the lanes. 6436 unsigned NumElts = VT.getVectorNumElements(); 6437 bool isOnlyLowElement = true; 6438 bool usesOnlyOneValue = true; 6439 bool hasDominantValue = false; 6440 bool isConstant = true; 6441 6442 // Map of the number of times a particular SDValue appears in the 6443 // element list. 6444 DenseMap<SDValue, unsigned> ValueCounts; 6445 SDValue Value; 6446 for (unsigned i = 0; i < NumElts; ++i) { 6447 SDValue V = Op.getOperand(i); 6448 if (V.isUndef()) 6449 continue; 6450 if (i > 0) 6451 isOnlyLowElement = false; 6452 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 6453 isConstant = false; 6454 6455 ValueCounts.insert(std::make_pair(V, 0)); 6456 unsigned &Count = ValueCounts[V]; 6457 6458 // Is this value dominant? (takes up more than half of the lanes) 6459 if (++Count > (NumElts / 2)) { 6460 hasDominantValue = true; 6461 Value = V; 6462 } 6463 } 6464 if (ValueCounts.size() != 1) 6465 usesOnlyOneValue = false; 6466 if (!Value.getNode() && !ValueCounts.empty()) 6467 Value = ValueCounts.begin()->first; 6468 6469 if (ValueCounts.empty()) 6470 return DAG.getUNDEF(VT); 6471 6472 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 6473 // Keep going if we are hitting this case. 6474 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 6475 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 6476 6477 unsigned EltSize = VT.getScalarSizeInBits(); 6478 6479 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 6480 // i32 and try again. 6481 if (hasDominantValue && EltSize <= 32) { 6482 if (!isConstant) { 6483 SDValue N; 6484 6485 // If we are VDUPing a value that comes directly from a vector, that will 6486 // cause an unnecessary move to and from a GPR, where instead we could 6487 // just use VDUPLANE. We can only do this if the lane being extracted 6488 // is at a constant index, as the VDUP from lane instructions only have 6489 // constant-index forms. 6490 ConstantSDNode *constIndex; 6491 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6492 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 6493 // We need to create a new undef vector to use for the VDUPLANE if the 6494 // size of the vector from which we get the value is different than the 6495 // size of the vector that we need to create. We will insert the element 6496 // such that the register coalescer will remove unnecessary copies. 6497 if (VT != Value->getOperand(0).getValueType()) { 6498 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 6499 VT.getVectorNumElements(); 6500 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6501 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 6502 Value, DAG.getConstant(index, dl, MVT::i32)), 6503 DAG.getConstant(index, dl, MVT::i32)); 6504 } else 6505 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6506 Value->getOperand(0), Value->getOperand(1)); 6507 } else 6508 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 6509 6510 if (!usesOnlyOneValue) { 6511 // The dominant value was splatted as 'N', but we now have to insert 6512 // all differing elements. 6513 for (unsigned I = 0; I < NumElts; ++I) { 6514 if (Op.getOperand(I) == Value) 6515 continue; 6516 SmallVector<SDValue, 3> Ops; 6517 Ops.push_back(N); 6518 Ops.push_back(Op.getOperand(I)); 6519 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 6520 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 6521 } 6522 } 6523 return N; 6524 } 6525 if (VT.getVectorElementType().isFloatingPoint()) { 6526 SmallVector<SDValue, 8> Ops; 6527 for (unsigned i = 0; i < NumElts; ++i) 6528 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, 6529 Op.getOperand(i))); 6530 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 6531 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 6532 Val = LowerBUILD_VECTOR(Val, DAG, ST); 6533 if (Val.getNode()) 6534 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6535 } 6536 if (usesOnlyOneValue) { 6537 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 6538 if (isConstant && Val.getNode()) 6539 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 6540 } 6541 } 6542 6543 // If all elements are constants and the case above didn't get hit, fall back 6544 // to the default expansion, which will generate a load from the constant 6545 // pool. 6546 if (isConstant) 6547 return SDValue(); 6548 6549 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 6550 if (NumElts >= 4) { 6551 SDValue shuffle = ReconstructShuffle(Op, DAG); 6552 if (shuffle != SDValue()) 6553 return shuffle; 6554 } 6555 6556 if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 6557 // If we haven't found an efficient lowering, try splitting a 128-bit vector 6558 // into two 64-bit vectors; we might discover a better way to lower it. 6559 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 6560 EVT ExtVT = VT.getVectorElementType(); 6561 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 6562 SDValue Lower = 6563 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 6564 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 6565 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 6566 SDValue Upper = DAG.getBuildVector( 6567 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 6568 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 6569 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 6570 if (Lower && Upper) 6571 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 6572 } 6573 6574 // Vectors with 32- or 64-bit elements can be built by directly assigning 6575 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 6576 // will be legalized. 6577 if (EltSize >= 32) { 6578 // Do the expansion with floating-point types, since that is what the VFP 6579 // registers are defined to use, and since i64 is not legal. 6580 EVT EltVT = EVT::getFloatingPointVT(EltSize); 6581 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 6582 SmallVector<SDValue, 8> Ops; 6583 for (unsigned i = 0; i < NumElts; ++i) 6584 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 6585 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 6586 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 6587 } 6588 6589 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 6590 // know the default expansion would otherwise fall back on something even 6591 // worse. For a vector with one or two non-undef values, that's 6592 // scalar_to_vector for the elements followed by a shuffle (provided the 6593 // shuffle is valid for the target) and materialization element by element 6594 // on the stack followed by a load for everything else. 6595 if (!isConstant && !usesOnlyOneValue) { 6596 SDValue Vec = DAG.getUNDEF(VT); 6597 for (unsigned i = 0 ; i < NumElts; ++i) { 6598 SDValue V = Op.getOperand(i); 6599 if (V.isUndef()) 6600 continue; 6601 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 6602 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 6603 } 6604 return Vec; 6605 } 6606 6607 return SDValue(); 6608 } 6609 6610 // Gather data to see if the operation can be modelled as a 6611 // shuffle in combination with VEXTs. 6612 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 6613 SelectionDAG &DAG) const { 6614 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 6615 SDLoc dl(Op); 6616 EVT VT = Op.getValueType(); 6617 unsigned NumElts = VT.getVectorNumElements(); 6618 6619 struct ShuffleSourceInfo { 6620 SDValue Vec; 6621 unsigned MinElt = std::numeric_limits<unsigned>::max(); 6622 unsigned MaxElt = 0; 6623 6624 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 6625 // be compatible with the shuffle we intend to construct. As a result 6626 // ShuffleVec will be some sliding window into the original Vec. 6627 SDValue ShuffleVec; 6628 6629 // Code should guarantee that element i in Vec starts at element "WindowBase 6630 // + i * WindowScale in ShuffleVec". 6631 int WindowBase = 0; 6632 int WindowScale = 1; 6633 6634 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 6635 6636 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 6637 }; 6638 6639 // First gather all vectors used as an immediate source for this BUILD_VECTOR 6640 // node. 6641 SmallVector<ShuffleSourceInfo, 2> Sources; 6642 for (unsigned i = 0; i < NumElts; ++i) { 6643 SDValue V = Op.getOperand(i); 6644 if (V.isUndef()) 6645 continue; 6646 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 6647 // A shuffle can only come from building a vector from various 6648 // elements of other vectors. 6649 return SDValue(); 6650 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 6651 // Furthermore, shuffles require a constant mask, whereas extractelts 6652 // accept variable indices. 6653 return SDValue(); 6654 } 6655 6656 // Add this element source to the list if it's not already there. 6657 SDValue SourceVec = V.getOperand(0); 6658 auto Source = llvm::find(Sources, SourceVec); 6659 if (Source == Sources.end()) 6660 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 6661 6662 // Update the minimum and maximum lane number seen. 6663 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 6664 Source->MinElt = std::min(Source->MinElt, EltNo); 6665 Source->MaxElt = std::max(Source->MaxElt, EltNo); 6666 } 6667 6668 // Currently only do something sane when at most two source vectors 6669 // are involved. 6670 if (Sources.size() > 2) 6671 return SDValue(); 6672 6673 // Find out the smallest element size among result and two sources, and use 6674 // it as element size to build the shuffle_vector. 6675 EVT SmallestEltTy = VT.getVectorElementType(); 6676 for (auto &Source : Sources) { 6677 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 6678 if (SrcEltTy.bitsLT(SmallestEltTy)) 6679 SmallestEltTy = SrcEltTy; 6680 } 6681 unsigned ResMultiplier = 6682 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 6683 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6684 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 6685 6686 // If the source vector is too wide or too narrow, we may nevertheless be able 6687 // to construct a compatible shuffle either by concatenating it with UNDEF or 6688 // extracting a suitable range of elements. 6689 for (auto &Src : Sources) { 6690 EVT SrcVT = Src.ShuffleVec.getValueType(); 6691 6692 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 6693 continue; 6694 6695 // This stage of the search produces a source with the same element type as 6696 // the original, but with a total width matching the BUILD_VECTOR output. 6697 EVT EltVT = SrcVT.getVectorElementType(); 6698 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 6699 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 6700 6701 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 6702 if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) 6703 return SDValue(); 6704 // We can pad out the smaller vector for free, so if it's part of a 6705 // shuffle... 6706 Src.ShuffleVec = 6707 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 6708 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 6709 continue; 6710 } 6711 6712 if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) 6713 return SDValue(); 6714 6715 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 6716 // Span too large for a VEXT to cope 6717 return SDValue(); 6718 } 6719 6720 if (Src.MinElt >= NumSrcElts) { 6721 // The extraction can just take the second half 6722 Src.ShuffleVec = 6723 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6724 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6725 Src.WindowBase = -NumSrcElts; 6726 } else if (Src.MaxElt < NumSrcElts) { 6727 // The extraction can just take the first half 6728 Src.ShuffleVec = 6729 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6730 DAG.getConstant(0, dl, MVT::i32)); 6731 } else { 6732 // An actual VEXT is needed 6733 SDValue VEXTSrc1 = 6734 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6735 DAG.getConstant(0, dl, MVT::i32)); 6736 SDValue VEXTSrc2 = 6737 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6738 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 6739 6740 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 6741 VEXTSrc2, 6742 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 6743 Src.WindowBase = -Src.MinElt; 6744 } 6745 } 6746 6747 // Another possible incompatibility occurs from the vector element types. We 6748 // can fix this by bitcasting the source vectors to the same type we intend 6749 // for the shuffle. 6750 for (auto &Src : Sources) { 6751 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 6752 if (SrcEltTy == SmallestEltTy) 6753 continue; 6754 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 6755 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 6756 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6757 Src.WindowBase *= Src.WindowScale; 6758 } 6759 6760 // Final sanity check before we try to actually produce a shuffle. 6761 LLVM_DEBUG(for (auto Src 6762 : Sources) 6763 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 6764 6765 // The stars all align, our next step is to produce the mask for the shuffle. 6766 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 6767 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 6768 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 6769 SDValue Entry = Op.getOperand(i); 6770 if (Entry.isUndef()) 6771 continue; 6772 6773 auto Src = llvm::find(Sources, Entry.getOperand(0)); 6774 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 6775 6776 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 6777 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 6778 // segment. 6779 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 6780 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 6781 VT.getScalarSizeInBits()); 6782 int LanesDefined = BitsDefined / BitsPerShuffleLane; 6783 6784 // This source is expected to fill ResMultiplier lanes of the final shuffle, 6785 // starting at the appropriate offset. 6786 int *LaneMask = &Mask[i * ResMultiplier]; 6787 6788 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 6789 ExtractBase += NumElts * (Src - Sources.begin()); 6790 for (int j = 0; j < LanesDefined; ++j) 6791 LaneMask[j] = ExtractBase + j; 6792 } 6793 6794 // Final check before we try to produce nonsense... 6795 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 6796 return SDValue(); 6797 6798 // We can't handle more than two sources. This should have already 6799 // been checked before this point. 6800 assert(Sources.size() <= 2 && "Too many sources!"); 6801 6802 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 6803 for (unsigned i = 0; i < Sources.size(); ++i) 6804 ShuffleOps[i] = Sources[i].ShuffleVec; 6805 6806 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 6807 ShuffleOps[1], Mask); 6808 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 6809 } 6810 6811 /// isShuffleMaskLegal - Targets can use this to indicate that they only 6812 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6813 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6814 /// are assumed to be legal. 6815 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 6816 if (VT.getVectorNumElements() == 4 && 6817 (VT.is128BitVector() || VT.is64BitVector())) { 6818 unsigned PFIndexes[4]; 6819 for (unsigned i = 0; i != 4; ++i) { 6820 if (M[i] < 0) 6821 PFIndexes[i] = 8; 6822 else 6823 PFIndexes[i] = M[i]; 6824 } 6825 6826 // Compute the index in the perfect shuffle table. 6827 unsigned PFTableIndex = 6828 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 6829 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6830 unsigned Cost = (PFEntry >> 30); 6831 6832 if (Cost <= 4) 6833 return true; 6834 } 6835 6836 bool ReverseVEXT, isV_UNDEF; 6837 unsigned Imm, WhichResult; 6838 6839 unsigned EltSize = VT.getScalarSizeInBits(); 6840 return (EltSize >= 32 || 6841 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6842 isVREVMask(M, VT, 64) || 6843 isVREVMask(M, VT, 32) || 6844 isVREVMask(M, VT, 16) || 6845 isVEXTMask(M, VT, ReverseVEXT, Imm) || 6846 isVTBLMask(M, VT) || 6847 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || 6848 ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); 6849 } 6850 6851 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 6852 /// the specified operations to build the shuffle. 6853 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 6854 SDValue RHS, SelectionDAG &DAG, 6855 const SDLoc &dl) { 6856 unsigned OpNum = (PFEntry >> 26) & 0x0F; 6857 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 6858 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 6859 6860 enum { 6861 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 6862 OP_VREV, 6863 OP_VDUP0, 6864 OP_VDUP1, 6865 OP_VDUP2, 6866 OP_VDUP3, 6867 OP_VEXT1, 6868 OP_VEXT2, 6869 OP_VEXT3, 6870 OP_VUZPL, // VUZP, left result 6871 OP_VUZPR, // VUZP, right result 6872 OP_VZIPL, // VZIP, left result 6873 OP_VZIPR, // VZIP, right result 6874 OP_VTRNL, // VTRN, left result 6875 OP_VTRNR // VTRN, right result 6876 }; 6877 6878 if (OpNum == OP_COPY) { 6879 if (LHSID == (1*9+2)*9+3) return LHS; 6880 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 6881 return RHS; 6882 } 6883 6884 SDValue OpLHS, OpRHS; 6885 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 6886 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 6887 EVT VT = OpLHS.getValueType(); 6888 6889 switch (OpNum) { 6890 default: llvm_unreachable("Unknown shuffle opcode!"); 6891 case OP_VREV: 6892 // VREV divides the vector in half and swaps within the half. 6893 if (VT.getVectorElementType() == MVT::i32 || 6894 VT.getVectorElementType() == MVT::f32) 6895 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 6896 // vrev <4 x i16> -> VREV32 6897 if (VT.getVectorElementType() == MVT::i16) 6898 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 6899 // vrev <4 x i8> -> VREV16 6900 assert(VT.getVectorElementType() == MVT::i8); 6901 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 6902 case OP_VDUP0: 6903 case OP_VDUP1: 6904 case OP_VDUP2: 6905 case OP_VDUP3: 6906 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 6907 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 6908 case OP_VEXT1: 6909 case OP_VEXT2: 6910 case OP_VEXT3: 6911 return DAG.getNode(ARMISD::VEXT, dl, VT, 6912 OpLHS, OpRHS, 6913 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 6914 case OP_VUZPL: 6915 case OP_VUZPR: 6916 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 6917 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 6918 case OP_VZIPL: 6919 case OP_VZIPR: 6920 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 6921 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 6922 case OP_VTRNL: 6923 case OP_VTRNR: 6924 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 6925 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 6926 } 6927 } 6928 6929 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 6930 ArrayRef<int> ShuffleMask, 6931 SelectionDAG &DAG) { 6932 // Check to see if we can use the VTBL instruction. 6933 SDValue V1 = Op.getOperand(0); 6934 SDValue V2 = Op.getOperand(1); 6935 SDLoc DL(Op); 6936 6937 SmallVector<SDValue, 8> VTBLMask; 6938 for (ArrayRef<int>::iterator 6939 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 6940 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 6941 6942 if (V2.getNode()->isUndef()) 6943 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 6944 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6945 6946 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 6947 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 6948 } 6949 6950 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 6951 SelectionDAG &DAG) { 6952 SDLoc DL(Op); 6953 SDValue OpLHS = Op.getOperand(0); 6954 EVT VT = OpLHS.getValueType(); 6955 6956 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 6957 "Expect an v8i16/v16i8 type"); 6958 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 6959 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 6960 // extract the first 8 bytes into the top double word and the last 8 bytes 6961 // into the bottom double word. The v8i16 case is similar. 6962 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 6963 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 6964 DAG.getConstant(ExtractNum, DL, MVT::i32)); 6965 } 6966 6967 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 6968 SDValue V1 = Op.getOperand(0); 6969 SDValue V2 = Op.getOperand(1); 6970 SDLoc dl(Op); 6971 EVT VT = Op.getValueType(); 6972 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 6973 6974 // Convert shuffles that are directly supported on NEON to target-specific 6975 // DAG nodes, instead of keeping them as shuffles and matching them again 6976 // during code selection. This is more efficient and avoids the possibility 6977 // of inconsistencies between legalization and selection. 6978 // FIXME: floating-point vectors should be canonicalized to integer vectors 6979 // of the same time so that they get CSEd properly. 6980 ArrayRef<int> ShuffleMask = SVN->getMask(); 6981 6982 unsigned EltSize = VT.getScalarSizeInBits(); 6983 if (EltSize <= 32) { 6984 if (SVN->isSplat()) { 6985 int Lane = SVN->getSplatIndex(); 6986 // If this is undef splat, generate it via "just" vdup, if possible. 6987 if (Lane == -1) Lane = 0; 6988 6989 // Test if V1 is a SCALAR_TO_VECTOR. 6990 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 6991 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 6992 } 6993 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 6994 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 6995 // reaches it). 6996 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 6997 !isa<ConstantSDNode>(V1.getOperand(0))) { 6998 bool IsScalarToVector = true; 6999 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 7000 if (!V1.getOperand(i).isUndef()) { 7001 IsScalarToVector = false; 7002 break; 7003 } 7004 if (IsScalarToVector) 7005 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 7006 } 7007 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 7008 DAG.getConstant(Lane, dl, MVT::i32)); 7009 } 7010 7011 bool ReverseVEXT; 7012 unsigned Imm; 7013 if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 7014 if (ReverseVEXT) 7015 std::swap(V1, V2); 7016 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 7017 DAG.getConstant(Imm, dl, MVT::i32)); 7018 } 7019 7020 if (isVREVMask(ShuffleMask, VT, 64)) 7021 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 7022 if (isVREVMask(ShuffleMask, VT, 32)) 7023 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 7024 if (isVREVMask(ShuffleMask, VT, 16)) 7025 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 7026 7027 if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 7028 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 7029 DAG.getConstant(Imm, dl, MVT::i32)); 7030 } 7031 7032 // Check for Neon shuffles that modify both input vectors in place. 7033 // If both results are used, i.e., if there are two shuffles with the same 7034 // source operands and with masks corresponding to both results of one of 7035 // these operations, DAG memoization will ensure that a single node is 7036 // used for both shuffles. 7037 unsigned WhichResult; 7038 bool isV_UNDEF; 7039 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7040 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 7041 if (isV_UNDEF) 7042 V2 = V1; 7043 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 7044 .getValue(WhichResult); 7045 } 7046 7047 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 7048 // shuffles that produce a result larger than their operands with: 7049 // shuffle(concat(v1, undef), concat(v2, undef)) 7050 // -> 7051 // shuffle(concat(v1, v2), undef) 7052 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 7053 // 7054 // This is useful in the general case, but there are special cases where 7055 // native shuffles produce larger results: the two-result ops. 7056 // 7057 // Look through the concat when lowering them: 7058 // shuffle(concat(v1, v2), undef) 7059 // -> 7060 // concat(VZIP(v1, v2):0, :1) 7061 // 7062 if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 7063 SDValue SubV1 = V1->getOperand(0); 7064 SDValue SubV2 = V1->getOperand(1); 7065 EVT SubVT = SubV1.getValueType(); 7066 7067 // We expect these to have been canonicalized to -1. 7068 assert(llvm::all_of(ShuffleMask, [&](int i) { 7069 return i < (int)VT.getVectorNumElements(); 7070 }) && "Unexpected shuffle index into UNDEF operand!"); 7071 7072 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 7073 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 7074 if (isV_UNDEF) 7075 SubV2 = SubV1; 7076 assert((WhichResult == 0) && 7077 "In-place shuffle of concat can only have one result!"); 7078 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 7079 SubV1, SubV2); 7080 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 7081 Res.getValue(1)); 7082 } 7083 } 7084 } 7085 7086 // If the shuffle is not directly supported and it has 4 elements, use 7087 // the PerfectShuffle-generated table to synthesize it from other shuffles. 7088 unsigned NumElts = VT.getVectorNumElements(); 7089 if (NumElts == 4) { 7090 unsigned PFIndexes[4]; 7091 for (unsigned i = 0; i != 4; ++i) { 7092 if (ShuffleMask[i] < 0) 7093 PFIndexes[i] = 8; 7094 else 7095 PFIndexes[i] = ShuffleMask[i]; 7096 } 7097 7098 // Compute the index in the perfect shuffle table. 7099 unsigned PFTableIndex = 7100 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7101 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7102 unsigned Cost = (PFEntry >> 30); 7103 7104 if (Cost <= 4) 7105 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 7106 } 7107 7108 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 7109 if (EltSize >= 32) { 7110 // Do the expansion with floating-point types, since that is what the VFP 7111 // registers are defined to use, and since i64 is not legal. 7112 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7113 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7114 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 7115 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 7116 SmallVector<SDValue, 8> Ops; 7117 for (unsigned i = 0; i < NumElts; ++i) { 7118 if (ShuffleMask[i] < 0) 7119 Ops.push_back(DAG.getUNDEF(EltVT)); 7120 else 7121 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 7122 ShuffleMask[i] < (int)NumElts ? V1 : V2, 7123 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 7124 dl, MVT::i32))); 7125 } 7126 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7127 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7128 } 7129 7130 if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 7131 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 7132 7133 if (VT == MVT::v8i8) 7134 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 7135 return NewOp; 7136 7137 return SDValue(); 7138 } 7139 7140 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 7141 // INSERT_VECTOR_ELT is legal only for immediate indexes. 7142 SDValue Lane = Op.getOperand(2); 7143 if (!isa<ConstantSDNode>(Lane)) 7144 return SDValue(); 7145 7146 return Op; 7147 } 7148 7149 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 7150 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 7151 SDValue Lane = Op.getOperand(1); 7152 if (!isa<ConstantSDNode>(Lane)) 7153 return SDValue(); 7154 7155 SDValue Vec = Op.getOperand(0); 7156 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 7157 SDLoc dl(Op); 7158 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 7159 } 7160 7161 return Op; 7162 } 7163 7164 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 7165 // The only time a CONCAT_VECTORS operation can have legal types is when 7166 // two 64-bit vectors are concatenated to a 128-bit vector. 7167 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 7168 "unexpected CONCAT_VECTORS"); 7169 SDLoc dl(Op); 7170 SDValue Val = DAG.getUNDEF(MVT::v2f64); 7171 SDValue Op0 = Op.getOperand(0); 7172 SDValue Op1 = Op.getOperand(1); 7173 if (!Op0.isUndef()) 7174 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 7175 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 7176 DAG.getIntPtrConstant(0, dl)); 7177 if (!Op1.isUndef()) 7178 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 7179 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 7180 DAG.getIntPtrConstant(1, dl)); 7181 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 7182 } 7183 7184 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 7185 /// element has been zero/sign-extended, depending on the isSigned parameter, 7186 /// from an integer type half its size. 7187 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 7188 bool isSigned) { 7189 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 7190 EVT VT = N->getValueType(0); 7191 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 7192 SDNode *BVN = N->getOperand(0).getNode(); 7193 if (BVN->getValueType(0) != MVT::v4i32 || 7194 BVN->getOpcode() != ISD::BUILD_VECTOR) 7195 return false; 7196 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 7197 unsigned HiElt = 1 - LoElt; 7198 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 7199 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 7200 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 7201 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 7202 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 7203 return false; 7204 if (isSigned) { 7205 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 7206 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 7207 return true; 7208 } else { 7209 if (Hi0->isNullValue() && Hi1->isNullValue()) 7210 return true; 7211 } 7212 return false; 7213 } 7214 7215 if (N->getOpcode() != ISD::BUILD_VECTOR) 7216 return false; 7217 7218 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 7219 SDNode *Elt = N->getOperand(i).getNode(); 7220 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 7221 unsigned EltSize = VT.getScalarSizeInBits(); 7222 unsigned HalfSize = EltSize / 2; 7223 if (isSigned) { 7224 if (!isIntN(HalfSize, C->getSExtValue())) 7225 return false; 7226 } else { 7227 if (!isUIntN(HalfSize, C->getZExtValue())) 7228 return false; 7229 } 7230 continue; 7231 } 7232 return false; 7233 } 7234 7235 return true; 7236 } 7237 7238 /// isSignExtended - Check if a node is a vector value that is sign-extended 7239 /// or a constant BUILD_VECTOR with sign-extended elements. 7240 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 7241 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 7242 return true; 7243 if (isExtendedBUILD_VECTOR(N, DAG, true)) 7244 return true; 7245 return false; 7246 } 7247 7248 /// isZeroExtended - Check if a node is a vector value that is zero-extended 7249 /// or a constant BUILD_VECTOR with zero-extended elements. 7250 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 7251 if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) 7252 return true; 7253 if (isExtendedBUILD_VECTOR(N, DAG, false)) 7254 return true; 7255 return false; 7256 } 7257 7258 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 7259 if (OrigVT.getSizeInBits() >= 64) 7260 return OrigVT; 7261 7262 assert(OrigVT.isSimple() && "Expecting a simple value type"); 7263 7264 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 7265 switch (OrigSimpleTy) { 7266 default: llvm_unreachable("Unexpected Vector Type"); 7267 case MVT::v2i8: 7268 case MVT::v2i16: 7269 return MVT::v2i32; 7270 case MVT::v4i8: 7271 return MVT::v4i16; 7272 } 7273 } 7274 7275 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 7276 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 7277 /// We insert the required extension here to get the vector to fill a D register. 7278 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 7279 const EVT &OrigTy, 7280 const EVT &ExtTy, 7281 unsigned ExtOpcode) { 7282 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 7283 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 7284 // 64-bits we need to insert a new extension so that it will be 64-bits. 7285 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 7286 if (OrigTy.getSizeInBits() >= 64) 7287 return N; 7288 7289 // Must extend size to at least 64 bits to be used as an operand for VMULL. 7290 EVT NewVT = getExtensionTo64Bits(OrigTy); 7291 7292 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 7293 } 7294 7295 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 7296 /// does not do any sign/zero extension. If the original vector is less 7297 /// than 64 bits, an appropriate extension will be added after the load to 7298 /// reach a total size of 64 bits. We have to add the extension separately 7299 /// because ARM does not have a sign/zero extending load for vectors. 7300 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 7301 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 7302 7303 // The load already has the right type. 7304 if (ExtendedTy == LD->getMemoryVT()) 7305 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 7306 LD->getBasePtr(), LD->getPointerInfo(), 7307 LD->getAlignment(), LD->getMemOperand()->getFlags()); 7308 7309 // We need to create a zextload/sextload. We cannot just create a load 7310 // followed by a zext/zext node because LowerMUL is also run during normal 7311 // operation legalization where we can't create illegal types. 7312 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 7313 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 7314 LD->getMemoryVT(), LD->getAlignment(), 7315 LD->getMemOperand()->getFlags()); 7316 } 7317 7318 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 7319 /// extending load, or BUILD_VECTOR with extended elements, return the 7320 /// unextended value. The unextended vector should be 64 bits so that it can 7321 /// be used as an operand to a VMULL instruction. If the original vector size 7322 /// before extension is less than 64 bits we add a an extension to resize 7323 /// the vector to 64 bits. 7324 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 7325 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 7326 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 7327 N->getOperand(0)->getValueType(0), 7328 N->getValueType(0), 7329 N->getOpcode()); 7330 7331 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 7332 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 7333 "Expected extending load"); 7334 7335 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 7336 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 7337 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 7338 SDValue extLoad = 7339 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 7340 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 7341 7342 return newLoad; 7343 } 7344 7345 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 7346 // have been legalized as a BITCAST from v4i32. 7347 if (N->getOpcode() == ISD::BITCAST) { 7348 SDNode *BVN = N->getOperand(0).getNode(); 7349 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 7350 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 7351 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 7352 return DAG.getBuildVector( 7353 MVT::v2i32, SDLoc(N), 7354 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 7355 } 7356 // Construct a new BUILD_VECTOR with elements truncated to half the size. 7357 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 7358 EVT VT = N->getValueType(0); 7359 unsigned EltSize = VT.getScalarSizeInBits() / 2; 7360 unsigned NumElts = VT.getVectorNumElements(); 7361 MVT TruncVT = MVT::getIntegerVT(EltSize); 7362 SmallVector<SDValue, 8> Ops; 7363 SDLoc dl(N); 7364 for (unsigned i = 0; i != NumElts; ++i) { 7365 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 7366 const APInt &CInt = C->getAPIntValue(); 7367 // Element types smaller than 32 bits are not legal, so use i32 elements. 7368 // The values are implicitly truncated so sext vs. zext doesn't matter. 7369 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 7370 } 7371 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 7372 } 7373 7374 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 7375 unsigned Opcode = N->getOpcode(); 7376 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 7377 SDNode *N0 = N->getOperand(0).getNode(); 7378 SDNode *N1 = N->getOperand(1).getNode(); 7379 return N0->hasOneUse() && N1->hasOneUse() && 7380 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 7381 } 7382 return false; 7383 } 7384 7385 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 7386 unsigned Opcode = N->getOpcode(); 7387 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 7388 SDNode *N0 = N->getOperand(0).getNode(); 7389 SDNode *N1 = N->getOperand(1).getNode(); 7390 return N0->hasOneUse() && N1->hasOneUse() && 7391 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 7392 } 7393 return false; 7394 } 7395 7396 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 7397 // Multiplications are only custom-lowered for 128-bit vectors so that 7398 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 7399 EVT VT = Op.getValueType(); 7400 assert(VT.is128BitVector() && VT.isInteger() && 7401 "unexpected type for custom-lowering ISD::MUL"); 7402 SDNode *N0 = Op.getOperand(0).getNode(); 7403 SDNode *N1 = Op.getOperand(1).getNode(); 7404 unsigned NewOpc = 0; 7405 bool isMLA = false; 7406 bool isN0SExt = isSignExtended(N0, DAG); 7407 bool isN1SExt = isSignExtended(N1, DAG); 7408 if (isN0SExt && isN1SExt) 7409 NewOpc = ARMISD::VMULLs; 7410 else { 7411 bool isN0ZExt = isZeroExtended(N0, DAG); 7412 bool isN1ZExt = isZeroExtended(N1, DAG); 7413 if (isN0ZExt && isN1ZExt) 7414 NewOpc = ARMISD::VMULLu; 7415 else if (isN1SExt || isN1ZExt) { 7416 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 7417 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 7418 if (isN1SExt && isAddSubSExt(N0, DAG)) { 7419 NewOpc = ARMISD::VMULLs; 7420 isMLA = true; 7421 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 7422 NewOpc = ARMISD::VMULLu; 7423 isMLA = true; 7424 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 7425 std::swap(N0, N1); 7426 NewOpc = ARMISD::VMULLu; 7427 isMLA = true; 7428 } 7429 } 7430 7431 if (!NewOpc) { 7432 if (VT == MVT::v2i64) 7433 // Fall through to expand this. It is not legal. 7434 return SDValue(); 7435 else 7436 // Other vector multiplications are legal. 7437 return Op; 7438 } 7439 } 7440 7441 // Legalize to a VMULL instruction. 7442 SDLoc DL(Op); 7443 SDValue Op0; 7444 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 7445 if (!isMLA) { 7446 Op0 = SkipExtensionForVMULL(N0, DAG); 7447 assert(Op0.getValueType().is64BitVector() && 7448 Op1.getValueType().is64BitVector() && 7449 "unexpected types for extended operands to VMULL"); 7450 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 7451 } 7452 7453 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 7454 // isel lowering to take advantage of no-stall back to back vmul + vmla. 7455 // vmull q0, d4, d6 7456 // vmlal q0, d5, d6 7457 // is faster than 7458 // vaddl q0, d4, d5 7459 // vmovl q1, d6 7460 // vmul q0, q0, q1 7461 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 7462 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 7463 EVT Op1VT = Op1.getValueType(); 7464 return DAG.getNode(N0->getOpcode(), DL, VT, 7465 DAG.getNode(NewOpc, DL, VT, 7466 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 7467 DAG.getNode(NewOpc, DL, VT, 7468 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 7469 } 7470 7471 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 7472 SelectionDAG &DAG) { 7473 // TODO: Should this propagate fast-math-flags? 7474 7475 // Convert to float 7476 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 7477 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 7478 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 7479 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 7480 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 7481 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 7482 // Get reciprocal estimate. 7483 // float4 recip = vrecpeq_f32(yf); 7484 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7485 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7486 Y); 7487 // Because char has a smaller range than uchar, we can actually get away 7488 // without any newton steps. This requires that we use a weird bias 7489 // of 0xb000, however (again, this has been exhaustively tested). 7490 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 7491 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 7492 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 7493 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 7494 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 7495 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 7496 // Convert back to short. 7497 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 7498 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 7499 return X; 7500 } 7501 7502 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 7503 SelectionDAG &DAG) { 7504 // TODO: Should this propagate fast-math-flags? 7505 7506 SDValue N2; 7507 // Convert to float. 7508 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 7509 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 7510 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 7511 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 7512 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7513 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7514 7515 // Use reciprocal estimate and one refinement step. 7516 // float4 recip = vrecpeq_f32(yf); 7517 // recip *= vrecpsq_f32(yf, recip); 7518 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7519 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7520 N1); 7521 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7522 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7523 N1, N2); 7524 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7525 // Because short has a smaller range than ushort, we can actually get away 7526 // with only a single newton step. This requires that we use a weird bias 7527 // of 89, however (again, this has been exhaustively tested). 7528 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 7529 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7530 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7531 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 7532 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7533 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7534 // Convert back to integer and return. 7535 // return vmovn_s32(vcvt_s32_f32(result)); 7536 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7537 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7538 return N0; 7539 } 7540 7541 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 7542 EVT VT = Op.getValueType(); 7543 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7544 "unexpected type for custom-lowering ISD::SDIV"); 7545 7546 SDLoc dl(Op); 7547 SDValue N0 = Op.getOperand(0); 7548 SDValue N1 = Op.getOperand(1); 7549 SDValue N2, N3; 7550 7551 if (VT == MVT::v8i8) { 7552 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 7553 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 7554 7555 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7556 DAG.getIntPtrConstant(4, dl)); 7557 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7558 DAG.getIntPtrConstant(4, dl)); 7559 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7560 DAG.getIntPtrConstant(0, dl)); 7561 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7562 DAG.getIntPtrConstant(0, dl)); 7563 7564 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 7565 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 7566 7567 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7568 N0 = LowerCONCAT_VECTORS(N0, DAG); 7569 7570 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 7571 return N0; 7572 } 7573 return LowerSDIV_v4i16(N0, N1, dl, DAG); 7574 } 7575 7576 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { 7577 // TODO: Should this propagate fast-math-flags? 7578 EVT VT = Op.getValueType(); 7579 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 7580 "unexpected type for custom-lowering ISD::UDIV"); 7581 7582 SDLoc dl(Op); 7583 SDValue N0 = Op.getOperand(0); 7584 SDValue N1 = Op.getOperand(1); 7585 SDValue N2, N3; 7586 7587 if (VT == MVT::v8i8) { 7588 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 7589 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 7590 7591 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7592 DAG.getIntPtrConstant(4, dl)); 7593 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7594 DAG.getIntPtrConstant(4, dl)); 7595 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 7596 DAG.getIntPtrConstant(0, dl)); 7597 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 7598 DAG.getIntPtrConstant(0, dl)); 7599 7600 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 7601 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 7602 7603 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 7604 N0 = LowerCONCAT_VECTORS(N0, DAG); 7605 7606 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 7607 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 7608 MVT::i32), 7609 N0); 7610 return N0; 7611 } 7612 7613 // v4i16 sdiv ... Convert to float. 7614 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 7615 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 7616 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 7617 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 7618 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 7619 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 7620 7621 // Use reciprocal estimate and two refinement steps. 7622 // float4 recip = vrecpeq_f32(yf); 7623 // recip *= vrecpsq_f32(yf, recip); 7624 // recip *= vrecpsq_f32(yf, recip); 7625 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7626 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 7627 BN1); 7628 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7629 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7630 BN1, N2); 7631 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7632 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 7633 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 7634 BN1, N2); 7635 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 7636 // Simply multiplying by the reciprocal estimate can leave us a few ulps 7637 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 7638 // and that it will never cause us to return an answer too large). 7639 // float4 result = as_float4(as_int4(xf*recip) + 2); 7640 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 7641 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 7642 N1 = DAG.getConstant(2, dl, MVT::v4i32); 7643 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 7644 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 7645 // Convert back to integer and return. 7646 // return vmovn_u32(vcvt_s32_f32(result)); 7647 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 7648 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 7649 return N0; 7650 } 7651 7652 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 7653 SDNode *N = Op.getNode(); 7654 EVT VT = N->getValueType(0); 7655 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 7656 7657 SDValue Carry = Op.getOperand(2); 7658 7659 SDLoc DL(Op); 7660 7661 SDValue Result; 7662 if (Op.getOpcode() == ISD::ADDCARRY) { 7663 // This converts the boolean value carry into the carry flag. 7664 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 7665 7666 // Do the addition proper using the carry flag we wanted. 7667 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 7668 Op.getOperand(1), Carry); 7669 7670 // Now convert the carry flag into a boolean value. 7671 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 7672 } else { 7673 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 7674 // have to invert the carry first. 7675 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 7676 DAG.getConstant(1, DL, MVT::i32), Carry); 7677 // This converts the boolean value carry into the carry flag. 7678 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 7679 7680 // Do the subtraction proper using the carry flag we wanted. 7681 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 7682 Op.getOperand(1), Carry); 7683 7684 // Now convert the carry flag into a boolean value. 7685 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 7686 // But the carry returned by ARMISD::SUBE is not a borrow as expected 7687 // by ISD::SUBCARRY, so compute 1 - C. 7688 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 7689 DAG.getConstant(1, DL, MVT::i32), Carry); 7690 } 7691 7692 // Return both values. 7693 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 7694 } 7695 7696 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 7697 assert(Subtarget->isTargetDarwin()); 7698 7699 // For iOS, we want to call an alternative entry point: __sincos_stret, 7700 // return values are passed via sret. 7701 SDLoc dl(Op); 7702 SDValue Arg = Op.getOperand(0); 7703 EVT ArgVT = Arg.getValueType(); 7704 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7705 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7706 7707 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7708 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 7709 7710 // Pair of floats / doubles used to pass the result. 7711 Type *RetTy = StructType::get(ArgTy, ArgTy); 7712 auto &DL = DAG.getDataLayout(); 7713 7714 ArgListTy Args; 7715 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 7716 SDValue SRet; 7717 if (ShouldUseSRet) { 7718 // Create stack object for sret. 7719 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 7720 const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); 7721 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 7722 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 7723 7724 ArgListEntry Entry; 7725 Entry.Node = SRet; 7726 Entry.Ty = RetTy->getPointerTo(); 7727 Entry.IsSExt = false; 7728 Entry.IsZExt = false; 7729 Entry.IsSRet = true; 7730 Args.push_back(Entry); 7731 RetTy = Type::getVoidTy(*DAG.getContext()); 7732 } 7733 7734 ArgListEntry Entry; 7735 Entry.Node = Arg; 7736 Entry.Ty = ArgTy; 7737 Entry.IsSExt = false; 7738 Entry.IsZExt = false; 7739 Args.push_back(Entry); 7740 7741 RTLIB::Libcall LC = 7742 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 7743 const char *LibcallName = getLibcallName(LC); 7744 CallingConv::ID CC = getLibcallCallingConv(LC); 7745 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 7746 7747 TargetLowering::CallLoweringInfo CLI(DAG); 7748 CLI.setDebugLoc(dl) 7749 .setChain(DAG.getEntryNode()) 7750 .setCallee(CC, RetTy, Callee, std::move(Args)) 7751 .setDiscardResult(ShouldUseSRet); 7752 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 7753 7754 if (!ShouldUseSRet) 7755 return CallResult.first; 7756 7757 SDValue LoadSin = 7758 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 7759 7760 // Address of cos field. 7761 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 7762 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 7763 SDValue LoadCos = 7764 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 7765 7766 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 7767 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 7768 LoadSin.getValue(0), LoadCos.getValue(0)); 7769 } 7770 7771 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 7772 bool Signed, 7773 SDValue &Chain) const { 7774 EVT VT = Op.getValueType(); 7775 assert((VT == MVT::i32 || VT == MVT::i64) && 7776 "unexpected type for custom lowering DIV"); 7777 SDLoc dl(Op); 7778 7779 const auto &DL = DAG.getDataLayout(); 7780 const auto &TLI = DAG.getTargetLoweringInfo(); 7781 7782 const char *Name = nullptr; 7783 if (Signed) 7784 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 7785 else 7786 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 7787 7788 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 7789 7790 ARMTargetLowering::ArgListTy Args; 7791 7792 for (auto AI : {1, 0}) { 7793 ArgListEntry Arg; 7794 Arg.Node = Op.getOperand(AI); 7795 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 7796 Args.push_back(Arg); 7797 } 7798 7799 CallLoweringInfo CLI(DAG); 7800 CLI.setDebugLoc(dl) 7801 .setChain(Chain) 7802 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 7803 ES, std::move(Args)); 7804 7805 return LowerCallTo(CLI).first; 7806 } 7807 7808 // This is a code size optimisation: return the original SDIV node to 7809 // DAGCombiner when we don't want to expand SDIV into a sequence of 7810 // instructions, and an empty node otherwise which will cause the 7811 // SDIV to be expanded in DAGCombine. 7812 SDValue 7813 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 7814 SelectionDAG &DAG, 7815 SmallVectorImpl<SDNode *> &Created) const { 7816 // TODO: Support SREM 7817 if (N->getOpcode() != ISD::SDIV) 7818 return SDValue(); 7819 7820 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 7821 const bool MinSize = ST.hasMinSize(); 7822 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 7823 : ST.hasDivideInARMMode(); 7824 7825 // Don't touch vector types; rewriting this may lead to scalarizing 7826 // the int divs. 7827 if (N->getOperand(0).getValueType().isVector()) 7828 return SDValue(); 7829 7830 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 7831 // hwdiv support for this to be really profitable. 7832 if (!(MinSize && HasDivide)) 7833 return SDValue(); 7834 7835 // ARM mode is a bit simpler than Thumb: we can handle large power 7836 // of 2 immediates with 1 mov instruction; no further checks required, 7837 // just return the sdiv node. 7838 if (!ST.isThumb()) 7839 return SDValue(N, 0); 7840 7841 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 7842 // and thus lose the code size benefits of a MOVS that requires only 2. 7843 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 7844 // but as it's doing exactly this, it's not worth the trouble to get TTI. 7845 if (Divisor.sgt(128)) 7846 return SDValue(); 7847 7848 return SDValue(N, 0); 7849 } 7850 7851 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 7852 bool Signed) const { 7853 assert(Op.getValueType() == MVT::i32 && 7854 "unexpected type for custom lowering DIV"); 7855 SDLoc dl(Op); 7856 7857 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 7858 DAG.getEntryNode(), Op.getOperand(1)); 7859 7860 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7861 } 7862 7863 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 7864 SDLoc DL(N); 7865 SDValue Op = N->getOperand(1); 7866 if (N->getValueType(0) == MVT::i32) 7867 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 7868 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7869 DAG.getConstant(0, DL, MVT::i32)); 7870 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 7871 DAG.getConstant(1, DL, MVT::i32)); 7872 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 7873 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 7874 } 7875 7876 void ARMTargetLowering::ExpandDIV_Windows( 7877 SDValue Op, SelectionDAG &DAG, bool Signed, 7878 SmallVectorImpl<SDValue> &Results) const { 7879 const auto &DL = DAG.getDataLayout(); 7880 const auto &TLI = DAG.getTargetLoweringInfo(); 7881 7882 assert(Op.getValueType() == MVT::i64 && 7883 "unexpected type for custom lowering DIV"); 7884 SDLoc dl(Op); 7885 7886 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 7887 7888 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 7889 7890 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 7891 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 7892 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 7893 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 7894 7895 Results.push_back(Lower); 7896 Results.push_back(Upper); 7897 } 7898 7899 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 7900 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 7901 // Acquire/Release load/store is not legal for targets without a dmb or 7902 // equivalent available. 7903 return SDValue(); 7904 7905 // Monotonic load/store is legal for all targets. 7906 return Op; 7907 } 7908 7909 static void ReplaceREADCYCLECOUNTER(SDNode *N, 7910 SmallVectorImpl<SDValue> &Results, 7911 SelectionDAG &DAG, 7912 const ARMSubtarget *Subtarget) { 7913 SDLoc DL(N); 7914 // Under Power Management extensions, the cycle-count is: 7915 // mrc p15, #0, <Rt>, c9, c13, #0 7916 SDValue Ops[] = { N->getOperand(0), // Chain 7917 DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), 7918 DAG.getConstant(15, DL, MVT::i32), 7919 DAG.getConstant(0, DL, MVT::i32), 7920 DAG.getConstant(9, DL, MVT::i32), 7921 DAG.getConstant(13, DL, MVT::i32), 7922 DAG.getConstant(0, DL, MVT::i32) 7923 }; 7924 7925 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 7926 DAG.getVTList(MVT::i32, MVT::Other), Ops); 7927 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 7928 DAG.getConstant(0, DL, MVT::i32))); 7929 Results.push_back(Cycles32.getValue(1)); 7930 } 7931 7932 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 7933 SDLoc dl(V.getNode()); 7934 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 7935 SDValue VHi = DAG.getAnyExtOrTrunc( 7936 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 7937 dl, MVT::i32); 7938 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 7939 if (isBigEndian) 7940 std::swap (VLo, VHi); 7941 SDValue RegClass = 7942 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 7943 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 7944 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 7945 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 7946 return SDValue( 7947 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 7948 } 7949 7950 static void ReplaceCMP_SWAP_64Results(SDNode *N, 7951 SmallVectorImpl<SDValue> & Results, 7952 SelectionDAG &DAG) { 7953 assert(N->getValueType(0) == MVT::i64 && 7954 "AtomicCmpSwap on types less than 64 should be legal"); 7955 SDValue Ops[] = {N->getOperand(1), 7956 createGPRPairNode(DAG, N->getOperand(2)), 7957 createGPRPairNode(DAG, N->getOperand(3)), 7958 N->getOperand(0)}; 7959 SDNode *CmpSwap = DAG.getMachineNode( 7960 ARM::CMP_SWAP_64, SDLoc(N), 7961 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 7962 7963 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 7964 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 7965 7966 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 7967 7968 Results.push_back( 7969 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 7970 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 7971 Results.push_back( 7972 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 7973 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); 7974 Results.push_back(SDValue(CmpSwap, 2)); 7975 } 7976 7977 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, 7978 SelectionDAG &DAG) { 7979 const auto &TLI = DAG.getTargetLoweringInfo(); 7980 7981 assert(Subtarget.getTargetTriple().isOSMSVCRT() && 7982 "Custom lowering is MSVCRT specific!"); 7983 7984 SDLoc dl(Op); 7985 SDValue Val = Op.getOperand(0); 7986 MVT Ty = Val->getSimpleValueType(0); 7987 SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); 7988 SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", 7989 TLI.getPointerTy(DAG.getDataLayout())); 7990 7991 TargetLowering::ArgListTy Args; 7992 TargetLowering::ArgListEntry Entry; 7993 7994 Entry.Node = Val; 7995 Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); 7996 Entry.IsZExt = true; 7997 Args.push_back(Entry); 7998 7999 Entry.Node = Exponent; 8000 Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); 8001 Entry.IsZExt = true; 8002 Args.push_back(Entry); 8003 8004 Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); 8005 8006 // In the in-chain to the call is the entry node If we are emitting a 8007 // tailcall, the chain will be mutated if the node has a non-entry input 8008 // chain. 8009 SDValue InChain = DAG.getEntryNode(); 8010 SDValue TCChain = InChain; 8011 8012 const Function &F = DAG.getMachineFunction().getFunction(); 8013 bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && 8014 F.getReturnType() == LCRTy; 8015 if (IsTC) 8016 InChain = TCChain; 8017 8018 TargetLowering::CallLoweringInfo CLI(DAG); 8019 CLI.setDebugLoc(dl) 8020 .setChain(InChain) 8021 .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) 8022 .setTailCall(IsTC); 8023 std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI); 8024 8025 // Return the chain (the DAG root) if it is a tail call 8026 return !CI.second.getNode() ? DAG.getRoot() : CI.first; 8027 } 8028 8029 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8030 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 8031 switch (Op.getOpcode()) { 8032 default: llvm_unreachable("Don't know how to custom lower this!"); 8033 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 8034 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8035 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8036 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8037 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8038 case ISD::SELECT: return LowerSELECT(Op, DAG); 8039 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 8040 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8041 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 8042 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 8043 case ISD::VASTART: return LowerVASTART(Op, DAG); 8044 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 8045 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 8046 case ISD::SINT_TO_FP: 8047 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 8048 case ISD::FP_TO_SINT: 8049 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 8050 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8051 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8052 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8053 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 8054 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 8055 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 8056 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 8057 Subtarget); 8058 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 8059 case ISD::SHL: 8060 case ISD::SRL: 8061 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 8062 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 8063 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 8064 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 8065 case ISD::SRL_PARTS: 8066 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 8067 case ISD::CTTZ: 8068 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 8069 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 8070 case ISD::SETCC: return LowerVSETCC(Op, DAG); 8071 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 8072 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 8073 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 8074 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8075 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8076 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8077 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8078 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8079 case ISD::MUL: return LowerMUL(Op, DAG); 8080 case ISD::SDIV: 8081 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 8082 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 8083 return LowerSDIV(Op, DAG); 8084 case ISD::UDIV: 8085 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 8086 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 8087 return LowerUDIV(Op, DAG); 8088 case ISD::ADDCARRY: 8089 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 8090 case ISD::SADDO: 8091 case ISD::SSUBO: 8092 return LowerSignedALUO(Op, DAG); 8093 case ISD::UADDO: 8094 case ISD::USUBO: 8095 return LowerUnsignedALUO(Op, DAG); 8096 case ISD::ATOMIC_LOAD: 8097 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 8098 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 8099 case ISD::SDIVREM: 8100 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 8101 case ISD::DYNAMIC_STACKALLOC: 8102 if (Subtarget->isTargetWindows()) 8103 return LowerDYNAMIC_STACKALLOC(Op, DAG); 8104 llvm_unreachable("Don't know how to custom lower this!"); 8105 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 8106 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 8107 case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); 8108 case ARMISD::WIN__DBZCHK: return SDValue(); 8109 } 8110 } 8111 8112 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 8113 SelectionDAG &DAG) { 8114 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 8115 unsigned Opc = 0; 8116 if (IntNo == Intrinsic::arm_smlald) 8117 Opc = ARMISD::SMLALD; 8118 else if (IntNo == Intrinsic::arm_smlaldx) 8119 Opc = ARMISD::SMLALDX; 8120 else if (IntNo == Intrinsic::arm_smlsld) 8121 Opc = ARMISD::SMLSLD; 8122 else if (IntNo == Intrinsic::arm_smlsldx) 8123 Opc = ARMISD::SMLSLDX; 8124 else 8125 return; 8126 8127 SDLoc dl(N); 8128 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8129 N->getOperand(3), 8130 DAG.getConstant(0, dl, MVT::i32)); 8131 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8132 N->getOperand(3), 8133 DAG.getConstant(1, dl, MVT::i32)); 8134 8135 SDValue LongMul = DAG.getNode(Opc, dl, 8136 DAG.getVTList(MVT::i32, MVT::i32), 8137 N->getOperand(1), N->getOperand(2), 8138 Lo, Hi); 8139 Results.push_back(LongMul.getValue(0)); 8140 Results.push_back(LongMul.getValue(1)); 8141 } 8142 8143 /// ReplaceNodeResults - Replace the results of node with an illegal result 8144 /// type with new values built out of custom code. 8145 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 8146 SmallVectorImpl<SDValue> &Results, 8147 SelectionDAG &DAG) const { 8148 SDValue Res; 8149 switch (N->getOpcode()) { 8150 default: 8151 llvm_unreachable("Don't know how to custom expand this!"); 8152 case ISD::READ_REGISTER: 8153 ExpandREAD_REGISTER(N, Results, DAG); 8154 break; 8155 case ISD::BITCAST: 8156 Res = ExpandBITCAST(N, DAG, Subtarget); 8157 break; 8158 case ISD::SRL: 8159 case ISD::SRA: 8160 Res = Expand64BitShift(N, DAG, Subtarget); 8161 break; 8162 case ISD::SREM: 8163 case ISD::UREM: 8164 Res = LowerREM(N, DAG); 8165 break; 8166 case ISD::SDIVREM: 8167 case ISD::UDIVREM: 8168 Res = LowerDivRem(SDValue(N, 0), DAG); 8169 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 8170 Results.push_back(Res.getValue(0)); 8171 Results.push_back(Res.getValue(1)); 8172 return; 8173 case ISD::READCYCLECOUNTER: 8174 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 8175 return; 8176 case ISD::UDIV: 8177 case ISD::SDIV: 8178 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 8179 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 8180 Results); 8181 case ISD::ATOMIC_CMP_SWAP: 8182 ReplaceCMP_SWAP_64Results(N, Results, DAG); 8183 return; 8184 case ISD::INTRINSIC_WO_CHAIN: 8185 return ReplaceLongIntrinsic(N, Results, DAG); 8186 case ISD::ABS: 8187 lowerABS(N, Results, DAG); 8188 return ; 8189 8190 } 8191 if (Res.getNode()) 8192 Results.push_back(Res); 8193 } 8194 8195 //===----------------------------------------------------------------------===// 8196 // ARM Scheduler Hooks 8197 //===----------------------------------------------------------------------===// 8198 8199 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 8200 /// registers the function context. 8201 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 8202 MachineBasicBlock *MBB, 8203 MachineBasicBlock *DispatchBB, 8204 int FI) const { 8205 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 8206 "ROPI/RWPI not currently supported with SjLj"); 8207 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8208 DebugLoc dl = MI.getDebugLoc(); 8209 MachineFunction *MF = MBB->getParent(); 8210 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8211 MachineConstantPool *MCP = MF->getConstantPool(); 8212 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 8213 const Function &F = MF->getFunction(); 8214 8215 bool isThumb = Subtarget->isThumb(); 8216 bool isThumb2 = Subtarget->isThumb2(); 8217 8218 unsigned PCLabelId = AFI->createPICLabelUId(); 8219 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 8220 ARMConstantPoolValue *CPV = 8221 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 8222 unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); 8223 8224 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 8225 : &ARM::GPRRegClass; 8226 8227 // Grab constant pool and fixed stack memory operands. 8228 MachineMemOperand *CPMMO = 8229 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 8230 MachineMemOperand::MOLoad, 4, 4); 8231 8232 MachineMemOperand *FIMMOSt = 8233 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 8234 MachineMemOperand::MOStore, 4, 4); 8235 8236 // Load the address of the dispatch MBB into the jump buffer. 8237 if (isThumb2) { 8238 // Incoming value: jbuf 8239 // ldr.n r5, LCPI1_1 8240 // orr r5, r5, #1 8241 // add r5, pc 8242 // str r5, [$jbuf, #+4] ; &jbuf[1] 8243 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8244 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 8245 .addConstantPoolIndex(CPI) 8246 .addMemOperand(CPMMO) 8247 .add(predOps(ARMCC::AL)); 8248 // Set the low bit because of thumb mode. 8249 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8250 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 8251 .addReg(NewVReg1, RegState::Kill) 8252 .addImm(0x01) 8253 .add(predOps(ARMCC::AL)) 8254 .add(condCodeOp()); 8255 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8256 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 8257 .addReg(NewVReg2, RegState::Kill) 8258 .addImm(PCLabelId); 8259 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 8260 .addReg(NewVReg3, RegState::Kill) 8261 .addFrameIndex(FI) 8262 .addImm(36) // &jbuf[1] :: pc 8263 .addMemOperand(FIMMOSt) 8264 .add(predOps(ARMCC::AL)); 8265 } else if (isThumb) { 8266 // Incoming value: jbuf 8267 // ldr.n r1, LCPI1_4 8268 // add r1, pc 8269 // mov r2, #1 8270 // orrs r1, r2 8271 // add r2, $jbuf, #+4 ; &jbuf[1] 8272 // str r1, [r2] 8273 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8274 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 8275 .addConstantPoolIndex(CPI) 8276 .addMemOperand(CPMMO) 8277 .add(predOps(ARMCC::AL)); 8278 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8279 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 8280 .addReg(NewVReg1, RegState::Kill) 8281 .addImm(PCLabelId); 8282 // Set the low bit because of thumb mode. 8283 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8284 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 8285 .addReg(ARM::CPSR, RegState::Define) 8286 .addImm(1) 8287 .add(predOps(ARMCC::AL)); 8288 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8289 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 8290 .addReg(ARM::CPSR, RegState::Define) 8291 .addReg(NewVReg2, RegState::Kill) 8292 .addReg(NewVReg3, RegState::Kill) 8293 .add(predOps(ARMCC::AL)); 8294 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8295 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 8296 .addFrameIndex(FI) 8297 .addImm(36); // &jbuf[1] :: pc 8298 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 8299 .addReg(NewVReg4, RegState::Kill) 8300 .addReg(NewVReg5, RegState::Kill) 8301 .addImm(0) 8302 .addMemOperand(FIMMOSt) 8303 .add(predOps(ARMCC::AL)); 8304 } else { 8305 // Incoming value: jbuf 8306 // ldr r1, LCPI1_1 8307 // add r1, pc, r1 8308 // str r1, [$jbuf, #+4] ; &jbuf[1] 8309 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8310 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 8311 .addConstantPoolIndex(CPI) 8312 .addImm(0) 8313 .addMemOperand(CPMMO) 8314 .add(predOps(ARMCC::AL)); 8315 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8316 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 8317 .addReg(NewVReg1, RegState::Kill) 8318 .addImm(PCLabelId) 8319 .add(predOps(ARMCC::AL)); 8320 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 8321 .addReg(NewVReg2, RegState::Kill) 8322 .addFrameIndex(FI) 8323 .addImm(36) // &jbuf[1] :: pc 8324 .addMemOperand(FIMMOSt) 8325 .add(predOps(ARMCC::AL)); 8326 } 8327 } 8328 8329 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 8330 MachineBasicBlock *MBB) const { 8331 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8332 DebugLoc dl = MI.getDebugLoc(); 8333 MachineFunction *MF = MBB->getParent(); 8334 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8335 MachineFrameInfo &MFI = MF->getFrameInfo(); 8336 int FI = MFI.getFunctionContextIndex(); 8337 8338 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 8339 : &ARM::GPRnopcRegClass; 8340 8341 // Get a mapping of the call site numbers to all of the landing pads they're 8342 // associated with. 8343 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 8344 unsigned MaxCSNum = 0; 8345 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 8346 ++BB) { 8347 if (!BB->isEHPad()) continue; 8348 8349 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 8350 // pad. 8351 for (MachineBasicBlock::iterator 8352 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 8353 if (!II->isEHLabel()) continue; 8354 8355 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 8356 if (!MF->hasCallSiteLandingPad(Sym)) continue; 8357 8358 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 8359 for (SmallVectorImpl<unsigned>::iterator 8360 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 8361 CSI != CSE; ++CSI) { 8362 CallSiteNumToLPad[*CSI].push_back(&*BB); 8363 MaxCSNum = std::max(MaxCSNum, *CSI); 8364 } 8365 break; 8366 } 8367 } 8368 8369 // Get an ordered list of the machine basic blocks for the jump table. 8370 std::vector<MachineBasicBlock*> LPadList; 8371 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 8372 LPadList.reserve(CallSiteNumToLPad.size()); 8373 for (unsigned I = 1; I <= MaxCSNum; ++I) { 8374 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 8375 for (SmallVectorImpl<MachineBasicBlock*>::iterator 8376 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 8377 LPadList.push_back(*II); 8378 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 8379 } 8380 } 8381 8382 assert(!LPadList.empty() && 8383 "No landing pad destinations for the dispatch jump table!"); 8384 8385 // Create the jump table and associated information. 8386 MachineJumpTableInfo *JTI = 8387 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 8388 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 8389 8390 // Create the MBBs for the dispatch code. 8391 8392 // Shove the dispatch's address into the return slot in the function context. 8393 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 8394 DispatchBB->setIsEHPad(); 8395 8396 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 8397 unsigned trap_opcode; 8398 if (Subtarget->isThumb()) 8399 trap_opcode = ARM::tTRAP; 8400 else 8401 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 8402 8403 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 8404 DispatchBB->addSuccessor(TrapBB); 8405 8406 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 8407 DispatchBB->addSuccessor(DispContBB); 8408 8409 // Insert and MBBs. 8410 MF->insert(MF->end(), DispatchBB); 8411 MF->insert(MF->end(), DispContBB); 8412 MF->insert(MF->end(), TrapBB); 8413 8414 // Insert code into the entry block that creates and registers the function 8415 // context. 8416 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 8417 8418 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 8419 MachinePointerInfo::getFixedStack(*MF, FI), 8420 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); 8421 8422 MachineInstrBuilder MIB; 8423 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 8424 8425 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 8426 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 8427 8428 // Add a register mask with no preserved registers. This results in all 8429 // registers being marked as clobbered. This can't work if the dispatch block 8430 // is in a Thumb1 function and is linked with ARM code which uses the FP 8431 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 8432 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 8433 8434 bool IsPositionIndependent = isPositionIndependent(); 8435 unsigned NumLPads = LPadList.size(); 8436 if (Subtarget->isThumb2()) { 8437 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8438 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 8439 .addFrameIndex(FI) 8440 .addImm(4) 8441 .addMemOperand(FIMMOLd) 8442 .add(predOps(ARMCC::AL)); 8443 8444 if (NumLPads < 256) { 8445 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 8446 .addReg(NewVReg1) 8447 .addImm(LPadList.size()) 8448 .add(predOps(ARMCC::AL)); 8449 } else { 8450 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8451 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 8452 .addImm(NumLPads & 0xFFFF) 8453 .add(predOps(ARMCC::AL)); 8454 8455 unsigned VReg2 = VReg1; 8456 if ((NumLPads & 0xFFFF0000) != 0) { 8457 VReg2 = MRI->createVirtualRegister(TRC); 8458 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 8459 .addReg(VReg1) 8460 .addImm(NumLPads >> 16) 8461 .add(predOps(ARMCC::AL)); 8462 } 8463 8464 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 8465 .addReg(NewVReg1) 8466 .addReg(VReg2) 8467 .add(predOps(ARMCC::AL)); 8468 } 8469 8470 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 8471 .addMBB(TrapBB) 8472 .addImm(ARMCC::HI) 8473 .addReg(ARM::CPSR); 8474 8475 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8476 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 8477 .addJumpTableIndex(MJTI) 8478 .add(predOps(ARMCC::AL)); 8479 8480 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8481 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 8482 .addReg(NewVReg3, RegState::Kill) 8483 .addReg(NewVReg1) 8484 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 8485 .add(predOps(ARMCC::AL)) 8486 .add(condCodeOp()); 8487 8488 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 8489 .addReg(NewVReg4, RegState::Kill) 8490 .addReg(NewVReg1) 8491 .addJumpTableIndex(MJTI); 8492 } else if (Subtarget->isThumb()) { 8493 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8494 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 8495 .addFrameIndex(FI) 8496 .addImm(1) 8497 .addMemOperand(FIMMOLd) 8498 .add(predOps(ARMCC::AL)); 8499 8500 if (NumLPads < 256) { 8501 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 8502 .addReg(NewVReg1) 8503 .addImm(NumLPads) 8504 .add(predOps(ARMCC::AL)); 8505 } else { 8506 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8507 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 8508 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 8509 8510 // MachineConstantPool wants an explicit alignment. 8511 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8512 if (Align == 0) 8513 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8514 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8515 8516 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8517 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 8518 .addReg(VReg1, RegState::Define) 8519 .addConstantPoolIndex(Idx) 8520 .add(predOps(ARMCC::AL)); 8521 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 8522 .addReg(NewVReg1) 8523 .addReg(VReg1) 8524 .add(predOps(ARMCC::AL)); 8525 } 8526 8527 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 8528 .addMBB(TrapBB) 8529 .addImm(ARMCC::HI) 8530 .addReg(ARM::CPSR); 8531 8532 unsigned NewVReg2 = MRI->createVirtualRegister(TRC); 8533 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 8534 .addReg(ARM::CPSR, RegState::Define) 8535 .addReg(NewVReg1) 8536 .addImm(2) 8537 .add(predOps(ARMCC::AL)); 8538 8539 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8540 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 8541 .addJumpTableIndex(MJTI) 8542 .add(predOps(ARMCC::AL)); 8543 8544 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8545 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 8546 .addReg(ARM::CPSR, RegState::Define) 8547 .addReg(NewVReg2, RegState::Kill) 8548 .addReg(NewVReg3) 8549 .add(predOps(ARMCC::AL)); 8550 8551 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8552 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8553 8554 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8555 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 8556 .addReg(NewVReg4, RegState::Kill) 8557 .addImm(0) 8558 .addMemOperand(JTMMOLd) 8559 .add(predOps(ARMCC::AL)); 8560 8561 unsigned NewVReg6 = NewVReg5; 8562 if (IsPositionIndependent) { 8563 NewVReg6 = MRI->createVirtualRegister(TRC); 8564 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 8565 .addReg(ARM::CPSR, RegState::Define) 8566 .addReg(NewVReg5, RegState::Kill) 8567 .addReg(NewVReg3) 8568 .add(predOps(ARMCC::AL)); 8569 } 8570 8571 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 8572 .addReg(NewVReg6, RegState::Kill) 8573 .addJumpTableIndex(MJTI); 8574 } else { 8575 unsigned NewVReg1 = MRI->createVirtualRegister(TRC); 8576 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 8577 .addFrameIndex(FI) 8578 .addImm(4) 8579 .addMemOperand(FIMMOLd) 8580 .add(predOps(ARMCC::AL)); 8581 8582 if (NumLPads < 256) { 8583 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 8584 .addReg(NewVReg1) 8585 .addImm(NumLPads) 8586 .add(predOps(ARMCC::AL)); 8587 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 8588 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8589 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 8590 .addImm(NumLPads & 0xFFFF) 8591 .add(predOps(ARMCC::AL)); 8592 8593 unsigned VReg2 = VReg1; 8594 if ((NumLPads & 0xFFFF0000) != 0) { 8595 VReg2 = MRI->createVirtualRegister(TRC); 8596 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 8597 .addReg(VReg1) 8598 .addImm(NumLPads >> 16) 8599 .add(predOps(ARMCC::AL)); 8600 } 8601 8602 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8603 .addReg(NewVReg1) 8604 .addReg(VReg2) 8605 .add(predOps(ARMCC::AL)); 8606 } else { 8607 MachineConstantPool *ConstantPool = MF->getConstantPool(); 8608 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 8609 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 8610 8611 // MachineConstantPool wants an explicit alignment. 8612 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 8613 if (Align == 0) 8614 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 8615 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 8616 8617 unsigned VReg1 = MRI->createVirtualRegister(TRC); 8618 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 8619 .addReg(VReg1, RegState::Define) 8620 .addConstantPoolIndex(Idx) 8621 .addImm(0) 8622 .add(predOps(ARMCC::AL)); 8623 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 8624 .addReg(NewVReg1) 8625 .addReg(VReg1, RegState::Kill) 8626 .add(predOps(ARMCC::AL)); 8627 } 8628 8629 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 8630 .addMBB(TrapBB) 8631 .addImm(ARMCC::HI) 8632 .addReg(ARM::CPSR); 8633 8634 unsigned NewVReg3 = MRI->createVirtualRegister(TRC); 8635 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 8636 .addReg(NewVReg1) 8637 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 8638 .add(predOps(ARMCC::AL)) 8639 .add(condCodeOp()); 8640 unsigned NewVReg4 = MRI->createVirtualRegister(TRC); 8641 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 8642 .addJumpTableIndex(MJTI) 8643 .add(predOps(ARMCC::AL)); 8644 8645 MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( 8646 MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); 8647 unsigned NewVReg5 = MRI->createVirtualRegister(TRC); 8648 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 8649 .addReg(NewVReg3, RegState::Kill) 8650 .addReg(NewVReg4) 8651 .addImm(0) 8652 .addMemOperand(JTMMOLd) 8653 .add(predOps(ARMCC::AL)); 8654 8655 if (IsPositionIndependent) { 8656 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 8657 .addReg(NewVReg5, RegState::Kill) 8658 .addReg(NewVReg4) 8659 .addJumpTableIndex(MJTI); 8660 } else { 8661 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 8662 .addReg(NewVReg5, RegState::Kill) 8663 .addJumpTableIndex(MJTI); 8664 } 8665 } 8666 8667 // Add the jump table entries as successors to the MBB. 8668 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 8669 for (std::vector<MachineBasicBlock*>::iterator 8670 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 8671 MachineBasicBlock *CurMBB = *I; 8672 if (SeenMBBs.insert(CurMBB).second) 8673 DispContBB->addSuccessor(CurMBB); 8674 } 8675 8676 // N.B. the order the invoke BBs are processed in doesn't matter here. 8677 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 8678 SmallVector<MachineBasicBlock*, 64> MBBLPads; 8679 for (MachineBasicBlock *BB : InvokeBBs) { 8680 8681 // Remove the landing pad successor from the invoke block and replace it 8682 // with the new dispatch block. 8683 SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(), 8684 BB->succ_end()); 8685 while (!Successors.empty()) { 8686 MachineBasicBlock *SMBB = Successors.pop_back_val(); 8687 if (SMBB->isEHPad()) { 8688 BB->removeSuccessor(SMBB); 8689 MBBLPads.push_back(SMBB); 8690 } 8691 } 8692 8693 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 8694 BB->normalizeSuccProbs(); 8695 8696 // Find the invoke call and mark all of the callee-saved registers as 8697 // 'implicit defined' so that they're spilled. This prevents code from 8698 // moving instructions to before the EH block, where they will never be 8699 // executed. 8700 for (MachineBasicBlock::reverse_iterator 8701 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 8702 if (!II->isCall()) continue; 8703 8704 DenseMap<unsigned, bool> DefRegs; 8705 for (MachineInstr::mop_iterator 8706 OI = II->operands_begin(), OE = II->operands_end(); 8707 OI != OE; ++OI) { 8708 if (!OI->isReg()) continue; 8709 DefRegs[OI->getReg()] = true; 8710 } 8711 8712 MachineInstrBuilder MIB(*MF, &*II); 8713 8714 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 8715 unsigned Reg = SavedRegs[i]; 8716 if (Subtarget->isThumb2() && 8717 !ARM::tGPRRegClass.contains(Reg) && 8718 !ARM::hGPRRegClass.contains(Reg)) 8719 continue; 8720 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 8721 continue; 8722 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 8723 continue; 8724 if (!DefRegs[Reg]) 8725 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 8726 } 8727 8728 break; 8729 } 8730 } 8731 8732 // Mark all former landing pads as non-landing pads. The dispatch is the only 8733 // landing pad now. 8734 for (SmallVectorImpl<MachineBasicBlock*>::iterator 8735 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 8736 (*I)->setIsEHPad(false); 8737 8738 // The instruction is gone now. 8739 MI.eraseFromParent(); 8740 } 8741 8742 static 8743 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 8744 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 8745 E = MBB->succ_end(); I != E; ++I) 8746 if (*I != Succ) 8747 return *I; 8748 llvm_unreachable("Expecting a BB with two successors!"); 8749 } 8750 8751 /// Return the load opcode for a given load size. If load size >= 8, 8752 /// neon opcode will be returned. 8753 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 8754 if (LdSize >= 8) 8755 return LdSize == 16 ? ARM::VLD1q32wb_fixed 8756 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 8757 if (IsThumb1) 8758 return LdSize == 4 ? ARM::tLDRi 8759 : LdSize == 2 ? ARM::tLDRHi 8760 : LdSize == 1 ? ARM::tLDRBi : 0; 8761 if (IsThumb2) 8762 return LdSize == 4 ? ARM::t2LDR_POST 8763 : LdSize == 2 ? ARM::t2LDRH_POST 8764 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 8765 return LdSize == 4 ? ARM::LDR_POST_IMM 8766 : LdSize == 2 ? ARM::LDRH_POST 8767 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 8768 } 8769 8770 /// Return the store opcode for a given store size. If store size >= 8, 8771 /// neon opcode will be returned. 8772 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 8773 if (StSize >= 8) 8774 return StSize == 16 ? ARM::VST1q32wb_fixed 8775 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 8776 if (IsThumb1) 8777 return StSize == 4 ? ARM::tSTRi 8778 : StSize == 2 ? ARM::tSTRHi 8779 : StSize == 1 ? ARM::tSTRBi : 0; 8780 if (IsThumb2) 8781 return StSize == 4 ? ARM::t2STR_POST 8782 : StSize == 2 ? ARM::t2STRH_POST 8783 : StSize == 1 ? ARM::t2STRB_POST : 0; 8784 return StSize == 4 ? ARM::STR_POST_IMM 8785 : StSize == 2 ? ARM::STRH_POST 8786 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 8787 } 8788 8789 /// Emit a post-increment load operation with given size. The instructions 8790 /// will be added to BB at Pos. 8791 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8792 const TargetInstrInfo *TII, const DebugLoc &dl, 8793 unsigned LdSize, unsigned Data, unsigned AddrIn, 8794 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8795 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 8796 assert(LdOpc != 0 && "Should have a load opcode"); 8797 if (LdSize >= 8) { 8798 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8799 .addReg(AddrOut, RegState::Define) 8800 .addReg(AddrIn) 8801 .addImm(0) 8802 .add(predOps(ARMCC::AL)); 8803 } else if (IsThumb1) { 8804 // load + update AddrIn 8805 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8806 .addReg(AddrIn) 8807 .addImm(0) 8808 .add(predOps(ARMCC::AL)); 8809 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8810 .add(t1CondCodeOp()) 8811 .addReg(AddrIn) 8812 .addImm(LdSize) 8813 .add(predOps(ARMCC::AL)); 8814 } else if (IsThumb2) { 8815 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8816 .addReg(AddrOut, RegState::Define) 8817 .addReg(AddrIn) 8818 .addImm(LdSize) 8819 .add(predOps(ARMCC::AL)); 8820 } else { // arm 8821 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 8822 .addReg(AddrOut, RegState::Define) 8823 .addReg(AddrIn) 8824 .addReg(0) 8825 .addImm(LdSize) 8826 .add(predOps(ARMCC::AL)); 8827 } 8828 } 8829 8830 /// Emit a post-increment store operation with given size. The instructions 8831 /// will be added to BB at Pos. 8832 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 8833 const TargetInstrInfo *TII, const DebugLoc &dl, 8834 unsigned StSize, unsigned Data, unsigned AddrIn, 8835 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 8836 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 8837 assert(StOpc != 0 && "Should have a store opcode"); 8838 if (StSize >= 8) { 8839 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8840 .addReg(AddrIn) 8841 .addImm(0) 8842 .addReg(Data) 8843 .add(predOps(ARMCC::AL)); 8844 } else if (IsThumb1) { 8845 // store + update AddrIn 8846 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 8847 .addReg(Data) 8848 .addReg(AddrIn) 8849 .addImm(0) 8850 .add(predOps(ARMCC::AL)); 8851 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 8852 .add(t1CondCodeOp()) 8853 .addReg(AddrIn) 8854 .addImm(StSize) 8855 .add(predOps(ARMCC::AL)); 8856 } else if (IsThumb2) { 8857 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8858 .addReg(Data) 8859 .addReg(AddrIn) 8860 .addImm(StSize) 8861 .add(predOps(ARMCC::AL)); 8862 } else { // arm 8863 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 8864 .addReg(Data) 8865 .addReg(AddrIn) 8866 .addReg(0) 8867 .addImm(StSize) 8868 .add(predOps(ARMCC::AL)); 8869 } 8870 } 8871 8872 MachineBasicBlock * 8873 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 8874 MachineBasicBlock *BB) const { 8875 // This pseudo instruction has 3 operands: dst, src, size 8876 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 8877 // Otherwise, we will generate unrolled scalar copies. 8878 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 8879 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8880 MachineFunction::iterator It = ++BB->getIterator(); 8881 8882 unsigned dest = MI.getOperand(0).getReg(); 8883 unsigned src = MI.getOperand(1).getReg(); 8884 unsigned SizeVal = MI.getOperand(2).getImm(); 8885 unsigned Align = MI.getOperand(3).getImm(); 8886 DebugLoc dl = MI.getDebugLoc(); 8887 8888 MachineFunction *MF = BB->getParent(); 8889 MachineRegisterInfo &MRI = MF->getRegInfo(); 8890 unsigned UnitSize = 0; 8891 const TargetRegisterClass *TRC = nullptr; 8892 const TargetRegisterClass *VecTRC = nullptr; 8893 8894 bool IsThumb1 = Subtarget->isThumb1Only(); 8895 bool IsThumb2 = Subtarget->isThumb2(); 8896 bool IsThumb = Subtarget->isThumb(); 8897 8898 if (Align & 1) { 8899 UnitSize = 1; 8900 } else if (Align & 2) { 8901 UnitSize = 2; 8902 } else { 8903 // Check whether we can use NEON instructions. 8904 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 8905 Subtarget->hasNEON()) { 8906 if ((Align % 16 == 0) && SizeVal >= 16) 8907 UnitSize = 16; 8908 else if ((Align % 8 == 0) && SizeVal >= 8) 8909 UnitSize = 8; 8910 } 8911 // Can't use NEON instructions. 8912 if (UnitSize == 0) 8913 UnitSize = 4; 8914 } 8915 8916 // Select the correct opcode and register class for unit size load/store 8917 bool IsNeon = UnitSize >= 8; 8918 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 8919 if (IsNeon) 8920 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 8921 : UnitSize == 8 ? &ARM::DPRRegClass 8922 : nullptr; 8923 8924 unsigned BytesLeft = SizeVal % UnitSize; 8925 unsigned LoopSize = SizeVal - BytesLeft; 8926 8927 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 8928 // Use LDR and STR to copy. 8929 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 8930 // [destOut] = STR_POST(scratch, destIn, UnitSize) 8931 unsigned srcIn = src; 8932 unsigned destIn = dest; 8933 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 8934 unsigned srcOut = MRI.createVirtualRegister(TRC); 8935 unsigned destOut = MRI.createVirtualRegister(TRC); 8936 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 8937 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 8938 IsThumb1, IsThumb2); 8939 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 8940 IsThumb1, IsThumb2); 8941 srcIn = srcOut; 8942 destIn = destOut; 8943 } 8944 8945 // Handle the leftover bytes with LDRB and STRB. 8946 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 8947 // [destOut] = STRB_POST(scratch, destIn, 1) 8948 for (unsigned i = 0; i < BytesLeft; i++) { 8949 unsigned srcOut = MRI.createVirtualRegister(TRC); 8950 unsigned destOut = MRI.createVirtualRegister(TRC); 8951 unsigned scratch = MRI.createVirtualRegister(TRC); 8952 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 8953 IsThumb1, IsThumb2); 8954 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 8955 IsThumb1, IsThumb2); 8956 srcIn = srcOut; 8957 destIn = destOut; 8958 } 8959 MI.eraseFromParent(); // The instruction is gone now. 8960 return BB; 8961 } 8962 8963 // Expand the pseudo op to a loop. 8964 // thisMBB: 8965 // ... 8966 // movw varEnd, # --> with thumb2 8967 // movt varEnd, # 8968 // ldrcp varEnd, idx --> without thumb2 8969 // fallthrough --> loopMBB 8970 // loopMBB: 8971 // PHI varPhi, varEnd, varLoop 8972 // PHI srcPhi, src, srcLoop 8973 // PHI destPhi, dst, destLoop 8974 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 8975 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 8976 // subs varLoop, varPhi, #UnitSize 8977 // bne loopMBB 8978 // fallthrough --> exitMBB 8979 // exitMBB: 8980 // epilogue to handle left-over bytes 8981 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 8982 // [destOut] = STRB_POST(scratch, destLoop, 1) 8983 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8984 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 8985 MF->insert(It, loopMBB); 8986 MF->insert(It, exitMBB); 8987 8988 // Transfer the remainder of BB and its successor edges to exitMBB. 8989 exitMBB->splice(exitMBB->begin(), BB, 8990 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 8991 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 8992 8993 // Load an immediate to varEnd. 8994 unsigned varEnd = MRI.createVirtualRegister(TRC); 8995 if (Subtarget->useMovt()) { 8996 unsigned Vtmp = varEnd; 8997 if ((LoopSize & 0xFFFF0000) != 0) 8998 Vtmp = MRI.createVirtualRegister(TRC); 8999 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 9000 .addImm(LoopSize & 0xFFFF) 9001 .add(predOps(ARMCC::AL)); 9002 9003 if ((LoopSize & 0xFFFF0000) != 0) 9004 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 9005 .addReg(Vtmp) 9006 .addImm(LoopSize >> 16) 9007 .add(predOps(ARMCC::AL)); 9008 } else { 9009 MachineConstantPool *ConstantPool = MF->getConstantPool(); 9010 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 9011 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 9012 9013 // MachineConstantPool wants an explicit alignment. 9014 unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); 9015 if (Align == 0) 9016 Align = MF->getDataLayout().getTypeAllocSize(C->getType()); 9017 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); 9018 MachineMemOperand *CPMMO = 9019 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 9020 MachineMemOperand::MOLoad, 4, 4); 9021 9022 if (IsThumb) 9023 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 9024 .addReg(varEnd, RegState::Define) 9025 .addConstantPoolIndex(Idx) 9026 .add(predOps(ARMCC::AL)) 9027 .addMemOperand(CPMMO); 9028 else 9029 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 9030 .addReg(varEnd, RegState::Define) 9031 .addConstantPoolIndex(Idx) 9032 .addImm(0) 9033 .add(predOps(ARMCC::AL)) 9034 .addMemOperand(CPMMO); 9035 } 9036 BB->addSuccessor(loopMBB); 9037 9038 // Generate the loop body: 9039 // varPhi = PHI(varLoop, varEnd) 9040 // srcPhi = PHI(srcLoop, src) 9041 // destPhi = PHI(destLoop, dst) 9042 MachineBasicBlock *entryBB = BB; 9043 BB = loopMBB; 9044 unsigned varLoop = MRI.createVirtualRegister(TRC); 9045 unsigned varPhi = MRI.createVirtualRegister(TRC); 9046 unsigned srcLoop = MRI.createVirtualRegister(TRC); 9047 unsigned srcPhi = MRI.createVirtualRegister(TRC); 9048 unsigned destLoop = MRI.createVirtualRegister(TRC); 9049 unsigned destPhi = MRI.createVirtualRegister(TRC); 9050 9051 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 9052 .addReg(varLoop).addMBB(loopMBB) 9053 .addReg(varEnd).addMBB(entryBB); 9054 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 9055 .addReg(srcLoop).addMBB(loopMBB) 9056 .addReg(src).addMBB(entryBB); 9057 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 9058 .addReg(destLoop).addMBB(loopMBB) 9059 .addReg(dest).addMBB(entryBB); 9060 9061 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 9062 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 9063 unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 9064 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 9065 IsThumb1, IsThumb2); 9066 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 9067 IsThumb1, IsThumb2); 9068 9069 // Decrement loop variable by UnitSize. 9070 if (IsThumb1) { 9071 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 9072 .add(t1CondCodeOp()) 9073 .addReg(varPhi) 9074 .addImm(UnitSize) 9075 .add(predOps(ARMCC::AL)); 9076 } else { 9077 MachineInstrBuilder MIB = 9078 BuildMI(*BB, BB->end(), dl, 9079 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 9080 MIB.addReg(varPhi) 9081 .addImm(UnitSize) 9082 .add(predOps(ARMCC::AL)) 9083 .add(condCodeOp()); 9084 MIB->getOperand(5).setReg(ARM::CPSR); 9085 MIB->getOperand(5).setIsDef(true); 9086 } 9087 BuildMI(*BB, BB->end(), dl, 9088 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 9089 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 9090 9091 // loopMBB can loop back to loopMBB or fall through to exitMBB. 9092 BB->addSuccessor(loopMBB); 9093 BB->addSuccessor(exitMBB); 9094 9095 // Add epilogue to handle BytesLeft. 9096 BB = exitMBB; 9097 auto StartOfExit = exitMBB->begin(); 9098 9099 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 9100 // [destOut] = STRB_POST(scratch, destLoop, 1) 9101 unsigned srcIn = srcLoop; 9102 unsigned destIn = destLoop; 9103 for (unsigned i = 0; i < BytesLeft; i++) { 9104 unsigned srcOut = MRI.createVirtualRegister(TRC); 9105 unsigned destOut = MRI.createVirtualRegister(TRC); 9106 unsigned scratch = MRI.createVirtualRegister(TRC); 9107 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 9108 IsThumb1, IsThumb2); 9109 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 9110 IsThumb1, IsThumb2); 9111 srcIn = srcOut; 9112 destIn = destOut; 9113 } 9114 9115 MI.eraseFromParent(); // The instruction is gone now. 9116 return BB; 9117 } 9118 9119 MachineBasicBlock * 9120 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 9121 MachineBasicBlock *MBB) const { 9122 const TargetMachine &TM = getTargetMachine(); 9123 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 9124 DebugLoc DL = MI.getDebugLoc(); 9125 9126 assert(Subtarget->isTargetWindows() && 9127 "__chkstk is only supported on Windows"); 9128 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 9129 9130 // __chkstk takes the number of words to allocate on the stack in R4, and 9131 // returns the stack adjustment in number of bytes in R4. This will not 9132 // clober any other registers (other than the obvious lr). 9133 // 9134 // Although, technically, IP should be considered a register which may be 9135 // clobbered, the call itself will not touch it. Windows on ARM is a pure 9136 // thumb-2 environment, so there is no interworking required. As a result, we 9137 // do not expect a veneer to be emitted by the linker, clobbering IP. 9138 // 9139 // Each module receives its own copy of __chkstk, so no import thunk is 9140 // required, again, ensuring that IP is not clobbered. 9141 // 9142 // Finally, although some linkers may theoretically provide a trampoline for 9143 // out of range calls (which is quite common due to a 32M range limitation of 9144 // branches for Thumb), we can generate the long-call version via 9145 // -mcmodel=large, alleviating the need for the trampoline which may clobber 9146 // IP. 9147 9148 switch (TM.getCodeModel()) { 9149 case CodeModel::Tiny: 9150 llvm_unreachable("Tiny code model not available on ARM."); 9151 case CodeModel::Small: 9152 case CodeModel::Medium: 9153 case CodeModel::Kernel: 9154 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 9155 .add(predOps(ARMCC::AL)) 9156 .addExternalSymbol("__chkstk") 9157 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 9158 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 9159 .addReg(ARM::R12, 9160 RegState::Implicit | RegState::Define | RegState::Dead) 9161 .addReg(ARM::CPSR, 9162 RegState::Implicit | RegState::Define | RegState::Dead); 9163 break; 9164 case CodeModel::Large: { 9165 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9166 unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 9167 9168 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 9169 .addExternalSymbol("__chkstk"); 9170 BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) 9171 .add(predOps(ARMCC::AL)) 9172 .addReg(Reg, RegState::Kill) 9173 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 9174 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 9175 .addReg(ARM::R12, 9176 RegState::Implicit | RegState::Define | RegState::Dead) 9177 .addReg(ARM::CPSR, 9178 RegState::Implicit | RegState::Define | RegState::Dead); 9179 break; 9180 } 9181 } 9182 9183 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 9184 .addReg(ARM::SP, RegState::Kill) 9185 .addReg(ARM::R4, RegState::Kill) 9186 .setMIFlags(MachineInstr::FrameSetup) 9187 .add(predOps(ARMCC::AL)) 9188 .add(condCodeOp()); 9189 9190 MI.eraseFromParent(); 9191 return MBB; 9192 } 9193 9194 MachineBasicBlock * 9195 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 9196 MachineBasicBlock *MBB) const { 9197 DebugLoc DL = MI.getDebugLoc(); 9198 MachineFunction *MF = MBB->getParent(); 9199 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9200 9201 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 9202 MF->insert(++MBB->getIterator(), ContBB); 9203 ContBB->splice(ContBB->begin(), MBB, 9204 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 9205 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 9206 MBB->addSuccessor(ContBB); 9207 9208 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 9209 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 9210 MF->push_back(TrapBB); 9211 MBB->addSuccessor(TrapBB); 9212 9213 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 9214 .addReg(MI.getOperand(0).getReg()) 9215 .addImm(0) 9216 .add(predOps(ARMCC::AL)); 9217 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 9218 .addMBB(TrapBB) 9219 .addImm(ARMCC::EQ) 9220 .addReg(ARM::CPSR); 9221 9222 MI.eraseFromParent(); 9223 return ContBB; 9224 } 9225 9226 // The CPSR operand of SelectItr might be missing a kill marker 9227 // because there were multiple uses of CPSR, and ISel didn't know 9228 // which to mark. Figure out whether SelectItr should have had a 9229 // kill marker, and set it if it should. Returns the correct kill 9230 // marker value. 9231 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 9232 MachineBasicBlock* BB, 9233 const TargetRegisterInfo* TRI) { 9234 // Scan forward through BB for a use/def of CPSR. 9235 MachineBasicBlock::iterator miI(std::next(SelectItr)); 9236 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 9237 const MachineInstr& mi = *miI; 9238 if (mi.readsRegister(ARM::CPSR)) 9239 return false; 9240 if (mi.definesRegister(ARM::CPSR)) 9241 break; // Should have kill-flag - update below. 9242 } 9243 9244 // If we hit the end of the block, check whether CPSR is live into a 9245 // successor. 9246 if (miI == BB->end()) { 9247 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 9248 sEnd = BB->succ_end(); 9249 sItr != sEnd; ++sItr) { 9250 MachineBasicBlock* succ = *sItr; 9251 if (succ->isLiveIn(ARM::CPSR)) 9252 return false; 9253 } 9254 } 9255 9256 // We found a def, or hit the end of the basic block and CPSR wasn't live 9257 // out. SelectMI should have a kill flag on CPSR. 9258 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 9259 return true; 9260 } 9261 9262 MachineBasicBlock * 9263 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 9264 MachineBasicBlock *BB) const { 9265 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 9266 DebugLoc dl = MI.getDebugLoc(); 9267 bool isThumb2 = Subtarget->isThumb2(); 9268 switch (MI.getOpcode()) { 9269 default: { 9270 MI.print(errs()); 9271 llvm_unreachable("Unexpected instr type to insert"); 9272 } 9273 9274 // Thumb1 post-indexed loads are really just single-register LDMs. 9275 case ARM::tLDR_postidx: { 9276 MachineOperand Def(MI.getOperand(1)); 9277 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 9278 .add(Def) // Rn_wb 9279 .add(MI.getOperand(2)) // Rn 9280 .add(MI.getOperand(3)) // PredImm 9281 .add(MI.getOperand(4)) // PredReg 9282 .add(MI.getOperand(0)) // Rt 9283 .cloneMemRefs(MI); 9284 MI.eraseFromParent(); 9285 return BB; 9286 } 9287 9288 // The Thumb2 pre-indexed stores have the same MI operands, they just 9289 // define them differently in the .td files from the isel patterns, so 9290 // they need pseudos. 9291 case ARM::t2STR_preidx: 9292 MI.setDesc(TII->get(ARM::t2STR_PRE)); 9293 return BB; 9294 case ARM::t2STRB_preidx: 9295 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 9296 return BB; 9297 case ARM::t2STRH_preidx: 9298 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 9299 return BB; 9300 9301 case ARM::STRi_preidx: 9302 case ARM::STRBi_preidx: { 9303 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 9304 : ARM::STRB_PRE_IMM; 9305 // Decode the offset. 9306 unsigned Offset = MI.getOperand(4).getImm(); 9307 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 9308 Offset = ARM_AM::getAM2Offset(Offset); 9309 if (isSub) 9310 Offset = -Offset; 9311 9312 MachineMemOperand *MMO = *MI.memoperands_begin(); 9313 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 9314 .add(MI.getOperand(0)) // Rn_wb 9315 .add(MI.getOperand(1)) // Rt 9316 .add(MI.getOperand(2)) // Rn 9317 .addImm(Offset) // offset (skip GPR==zero_reg) 9318 .add(MI.getOperand(5)) // pred 9319 .add(MI.getOperand(6)) 9320 .addMemOperand(MMO); 9321 MI.eraseFromParent(); 9322 return BB; 9323 } 9324 case ARM::STRr_preidx: 9325 case ARM::STRBr_preidx: 9326 case ARM::STRH_preidx: { 9327 unsigned NewOpc; 9328 switch (MI.getOpcode()) { 9329 default: llvm_unreachable("unexpected opcode!"); 9330 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 9331 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 9332 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 9333 } 9334 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 9335 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 9336 MIB.add(MI.getOperand(i)); 9337 MI.eraseFromParent(); 9338 return BB; 9339 } 9340 9341 case ARM::tMOVCCr_pseudo: { 9342 // To "insert" a SELECT_CC instruction, we actually have to insert the 9343 // diamond control-flow pattern. The incoming instruction knows the 9344 // destination vreg to set, the condition code register to branch on, the 9345 // true/false values to select between, and a branch opcode to use. 9346 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9347 MachineFunction::iterator It = ++BB->getIterator(); 9348 9349 // thisMBB: 9350 // ... 9351 // TrueVal = ... 9352 // cmpTY ccX, r1, r2 9353 // bCC copy1MBB 9354 // fallthrough --> copy0MBB 9355 MachineBasicBlock *thisMBB = BB; 9356 MachineFunction *F = BB->getParent(); 9357 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9358 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9359 F->insert(It, copy0MBB); 9360 F->insert(It, sinkMBB); 9361 9362 // Check whether CPSR is live past the tMOVCCr_pseudo. 9363 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 9364 if (!MI.killsRegister(ARM::CPSR) && 9365 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 9366 copy0MBB->addLiveIn(ARM::CPSR); 9367 sinkMBB->addLiveIn(ARM::CPSR); 9368 } 9369 9370 // Transfer the remainder of BB and its successor edges to sinkMBB. 9371 sinkMBB->splice(sinkMBB->begin(), BB, 9372 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9373 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9374 9375 BB->addSuccessor(copy0MBB); 9376 BB->addSuccessor(sinkMBB); 9377 9378 BuildMI(BB, dl, TII->get(ARM::tBcc)) 9379 .addMBB(sinkMBB) 9380 .addImm(MI.getOperand(3).getImm()) 9381 .addReg(MI.getOperand(4).getReg()); 9382 9383 // copy0MBB: 9384 // %FalseValue = ... 9385 // # fallthrough to sinkMBB 9386 BB = copy0MBB; 9387 9388 // Update machine-CFG edges 9389 BB->addSuccessor(sinkMBB); 9390 9391 // sinkMBB: 9392 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9393 // ... 9394 BB = sinkMBB; 9395 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 9396 .addReg(MI.getOperand(1).getReg()) 9397 .addMBB(copy0MBB) 9398 .addReg(MI.getOperand(2).getReg()) 9399 .addMBB(thisMBB); 9400 9401 MI.eraseFromParent(); // The pseudo instruction is gone now. 9402 return BB; 9403 } 9404 9405 case ARM::BCCi64: 9406 case ARM::BCCZi64: { 9407 // If there is an unconditional branch to the other successor, remove it. 9408 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9409 9410 // Compare both parts that make up the double comparison separately for 9411 // equality. 9412 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 9413 9414 unsigned LHS1 = MI.getOperand(1).getReg(); 9415 unsigned LHS2 = MI.getOperand(2).getReg(); 9416 if (RHSisZero) { 9417 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9418 .addReg(LHS1) 9419 .addImm(0) 9420 .add(predOps(ARMCC::AL)); 9421 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9422 .addReg(LHS2).addImm(0) 9423 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 9424 } else { 9425 unsigned RHS1 = MI.getOperand(3).getReg(); 9426 unsigned RHS2 = MI.getOperand(4).getReg(); 9427 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 9428 .addReg(LHS1) 9429 .addReg(RHS1) 9430 .add(predOps(ARMCC::AL)); 9431 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 9432 .addReg(LHS2).addReg(RHS2) 9433 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 9434 } 9435 9436 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 9437 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 9438 if (MI.getOperand(0).getImm() == ARMCC::NE) 9439 std::swap(destMBB, exitMBB); 9440 9441 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 9442 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 9443 if (isThumb2) 9444 BuildMI(BB, dl, TII->get(ARM::t2B)) 9445 .addMBB(exitMBB) 9446 .add(predOps(ARMCC::AL)); 9447 else 9448 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 9449 9450 MI.eraseFromParent(); // The pseudo instruction is gone now. 9451 return BB; 9452 } 9453 9454 case ARM::Int_eh_sjlj_setjmp: 9455 case ARM::Int_eh_sjlj_setjmp_nofp: 9456 case ARM::tInt_eh_sjlj_setjmp: 9457 case ARM::t2Int_eh_sjlj_setjmp: 9458 case ARM::t2Int_eh_sjlj_setjmp_nofp: 9459 return BB; 9460 9461 case ARM::Int_eh_sjlj_setup_dispatch: 9462 EmitSjLjDispatchBlock(MI, BB); 9463 return BB; 9464 9465 case ARM::ABS: 9466 case ARM::t2ABS: { 9467 // To insert an ABS instruction, we have to insert the 9468 // diamond control-flow pattern. The incoming instruction knows the 9469 // source vreg to test against 0, the destination vreg to set, 9470 // the condition code register to branch on, the 9471 // true/false values to select between, and a branch opcode to use. 9472 // It transforms 9473 // V1 = ABS V0 9474 // into 9475 // V2 = MOVS V0 9476 // BCC (branch to SinkBB if V0 >= 0) 9477 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 9478 // SinkBB: V1 = PHI(V2, V3) 9479 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9480 MachineFunction::iterator BBI = ++BB->getIterator(); 9481 MachineFunction *Fn = BB->getParent(); 9482 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 9483 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 9484 Fn->insert(BBI, RSBBB); 9485 Fn->insert(BBI, SinkBB); 9486 9487 unsigned int ABSSrcReg = MI.getOperand(1).getReg(); 9488 unsigned int ABSDstReg = MI.getOperand(0).getReg(); 9489 bool ABSSrcKIll = MI.getOperand(1).isKill(); 9490 bool isThumb2 = Subtarget->isThumb2(); 9491 MachineRegisterInfo &MRI = Fn->getRegInfo(); 9492 // In Thumb mode S must not be specified if source register is the SP or 9493 // PC and if destination register is the SP, so restrict register class 9494 unsigned NewRsbDstReg = 9495 MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 9496 9497 // Transfer the remainder of BB and its successor edges to sinkMBB. 9498 SinkBB->splice(SinkBB->begin(), BB, 9499 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9500 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 9501 9502 BB->addSuccessor(RSBBB); 9503 BB->addSuccessor(SinkBB); 9504 9505 // fall through to SinkMBB 9506 RSBBB->addSuccessor(SinkBB); 9507 9508 // insert a cmp at the end of BB 9509 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 9510 .addReg(ABSSrcReg) 9511 .addImm(0) 9512 .add(predOps(ARMCC::AL)); 9513 9514 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 9515 BuildMI(BB, dl, 9516 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 9517 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 9518 9519 // insert rsbri in RSBBB 9520 // Note: BCC and rsbri will be converted into predicated rsbmi 9521 // by if-conversion pass 9522 BuildMI(*RSBBB, RSBBB->begin(), dl, 9523 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 9524 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 9525 .addImm(0) 9526 .add(predOps(ARMCC::AL)) 9527 .add(condCodeOp()); 9528 9529 // insert PHI in SinkBB, 9530 // reuse ABSDstReg to not change uses of ABS instruction 9531 BuildMI(*SinkBB, SinkBB->begin(), dl, 9532 TII->get(ARM::PHI), ABSDstReg) 9533 .addReg(NewRsbDstReg).addMBB(RSBBB) 9534 .addReg(ABSSrcReg).addMBB(BB); 9535 9536 // remove ABS instruction 9537 MI.eraseFromParent(); 9538 9539 // return last added BB 9540 return SinkBB; 9541 } 9542 case ARM::COPY_STRUCT_BYVAL_I32: 9543 ++NumLoopByVals; 9544 return EmitStructByval(MI, BB); 9545 case ARM::WIN__CHKSTK: 9546 return EmitLowered__chkstk(MI, BB); 9547 case ARM::WIN__DBZCHK: 9548 return EmitLowered__dbzchk(MI, BB); 9549 } 9550 } 9551 9552 /// Attaches vregs to MEMCPY that it will use as scratch registers 9553 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 9554 /// instead of as a custom inserter because we need the use list from the SDNode. 9555 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 9556 MachineInstr &MI, const SDNode *Node) { 9557 bool isThumb1 = Subtarget->isThumb1Only(); 9558 9559 DebugLoc DL = MI.getDebugLoc(); 9560 MachineFunction *MF = MI.getParent()->getParent(); 9561 MachineRegisterInfo &MRI = MF->getRegInfo(); 9562 MachineInstrBuilder MIB(*MF, MI); 9563 9564 // If the new dst/src is unused mark it as dead. 9565 if (!Node->hasAnyUseOfValue(0)) { 9566 MI.getOperand(0).setIsDead(true); 9567 } 9568 if (!Node->hasAnyUseOfValue(1)) { 9569 MI.getOperand(1).setIsDead(true); 9570 } 9571 9572 // The MEMCPY both defines and kills the scratch registers. 9573 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 9574 unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 9575 : &ARM::GPRRegClass); 9576 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 9577 } 9578 } 9579 9580 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 9581 SDNode *Node) const { 9582 if (MI.getOpcode() == ARM::MEMCPY) { 9583 attachMEMCPYScratchRegs(Subtarget, MI, Node); 9584 return; 9585 } 9586 9587 const MCInstrDesc *MCID = &MI.getDesc(); 9588 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 9589 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 9590 // operand is still set to noreg. If needed, set the optional operand's 9591 // register to CPSR, and remove the redundant implicit def. 9592 // 9593 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 9594 9595 // Rename pseudo opcodes. 9596 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 9597 unsigned ccOutIdx; 9598 if (NewOpc) { 9599 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 9600 MCID = &TII->get(NewOpc); 9601 9602 assert(MCID->getNumOperands() == 9603 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 9604 && "converted opcode should be the same except for cc_out" 9605 " (and, on Thumb1, pred)"); 9606 9607 MI.setDesc(*MCID); 9608 9609 // Add the optional cc_out operand 9610 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 9611 9612 // On Thumb1, move all input operands to the end, then add the predicate 9613 if (Subtarget->isThumb1Only()) { 9614 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 9615 MI.addOperand(MI.getOperand(1)); 9616 MI.RemoveOperand(1); 9617 } 9618 9619 // Restore the ties 9620 for (unsigned i = MI.getNumOperands(); i--;) { 9621 const MachineOperand& op = MI.getOperand(i); 9622 if (op.isReg() && op.isUse()) { 9623 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 9624 if (DefIdx != -1) 9625 MI.tieOperands(DefIdx, i); 9626 } 9627 } 9628 9629 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 9630 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 9631 ccOutIdx = 1; 9632 } else 9633 ccOutIdx = MCID->getNumOperands() - 1; 9634 } else 9635 ccOutIdx = MCID->getNumOperands() - 1; 9636 9637 // Any ARM instruction that sets the 's' bit should specify an optional 9638 // "cc_out" operand in the last operand position. 9639 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 9640 assert(!NewOpc && "Optional cc_out operand required"); 9641 return; 9642 } 9643 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 9644 // since we already have an optional CPSR def. 9645 bool definesCPSR = false; 9646 bool deadCPSR = false; 9647 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 9648 ++i) { 9649 const MachineOperand &MO = MI.getOperand(i); 9650 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 9651 definesCPSR = true; 9652 if (MO.isDead()) 9653 deadCPSR = true; 9654 MI.RemoveOperand(i); 9655 break; 9656 } 9657 } 9658 if (!definesCPSR) { 9659 assert(!NewOpc && "Optional cc_out operand required"); 9660 return; 9661 } 9662 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 9663 if (deadCPSR) { 9664 assert(!MI.getOperand(ccOutIdx).getReg() && 9665 "expect uninitialized optional cc_out operand"); 9666 // Thumb1 instructions must have the S bit even if the CPSR is dead. 9667 if (!Subtarget->isThumb1Only()) 9668 return; 9669 } 9670 9671 // If this instruction was defined with an optional CPSR def and its dag node 9672 // had a live implicit CPSR def, then activate the optional CPSR def. 9673 MachineOperand &MO = MI.getOperand(ccOutIdx); 9674 MO.setReg(ARM::CPSR); 9675 MO.setIsDef(true); 9676 } 9677 9678 //===----------------------------------------------------------------------===// 9679 // ARM Optimization Hooks 9680 //===----------------------------------------------------------------------===// 9681 9682 // Helper function that checks if N is a null or all ones constant. 9683 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 9684 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 9685 } 9686 9687 // Return true if N is conditionally 0 or all ones. 9688 // Detects these expressions where cc is an i1 value: 9689 // 9690 // (select cc 0, y) [AllOnes=0] 9691 // (select cc y, 0) [AllOnes=0] 9692 // (zext cc) [AllOnes=0] 9693 // (sext cc) [AllOnes=0/1] 9694 // (select cc -1, y) [AllOnes=1] 9695 // (select cc y, -1) [AllOnes=1] 9696 // 9697 // Invert is set when N is the null/all ones constant when CC is false. 9698 // OtherOp is set to the alternative value of N. 9699 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 9700 SDValue &CC, bool &Invert, 9701 SDValue &OtherOp, 9702 SelectionDAG &DAG) { 9703 switch (N->getOpcode()) { 9704 default: return false; 9705 case ISD::SELECT: { 9706 CC = N->getOperand(0); 9707 SDValue N1 = N->getOperand(1); 9708 SDValue N2 = N->getOperand(2); 9709 if (isZeroOrAllOnes(N1, AllOnes)) { 9710 Invert = false; 9711 OtherOp = N2; 9712 return true; 9713 } 9714 if (isZeroOrAllOnes(N2, AllOnes)) { 9715 Invert = true; 9716 OtherOp = N1; 9717 return true; 9718 } 9719 return false; 9720 } 9721 case ISD::ZERO_EXTEND: 9722 // (zext cc) can never be the all ones value. 9723 if (AllOnes) 9724 return false; 9725 LLVM_FALLTHROUGH; 9726 case ISD::SIGN_EXTEND: { 9727 SDLoc dl(N); 9728 EVT VT = N->getValueType(0); 9729 CC = N->getOperand(0); 9730 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 9731 return false; 9732 Invert = !AllOnes; 9733 if (AllOnes) 9734 // When looking for an AllOnes constant, N is an sext, and the 'other' 9735 // value is 0. 9736 OtherOp = DAG.getConstant(0, dl, VT); 9737 else if (N->getOpcode() == ISD::ZERO_EXTEND) 9738 // When looking for a 0 constant, N can be zext or sext. 9739 OtherOp = DAG.getConstant(1, dl, VT); 9740 else 9741 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 9742 VT); 9743 return true; 9744 } 9745 } 9746 } 9747 9748 // Combine a constant select operand into its use: 9749 // 9750 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 9751 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 9752 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 9753 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 9754 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 9755 // 9756 // The transform is rejected if the select doesn't have a constant operand that 9757 // is null, or all ones when AllOnes is set. 9758 // 9759 // Also recognize sext/zext from i1: 9760 // 9761 // (add (zext cc), x) -> (select cc (add x, 1), x) 9762 // (add (sext cc), x) -> (select cc (add x, -1), x) 9763 // 9764 // These transformations eventually create predicated instructions. 9765 // 9766 // @param N The node to transform. 9767 // @param Slct The N operand that is a select. 9768 // @param OtherOp The other N operand (x above). 9769 // @param DCI Context. 9770 // @param AllOnes Require the select constant to be all ones instead of null. 9771 // @returns The new node, or SDValue() on failure. 9772 static 9773 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 9774 TargetLowering::DAGCombinerInfo &DCI, 9775 bool AllOnes = false) { 9776 SelectionDAG &DAG = DCI.DAG; 9777 EVT VT = N->getValueType(0); 9778 SDValue NonConstantVal; 9779 SDValue CCOp; 9780 bool SwapSelectOps; 9781 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 9782 NonConstantVal, DAG)) 9783 return SDValue(); 9784 9785 // Slct is now know to be the desired identity constant when CC is true. 9786 SDValue TrueVal = OtherOp; 9787 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 9788 OtherOp, NonConstantVal); 9789 // Unless SwapSelectOps says CC should be false. 9790 if (SwapSelectOps) 9791 std::swap(TrueVal, FalseVal); 9792 9793 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 9794 CCOp, TrueVal, FalseVal); 9795 } 9796 9797 // Attempt combineSelectAndUse on each operand of a commutative operator N. 9798 static 9799 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 9800 TargetLowering::DAGCombinerInfo &DCI) { 9801 SDValue N0 = N->getOperand(0); 9802 SDValue N1 = N->getOperand(1); 9803 if (N0.getNode()->hasOneUse()) 9804 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 9805 return Result; 9806 if (N1.getNode()->hasOneUse()) 9807 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 9808 return Result; 9809 return SDValue(); 9810 } 9811 9812 static bool IsVUZPShuffleNode(SDNode *N) { 9813 // VUZP shuffle node. 9814 if (N->getOpcode() == ARMISD::VUZP) 9815 return true; 9816 9817 // "VUZP" on i32 is an alias for VTRN. 9818 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 9819 return true; 9820 9821 return false; 9822 } 9823 9824 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 9825 TargetLowering::DAGCombinerInfo &DCI, 9826 const ARMSubtarget *Subtarget) { 9827 // Look for ADD(VUZP.0, VUZP.1). 9828 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 9829 N0 == N1) 9830 return SDValue(); 9831 9832 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 9833 if (!N->getValueType(0).is64BitVector()) 9834 return SDValue(); 9835 9836 // Generate vpadd. 9837 SelectionDAG &DAG = DCI.DAG; 9838 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9839 SDLoc dl(N); 9840 SDNode *Unzip = N0.getNode(); 9841 EVT VT = N->getValueType(0); 9842 9843 SmallVector<SDValue, 8> Ops; 9844 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 9845 TLI.getPointerTy(DAG.getDataLayout()))); 9846 Ops.push_back(Unzip->getOperand(0)); 9847 Ops.push_back(Unzip->getOperand(1)); 9848 9849 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9850 } 9851 9852 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9853 TargetLowering::DAGCombinerInfo &DCI, 9854 const ARMSubtarget *Subtarget) { 9855 // Check for two extended operands. 9856 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 9857 N1.getOpcode() == ISD::SIGN_EXTEND) && 9858 !(N0.getOpcode() == ISD::ZERO_EXTEND && 9859 N1.getOpcode() == ISD::ZERO_EXTEND)) 9860 return SDValue(); 9861 9862 SDValue N00 = N0.getOperand(0); 9863 SDValue N10 = N1.getOperand(0); 9864 9865 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 9866 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 9867 N00 == N10) 9868 return SDValue(); 9869 9870 // We only recognize Q register paddl here; this can't be reached until 9871 // after type legalization. 9872 if (!N00.getValueType().is64BitVector() || 9873 !N0.getValueType().is128BitVector()) 9874 return SDValue(); 9875 9876 // Generate vpaddl. 9877 SelectionDAG &DAG = DCI.DAG; 9878 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9879 SDLoc dl(N); 9880 EVT VT = N->getValueType(0); 9881 9882 SmallVector<SDValue, 8> Ops; 9883 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 9884 unsigned Opcode; 9885 if (N0.getOpcode() == ISD::SIGN_EXTEND) 9886 Opcode = Intrinsic::arm_neon_vpaddls; 9887 else 9888 Opcode = Intrinsic::arm_neon_vpaddlu; 9889 Ops.push_back(DAG.getConstant(Opcode, dl, 9890 TLI.getPointerTy(DAG.getDataLayout()))); 9891 EVT ElemTy = N00.getValueType().getVectorElementType(); 9892 unsigned NumElts = VT.getVectorNumElements(); 9893 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 9894 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 9895 N00.getOperand(0), N00.getOperand(1)); 9896 Ops.push_back(Concat); 9897 9898 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 9899 } 9900 9901 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 9902 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 9903 // much easier to match. 9904 static SDValue 9905 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 9906 TargetLowering::DAGCombinerInfo &DCI, 9907 const ARMSubtarget *Subtarget) { 9908 // Only perform optimization if after legalize, and if NEON is available. We 9909 // also expected both operands to be BUILD_VECTORs. 9910 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 9911 || N0.getOpcode() != ISD::BUILD_VECTOR 9912 || N1.getOpcode() != ISD::BUILD_VECTOR) 9913 return SDValue(); 9914 9915 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 9916 EVT VT = N->getValueType(0); 9917 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 9918 return SDValue(); 9919 9920 // Check that the vector operands are of the right form. 9921 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 9922 // operands, where N is the size of the formed vector. 9923 // Each EXTRACT_VECTOR should have the same input vector and odd or even 9924 // index such that we have a pair wise add pattern. 9925 9926 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 9927 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9928 return SDValue(); 9929 SDValue Vec = N0->getOperand(0)->getOperand(0); 9930 SDNode *V = Vec.getNode(); 9931 unsigned nextIndex = 0; 9932 9933 // For each operands to the ADD which are BUILD_VECTORs, 9934 // check to see if each of their operands are an EXTRACT_VECTOR with 9935 // the same vector and appropriate index. 9936 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 9937 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 9938 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 9939 9940 SDValue ExtVec0 = N0->getOperand(i); 9941 SDValue ExtVec1 = N1->getOperand(i); 9942 9943 // First operand is the vector, verify its the same. 9944 if (V != ExtVec0->getOperand(0).getNode() || 9945 V != ExtVec1->getOperand(0).getNode()) 9946 return SDValue(); 9947 9948 // Second is the constant, verify its correct. 9949 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 9950 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 9951 9952 // For the constant, we want to see all the even or all the odd. 9953 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 9954 || C1->getZExtValue() != nextIndex+1) 9955 return SDValue(); 9956 9957 // Increment index. 9958 nextIndex+=2; 9959 } else 9960 return SDValue(); 9961 } 9962 9963 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 9964 // we're using the entire input vector, otherwise there's a size/legality 9965 // mismatch somewhere. 9966 if (nextIndex != Vec.getValueType().getVectorNumElements() || 9967 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 9968 return SDValue(); 9969 9970 // Create VPADDL node. 9971 SelectionDAG &DAG = DCI.DAG; 9972 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9973 9974 SDLoc dl(N); 9975 9976 // Build operand list. 9977 SmallVector<SDValue, 8> Ops; 9978 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 9979 TLI.getPointerTy(DAG.getDataLayout()))); 9980 9981 // Input is the vector. 9982 Ops.push_back(Vec); 9983 9984 // Get widened type and narrowed type. 9985 MVT widenType; 9986 unsigned numElem = VT.getVectorNumElements(); 9987 9988 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 9989 switch (inputLaneType.getSimpleVT().SimpleTy) { 9990 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 9991 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 9992 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 9993 default: 9994 llvm_unreachable("Invalid vector element type for padd optimization."); 9995 } 9996 9997 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 9998 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 9999 return DAG.getNode(ExtOp, dl, VT, tmp); 10000 } 10001 10002 static SDValue findMUL_LOHI(SDValue V) { 10003 if (V->getOpcode() == ISD::UMUL_LOHI || 10004 V->getOpcode() == ISD::SMUL_LOHI) 10005 return V; 10006 return SDValue(); 10007 } 10008 10009 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 10010 TargetLowering::DAGCombinerInfo &DCI, 10011 const ARMSubtarget *Subtarget) { 10012 if (Subtarget->isThumb()) { 10013 if (!Subtarget->hasDSP()) 10014 return SDValue(); 10015 } else if (!Subtarget->hasV5TEOps()) 10016 return SDValue(); 10017 10018 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 10019 // accumulates the product into a 64-bit value. The 16-bit values will 10020 // be sign extended somehow or SRA'd into 32-bit values 10021 // (addc (adde (mul 16bit, 16bit), lo), hi) 10022 SDValue Mul = AddcNode->getOperand(0); 10023 SDValue Lo = AddcNode->getOperand(1); 10024 if (Mul.getOpcode() != ISD::MUL) { 10025 Lo = AddcNode->getOperand(0); 10026 Mul = AddcNode->getOperand(1); 10027 if (Mul.getOpcode() != ISD::MUL) 10028 return SDValue(); 10029 } 10030 10031 SDValue SRA = AddeNode->getOperand(0); 10032 SDValue Hi = AddeNode->getOperand(1); 10033 if (SRA.getOpcode() != ISD::SRA) { 10034 SRA = AddeNode->getOperand(1); 10035 Hi = AddeNode->getOperand(0); 10036 if (SRA.getOpcode() != ISD::SRA) 10037 return SDValue(); 10038 } 10039 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 10040 if (Const->getZExtValue() != 31) 10041 return SDValue(); 10042 } else 10043 return SDValue(); 10044 10045 if (SRA.getOperand(0) != Mul) 10046 return SDValue(); 10047 10048 SelectionDAG &DAG = DCI.DAG; 10049 SDLoc dl(AddcNode); 10050 unsigned Opcode = 0; 10051 SDValue Op0; 10052 SDValue Op1; 10053 10054 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 10055 Opcode = ARMISD::SMLALBB; 10056 Op0 = Mul.getOperand(0); 10057 Op1 = Mul.getOperand(1); 10058 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 10059 Opcode = ARMISD::SMLALBT; 10060 Op0 = Mul.getOperand(0); 10061 Op1 = Mul.getOperand(1).getOperand(0); 10062 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 10063 Opcode = ARMISD::SMLALTB; 10064 Op0 = Mul.getOperand(0).getOperand(0); 10065 Op1 = Mul.getOperand(1); 10066 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 10067 Opcode = ARMISD::SMLALTT; 10068 Op0 = Mul->getOperand(0).getOperand(0); 10069 Op1 = Mul->getOperand(1).getOperand(0); 10070 } 10071 10072 if (!Op0 || !Op1) 10073 return SDValue(); 10074 10075 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 10076 Op0, Op1, Lo, Hi); 10077 // Replace the ADDs' nodes uses by the MLA node's values. 10078 SDValue HiMLALResult(SMLAL.getNode(), 1); 10079 SDValue LoMLALResult(SMLAL.getNode(), 0); 10080 10081 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 10082 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 10083 10084 // Return original node to notify the driver to stop replacing. 10085 SDValue resNode(AddcNode, 0); 10086 return resNode; 10087 } 10088 10089 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 10090 TargetLowering::DAGCombinerInfo &DCI, 10091 const ARMSubtarget *Subtarget) { 10092 // Look for multiply add opportunities. 10093 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 10094 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 10095 // a glue link from the first add to the second add. 10096 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 10097 // a S/UMLAL instruction. 10098 // UMUL_LOHI 10099 // / :lo \ :hi 10100 // V \ [no multiline comment] 10101 // loAdd -> ADDC | 10102 // \ :carry / 10103 // V V 10104 // ADDE <- hiAdd 10105 // 10106 // In the special case where only the higher part of a signed result is used 10107 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 10108 // a constant with the exact value of 0x80000000, we recognize we are dealing 10109 // with a "rounded multiply and add" (or subtract) and transform it into 10110 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 10111 10112 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 10113 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 10114 "Expect an ADDE or SUBE"); 10115 10116 assert(AddeSubeNode->getNumOperands() == 3 && 10117 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 10118 "ADDE node has the wrong inputs"); 10119 10120 // Check that we are chained to the right ADDC or SUBC node. 10121 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 10122 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 10123 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 10124 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 10125 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 10126 return SDValue(); 10127 10128 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 10129 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 10130 10131 // Check if the two operands are from the same mul_lohi node. 10132 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 10133 return SDValue(); 10134 10135 assert(AddcSubcNode->getNumValues() == 2 && 10136 AddcSubcNode->getValueType(0) == MVT::i32 && 10137 "Expect ADDC with two result values. First: i32"); 10138 10139 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 10140 // maybe a SMLAL which multiplies two 16-bit values. 10141 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 10142 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 10143 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 10144 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 10145 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 10146 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 10147 10148 // Check for the triangle shape. 10149 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 10150 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 10151 10152 // Make sure that the ADDE/SUBE operands are not coming from the same node. 10153 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 10154 return SDValue(); 10155 10156 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 10157 bool IsLeftOperandMUL = false; 10158 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 10159 if (MULOp == SDValue()) 10160 MULOp = findMUL_LOHI(AddeSubeOp1); 10161 else 10162 IsLeftOperandMUL = true; 10163 if (MULOp == SDValue()) 10164 return SDValue(); 10165 10166 // Figure out the right opcode. 10167 unsigned Opc = MULOp->getOpcode(); 10168 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 10169 10170 // Figure out the high and low input values to the MLAL node. 10171 SDValue *HiAddSub = nullptr; 10172 SDValue *LoMul = nullptr; 10173 SDValue *LowAddSub = nullptr; 10174 10175 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 10176 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 10177 return SDValue(); 10178 10179 if (IsLeftOperandMUL) 10180 HiAddSub = &AddeSubeOp1; 10181 else 10182 HiAddSub = &AddeSubeOp0; 10183 10184 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 10185 // whose low result is fed to the ADDC/SUBC we are checking. 10186 10187 if (AddcSubcOp0 == MULOp.getValue(0)) { 10188 LoMul = &AddcSubcOp0; 10189 LowAddSub = &AddcSubcOp1; 10190 } 10191 if (AddcSubcOp1 == MULOp.getValue(0)) { 10192 LoMul = &AddcSubcOp1; 10193 LowAddSub = &AddcSubcOp0; 10194 } 10195 10196 if (!LoMul) 10197 return SDValue(); 10198 10199 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 10200 // the replacement below will create a cycle. 10201 if (AddcSubcNode == HiAddSub->getNode() || 10202 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 10203 return SDValue(); 10204 10205 // Create the merged node. 10206 SelectionDAG &DAG = DCI.DAG; 10207 10208 // Start building operand list. 10209 SmallVector<SDValue, 8> Ops; 10210 Ops.push_back(LoMul->getOperand(0)); 10211 Ops.push_back(LoMul->getOperand(1)); 10212 10213 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 10214 // the case, we must be doing signed multiplication and only use the higher 10215 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 10216 // addition or subtraction with the value of 0x800000. 10217 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 10218 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 10219 LowAddSub->getNode()->getOpcode() == ISD::Constant && 10220 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 10221 0x80000000) { 10222 Ops.push_back(*HiAddSub); 10223 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 10224 FinalOpc = ARMISD::SMMLSR; 10225 } else { 10226 FinalOpc = ARMISD::SMMLAR; 10227 } 10228 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 10229 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 10230 10231 return SDValue(AddeSubeNode, 0); 10232 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 10233 // SMMLS is generated during instruction selection and the rest of this 10234 // function can not handle the case where AddcSubcNode is a SUBC. 10235 return SDValue(); 10236 10237 // Finish building the operand list for {U/S}MLAL 10238 Ops.push_back(*LowAddSub); 10239 Ops.push_back(*HiAddSub); 10240 10241 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 10242 DAG.getVTList(MVT::i32, MVT::i32), Ops); 10243 10244 // Replace the ADDs' nodes uses by the MLA node's values. 10245 SDValue HiMLALResult(MLALNode.getNode(), 1); 10246 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 10247 10248 SDValue LoMLALResult(MLALNode.getNode(), 0); 10249 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 10250 10251 // Return original node to notify the driver to stop replacing. 10252 return SDValue(AddeSubeNode, 0); 10253 } 10254 10255 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 10256 TargetLowering::DAGCombinerInfo &DCI, 10257 const ARMSubtarget *Subtarget) { 10258 // UMAAL is similar to UMLAL except that it adds two unsigned values. 10259 // While trying to combine for the other MLAL nodes, first search for the 10260 // chance to use UMAAL. Check if Addc uses a node which has already 10261 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 10262 // as the addend, and it's handled in PerformUMLALCombine. 10263 10264 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 10265 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 10266 10267 // Check that we have a glued ADDC node. 10268 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 10269 if (AddcNode->getOpcode() != ARMISD::ADDC) 10270 return SDValue(); 10271 10272 // Find the converted UMAAL or quit if it doesn't exist. 10273 SDNode *UmlalNode = nullptr; 10274 SDValue AddHi; 10275 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 10276 UmlalNode = AddcNode->getOperand(0).getNode(); 10277 AddHi = AddcNode->getOperand(1); 10278 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 10279 UmlalNode = AddcNode->getOperand(1).getNode(); 10280 AddHi = AddcNode->getOperand(0); 10281 } else { 10282 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 10283 } 10284 10285 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 10286 // the ADDC as well as Zero. 10287 if (!isNullConstant(UmlalNode->getOperand(3))) 10288 return SDValue(); 10289 10290 if ((isNullConstant(AddeNode->getOperand(0)) && 10291 AddeNode->getOperand(1).getNode() == UmlalNode) || 10292 (AddeNode->getOperand(0).getNode() == UmlalNode && 10293 isNullConstant(AddeNode->getOperand(1)))) { 10294 SelectionDAG &DAG = DCI.DAG; 10295 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 10296 UmlalNode->getOperand(2), AddHi }; 10297 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 10298 DAG.getVTList(MVT::i32, MVT::i32), Ops); 10299 10300 // Replace the ADDs' nodes uses by the UMAAL node's values. 10301 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 10302 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 10303 10304 // Return original node to notify the driver to stop replacing. 10305 return SDValue(AddeNode, 0); 10306 } 10307 return SDValue(); 10308 } 10309 10310 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 10311 const ARMSubtarget *Subtarget) { 10312 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 10313 return SDValue(); 10314 10315 // Check that we have a pair of ADDC and ADDE as operands. 10316 // Both addends of the ADDE must be zero. 10317 SDNode* AddcNode = N->getOperand(2).getNode(); 10318 SDNode* AddeNode = N->getOperand(3).getNode(); 10319 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 10320 (AddeNode->getOpcode() == ARMISD::ADDE) && 10321 isNullConstant(AddeNode->getOperand(0)) && 10322 isNullConstant(AddeNode->getOperand(1)) && 10323 (AddeNode->getOperand(2).getNode() == AddcNode)) 10324 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 10325 DAG.getVTList(MVT::i32, MVT::i32), 10326 {N->getOperand(0), N->getOperand(1), 10327 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 10328 else 10329 return SDValue(); 10330 } 10331 10332 static SDValue PerformAddcSubcCombine(SDNode *N, 10333 TargetLowering::DAGCombinerInfo &DCI, 10334 const ARMSubtarget *Subtarget) { 10335 SelectionDAG &DAG(DCI.DAG); 10336 10337 if (N->getOpcode() == ARMISD::SUBC) { 10338 // (SUBC (ADDE 0, 0, C), 1) -> C 10339 SDValue LHS = N->getOperand(0); 10340 SDValue RHS = N->getOperand(1); 10341 if (LHS->getOpcode() == ARMISD::ADDE && 10342 isNullConstant(LHS->getOperand(0)) && 10343 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 10344 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 10345 } 10346 } 10347 10348 if (Subtarget->isThumb1Only()) { 10349 SDValue RHS = N->getOperand(1); 10350 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 10351 int32_t imm = C->getSExtValue(); 10352 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 10353 SDLoc DL(N); 10354 RHS = DAG.getConstant(-imm, DL, MVT::i32); 10355 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 10356 : ARMISD::ADDC; 10357 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 10358 } 10359 } 10360 } 10361 10362 return SDValue(); 10363 } 10364 10365 static SDValue PerformAddeSubeCombine(SDNode *N, 10366 TargetLowering::DAGCombinerInfo &DCI, 10367 const ARMSubtarget *Subtarget) { 10368 if (Subtarget->isThumb1Only()) { 10369 SelectionDAG &DAG = DCI.DAG; 10370 SDValue RHS = N->getOperand(1); 10371 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 10372 int64_t imm = C->getSExtValue(); 10373 if (imm < 0) { 10374 SDLoc DL(N); 10375 10376 // The with-carry-in form matches bitwise not instead of the negation. 10377 // Effectively, the inverse interpretation of the carry flag already 10378 // accounts for part of the negation. 10379 RHS = DAG.getConstant(~imm, DL, MVT::i32); 10380 10381 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 10382 : ARMISD::ADDE; 10383 return DAG.getNode(Opcode, DL, N->getVTList(), 10384 N->getOperand(0), RHS, N->getOperand(2)); 10385 } 10386 } 10387 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 10388 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 10389 } 10390 return SDValue(); 10391 } 10392 10393 static SDValue PerformABSCombine(SDNode *N, 10394 TargetLowering::DAGCombinerInfo &DCI, 10395 const ARMSubtarget *Subtarget) { 10396 SDValue res; 10397 SelectionDAG &DAG = DCI.DAG; 10398 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10399 10400 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 10401 return SDValue(); 10402 10403 if (!TLI.expandABS(N, res, DAG)) 10404 return SDValue(); 10405 10406 return res; 10407 } 10408 10409 /// PerformADDECombine - Target-specific dag combine transform from 10410 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 10411 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 10412 static SDValue PerformADDECombine(SDNode *N, 10413 TargetLowering::DAGCombinerInfo &DCI, 10414 const ARMSubtarget *Subtarget) { 10415 // Only ARM and Thumb2 support UMLAL/SMLAL. 10416 if (Subtarget->isThumb1Only()) 10417 return PerformAddeSubeCombine(N, DCI, Subtarget); 10418 10419 // Only perform the checks after legalize when the pattern is available. 10420 if (DCI.isBeforeLegalize()) return SDValue(); 10421 10422 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 10423 } 10424 10425 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 10426 /// operands N0 and N1. This is a helper for PerformADDCombine that is 10427 /// called with the default operands, and if that fails, with commuted 10428 /// operands. 10429 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 10430 TargetLowering::DAGCombinerInfo &DCI, 10431 const ARMSubtarget *Subtarget){ 10432 // Attempt to create vpadd for this add. 10433 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 10434 return Result; 10435 10436 // Attempt to create vpaddl for this add. 10437 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 10438 return Result; 10439 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 10440 Subtarget)) 10441 return Result; 10442 10443 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 10444 if (N0.getNode()->hasOneUse()) 10445 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 10446 return Result; 10447 return SDValue(); 10448 } 10449 10450 bool 10451 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 10452 CombineLevel Level) const { 10453 if (Level == BeforeLegalizeTypes) 10454 return true; 10455 10456 if (N->getOpcode() != ISD::SHL) 10457 return true; 10458 10459 if (Subtarget->isThumb1Only()) { 10460 // Avoid making expensive immediates by commuting shifts. (This logic 10461 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 10462 // for free.) 10463 if (N->getOpcode() != ISD::SHL) 10464 return true; 10465 SDValue N1 = N->getOperand(0); 10466 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 10467 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 10468 return true; 10469 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 10470 if (Const->getAPIntValue().ult(256)) 10471 return false; 10472 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 10473 Const->getAPIntValue().sgt(-256)) 10474 return false; 10475 } 10476 return true; 10477 } 10478 10479 // Turn off commute-with-shift transform after legalization, so it doesn't 10480 // conflict with PerformSHLSimplify. (We could try to detect when 10481 // PerformSHLSimplify would trigger more precisely, but it isn't 10482 // really necessary.) 10483 return false; 10484 } 10485 10486 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 10487 const SDNode *N, CombineLevel Level) const { 10488 if (!Subtarget->isThumb1Only()) 10489 return true; 10490 10491 if (Level == BeforeLegalizeTypes) 10492 return true; 10493 10494 return false; 10495 } 10496 10497 static SDValue PerformSHLSimplify(SDNode *N, 10498 TargetLowering::DAGCombinerInfo &DCI, 10499 const ARMSubtarget *ST) { 10500 // Allow the generic combiner to identify potential bswaps. 10501 if (DCI.isBeforeLegalize()) 10502 return SDValue(); 10503 10504 // DAG combiner will fold: 10505 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 10506 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 10507 // Other code patterns that can be also be modified have the following form: 10508 // b + ((a << 1) | 510) 10509 // b + ((a << 1) & 510) 10510 // b + ((a << 1) ^ 510) 10511 // b + ((a << 1) + 510) 10512 10513 // Many instructions can perform the shift for free, but it requires both 10514 // the operands to be registers. If c1 << c2 is too large, a mov immediate 10515 // instruction will needed. So, unfold back to the original pattern if: 10516 // - if c1 and c2 are small enough that they don't require mov imms. 10517 // - the user(s) of the node can perform an shl 10518 10519 // No shifted operands for 16-bit instructions. 10520 if (ST->isThumb() && ST->isThumb1Only()) 10521 return SDValue(); 10522 10523 // Check that all the users could perform the shl themselves. 10524 for (auto U : N->uses()) { 10525 switch(U->getOpcode()) { 10526 default: 10527 return SDValue(); 10528 case ISD::SUB: 10529 case ISD::ADD: 10530 case ISD::AND: 10531 case ISD::OR: 10532 case ISD::XOR: 10533 case ISD::SETCC: 10534 case ARMISD::CMP: 10535 // Check that the user isn't already using a constant because there 10536 // aren't any instructions that support an immediate operand and a 10537 // shifted operand. 10538 if (isa<ConstantSDNode>(U->getOperand(0)) || 10539 isa<ConstantSDNode>(U->getOperand(1))) 10540 return SDValue(); 10541 10542 // Check that it's not already using a shift. 10543 if (U->getOperand(0).getOpcode() == ISD::SHL || 10544 U->getOperand(1).getOpcode() == ISD::SHL) 10545 return SDValue(); 10546 break; 10547 } 10548 } 10549 10550 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 10551 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 10552 return SDValue(); 10553 10554 if (N->getOperand(0).getOpcode() != ISD::SHL) 10555 return SDValue(); 10556 10557 SDValue SHL = N->getOperand(0); 10558 10559 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10560 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 10561 if (!C1ShlC2 || !C2) 10562 return SDValue(); 10563 10564 APInt C2Int = C2->getAPIntValue(); 10565 APInt C1Int = C1ShlC2->getAPIntValue(); 10566 10567 // Check that performing a lshr will not lose any information. 10568 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 10569 C2Int.getBitWidth() - C2->getZExtValue()); 10570 if ((C1Int & Mask) != C1Int) 10571 return SDValue(); 10572 10573 // Shift the first constant. 10574 C1Int.lshrInPlace(C2Int); 10575 10576 // The immediates are encoded as an 8-bit value that can be rotated. 10577 auto LargeImm = [](const APInt &Imm) { 10578 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 10579 return Imm.getBitWidth() - Zeros > 8; 10580 }; 10581 10582 if (LargeImm(C1Int) || LargeImm(C2Int)) 10583 return SDValue(); 10584 10585 SelectionDAG &DAG = DCI.DAG; 10586 SDLoc dl(N); 10587 SDValue X = SHL.getOperand(0); 10588 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 10589 DAG.getConstant(C1Int, dl, MVT::i32)); 10590 // Shift left to compensate for the lshr of C1Int. 10591 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 10592 10593 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 10594 SHL.dump(); N->dump()); 10595 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 10596 return Res; 10597 } 10598 10599 10600 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 10601 /// 10602 static SDValue PerformADDCombine(SDNode *N, 10603 TargetLowering::DAGCombinerInfo &DCI, 10604 const ARMSubtarget *Subtarget) { 10605 SDValue N0 = N->getOperand(0); 10606 SDValue N1 = N->getOperand(1); 10607 10608 // Only works one way, because it needs an immediate operand. 10609 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 10610 return Result; 10611 10612 // First try with the default operand order. 10613 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 10614 return Result; 10615 10616 // If that didn't work, try again with the operands commuted. 10617 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 10618 } 10619 10620 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 10621 /// 10622 static SDValue PerformSUBCombine(SDNode *N, 10623 TargetLowering::DAGCombinerInfo &DCI) { 10624 SDValue N0 = N->getOperand(0); 10625 SDValue N1 = N->getOperand(1); 10626 10627 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 10628 if (N1.getNode()->hasOneUse()) 10629 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 10630 return Result; 10631 10632 return SDValue(); 10633 } 10634 10635 /// PerformVMULCombine 10636 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 10637 /// special multiplier accumulator forwarding. 10638 /// vmul d3, d0, d2 10639 /// vmla d3, d1, d2 10640 /// is faster than 10641 /// vadd d3, d0, d1 10642 /// vmul d3, d3, d2 10643 // However, for (A + B) * (A + B), 10644 // vadd d2, d0, d1 10645 // vmul d3, d0, d2 10646 // vmla d3, d1, d2 10647 // is slower than 10648 // vadd d2, d0, d1 10649 // vmul d3, d2, d2 10650 static SDValue PerformVMULCombine(SDNode *N, 10651 TargetLowering::DAGCombinerInfo &DCI, 10652 const ARMSubtarget *Subtarget) { 10653 if (!Subtarget->hasVMLxForwarding()) 10654 return SDValue(); 10655 10656 SelectionDAG &DAG = DCI.DAG; 10657 SDValue N0 = N->getOperand(0); 10658 SDValue N1 = N->getOperand(1); 10659 unsigned Opcode = N0.getOpcode(); 10660 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 10661 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 10662 Opcode = N1.getOpcode(); 10663 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 10664 Opcode != ISD::FADD && Opcode != ISD::FSUB) 10665 return SDValue(); 10666 std::swap(N0, N1); 10667 } 10668 10669 if (N0 == N1) 10670 return SDValue(); 10671 10672 EVT VT = N->getValueType(0); 10673 SDLoc DL(N); 10674 SDValue N00 = N0->getOperand(0); 10675 SDValue N01 = N0->getOperand(1); 10676 return DAG.getNode(Opcode, DL, VT, 10677 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 10678 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 10679 } 10680 10681 static SDValue PerformMULCombine(SDNode *N, 10682 TargetLowering::DAGCombinerInfo &DCI, 10683 const ARMSubtarget *Subtarget) { 10684 SelectionDAG &DAG = DCI.DAG; 10685 10686 if (Subtarget->isThumb1Only()) 10687 return SDValue(); 10688 10689 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10690 return SDValue(); 10691 10692 EVT VT = N->getValueType(0); 10693 if (VT.is64BitVector() || VT.is128BitVector()) 10694 return PerformVMULCombine(N, DCI, Subtarget); 10695 if (VT != MVT::i32) 10696 return SDValue(); 10697 10698 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10699 if (!C) 10700 return SDValue(); 10701 10702 int64_t MulAmt = C->getSExtValue(); 10703 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 10704 10705 ShiftAmt = ShiftAmt & (32 - 1); 10706 SDValue V = N->getOperand(0); 10707 SDLoc DL(N); 10708 10709 SDValue Res; 10710 MulAmt >>= ShiftAmt; 10711 10712 if (MulAmt >= 0) { 10713 if (isPowerOf2_32(MulAmt - 1)) { 10714 // (mul x, 2^N + 1) => (add (shl x, N), x) 10715 Res = DAG.getNode(ISD::ADD, DL, VT, 10716 V, 10717 DAG.getNode(ISD::SHL, DL, VT, 10718 V, 10719 DAG.getConstant(Log2_32(MulAmt - 1), DL, 10720 MVT::i32))); 10721 } else if (isPowerOf2_32(MulAmt + 1)) { 10722 // (mul x, 2^N - 1) => (sub (shl x, N), x) 10723 Res = DAG.getNode(ISD::SUB, DL, VT, 10724 DAG.getNode(ISD::SHL, DL, VT, 10725 V, 10726 DAG.getConstant(Log2_32(MulAmt + 1), DL, 10727 MVT::i32)), 10728 V); 10729 } else 10730 return SDValue(); 10731 } else { 10732 uint64_t MulAmtAbs = -MulAmt; 10733 if (isPowerOf2_32(MulAmtAbs + 1)) { 10734 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 10735 Res = DAG.getNode(ISD::SUB, DL, VT, 10736 V, 10737 DAG.getNode(ISD::SHL, DL, VT, 10738 V, 10739 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 10740 MVT::i32))); 10741 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 10742 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 10743 Res = DAG.getNode(ISD::ADD, DL, VT, 10744 V, 10745 DAG.getNode(ISD::SHL, DL, VT, 10746 V, 10747 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 10748 MVT::i32))); 10749 Res = DAG.getNode(ISD::SUB, DL, VT, 10750 DAG.getConstant(0, DL, MVT::i32), Res); 10751 } else 10752 return SDValue(); 10753 } 10754 10755 if (ShiftAmt != 0) 10756 Res = DAG.getNode(ISD::SHL, DL, VT, 10757 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 10758 10759 // Do not add new nodes to DAG combiner worklist. 10760 DCI.CombineTo(N, Res, false); 10761 return SDValue(); 10762 } 10763 10764 static SDValue CombineANDShift(SDNode *N, 10765 TargetLowering::DAGCombinerInfo &DCI, 10766 const ARMSubtarget *Subtarget) { 10767 // Allow DAGCombine to pattern-match before we touch the canonical form. 10768 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10769 return SDValue(); 10770 10771 if (N->getValueType(0) != MVT::i32) 10772 return SDValue(); 10773 10774 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10775 if (!N1C) 10776 return SDValue(); 10777 10778 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 10779 // Don't transform uxtb/uxth. 10780 if (C1 == 255 || C1 == 65535) 10781 return SDValue(); 10782 10783 SDNode *N0 = N->getOperand(0).getNode(); 10784 if (!N0->hasOneUse()) 10785 return SDValue(); 10786 10787 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 10788 return SDValue(); 10789 10790 bool LeftShift = N0->getOpcode() == ISD::SHL; 10791 10792 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 10793 if (!N01C) 10794 return SDValue(); 10795 10796 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 10797 if (!C2 || C2 >= 32) 10798 return SDValue(); 10799 10800 // Clear irrelevant bits in the mask. 10801 if (LeftShift) 10802 C1 &= (-1U << C2); 10803 else 10804 C1 &= (-1U >> C2); 10805 10806 SelectionDAG &DAG = DCI.DAG; 10807 SDLoc DL(N); 10808 10809 // We have a pattern of the form "(and (shl x, c2) c1)" or 10810 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 10811 // transform to a pair of shifts, to save materializing c1. 10812 10813 // First pattern: right shift, then mask off leading bits. 10814 // FIXME: Use demanded bits? 10815 if (!LeftShift && isMask_32(C1)) { 10816 uint32_t C3 = countLeadingZeros(C1); 10817 if (C2 < C3) { 10818 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 10819 DAG.getConstant(C3 - C2, DL, MVT::i32)); 10820 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 10821 DAG.getConstant(C3, DL, MVT::i32)); 10822 } 10823 } 10824 10825 // First pattern, reversed: left shift, then mask off trailing bits. 10826 if (LeftShift && isMask_32(~C1)) { 10827 uint32_t C3 = countTrailingZeros(C1); 10828 if (C2 < C3) { 10829 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 10830 DAG.getConstant(C3 - C2, DL, MVT::i32)); 10831 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 10832 DAG.getConstant(C3, DL, MVT::i32)); 10833 } 10834 } 10835 10836 // Second pattern: left shift, then mask off leading bits. 10837 // FIXME: Use demanded bits? 10838 if (LeftShift && isShiftedMask_32(C1)) { 10839 uint32_t Trailing = countTrailingZeros(C1); 10840 uint32_t C3 = countLeadingZeros(C1); 10841 if (Trailing == C2 && C2 + C3 < 32) { 10842 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 10843 DAG.getConstant(C2 + C3, DL, MVT::i32)); 10844 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 10845 DAG.getConstant(C3, DL, MVT::i32)); 10846 } 10847 } 10848 10849 // Second pattern, reversed: right shift, then mask off trailing bits. 10850 // FIXME: Handle other patterns of known/demanded bits. 10851 if (!LeftShift && isShiftedMask_32(C1)) { 10852 uint32_t Leading = countLeadingZeros(C1); 10853 uint32_t C3 = countTrailingZeros(C1); 10854 if (Leading == C2 && C2 + C3 < 32) { 10855 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 10856 DAG.getConstant(C2 + C3, DL, MVT::i32)); 10857 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 10858 DAG.getConstant(C3, DL, MVT::i32)); 10859 } 10860 } 10861 10862 // FIXME: Transform "(and (shl x, c2) c1)" -> 10863 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 10864 // c1. 10865 return SDValue(); 10866 } 10867 10868 static SDValue PerformANDCombine(SDNode *N, 10869 TargetLowering::DAGCombinerInfo &DCI, 10870 const ARMSubtarget *Subtarget) { 10871 // Attempt to use immediate-form VBIC 10872 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 10873 SDLoc dl(N); 10874 EVT VT = N->getValueType(0); 10875 SelectionDAG &DAG = DCI.DAG; 10876 10877 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10878 return SDValue(); 10879 10880 APInt SplatBits, SplatUndef; 10881 unsigned SplatBitSize; 10882 bool HasAnyUndefs; 10883 if (BVN && 10884 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 10885 if (SplatBitSize <= 64) { 10886 EVT VbicVT; 10887 SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), 10888 SplatUndef.getZExtValue(), SplatBitSize, 10889 DAG, dl, VbicVT, VT.is128BitVector(), 10890 OtherModImm); 10891 if (Val.getNode()) { 10892 SDValue Input = 10893 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 10894 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 10895 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 10896 } 10897 } 10898 } 10899 10900 if (!Subtarget->isThumb1Only()) { 10901 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 10902 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 10903 return Result; 10904 10905 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 10906 return Result; 10907 } 10908 10909 if (Subtarget->isThumb1Only()) 10910 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 10911 return Result; 10912 10913 return SDValue(); 10914 } 10915 10916 // Try combining OR nodes to SMULWB, SMULWT. 10917 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 10918 TargetLowering::DAGCombinerInfo &DCI, 10919 const ARMSubtarget *Subtarget) { 10920 if (!Subtarget->hasV6Ops() || 10921 (Subtarget->isThumb() && 10922 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 10923 return SDValue(); 10924 10925 SDValue SRL = OR->getOperand(0); 10926 SDValue SHL = OR->getOperand(1); 10927 10928 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 10929 SRL = OR->getOperand(1); 10930 SHL = OR->getOperand(0); 10931 } 10932 if (!isSRL16(SRL) || !isSHL16(SHL)) 10933 return SDValue(); 10934 10935 // The first operands to the shifts need to be the two results from the 10936 // same smul_lohi node. 10937 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 10938 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 10939 return SDValue(); 10940 10941 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 10942 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 10943 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 10944 return SDValue(); 10945 10946 // Now we have: 10947 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 10948 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 10949 // For SMUWB the 16-bit value will signed extended somehow. 10950 // For SMULWT only the SRA is required. 10951 // Check both sides of SMUL_LOHI 10952 SDValue OpS16 = SMULLOHI->getOperand(0); 10953 SDValue OpS32 = SMULLOHI->getOperand(1); 10954 10955 SelectionDAG &DAG = DCI.DAG; 10956 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 10957 OpS16 = OpS32; 10958 OpS32 = SMULLOHI->getOperand(0); 10959 } 10960 10961 SDLoc dl(OR); 10962 unsigned Opcode = 0; 10963 if (isS16(OpS16, DAG)) 10964 Opcode = ARMISD::SMULWB; 10965 else if (isSRA16(OpS16)) { 10966 Opcode = ARMISD::SMULWT; 10967 OpS16 = OpS16->getOperand(0); 10968 } 10969 else 10970 return SDValue(); 10971 10972 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 10973 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 10974 return SDValue(OR, 0); 10975 } 10976 10977 static SDValue PerformORCombineToBFI(SDNode *N, 10978 TargetLowering::DAGCombinerInfo &DCI, 10979 const ARMSubtarget *Subtarget) { 10980 // BFI is only available on V6T2+ 10981 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 10982 return SDValue(); 10983 10984 EVT VT = N->getValueType(0); 10985 SDValue N0 = N->getOperand(0); 10986 SDValue N1 = N->getOperand(1); 10987 SelectionDAG &DAG = DCI.DAG; 10988 SDLoc DL(N); 10989 // 1) or (and A, mask), val => ARMbfi A, val, mask 10990 // iff (val & mask) == val 10991 // 10992 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 10993 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 10994 // && mask == ~mask2 10995 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 10996 // && ~mask == mask2 10997 // (i.e., copy a bitfield value into another bitfield of the same width) 10998 10999 if (VT != MVT::i32) 11000 return SDValue(); 11001 11002 SDValue N00 = N0.getOperand(0); 11003 11004 // The value and the mask need to be constants so we can verify this is 11005 // actually a bitfield set. If the mask is 0xffff, we can do better 11006 // via a movt instruction, so don't use BFI in that case. 11007 SDValue MaskOp = N0.getOperand(1); 11008 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 11009 if (!MaskC) 11010 return SDValue(); 11011 unsigned Mask = MaskC->getZExtValue(); 11012 if (Mask == 0xffff) 11013 return SDValue(); 11014 SDValue Res; 11015 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 11016 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 11017 if (N1C) { 11018 unsigned Val = N1C->getZExtValue(); 11019 if ((Val & ~Mask) != Val) 11020 return SDValue(); 11021 11022 if (ARM::isBitFieldInvertedMask(Mask)) { 11023 Val >>= countTrailingZeros(~Mask); 11024 11025 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 11026 DAG.getConstant(Val, DL, MVT::i32), 11027 DAG.getConstant(Mask, DL, MVT::i32)); 11028 11029 DCI.CombineTo(N, Res, false); 11030 // Return value from the original node to inform the combiner than N is 11031 // now dead. 11032 return SDValue(N, 0); 11033 } 11034 } else if (N1.getOpcode() == ISD::AND) { 11035 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 11036 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 11037 if (!N11C) 11038 return SDValue(); 11039 unsigned Mask2 = N11C->getZExtValue(); 11040 11041 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 11042 // as is to match. 11043 if (ARM::isBitFieldInvertedMask(Mask) && 11044 (Mask == ~Mask2)) { 11045 // The pack halfword instruction works better for masks that fit it, 11046 // so use that when it's available. 11047 if (Subtarget->hasDSP() && 11048 (Mask == 0xffff || Mask == 0xffff0000)) 11049 return SDValue(); 11050 // 2a 11051 unsigned amt = countTrailingZeros(Mask2); 11052 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 11053 DAG.getConstant(amt, DL, MVT::i32)); 11054 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 11055 DAG.getConstant(Mask, DL, MVT::i32)); 11056 DCI.CombineTo(N, Res, false); 11057 // Return value from the original node to inform the combiner than N is 11058 // now dead. 11059 return SDValue(N, 0); 11060 } else if (ARM::isBitFieldInvertedMask(~Mask) && 11061 (~Mask == Mask2)) { 11062 // The pack halfword instruction works better for masks that fit it, 11063 // so use that when it's available. 11064 if (Subtarget->hasDSP() && 11065 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 11066 return SDValue(); 11067 // 2b 11068 unsigned lsb = countTrailingZeros(Mask); 11069 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 11070 DAG.getConstant(lsb, DL, MVT::i32)); 11071 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 11072 DAG.getConstant(Mask2, DL, MVT::i32)); 11073 DCI.CombineTo(N, Res, false); 11074 // Return value from the original node to inform the combiner than N is 11075 // now dead. 11076 return SDValue(N, 0); 11077 } 11078 } 11079 11080 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 11081 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 11082 ARM::isBitFieldInvertedMask(~Mask)) { 11083 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 11084 // where lsb(mask) == #shamt and masked bits of B are known zero. 11085 SDValue ShAmt = N00.getOperand(1); 11086 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 11087 unsigned LSB = countTrailingZeros(Mask); 11088 if (ShAmtC != LSB) 11089 return SDValue(); 11090 11091 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 11092 DAG.getConstant(~Mask, DL, MVT::i32)); 11093 11094 DCI.CombineTo(N, Res, false); 11095 // Return value from the original node to inform the combiner than N is 11096 // now dead. 11097 return SDValue(N, 0); 11098 } 11099 11100 return SDValue(); 11101 } 11102 11103 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 11104 static SDValue PerformORCombine(SDNode *N, 11105 TargetLowering::DAGCombinerInfo &DCI, 11106 const ARMSubtarget *Subtarget) { 11107 // Attempt to use immediate-form VORR 11108 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 11109 SDLoc dl(N); 11110 EVT VT = N->getValueType(0); 11111 SelectionDAG &DAG = DCI.DAG; 11112 11113 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11114 return SDValue(); 11115 11116 APInt SplatBits, SplatUndef; 11117 unsigned SplatBitSize; 11118 bool HasAnyUndefs; 11119 if (BVN && Subtarget->hasNEON() && 11120 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 11121 if (SplatBitSize <= 64) { 11122 EVT VorrVT; 11123 SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), 11124 SplatUndef.getZExtValue(), SplatBitSize, 11125 DAG, dl, VorrVT, VT.is128BitVector(), 11126 OtherModImm); 11127 if (Val.getNode()) { 11128 SDValue Input = 11129 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 11130 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 11131 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 11132 } 11133 } 11134 } 11135 11136 if (!Subtarget->isThumb1Only()) { 11137 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 11138 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 11139 return Result; 11140 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 11141 return Result; 11142 } 11143 11144 SDValue N0 = N->getOperand(0); 11145 SDValue N1 = N->getOperand(1); 11146 11147 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 11148 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 11149 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 11150 11151 // The code below optimizes (or (and X, Y), Z). 11152 // The AND operand needs to have a single user to make these optimizations 11153 // profitable. 11154 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 11155 return SDValue(); 11156 11157 APInt SplatUndef; 11158 unsigned SplatBitSize; 11159 bool HasAnyUndefs; 11160 11161 APInt SplatBits0, SplatBits1; 11162 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 11163 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 11164 // Ensure that the second operand of both ands are constants 11165 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 11166 HasAnyUndefs) && !HasAnyUndefs) { 11167 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 11168 HasAnyUndefs) && !HasAnyUndefs) { 11169 // Ensure that the bit width of the constants are the same and that 11170 // the splat arguments are logical inverses as per the pattern we 11171 // are trying to simplify. 11172 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 11173 SplatBits0 == ~SplatBits1) { 11174 // Canonicalize the vector type to make instruction selection 11175 // simpler. 11176 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 11177 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, 11178 N0->getOperand(1), 11179 N0->getOperand(0), 11180 N1->getOperand(0)); 11181 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 11182 } 11183 } 11184 } 11185 } 11186 11187 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 11188 // reasonable. 11189 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 11190 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 11191 return Res; 11192 } 11193 11194 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11195 return Result; 11196 11197 return SDValue(); 11198 } 11199 11200 static SDValue PerformXORCombine(SDNode *N, 11201 TargetLowering::DAGCombinerInfo &DCI, 11202 const ARMSubtarget *Subtarget) { 11203 EVT VT = N->getValueType(0); 11204 SelectionDAG &DAG = DCI.DAG; 11205 11206 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11207 return SDValue(); 11208 11209 if (!Subtarget->isThumb1Only()) { 11210 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11211 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 11212 return Result; 11213 11214 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 11215 return Result; 11216 } 11217 11218 return SDValue(); 11219 } 11220 11221 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 11222 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 11223 // their position in "to" (Rd). 11224 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 11225 assert(N->getOpcode() == ARMISD::BFI); 11226 11227 SDValue From = N->getOperand(1); 11228 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 11229 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 11230 11231 // If the Base came from a SHR #C, we can deduce that it is really testing bit 11232 // #C in the base of the SHR. 11233 if (From->getOpcode() == ISD::SRL && 11234 isa<ConstantSDNode>(From->getOperand(1))) { 11235 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 11236 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 11237 FromMask <<= Shift.getLimitedValue(31); 11238 From = From->getOperand(0); 11239 } 11240 11241 return From; 11242 } 11243 11244 // If A and B contain one contiguous set of bits, does A | B == A . B? 11245 // 11246 // Neither A nor B must be zero. 11247 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 11248 unsigned LastActiveBitInA = A.countTrailingZeros(); 11249 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 11250 return LastActiveBitInA - 1 == FirstActiveBitInB; 11251 } 11252 11253 static SDValue FindBFIToCombineWith(SDNode *N) { 11254 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 11255 // if one exists. 11256 APInt ToMask, FromMask; 11257 SDValue From = ParseBFI(N, ToMask, FromMask); 11258 SDValue To = N->getOperand(0); 11259 11260 // Now check for a compatible BFI to merge with. We can pass through BFIs that 11261 // aren't compatible, but not if they set the same bit in their destination as 11262 // we do (or that of any BFI we're going to combine with). 11263 SDValue V = To; 11264 APInt CombinedToMask = ToMask; 11265 while (V.getOpcode() == ARMISD::BFI) { 11266 APInt NewToMask, NewFromMask; 11267 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 11268 if (NewFrom != From) { 11269 // This BFI has a different base. Keep going. 11270 CombinedToMask |= NewToMask; 11271 V = V.getOperand(0); 11272 continue; 11273 } 11274 11275 // Do the written bits conflict with any we've seen so far? 11276 if ((NewToMask & CombinedToMask).getBoolValue()) 11277 // Conflicting bits - bail out because going further is unsafe. 11278 return SDValue(); 11279 11280 // Are the new bits contiguous when combined with the old bits? 11281 if (BitsProperlyConcatenate(ToMask, NewToMask) && 11282 BitsProperlyConcatenate(FromMask, NewFromMask)) 11283 return V; 11284 if (BitsProperlyConcatenate(NewToMask, ToMask) && 11285 BitsProperlyConcatenate(NewFromMask, FromMask)) 11286 return V; 11287 11288 // We've seen a write to some bits, so track it. 11289 CombinedToMask |= NewToMask; 11290 // Keep going... 11291 V = V.getOperand(0); 11292 } 11293 11294 return SDValue(); 11295 } 11296 11297 static SDValue PerformBFICombine(SDNode *N, 11298 TargetLowering::DAGCombinerInfo &DCI) { 11299 SDValue N1 = N->getOperand(1); 11300 if (N1.getOpcode() == ISD::AND) { 11301 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 11302 // the bits being cleared by the AND are not demanded by the BFI. 11303 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 11304 if (!N11C) 11305 return SDValue(); 11306 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 11307 unsigned LSB = countTrailingZeros(~InvMask); 11308 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 11309 assert(Width < 11310 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 11311 "undefined behavior"); 11312 unsigned Mask = (1u << Width) - 1; 11313 unsigned Mask2 = N11C->getZExtValue(); 11314 if ((Mask & (~Mask2)) == 0) 11315 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 11316 N->getOperand(0), N1.getOperand(0), 11317 N->getOperand(2)); 11318 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 11319 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 11320 // Keep track of any consecutive bits set that all come from the same base 11321 // value. We can combine these together into a single BFI. 11322 SDValue CombineBFI = FindBFIToCombineWith(N); 11323 if (CombineBFI == SDValue()) 11324 return SDValue(); 11325 11326 // We've found a BFI. 11327 APInt ToMask1, FromMask1; 11328 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 11329 11330 APInt ToMask2, FromMask2; 11331 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 11332 assert(From1 == From2); 11333 (void)From2; 11334 11335 // First, unlink CombineBFI. 11336 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 11337 // Then create a new BFI, combining the two together. 11338 APInt NewFromMask = FromMask1 | FromMask2; 11339 APInt NewToMask = ToMask1 | ToMask2; 11340 11341 EVT VT = N->getValueType(0); 11342 SDLoc dl(N); 11343 11344 if (NewFromMask[0] == 0) 11345 From1 = DCI.DAG.getNode( 11346 ISD::SRL, dl, VT, From1, 11347 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 11348 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 11349 DCI.DAG.getConstant(~NewToMask, dl, VT)); 11350 } 11351 return SDValue(); 11352 } 11353 11354 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 11355 /// ARMISD::VMOVRRD. 11356 static SDValue PerformVMOVRRDCombine(SDNode *N, 11357 TargetLowering::DAGCombinerInfo &DCI, 11358 const ARMSubtarget *Subtarget) { 11359 // vmovrrd(vmovdrr x, y) -> x,y 11360 SDValue InDouble = N->getOperand(0); 11361 if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) 11362 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 11363 11364 // vmovrrd(load f64) -> (load i32), (load i32) 11365 SDNode *InNode = InDouble.getNode(); 11366 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 11367 InNode->getValueType(0) == MVT::f64 && 11368 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 11369 !cast<LoadSDNode>(InNode)->isVolatile()) { 11370 // TODO: Should this be done for non-FrameIndex operands? 11371 LoadSDNode *LD = cast<LoadSDNode>(InNode); 11372 11373 SelectionDAG &DAG = DCI.DAG; 11374 SDLoc DL(LD); 11375 SDValue BasePtr = LD->getBasePtr(); 11376 SDValue NewLD1 = 11377 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 11378 LD->getAlignment(), LD->getMemOperand()->getFlags()); 11379 11380 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 11381 DAG.getConstant(4, DL, MVT::i32)); 11382 SDValue NewLD2 = DAG.getLoad( 11383 MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), 11384 std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); 11385 11386 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 11387 if (DCI.DAG.getDataLayout().isBigEndian()) 11388 std::swap (NewLD1, NewLD2); 11389 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 11390 return Result; 11391 } 11392 11393 return SDValue(); 11394 } 11395 11396 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 11397 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 11398 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 11399 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 11400 SDValue Op0 = N->getOperand(0); 11401 SDValue Op1 = N->getOperand(1); 11402 if (Op0.getOpcode() == ISD::BITCAST) 11403 Op0 = Op0.getOperand(0); 11404 if (Op1.getOpcode() == ISD::BITCAST) 11405 Op1 = Op1.getOperand(0); 11406 if (Op0.getOpcode() == ARMISD::VMOVRRD && 11407 Op0.getNode() == Op1.getNode() && 11408 Op0.getResNo() == 0 && Op1.getResNo() == 1) 11409 return DAG.getNode(ISD::BITCAST, SDLoc(N), 11410 N->getValueType(0), Op0.getOperand(0)); 11411 return SDValue(); 11412 } 11413 11414 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 11415 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 11416 /// i64 vector to have f64 elements, since the value can then be loaded 11417 /// directly into a VFP register. 11418 static bool hasNormalLoadOperand(SDNode *N) { 11419 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 11420 for (unsigned i = 0; i < NumElts; ++i) { 11421 SDNode *Elt = N->getOperand(i).getNode(); 11422 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 11423 return true; 11424 } 11425 return false; 11426 } 11427 11428 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 11429 /// ISD::BUILD_VECTOR. 11430 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 11431 TargetLowering::DAGCombinerInfo &DCI, 11432 const ARMSubtarget *Subtarget) { 11433 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 11434 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 11435 // into a pair of GPRs, which is fine when the value is used as a scalar, 11436 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 11437 SelectionDAG &DAG = DCI.DAG; 11438 if (N->getNumOperands() == 2) 11439 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 11440 return RV; 11441 11442 // Load i64 elements as f64 values so that type legalization does not split 11443 // them up into i32 values. 11444 EVT VT = N->getValueType(0); 11445 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 11446 return SDValue(); 11447 SDLoc dl(N); 11448 SmallVector<SDValue, 8> Ops; 11449 unsigned NumElts = VT.getVectorNumElements(); 11450 for (unsigned i = 0; i < NumElts; ++i) { 11451 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 11452 Ops.push_back(V); 11453 // Make the DAGCombiner fold the bitcast. 11454 DCI.AddToWorklist(V.getNode()); 11455 } 11456 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 11457 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 11458 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 11459 } 11460 11461 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 11462 static SDValue 11463 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 11464 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 11465 // At that time, we may have inserted bitcasts from integer to float. 11466 // If these bitcasts have survived DAGCombine, change the lowering of this 11467 // BUILD_VECTOR in something more vector friendly, i.e., that does not 11468 // force to use floating point types. 11469 11470 // Make sure we can change the type of the vector. 11471 // This is possible iff: 11472 // 1. The vector is only used in a bitcast to a integer type. I.e., 11473 // 1.1. Vector is used only once. 11474 // 1.2. Use is a bit convert to an integer type. 11475 // 2. The size of its operands are 32-bits (64-bits are not legal). 11476 EVT VT = N->getValueType(0); 11477 EVT EltVT = VT.getVectorElementType(); 11478 11479 // Check 1.1. and 2. 11480 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 11481 return SDValue(); 11482 11483 // By construction, the input type must be float. 11484 assert(EltVT == MVT::f32 && "Unexpected type!"); 11485 11486 // Check 1.2. 11487 SDNode *Use = *N->use_begin(); 11488 if (Use->getOpcode() != ISD::BITCAST || 11489 Use->getValueType(0).isFloatingPoint()) 11490 return SDValue(); 11491 11492 // Check profitability. 11493 // Model is, if more than half of the relevant operands are bitcast from 11494 // i32, turn the build_vector into a sequence of insert_vector_elt. 11495 // Relevant operands are everything that is not statically 11496 // (i.e., at compile time) bitcasted. 11497 unsigned NumOfBitCastedElts = 0; 11498 unsigned NumElts = VT.getVectorNumElements(); 11499 unsigned NumOfRelevantElts = NumElts; 11500 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 11501 SDValue Elt = N->getOperand(Idx); 11502 if (Elt->getOpcode() == ISD::BITCAST) { 11503 // Assume only bit cast to i32 will go away. 11504 if (Elt->getOperand(0).getValueType() == MVT::i32) 11505 ++NumOfBitCastedElts; 11506 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 11507 // Constants are statically casted, thus do not count them as 11508 // relevant operands. 11509 --NumOfRelevantElts; 11510 } 11511 11512 // Check if more than half of the elements require a non-free bitcast. 11513 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 11514 return SDValue(); 11515 11516 SelectionDAG &DAG = DCI.DAG; 11517 // Create the new vector type. 11518 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 11519 // Check if the type is legal. 11520 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11521 if (!TLI.isTypeLegal(VecVT)) 11522 return SDValue(); 11523 11524 // Combine: 11525 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 11526 // => BITCAST INSERT_VECTOR_ELT 11527 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 11528 // (BITCAST EN), N. 11529 SDValue Vec = DAG.getUNDEF(VecVT); 11530 SDLoc dl(N); 11531 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 11532 SDValue V = N->getOperand(Idx); 11533 if (V.isUndef()) 11534 continue; 11535 if (V.getOpcode() == ISD::BITCAST && 11536 V->getOperand(0).getValueType() == MVT::i32) 11537 // Fold obvious case. 11538 V = V.getOperand(0); 11539 else { 11540 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 11541 // Make the DAGCombiner fold the bitcasts. 11542 DCI.AddToWorklist(V.getNode()); 11543 } 11544 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 11545 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 11546 } 11547 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 11548 // Make the DAGCombiner fold the bitcasts. 11549 DCI.AddToWorklist(Vec.getNode()); 11550 return Vec; 11551 } 11552 11553 /// PerformInsertEltCombine - Target-specific dag combine xforms for 11554 /// ISD::INSERT_VECTOR_ELT. 11555 static SDValue PerformInsertEltCombine(SDNode *N, 11556 TargetLowering::DAGCombinerInfo &DCI) { 11557 // Bitcast an i64 load inserted into a vector to f64. 11558 // Otherwise, the i64 value will be legalized to a pair of i32 values. 11559 EVT VT = N->getValueType(0); 11560 SDNode *Elt = N->getOperand(1).getNode(); 11561 if (VT.getVectorElementType() != MVT::i64 || 11562 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 11563 return SDValue(); 11564 11565 SelectionDAG &DAG = DCI.DAG; 11566 SDLoc dl(N); 11567 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 11568 VT.getVectorNumElements()); 11569 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 11570 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 11571 // Make the DAGCombiner fold the bitcasts. 11572 DCI.AddToWorklist(Vec.getNode()); 11573 DCI.AddToWorklist(V.getNode()); 11574 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 11575 Vec, V, N->getOperand(2)); 11576 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 11577 } 11578 11579 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 11580 /// ISD::VECTOR_SHUFFLE. 11581 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 11582 // The LLVM shufflevector instruction does not require the shuffle mask 11583 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 11584 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 11585 // operands do not match the mask length, they are extended by concatenating 11586 // them with undef vectors. That is probably the right thing for other 11587 // targets, but for NEON it is better to concatenate two double-register 11588 // size vector operands into a single quad-register size vector. Do that 11589 // transformation here: 11590 // shuffle(concat(v1, undef), concat(v2, undef)) -> 11591 // shuffle(concat(v1, v2), undef) 11592 SDValue Op0 = N->getOperand(0); 11593 SDValue Op1 = N->getOperand(1); 11594 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 11595 Op1.getOpcode() != ISD::CONCAT_VECTORS || 11596 Op0.getNumOperands() != 2 || 11597 Op1.getNumOperands() != 2) 11598 return SDValue(); 11599 SDValue Concat0Op1 = Op0.getOperand(1); 11600 SDValue Concat1Op1 = Op1.getOperand(1); 11601 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 11602 return SDValue(); 11603 // Skip the transformation if any of the types are illegal. 11604 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11605 EVT VT = N->getValueType(0); 11606 if (!TLI.isTypeLegal(VT) || 11607 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 11608 !TLI.isTypeLegal(Concat1Op1.getValueType())) 11609 return SDValue(); 11610 11611 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 11612 Op0.getOperand(0), Op1.getOperand(0)); 11613 // Translate the shuffle mask. 11614 SmallVector<int, 16> NewMask; 11615 unsigned NumElts = VT.getVectorNumElements(); 11616 unsigned HalfElts = NumElts/2; 11617 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 11618 for (unsigned n = 0; n < NumElts; ++n) { 11619 int MaskElt = SVN->getMaskElt(n); 11620 int NewElt = -1; 11621 if (MaskElt < (int)HalfElts) 11622 NewElt = MaskElt; 11623 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 11624 NewElt = HalfElts + MaskElt - NumElts; 11625 NewMask.push_back(NewElt); 11626 } 11627 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 11628 DAG.getUNDEF(VT), NewMask); 11629 } 11630 11631 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 11632 /// NEON load/store intrinsics, and generic vector load/stores, to merge 11633 /// base address updates. 11634 /// For generic load/stores, the memory type is assumed to be a vector. 11635 /// The caller is assumed to have checked legality. 11636 static SDValue CombineBaseUpdate(SDNode *N, 11637 TargetLowering::DAGCombinerInfo &DCI) { 11638 SelectionDAG &DAG = DCI.DAG; 11639 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 11640 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 11641 const bool isStore = N->getOpcode() == ISD::STORE; 11642 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 11643 SDValue Addr = N->getOperand(AddrOpIdx); 11644 MemSDNode *MemN = cast<MemSDNode>(N); 11645 SDLoc dl(N); 11646 11647 // Search for a use of the address operand that is an increment. 11648 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 11649 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 11650 SDNode *User = *UI; 11651 if (User->getOpcode() != ISD::ADD || 11652 UI.getUse().getResNo() != Addr.getResNo()) 11653 continue; 11654 11655 // Check that the add is independent of the load/store. Otherwise, folding 11656 // it would create a cycle. We can avoid searching through Addr as it's a 11657 // predecessor to both. 11658 SmallPtrSet<const SDNode *, 32> Visited; 11659 SmallVector<const SDNode *, 16> Worklist; 11660 Visited.insert(Addr.getNode()); 11661 Worklist.push_back(N); 11662 Worklist.push_back(User); 11663 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 11664 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 11665 continue; 11666 11667 // Find the new opcode for the updating load/store. 11668 bool isLoadOp = true; 11669 bool isLaneOp = false; 11670 unsigned NewOpc = 0; 11671 unsigned NumVecs = 0; 11672 if (isIntrinsic) { 11673 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 11674 switch (IntNo) { 11675 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 11676 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 11677 NumVecs = 1; break; 11678 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 11679 NumVecs = 2; break; 11680 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 11681 NumVecs = 3; break; 11682 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 11683 NumVecs = 4; break; 11684 case Intrinsic::arm_neon_vld2dup: 11685 case Intrinsic::arm_neon_vld3dup: 11686 case Intrinsic::arm_neon_vld4dup: 11687 // TODO: Support updating VLDxDUP nodes. For now, we just skip 11688 // combining base updates for such intrinsics. 11689 continue; 11690 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 11691 NumVecs = 2; isLaneOp = true; break; 11692 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 11693 NumVecs = 3; isLaneOp = true; break; 11694 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 11695 NumVecs = 4; isLaneOp = true; break; 11696 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 11697 NumVecs = 1; isLoadOp = false; break; 11698 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 11699 NumVecs = 2; isLoadOp = false; break; 11700 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 11701 NumVecs = 3; isLoadOp = false; break; 11702 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 11703 NumVecs = 4; isLoadOp = false; break; 11704 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 11705 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 11706 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 11707 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 11708 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 11709 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 11710 } 11711 } else { 11712 isLaneOp = true; 11713 switch (N->getOpcode()) { 11714 default: llvm_unreachable("unexpected opcode for Neon base update"); 11715 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 11716 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 11717 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 11718 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 11719 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 11720 NumVecs = 1; isLaneOp = false; break; 11721 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 11722 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 11723 } 11724 } 11725 11726 // Find the size of memory referenced by the load/store. 11727 EVT VecTy; 11728 if (isLoadOp) { 11729 VecTy = N->getValueType(0); 11730 } else if (isIntrinsic) { 11731 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 11732 } else { 11733 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 11734 VecTy = N->getOperand(1).getValueType(); 11735 } 11736 11737 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 11738 if (isLaneOp) 11739 NumBytes /= VecTy.getVectorNumElements(); 11740 11741 // If the increment is a constant, it must match the memory ref size. 11742 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 11743 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 11744 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 11745 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 11746 // separate instructions that make it harder to use a non-constant update. 11747 continue; 11748 } 11749 11750 // OK, we found an ADD we can fold into the base update. 11751 // Now, create a _UPD node, taking care of not breaking alignment. 11752 11753 EVT AlignedVecTy = VecTy; 11754 unsigned Alignment = MemN->getAlignment(); 11755 11756 // If this is a less-than-standard-aligned load/store, change the type to 11757 // match the standard alignment. 11758 // The alignment is overlooked when selecting _UPD variants; and it's 11759 // easier to introduce bitcasts here than fix that. 11760 // There are 3 ways to get to this base-update combine: 11761 // - intrinsics: they are assumed to be properly aligned (to the standard 11762 // alignment of the memory type), so we don't need to do anything. 11763 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 11764 // intrinsics, so, likewise, there's nothing to do. 11765 // - generic load/store instructions: the alignment is specified as an 11766 // explicit operand, rather than implicitly as the standard alignment 11767 // of the memory type (like the intrisics). We need to change the 11768 // memory type to match the explicit alignment. That way, we don't 11769 // generate non-standard-aligned ARMISD::VLDx nodes. 11770 if (isa<LSBaseSDNode>(N)) { 11771 if (Alignment == 0) 11772 Alignment = 1; 11773 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 11774 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 11775 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 11776 assert(!isLaneOp && "Unexpected generic load/store lane."); 11777 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 11778 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 11779 } 11780 // Don't set an explicit alignment on regular load/stores that we want 11781 // to transform to VLD/VST 1_UPD nodes. 11782 // This matches the behavior of regular load/stores, which only get an 11783 // explicit alignment if the MMO alignment is larger than the standard 11784 // alignment of the memory type. 11785 // Intrinsics, however, always get an explicit alignment, set to the 11786 // alignment of the MMO. 11787 Alignment = 1; 11788 } 11789 11790 // Create the new updating load/store node. 11791 // First, create an SDVTList for the new updating node's results. 11792 EVT Tys[6]; 11793 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 11794 unsigned n; 11795 for (n = 0; n < NumResultVecs; ++n) 11796 Tys[n] = AlignedVecTy; 11797 Tys[n++] = MVT::i32; 11798 Tys[n] = MVT::Other; 11799 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 11800 11801 // Then, gather the new node's operands. 11802 SmallVector<SDValue, 8> Ops; 11803 Ops.push_back(N->getOperand(0)); // incoming chain 11804 Ops.push_back(N->getOperand(AddrOpIdx)); 11805 Ops.push_back(Inc); 11806 11807 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 11808 // Try to match the intrinsic's signature 11809 Ops.push_back(StN->getValue()); 11810 } else { 11811 // Loads (and of course intrinsics) match the intrinsics' signature, 11812 // so just add all but the alignment operand. 11813 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) 11814 Ops.push_back(N->getOperand(i)); 11815 } 11816 11817 // For all node types, the alignment operand is always the last one. 11818 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 11819 11820 // If this is a non-standard-aligned STORE, the penultimate operand is the 11821 // stored value. Bitcast it to the aligned type. 11822 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 11823 SDValue &StVal = Ops[Ops.size()-2]; 11824 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 11825 } 11826 11827 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 11828 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 11829 MemN->getMemOperand()); 11830 11831 // Update the uses. 11832 SmallVector<SDValue, 5> NewResults; 11833 for (unsigned i = 0; i < NumResultVecs; ++i) 11834 NewResults.push_back(SDValue(UpdN.getNode(), i)); 11835 11836 // If this is an non-standard-aligned LOAD, the first result is the loaded 11837 // value. Bitcast it to the expected result type. 11838 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 11839 SDValue &LdVal = NewResults[0]; 11840 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 11841 } 11842 11843 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 11844 DCI.CombineTo(N, NewResults); 11845 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 11846 11847 break; 11848 } 11849 return SDValue(); 11850 } 11851 11852 static SDValue PerformVLDCombine(SDNode *N, 11853 TargetLowering::DAGCombinerInfo &DCI) { 11854 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11855 return SDValue(); 11856 11857 return CombineBaseUpdate(N, DCI); 11858 } 11859 11860 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 11861 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 11862 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 11863 /// return true. 11864 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 11865 SelectionDAG &DAG = DCI.DAG; 11866 EVT VT = N->getValueType(0); 11867 // vldN-dup instructions only support 64-bit vectors for N > 1. 11868 if (!VT.is64BitVector()) 11869 return false; 11870 11871 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 11872 SDNode *VLD = N->getOperand(0).getNode(); 11873 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 11874 return false; 11875 unsigned NumVecs = 0; 11876 unsigned NewOpc = 0; 11877 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 11878 if (IntNo == Intrinsic::arm_neon_vld2lane) { 11879 NumVecs = 2; 11880 NewOpc = ARMISD::VLD2DUP; 11881 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 11882 NumVecs = 3; 11883 NewOpc = ARMISD::VLD3DUP; 11884 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 11885 NumVecs = 4; 11886 NewOpc = ARMISD::VLD4DUP; 11887 } else { 11888 return false; 11889 } 11890 11891 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 11892 // numbers match the load. 11893 unsigned VLDLaneNo = 11894 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 11895 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 11896 UI != UE; ++UI) { 11897 // Ignore uses of the chain result. 11898 if (UI.getUse().getResNo() == NumVecs) 11899 continue; 11900 SDNode *User = *UI; 11901 if (User->getOpcode() != ARMISD::VDUPLANE || 11902 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 11903 return false; 11904 } 11905 11906 // Create the vldN-dup node. 11907 EVT Tys[5]; 11908 unsigned n; 11909 for (n = 0; n < NumVecs; ++n) 11910 Tys[n] = VT; 11911 Tys[n] = MVT::Other; 11912 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 11913 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 11914 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 11915 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 11916 Ops, VLDMemInt->getMemoryVT(), 11917 VLDMemInt->getMemOperand()); 11918 11919 // Update the uses. 11920 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 11921 UI != UE; ++UI) { 11922 unsigned ResNo = UI.getUse().getResNo(); 11923 // Ignore uses of the chain result. 11924 if (ResNo == NumVecs) 11925 continue; 11926 SDNode *User = *UI; 11927 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 11928 } 11929 11930 // Now the vldN-lane intrinsic is dead except for its chain result. 11931 // Update uses of the chain. 11932 std::vector<SDValue> VLDDupResults; 11933 for (unsigned n = 0; n < NumVecs; ++n) 11934 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 11935 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 11936 DCI.CombineTo(VLD, VLDDupResults); 11937 11938 return true; 11939 } 11940 11941 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 11942 /// ARMISD::VDUPLANE. 11943 static SDValue PerformVDUPLANECombine(SDNode *N, 11944 TargetLowering::DAGCombinerInfo &DCI) { 11945 SDValue Op = N->getOperand(0); 11946 11947 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 11948 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 11949 if (CombineVLDDUP(N, DCI)) 11950 return SDValue(N, 0); 11951 11952 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 11953 // redundant. Ignore bit_converts for now; element sizes are checked below. 11954 while (Op.getOpcode() == ISD::BITCAST) 11955 Op = Op.getOperand(0); 11956 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 11957 return SDValue(); 11958 11959 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 11960 unsigned EltSize = Op.getScalarValueSizeInBits(); 11961 // The canonical VMOV for a zero vector uses a 32-bit element size. 11962 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11963 unsigned EltBits; 11964 if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) 11965 EltSize = 8; 11966 EVT VT = N->getValueType(0); 11967 if (EltSize > VT.getScalarSizeInBits()) 11968 return SDValue(); 11969 11970 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 11971 } 11972 11973 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 11974 static SDValue PerformVDUPCombine(SDNode *N, 11975 TargetLowering::DAGCombinerInfo &DCI) { 11976 SelectionDAG &DAG = DCI.DAG; 11977 SDValue Op = N->getOperand(0); 11978 11979 // Match VDUP(LOAD) -> VLD1DUP. 11980 // We match this pattern here rather than waiting for isel because the 11981 // transform is only legal for unindexed loads. 11982 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 11983 if (LD && Op.hasOneUse() && LD->isUnindexed() && 11984 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 11985 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 11986 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 11987 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 11988 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 11989 Ops, LD->getMemoryVT(), 11990 LD->getMemOperand()); 11991 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 11992 return VLDDup; 11993 } 11994 11995 return SDValue(); 11996 } 11997 11998 static SDValue PerformLOADCombine(SDNode *N, 11999 TargetLowering::DAGCombinerInfo &DCI) { 12000 EVT VT = N->getValueType(0); 12001 12002 // If this is a legal vector load, try to combine it into a VLD1_UPD. 12003 if (ISD::isNormalLoad(N) && VT.isVector() && 12004 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12005 return CombineBaseUpdate(N, DCI); 12006 12007 return SDValue(); 12008 } 12009 12010 /// PerformSTORECombine - Target-specific dag combine xforms for 12011 /// ISD::STORE. 12012 static SDValue PerformSTORECombine(SDNode *N, 12013 TargetLowering::DAGCombinerInfo &DCI) { 12014 StoreSDNode *St = cast<StoreSDNode>(N); 12015 if (St->isVolatile()) 12016 return SDValue(); 12017 12018 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 12019 // pack all of the elements in one place. Next, store to memory in fewer 12020 // chunks. 12021 SDValue StVal = St->getValue(); 12022 EVT VT = StVal.getValueType(); 12023 if (St->isTruncatingStore() && VT.isVector()) { 12024 SelectionDAG &DAG = DCI.DAG; 12025 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12026 EVT StVT = St->getMemoryVT(); 12027 unsigned NumElems = VT.getVectorNumElements(); 12028 assert(StVT != VT && "Cannot truncate to the same type"); 12029 unsigned FromEltSz = VT.getScalarSizeInBits(); 12030 unsigned ToEltSz = StVT.getScalarSizeInBits(); 12031 12032 // From, To sizes and ElemCount must be pow of two 12033 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); 12034 12035 // We are going to use the original vector elt for storing. 12036 // Accumulated smaller vector elements must be a multiple of the store size. 12037 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); 12038 12039 unsigned SizeRatio = FromEltSz / ToEltSz; 12040 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 12041 12042 // Create a type on which we perform the shuffle. 12043 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 12044 NumElems*SizeRatio); 12045 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 12046 12047 SDLoc DL(St); 12048 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 12049 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 12050 for (unsigned i = 0; i < NumElems; ++i) 12051 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() 12052 ? (i + 1) * SizeRatio - 1 12053 : i * SizeRatio; 12054 12055 // Can't shuffle using an illegal type. 12056 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 12057 12058 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, 12059 DAG.getUNDEF(WideVec.getValueType()), 12060 ShuffleVec); 12061 // At this point all of the data is stored at the bottom of the 12062 // register. We now need to save it to mem. 12063 12064 // Find the largest store unit 12065 MVT StoreType = MVT::i8; 12066 for (MVT Tp : MVT::integer_valuetypes()) { 12067 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 12068 StoreType = Tp; 12069 } 12070 // Didn't find a legal store type. 12071 if (!TLI.isTypeLegal(StoreType)) 12072 return SDValue(); 12073 12074 // Bitcast the original vector into a vector of store-size units 12075 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 12076 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 12077 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 12078 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 12079 SmallVector<SDValue, 8> Chains; 12080 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 12081 TLI.getPointerTy(DAG.getDataLayout())); 12082 SDValue BasePtr = St->getBasePtr(); 12083 12084 // Perform one or more big stores into memory. 12085 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); 12086 for (unsigned I = 0; I < E; I++) { 12087 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 12088 StoreType, ShuffWide, 12089 DAG.getIntPtrConstant(I, DL)); 12090 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, 12091 St->getPointerInfo(), St->getAlignment(), 12092 St->getMemOperand()->getFlags()); 12093 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, 12094 Increment); 12095 Chains.push_back(Ch); 12096 } 12097 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 12098 } 12099 12100 if (!ISD::isNormalStore(St)) 12101 return SDValue(); 12102 12103 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 12104 // ARM stores of arguments in the same cache line. 12105 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 12106 StVal.getNode()->hasOneUse()) { 12107 SelectionDAG &DAG = DCI.DAG; 12108 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 12109 SDLoc DL(St); 12110 SDValue BasePtr = St->getBasePtr(); 12111 SDValue NewST1 = DAG.getStore( 12112 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 12113 BasePtr, St->getPointerInfo(), St->getAlignment(), 12114 St->getMemOperand()->getFlags()); 12115 12116 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 12117 DAG.getConstant(4, DL, MVT::i32)); 12118 return DAG.getStore(NewST1.getValue(0), DL, 12119 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 12120 OffsetPtr, St->getPointerInfo(), 12121 std::min(4U, St->getAlignment() / 2), 12122 St->getMemOperand()->getFlags()); 12123 } 12124 12125 if (StVal.getValueType() == MVT::i64 && 12126 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12127 12128 // Bitcast an i64 store extracted from a vector to f64. 12129 // Otherwise, the i64 value will be legalized to a pair of i32 values. 12130 SelectionDAG &DAG = DCI.DAG; 12131 SDLoc dl(StVal); 12132 SDValue IntVec = StVal.getOperand(0); 12133 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 12134 IntVec.getValueType().getVectorNumElements()); 12135 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 12136 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 12137 Vec, StVal.getOperand(1)); 12138 dl = SDLoc(N); 12139 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 12140 // Make the DAGCombiner fold the bitcasts. 12141 DCI.AddToWorklist(Vec.getNode()); 12142 DCI.AddToWorklist(ExtElt.getNode()); 12143 DCI.AddToWorklist(V.getNode()); 12144 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 12145 St->getPointerInfo(), St->getAlignment(), 12146 St->getMemOperand()->getFlags(), St->getAAInfo()); 12147 } 12148 12149 // If this is a legal vector store, try to combine it into a VST1_UPD. 12150 if (ISD::isNormalStore(N) && VT.isVector() && 12151 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 12152 return CombineBaseUpdate(N, DCI); 12153 12154 return SDValue(); 12155 } 12156 12157 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 12158 /// can replace combinations of VMUL and VCVT (floating-point to integer) 12159 /// when the VMUL has a constant operand that is a power of 2. 12160 /// 12161 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 12162 /// vmul.f32 d16, d17, d16 12163 /// vcvt.s32.f32 d16, d16 12164 /// becomes: 12165 /// vcvt.s32.f32 d16, d16, #3 12166 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 12167 const ARMSubtarget *Subtarget) { 12168 if (!Subtarget->hasNEON()) 12169 return SDValue(); 12170 12171 SDValue Op = N->getOperand(0); 12172 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 12173 Op.getOpcode() != ISD::FMUL) 12174 return SDValue(); 12175 12176 SDValue ConstVec = Op->getOperand(1); 12177 if (!isa<BuildVectorSDNode>(ConstVec)) 12178 return SDValue(); 12179 12180 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 12181 uint32_t FloatBits = FloatTy.getSizeInBits(); 12182 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 12183 uint32_t IntBits = IntTy.getSizeInBits(); 12184 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12185 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 12186 // These instructions only exist converting from f32 to i32. We can handle 12187 // smaller integers by generating an extra truncate, but larger ones would 12188 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 12189 // these intructions only support v2i32/v4i32 types. 12190 return SDValue(); 12191 } 12192 12193 BitVector UndefElements; 12194 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12195 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 12196 if (C == -1 || C == 0 || C > 32) 12197 return SDValue(); 12198 12199 SDLoc dl(N); 12200 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 12201 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 12202 Intrinsic::arm_neon_vcvtfp2fxu; 12203 SDValue FixConv = DAG.getNode( 12204 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 12205 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 12206 DAG.getConstant(C, dl, MVT::i32)); 12207 12208 if (IntBits < FloatBits) 12209 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 12210 12211 return FixConv; 12212 } 12213 12214 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 12215 /// can replace combinations of VCVT (integer to floating-point) and VDIV 12216 /// when the VDIV has a constant operand that is a power of 2. 12217 /// 12218 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 12219 /// vcvt.f32.s32 d16, d16 12220 /// vdiv.f32 d16, d17, d16 12221 /// becomes: 12222 /// vcvt.f32.s32 d16, d16, #3 12223 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 12224 const ARMSubtarget *Subtarget) { 12225 if (!Subtarget->hasNEON()) 12226 return SDValue(); 12227 12228 SDValue Op = N->getOperand(0); 12229 unsigned OpOpcode = Op.getNode()->getOpcode(); 12230 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 12231 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 12232 return SDValue(); 12233 12234 SDValue ConstVec = N->getOperand(1); 12235 if (!isa<BuildVectorSDNode>(ConstVec)) 12236 return SDValue(); 12237 12238 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 12239 uint32_t FloatBits = FloatTy.getSizeInBits(); 12240 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 12241 uint32_t IntBits = IntTy.getSizeInBits(); 12242 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 12243 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 12244 // These instructions only exist converting from i32 to f32. We can handle 12245 // smaller integers by generating an extra extend, but larger ones would 12246 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 12247 // these intructions only support v2i32/v4i32 types. 12248 return SDValue(); 12249 } 12250 12251 BitVector UndefElements; 12252 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 12253 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 12254 if (C == -1 || C == 0 || C > 32) 12255 return SDValue(); 12256 12257 SDLoc dl(N); 12258 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 12259 SDValue ConvInput = Op.getOperand(0); 12260 if (IntBits < FloatBits) 12261 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 12262 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 12263 ConvInput); 12264 12265 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 12266 Intrinsic::arm_neon_vcvtfxu2fp; 12267 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 12268 Op.getValueType(), 12269 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 12270 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 12271 } 12272 12273 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 12274 /// operand of a vector shift operation, where all the elements of the 12275 /// build_vector must have the same constant integer value. 12276 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 12277 // Ignore bit_converts. 12278 while (Op.getOpcode() == ISD::BITCAST) 12279 Op = Op.getOperand(0); 12280 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 12281 APInt SplatBits, SplatUndef; 12282 unsigned SplatBitSize; 12283 bool HasAnyUndefs; 12284 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 12285 HasAnyUndefs, ElementBits) || 12286 SplatBitSize > ElementBits) 12287 return false; 12288 Cnt = SplatBits.getSExtValue(); 12289 return true; 12290 } 12291 12292 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 12293 /// operand of a vector shift left operation. That value must be in the range: 12294 /// 0 <= Value < ElementBits for a left shift; or 12295 /// 0 <= Value <= ElementBits for a long left shift. 12296 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 12297 assert(VT.isVector() && "vector shift count is not a vector type"); 12298 int64_t ElementBits = VT.getScalarSizeInBits(); 12299 if (! getVShiftImm(Op, ElementBits, Cnt)) 12300 return false; 12301 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); 12302 } 12303 12304 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 12305 /// operand of a vector shift right operation. For a shift opcode, the value 12306 /// is positive, but for an intrinsic the value count must be negative. The 12307 /// absolute value must be in the range: 12308 /// 1 <= |Value| <= ElementBits for a right shift; or 12309 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 12310 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 12311 int64_t &Cnt) { 12312 assert(VT.isVector() && "vector shift count is not a vector type"); 12313 int64_t ElementBits = VT.getScalarSizeInBits(); 12314 if (! getVShiftImm(Op, ElementBits, Cnt)) 12315 return false; 12316 if (!isIntrinsic) 12317 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); 12318 if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { 12319 Cnt = -Cnt; 12320 return true; 12321 } 12322 return false; 12323 } 12324 12325 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 12326 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 12327 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 12328 switch (IntNo) { 12329 default: 12330 // Don't do anything for most intrinsics. 12331 break; 12332 12333 // Vector shifts: check for immediate versions and lower them. 12334 // Note: This is done during DAG combining instead of DAG legalizing because 12335 // the build_vectors for 64-bit vector element shift counts are generally 12336 // not legal, and it is hard to see their values after they get legalized to 12337 // loads from a constant pool. 12338 case Intrinsic::arm_neon_vshifts: 12339 case Intrinsic::arm_neon_vshiftu: 12340 case Intrinsic::arm_neon_vrshifts: 12341 case Intrinsic::arm_neon_vrshiftu: 12342 case Intrinsic::arm_neon_vrshiftn: 12343 case Intrinsic::arm_neon_vqshifts: 12344 case Intrinsic::arm_neon_vqshiftu: 12345 case Intrinsic::arm_neon_vqshiftsu: 12346 case Intrinsic::arm_neon_vqshiftns: 12347 case Intrinsic::arm_neon_vqshiftnu: 12348 case Intrinsic::arm_neon_vqshiftnsu: 12349 case Intrinsic::arm_neon_vqrshiftns: 12350 case Intrinsic::arm_neon_vqrshiftnu: 12351 case Intrinsic::arm_neon_vqrshiftnsu: { 12352 EVT VT = N->getOperand(1).getValueType(); 12353 int64_t Cnt; 12354 unsigned VShiftOpc = 0; 12355 12356 switch (IntNo) { 12357 case Intrinsic::arm_neon_vshifts: 12358 case Intrinsic::arm_neon_vshiftu: 12359 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 12360 VShiftOpc = ARMISD::VSHL; 12361 break; 12362 } 12363 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 12364 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? 12365 ARMISD::VSHRs : ARMISD::VSHRu); 12366 break; 12367 } 12368 return SDValue(); 12369 12370 case Intrinsic::arm_neon_vrshifts: 12371 case Intrinsic::arm_neon_vrshiftu: 12372 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 12373 break; 12374 return SDValue(); 12375 12376 case Intrinsic::arm_neon_vqshifts: 12377 case Intrinsic::arm_neon_vqshiftu: 12378 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 12379 break; 12380 return SDValue(); 12381 12382 case Intrinsic::arm_neon_vqshiftsu: 12383 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 12384 break; 12385 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 12386 12387 case Intrinsic::arm_neon_vrshiftn: 12388 case Intrinsic::arm_neon_vqshiftns: 12389 case Intrinsic::arm_neon_vqshiftnu: 12390 case Intrinsic::arm_neon_vqshiftnsu: 12391 case Intrinsic::arm_neon_vqrshiftns: 12392 case Intrinsic::arm_neon_vqrshiftnu: 12393 case Intrinsic::arm_neon_vqrshiftnsu: 12394 // Narrowing shifts require an immediate right shift. 12395 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 12396 break; 12397 llvm_unreachable("invalid shift count for narrowing vector shift " 12398 "intrinsic"); 12399 12400 default: 12401 llvm_unreachable("unhandled vector shift"); 12402 } 12403 12404 switch (IntNo) { 12405 case Intrinsic::arm_neon_vshifts: 12406 case Intrinsic::arm_neon_vshiftu: 12407 // Opcode already set above. 12408 break; 12409 case Intrinsic::arm_neon_vrshifts: 12410 VShiftOpc = ARMISD::VRSHRs; break; 12411 case Intrinsic::arm_neon_vrshiftu: 12412 VShiftOpc = ARMISD::VRSHRu; break; 12413 case Intrinsic::arm_neon_vrshiftn: 12414 VShiftOpc = ARMISD::VRSHRN; break; 12415 case Intrinsic::arm_neon_vqshifts: 12416 VShiftOpc = ARMISD::VQSHLs; break; 12417 case Intrinsic::arm_neon_vqshiftu: 12418 VShiftOpc = ARMISD::VQSHLu; break; 12419 case Intrinsic::arm_neon_vqshiftsu: 12420 VShiftOpc = ARMISD::VQSHLsu; break; 12421 case Intrinsic::arm_neon_vqshiftns: 12422 VShiftOpc = ARMISD::VQSHRNs; break; 12423 case Intrinsic::arm_neon_vqshiftnu: 12424 VShiftOpc = ARMISD::VQSHRNu; break; 12425 case Intrinsic::arm_neon_vqshiftnsu: 12426 VShiftOpc = ARMISD::VQSHRNsu; break; 12427 case Intrinsic::arm_neon_vqrshiftns: 12428 VShiftOpc = ARMISD::VQRSHRNs; break; 12429 case Intrinsic::arm_neon_vqrshiftnu: 12430 VShiftOpc = ARMISD::VQRSHRNu; break; 12431 case Intrinsic::arm_neon_vqrshiftnsu: 12432 VShiftOpc = ARMISD::VQRSHRNsu; break; 12433 } 12434 12435 SDLoc dl(N); 12436 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 12437 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 12438 } 12439 12440 case Intrinsic::arm_neon_vshiftins: { 12441 EVT VT = N->getOperand(1).getValueType(); 12442 int64_t Cnt; 12443 unsigned VShiftOpc = 0; 12444 12445 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 12446 VShiftOpc = ARMISD::VSLI; 12447 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 12448 VShiftOpc = ARMISD::VSRI; 12449 else { 12450 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 12451 } 12452 12453 SDLoc dl(N); 12454 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 12455 N->getOperand(1), N->getOperand(2), 12456 DAG.getConstant(Cnt, dl, MVT::i32)); 12457 } 12458 12459 case Intrinsic::arm_neon_vqrshifts: 12460 case Intrinsic::arm_neon_vqrshiftu: 12461 // No immediate versions of these to check for. 12462 break; 12463 } 12464 12465 return SDValue(); 12466 } 12467 12468 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 12469 /// lowers them. As with the vector shift intrinsics, this is done during DAG 12470 /// combining instead of DAG legalizing because the build_vectors for 64-bit 12471 /// vector element shift counts are generally not legal, and it is hard to see 12472 /// their values after they get legalized to loads from a constant pool. 12473 static SDValue PerformShiftCombine(SDNode *N, 12474 TargetLowering::DAGCombinerInfo &DCI, 12475 const ARMSubtarget *ST) { 12476 SelectionDAG &DAG = DCI.DAG; 12477 EVT VT = N->getValueType(0); 12478 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 12479 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 12480 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 12481 SDValue N1 = N->getOperand(1); 12482 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 12483 SDValue N0 = N->getOperand(0); 12484 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 12485 DAG.MaskedValueIsZero(N0.getOperand(0), 12486 APInt::getHighBitsSet(32, 16))) 12487 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 12488 } 12489 } 12490 12491 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 12492 N->getOperand(0)->getOpcode() == ISD::AND && 12493 N->getOperand(0)->hasOneUse()) { 12494 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 12495 return SDValue(); 12496 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 12497 // usually show up because instcombine prefers to canonicalize it to 12498 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 12499 // out of GEP lowering in some cases. 12500 SDValue N0 = N->getOperand(0); 12501 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12502 if (!ShiftAmtNode) 12503 return SDValue(); 12504 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 12505 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 12506 if (!AndMaskNode) 12507 return SDValue(); 12508 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 12509 // Don't transform uxtb/uxth. 12510 if (AndMask == 255 || AndMask == 65535) 12511 return SDValue(); 12512 if (isMask_32(AndMask)) { 12513 uint32_t MaskedBits = countLeadingZeros(AndMask); 12514 if (MaskedBits > ShiftAmt) { 12515 SDLoc DL(N); 12516 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 12517 DAG.getConstant(MaskedBits, DL, MVT::i32)); 12518 return DAG.getNode( 12519 ISD::SRL, DL, MVT::i32, SHL, 12520 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 12521 } 12522 } 12523 } 12524 12525 // Nothing to be done for scalar shifts. 12526 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12527 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 12528 return SDValue(); 12529 12530 assert(ST->hasNEON() && "unexpected vector shift"); 12531 int64_t Cnt; 12532 12533 switch (N->getOpcode()) { 12534 default: llvm_unreachable("unexpected shift opcode"); 12535 12536 case ISD::SHL: 12537 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 12538 SDLoc dl(N); 12539 return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), 12540 DAG.getConstant(Cnt, dl, MVT::i32)); 12541 } 12542 break; 12543 12544 case ISD::SRA: 12545 case ISD::SRL: 12546 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 12547 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? 12548 ARMISD::VSHRs : ARMISD::VSHRu); 12549 SDLoc dl(N); 12550 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 12551 DAG.getConstant(Cnt, dl, MVT::i32)); 12552 } 12553 } 12554 return SDValue(); 12555 } 12556 12557 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 12558 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 12559 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 12560 const ARMSubtarget *ST) { 12561 SDValue N0 = N->getOperand(0); 12562 12563 // Check for sign- and zero-extensions of vector extract operations of 8- 12564 // and 16-bit vector elements. NEON supports these directly. They are 12565 // handled during DAG combining because type legalization will promote them 12566 // to 32-bit types and it is messy to recognize the operations after that. 12567 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12568 SDValue Vec = N0.getOperand(0); 12569 SDValue Lane = N0.getOperand(1); 12570 EVT VT = N->getValueType(0); 12571 EVT EltVT = N0.getValueType(); 12572 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12573 12574 if (VT == MVT::i32 && 12575 (EltVT == MVT::i8 || EltVT == MVT::i16) && 12576 TLI.isTypeLegal(Vec.getValueType()) && 12577 isa<ConstantSDNode>(Lane)) { 12578 12579 unsigned Opc = 0; 12580 switch (N->getOpcode()) { 12581 default: llvm_unreachable("unexpected opcode"); 12582 case ISD::SIGN_EXTEND: 12583 Opc = ARMISD::VGETLANEs; 12584 break; 12585 case ISD::ZERO_EXTEND: 12586 case ISD::ANY_EXTEND: 12587 Opc = ARMISD::VGETLANEu; 12588 break; 12589 } 12590 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 12591 } 12592 } 12593 12594 return SDValue(); 12595 } 12596 12597 static const APInt *isPowerOf2Constant(SDValue V) { 12598 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 12599 if (!C) 12600 return nullptr; 12601 const APInt *CV = &C->getAPIntValue(); 12602 return CV->isPowerOf2() ? CV : nullptr; 12603 } 12604 12605 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 12606 // If we have a CMOV, OR and AND combination such as: 12607 // if (x & CN) 12608 // y |= CM; 12609 // 12610 // And: 12611 // * CN is a single bit; 12612 // * All bits covered by CM are known zero in y 12613 // 12614 // Then we can convert this into a sequence of BFI instructions. This will 12615 // always be a win if CM is a single bit, will always be no worse than the 12616 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 12617 // three bits (due to the extra IT instruction). 12618 12619 SDValue Op0 = CMOV->getOperand(0); 12620 SDValue Op1 = CMOV->getOperand(1); 12621 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 12622 auto CC = CCNode->getAPIntValue().getLimitedValue(); 12623 SDValue CmpZ = CMOV->getOperand(4); 12624 12625 // The compare must be against zero. 12626 if (!isNullConstant(CmpZ->getOperand(1))) 12627 return SDValue(); 12628 12629 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 12630 SDValue And = CmpZ->getOperand(0); 12631 if (And->getOpcode() != ISD::AND) 12632 return SDValue(); 12633 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 12634 if (!AndC) 12635 return SDValue(); 12636 SDValue X = And->getOperand(0); 12637 12638 if (CC == ARMCC::EQ) { 12639 // We're performing an "equal to zero" compare. Swap the operands so we 12640 // canonicalize on a "not equal to zero" compare. 12641 std::swap(Op0, Op1); 12642 } else { 12643 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 12644 } 12645 12646 if (Op1->getOpcode() != ISD::OR) 12647 return SDValue(); 12648 12649 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 12650 if (!OrC) 12651 return SDValue(); 12652 SDValue Y = Op1->getOperand(0); 12653 12654 if (Op0 != Y) 12655 return SDValue(); 12656 12657 // Now, is it profitable to continue? 12658 APInt OrCI = OrC->getAPIntValue(); 12659 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 12660 if (OrCI.countPopulation() > Heuristic) 12661 return SDValue(); 12662 12663 // Lastly, can we determine that the bits defined by OrCI 12664 // are zero in Y? 12665 KnownBits Known = DAG.computeKnownBits(Y); 12666 if ((OrCI & Known.Zero) != OrCI) 12667 return SDValue(); 12668 12669 // OK, we can do the combine. 12670 SDValue V = Y; 12671 SDLoc dl(X); 12672 EVT VT = X.getValueType(); 12673 unsigned BitInX = AndC->logBase2(); 12674 12675 if (BitInX != 0) { 12676 // We must shift X first. 12677 X = DAG.getNode(ISD::SRL, dl, VT, X, 12678 DAG.getConstant(BitInX, dl, VT)); 12679 } 12680 12681 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 12682 BitInY < NumActiveBits; ++BitInY) { 12683 if (OrCI[BitInY] == 0) 12684 continue; 12685 APInt Mask(VT.getSizeInBits(), 0); 12686 Mask.setBit(BitInY); 12687 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 12688 // Confusingly, the operand is an *inverted* mask. 12689 DAG.getConstant(~Mask, dl, VT)); 12690 } 12691 12692 return V; 12693 } 12694 12695 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 12696 SDValue 12697 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 12698 SDValue Cmp = N->getOperand(4); 12699 if (Cmp.getOpcode() != ARMISD::CMPZ) 12700 // Only looking at NE cases. 12701 return SDValue(); 12702 12703 EVT VT = N->getValueType(0); 12704 SDLoc dl(N); 12705 SDValue LHS = Cmp.getOperand(0); 12706 SDValue RHS = Cmp.getOperand(1); 12707 SDValue Chain = N->getOperand(0); 12708 SDValue BB = N->getOperand(1); 12709 SDValue ARMcc = N->getOperand(2); 12710 ARMCC::CondCodes CC = 12711 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 12712 12713 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 12714 // -> (brcond Chain BB CC CPSR Cmp) 12715 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 12716 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 12717 LHS->getOperand(0)->hasOneUse()) { 12718 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 12719 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 12720 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 12721 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 12722 if ((LHS00C && LHS00C->getZExtValue() == 0) && 12723 (LHS01C && LHS01C->getZExtValue() == 1) && 12724 (LHS1C && LHS1C->getZExtValue() == 1) && 12725 (RHSC && RHSC->getZExtValue() == 0)) { 12726 return DAG.getNode( 12727 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 12728 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 12729 } 12730 } 12731 12732 return SDValue(); 12733 } 12734 12735 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 12736 SDValue 12737 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 12738 SDValue Cmp = N->getOperand(4); 12739 if (Cmp.getOpcode() != ARMISD::CMPZ) 12740 // Only looking at EQ and NE cases. 12741 return SDValue(); 12742 12743 EVT VT = N->getValueType(0); 12744 SDLoc dl(N); 12745 SDValue LHS = Cmp.getOperand(0); 12746 SDValue RHS = Cmp.getOperand(1); 12747 SDValue FalseVal = N->getOperand(0); 12748 SDValue TrueVal = N->getOperand(1); 12749 SDValue ARMcc = N->getOperand(2); 12750 ARMCC::CondCodes CC = 12751 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 12752 12753 // BFI is only available on V6T2+. 12754 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 12755 SDValue R = PerformCMOVToBFICombine(N, DAG); 12756 if (R) 12757 return R; 12758 } 12759 12760 // Simplify 12761 // mov r1, r0 12762 // cmp r1, x 12763 // mov r0, y 12764 // moveq r0, x 12765 // to 12766 // cmp r0, x 12767 // movne r0, y 12768 // 12769 // mov r1, r0 12770 // cmp r1, x 12771 // mov r0, x 12772 // movne r0, y 12773 // to 12774 // cmp r0, x 12775 // movne r0, y 12776 /// FIXME: Turn this into a target neutral optimization? 12777 SDValue Res; 12778 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 12779 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 12780 N->getOperand(3), Cmp); 12781 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 12782 SDValue ARMcc; 12783 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 12784 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 12785 N->getOperand(3), NewCmp); 12786 } 12787 12788 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 12789 // -> (cmov F T CC CPSR Cmp) 12790 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 12791 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 12792 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 12793 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 12794 if ((LHS0C && LHS0C->getZExtValue() == 0) && 12795 (LHS1C && LHS1C->getZExtValue() == 1) && 12796 (RHSC && RHSC->getZExtValue() == 0)) { 12797 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 12798 LHS->getOperand(2), LHS->getOperand(3), 12799 LHS->getOperand(4)); 12800 } 12801 } 12802 12803 if (!VT.isInteger()) 12804 return SDValue(); 12805 12806 // Materialize a boolean comparison for integers so we can avoid branching. 12807 if (isNullConstant(FalseVal)) { 12808 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 12809 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 12810 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 12811 // right 5 bits will make that 32 be 1, otherwise it will be 0. 12812 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 12813 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12814 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 12815 DAG.getConstant(5, dl, MVT::i32)); 12816 } else { 12817 // CMOV 0, 1, ==, (CMPZ x, y) -> 12818 // (ADDCARRY (SUB x, y), t:0, t:1) 12819 // where t = (SUBCARRY 0, (SUB x, y), 0) 12820 // 12821 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 12822 // x != y. In other words, a carry C == 1 when x == y, C == 0 12823 // otherwise. 12824 // The final ADDCARRY computes 12825 // x - y + (0 - (x - y)) + C == C 12826 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 12827 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12828 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 12829 // ISD::SUBCARRY returns a borrow but we want the carry here 12830 // actually. 12831 SDValue Carry = 12832 DAG.getNode(ISD::SUB, dl, MVT::i32, 12833 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 12834 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 12835 } 12836 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 12837 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 12838 // This seems pointless but will allow us to combine it further below. 12839 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 12840 SDValue Sub = 12841 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 12842 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 12843 Sub.getValue(1), SDValue()); 12844 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 12845 N->getOperand(3), CPSRGlue.getValue(1)); 12846 FalseVal = Sub; 12847 } 12848 } else if (isNullConstant(TrueVal)) { 12849 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 12850 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 12851 // This seems pointless but will allow us to combine it further below 12852 // Note that we change == for != as this is the dual for the case above. 12853 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 12854 SDValue Sub = 12855 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 12856 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 12857 Sub.getValue(1), SDValue()); 12858 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 12859 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 12860 N->getOperand(3), CPSRGlue.getValue(1)); 12861 FalseVal = Sub; 12862 } 12863 } 12864 12865 // On Thumb1, the DAG above may be further combined if z is a power of 2 12866 // (z == 2 ^ K). 12867 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 12868 // t1 = (USUBO (SUB x, y), 1) 12869 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 12870 // Result = if K != 0 then (SHL t2:0, K) else t2:0 12871 // 12872 // This also handles the special case of comparing against zero; it's 12873 // essentially, the same pattern, except there's no SUBS: 12874 // CMOV x, z, !=, (CMPZ x, 0) -> 12875 // t1 = (USUBO x, 1) 12876 // t2 = (SUBCARRY x, t1:0, t1:1) 12877 // Result = if K != 0 then (SHL t2:0, K) else t2:0 12878 const APInt *TrueConst; 12879 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 12880 ((FalseVal.getOpcode() == ARMISD::SUBS && 12881 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 12882 (FalseVal == LHS && isNullConstant(RHS))) && 12883 (TrueConst = isPowerOf2Constant(TrueVal))) { 12884 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12885 unsigned ShiftAmount = TrueConst->logBase2(); 12886 if (ShiftAmount) 12887 TrueVal = DAG.getConstant(1, dl, VT); 12888 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 12889 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 12890 12891 if (ShiftAmount) 12892 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 12893 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 12894 } 12895 12896 if (Res.getNode()) { 12897 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 12898 // Capture demanded bits information that would be otherwise lost. 12899 if (Known.Zero == 0xfffffffe) 12900 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12901 DAG.getValueType(MVT::i1)); 12902 else if (Known.Zero == 0xffffff00) 12903 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12904 DAG.getValueType(MVT::i8)); 12905 else if (Known.Zero == 0xffff0000) 12906 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 12907 DAG.getValueType(MVT::i16)); 12908 } 12909 12910 return Res; 12911 } 12912 12913 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 12914 DAGCombinerInfo &DCI) const { 12915 switch (N->getOpcode()) { 12916 default: break; 12917 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 12918 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 12919 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 12920 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 12921 case ISD::SUB: return PerformSUBCombine(N, DCI); 12922 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 12923 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 12924 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 12925 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 12926 case ARMISD::ADDC: 12927 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 12928 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 12929 case ARMISD::BFI: return PerformBFICombine(N, DCI); 12930 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 12931 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 12932 case ISD::STORE: return PerformSTORECombine(N, DCI); 12933 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 12934 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 12935 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 12936 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); 12937 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); 12938 case ISD::FP_TO_SINT: 12939 case ISD::FP_TO_UINT: 12940 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 12941 case ISD::FDIV: 12942 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 12943 case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); 12944 case ISD::SHL: 12945 case ISD::SRA: 12946 case ISD::SRL: 12947 return PerformShiftCombine(N, DCI, Subtarget); 12948 case ISD::SIGN_EXTEND: 12949 case ISD::ZERO_EXTEND: 12950 case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); 12951 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 12952 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 12953 case ISD::LOAD: return PerformLOADCombine(N, DCI); 12954 case ARMISD::VLD1DUP: 12955 case ARMISD::VLD2DUP: 12956 case ARMISD::VLD3DUP: 12957 case ARMISD::VLD4DUP: 12958 return PerformVLDCombine(N, DCI); 12959 case ARMISD::BUILD_VECTOR: 12960 return PerformARMBUILD_VECTORCombine(N, DCI); 12961 case ARMISD::SMULWB: { 12962 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12963 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 12964 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 12965 return SDValue(); 12966 break; 12967 } 12968 case ARMISD::SMULWT: { 12969 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12970 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 12971 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 12972 return SDValue(); 12973 break; 12974 } 12975 case ARMISD::SMLALBB: { 12976 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 12977 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 12978 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 12979 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 12980 return SDValue(); 12981 break; 12982 } 12983 case ARMISD::SMLALBT: { 12984 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 12985 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 12986 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 12987 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 12988 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 12989 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 12990 return SDValue(); 12991 break; 12992 } 12993 case ARMISD::SMLALTB: { 12994 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 12995 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 12996 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 12997 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 12998 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 12999 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 13000 return SDValue(); 13001 break; 13002 } 13003 case ARMISD::SMLALTT: { 13004 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 13005 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 13006 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 13007 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 13008 return SDValue(); 13009 break; 13010 } 13011 case ISD::INTRINSIC_VOID: 13012 case ISD::INTRINSIC_W_CHAIN: 13013 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 13014 case Intrinsic::arm_neon_vld1: 13015 case Intrinsic::arm_neon_vld1x2: 13016 case Intrinsic::arm_neon_vld1x3: 13017 case Intrinsic::arm_neon_vld1x4: 13018 case Intrinsic::arm_neon_vld2: 13019 case Intrinsic::arm_neon_vld3: 13020 case Intrinsic::arm_neon_vld4: 13021 case Intrinsic::arm_neon_vld2lane: 13022 case Intrinsic::arm_neon_vld3lane: 13023 case Intrinsic::arm_neon_vld4lane: 13024 case Intrinsic::arm_neon_vld2dup: 13025 case Intrinsic::arm_neon_vld3dup: 13026 case Intrinsic::arm_neon_vld4dup: 13027 case Intrinsic::arm_neon_vst1: 13028 case Intrinsic::arm_neon_vst1x2: 13029 case Intrinsic::arm_neon_vst1x3: 13030 case Intrinsic::arm_neon_vst1x4: 13031 case Intrinsic::arm_neon_vst2: 13032 case Intrinsic::arm_neon_vst3: 13033 case Intrinsic::arm_neon_vst4: 13034 case Intrinsic::arm_neon_vst2lane: 13035 case Intrinsic::arm_neon_vst3lane: 13036 case Intrinsic::arm_neon_vst4lane: 13037 return PerformVLDCombine(N, DCI); 13038 default: break; 13039 } 13040 break; 13041 } 13042 return SDValue(); 13043 } 13044 13045 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 13046 EVT VT) const { 13047 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 13048 } 13049 13050 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 13051 unsigned, 13052 unsigned, 13053 bool *Fast) const { 13054 // Depends what it gets converted into if the type is weird. 13055 if (!VT.isSimple()) 13056 return false; 13057 13058 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus 13059 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 13060 13061 switch (VT.getSimpleVT().SimpleTy) { 13062 default: 13063 return false; 13064 case MVT::i8: 13065 case MVT::i16: 13066 case MVT::i32: { 13067 // Unaligned access can use (for example) LRDB, LRDH, LDR 13068 if (AllowsUnaligned) { 13069 if (Fast) 13070 *Fast = Subtarget->hasV7Ops(); 13071 return true; 13072 } 13073 return false; 13074 } 13075 case MVT::f64: 13076 case MVT::v2f64: { 13077 // For any little-endian targets with neon, we can support unaligned ld/st 13078 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 13079 // A big-endian target may also explicitly support unaligned accesses 13080 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 13081 if (Fast) 13082 *Fast = true; 13083 return true; 13084 } 13085 return false; 13086 } 13087 } 13088 } 13089 13090 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 13091 unsigned AlignCheck) { 13092 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 13093 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 13094 } 13095 13096 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, 13097 unsigned DstAlign, unsigned SrcAlign, 13098 bool IsMemset, bool ZeroMemset, 13099 bool MemcpyStrSrc, 13100 MachineFunction &MF) const { 13101 const Function &F = MF.getFunction(); 13102 13103 // See if we can use NEON instructions for this... 13104 if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && 13105 !F.hasFnAttribute(Attribute::NoImplicitFloat)) { 13106 bool Fast; 13107 if (Size >= 16 && 13108 (memOpAlign(SrcAlign, DstAlign, 16) || 13109 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { 13110 return MVT::v2f64; 13111 } else if (Size >= 8 && 13112 (memOpAlign(SrcAlign, DstAlign, 8) || 13113 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && 13114 Fast))) { 13115 return MVT::f64; 13116 } 13117 } 13118 13119 // Let the target-independent logic figure it out. 13120 return MVT::Other; 13121 } 13122 13123 // 64-bit integers are split into their high and low parts and held in two 13124 // different registers, so the trunc is free since the low register can just 13125 // be used. 13126 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 13127 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 13128 return false; 13129 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 13130 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 13131 return (SrcBits == 64 && DestBits == 32); 13132 } 13133 13134 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 13135 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 13136 !DstVT.isInteger()) 13137 return false; 13138 unsigned SrcBits = SrcVT.getSizeInBits(); 13139 unsigned DestBits = DstVT.getSizeInBits(); 13140 return (SrcBits == 64 && DestBits == 32); 13141 } 13142 13143 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 13144 if (Val.getOpcode() != ISD::LOAD) 13145 return false; 13146 13147 EVT VT1 = Val.getValueType(); 13148 if (!VT1.isSimple() || !VT1.isInteger() || 13149 !VT2.isSimple() || !VT2.isInteger()) 13150 return false; 13151 13152 switch (VT1.getSimpleVT().SimpleTy) { 13153 default: break; 13154 case MVT::i1: 13155 case MVT::i8: 13156 case MVT::i16: 13157 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 13158 return true; 13159 } 13160 13161 return false; 13162 } 13163 13164 bool ARMTargetLowering::isFNegFree(EVT VT) const { 13165 if (!VT.isSimple()) 13166 return false; 13167 13168 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 13169 // negate values directly (fneg is free). So, we don't want to let the DAG 13170 // combiner rewrite fneg into xors and some other instructions. For f16 and 13171 // FullFP16 argument passing, some bitcast nodes may be introduced, 13172 // triggering this DAG combine rewrite, so we are avoiding that with this. 13173 switch (VT.getSimpleVT().SimpleTy) { 13174 default: break; 13175 case MVT::f16: 13176 return Subtarget->hasFullFP16(); 13177 } 13178 13179 return false; 13180 } 13181 13182 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 13183 /// of the vector elements. 13184 static bool areExtractExts(Value *Ext1, Value *Ext2) { 13185 auto areExtDoubled = [](Instruction *Ext) { 13186 return Ext->getType()->getScalarSizeInBits() == 13187 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 13188 }; 13189 13190 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 13191 !match(Ext2, m_ZExtOrSExt(m_Value())) || 13192 !areExtDoubled(cast<Instruction>(Ext1)) || 13193 !areExtDoubled(cast<Instruction>(Ext2))) 13194 return false; 13195 13196 return true; 13197 } 13198 13199 /// Check if sinking \p I's operands to I's basic block is profitable, because 13200 /// the operands can be folded into a target instruction, e.g. 13201 /// sext/zext can be folded into vsubl. 13202 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 13203 SmallVectorImpl<Use *> &Ops) const { 13204 if (!Subtarget->hasNEON() || !I->getType()->isVectorTy()) 13205 return false; 13206 13207 switch (I->getOpcode()) { 13208 case Instruction::Sub: 13209 case Instruction::Add: { 13210 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 13211 return false; 13212 Ops.push_back(&I->getOperandUse(0)); 13213 Ops.push_back(&I->getOperandUse(1)); 13214 return true; 13215 } 13216 default: 13217 return false; 13218 } 13219 return false; 13220 } 13221 13222 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 13223 EVT VT = ExtVal.getValueType(); 13224 13225 if (!isTypeLegal(VT)) 13226 return false; 13227 13228 // Don't create a loadext if we can fold the extension into a wide/long 13229 // instruction. 13230 // If there's more than one user instruction, the loadext is desirable no 13231 // matter what. There can be two uses by the same instruction. 13232 if (ExtVal->use_empty() || 13233 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 13234 return true; 13235 13236 SDNode *U = *ExtVal->use_begin(); 13237 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 13238 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) 13239 return false; 13240 13241 return true; 13242 } 13243 13244 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 13245 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13246 return false; 13247 13248 if (!isTypeLegal(EVT::getEVT(Ty1))) 13249 return false; 13250 13251 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 13252 13253 // Assuming the caller doesn't have a zeroext or signext return parameter, 13254 // truncation all the way down to i1 is valid. 13255 return true; 13256 } 13257 13258 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 13259 const AddrMode &AM, Type *Ty, 13260 unsigned AS) const { 13261 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 13262 if (Subtarget->hasFPAO()) 13263 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 13264 return 0; 13265 } 13266 return -1; 13267 } 13268 13269 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 13270 if (V < 0) 13271 return false; 13272 13273 unsigned Scale = 1; 13274 switch (VT.getSimpleVT().SimpleTy) { 13275 default: return false; 13276 case MVT::i1: 13277 case MVT::i8: 13278 // Scale == 1; 13279 break; 13280 case MVT::i16: 13281 // Scale == 2; 13282 Scale = 2; 13283 break; 13284 case MVT::i32: 13285 // Scale == 4; 13286 Scale = 4; 13287 break; 13288 } 13289 13290 if ((V & (Scale - 1)) != 0) 13291 return false; 13292 return isUInt<5>(V / Scale); 13293 } 13294 13295 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 13296 const ARMSubtarget *Subtarget) { 13297 if (!VT.isInteger() && !VT.isFloatingPoint()) 13298 return false; 13299 if (Subtarget->hasNEON() && VT.isVector()) 13300 return false; 13301 13302 bool IsNeg = false; 13303 if (V < 0) { 13304 IsNeg = true; 13305 V = -V; 13306 } 13307 13308 unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U); 13309 13310 // VLDR and LDRD: 4 * imm8 13311 if ((VT.isFloatingPoint() && Subtarget->hasVFP2()) || NumBytes == 8) 13312 return isShiftedUInt<8, 2>(V); 13313 13314 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 13315 // + imm12 or - imm8 13316 if (IsNeg) 13317 return isUInt<8>(V); 13318 return isUInt<12>(V); 13319 } 13320 13321 return false; 13322 } 13323 13324 /// isLegalAddressImmediate - Return true if the integer value can be used 13325 /// as the offset of the target addressing mode for load / store of the 13326 /// given type. 13327 static bool isLegalAddressImmediate(int64_t V, EVT VT, 13328 const ARMSubtarget *Subtarget) { 13329 if (V == 0) 13330 return true; 13331 13332 if (!VT.isSimple()) 13333 return false; 13334 13335 if (Subtarget->isThumb1Only()) 13336 return isLegalT1AddressImmediate(V, VT); 13337 else if (Subtarget->isThumb2()) 13338 return isLegalT2AddressImmediate(V, VT, Subtarget); 13339 13340 // ARM mode. 13341 if (V < 0) 13342 V = - V; 13343 switch (VT.getSimpleVT().SimpleTy) { 13344 default: return false; 13345 case MVT::i1: 13346 case MVT::i8: 13347 case MVT::i32: 13348 // +- imm12 13349 return isUInt<12>(V); 13350 case MVT::i16: 13351 // +- imm8 13352 return isUInt<8>(V); 13353 case MVT::f32: 13354 case MVT::f64: 13355 if (!Subtarget->hasVFP2()) // FIXME: NEON? 13356 return false; 13357 return isShiftedUInt<8, 2>(V); 13358 } 13359 } 13360 13361 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 13362 EVT VT) const { 13363 int Scale = AM.Scale; 13364 if (Scale < 0) 13365 return false; 13366 13367 switch (VT.getSimpleVT().SimpleTy) { 13368 default: return false; 13369 case MVT::i1: 13370 case MVT::i8: 13371 case MVT::i16: 13372 case MVT::i32: 13373 if (Scale == 1) 13374 return true; 13375 // r + r << imm 13376 Scale = Scale & ~1; 13377 return Scale == 2 || Scale == 4 || Scale == 8; 13378 case MVT::i64: 13379 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 13380 // version in Thumb mode. 13381 // r + r 13382 if (Scale == 1) 13383 return true; 13384 // r * 2 (this can be lowered to r + r). 13385 if (!AM.HasBaseReg && Scale == 2) 13386 return true; 13387 return false; 13388 case MVT::isVoid: 13389 // Note, we allow "void" uses (basically, uses that aren't loads or 13390 // stores), because arm allows folding a scale into many arithmetic 13391 // operations. This should be made more precise and revisited later. 13392 13393 // Allow r << imm, but the imm has to be a multiple of two. 13394 if (Scale & 1) return false; 13395 return isPowerOf2_32(Scale); 13396 } 13397 } 13398 13399 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 13400 EVT VT) const { 13401 const int Scale = AM.Scale; 13402 13403 // Negative scales are not supported in Thumb1. 13404 if (Scale < 0) 13405 return false; 13406 13407 // Thumb1 addressing modes do not support register scaling excepting the 13408 // following cases: 13409 // 1. Scale == 1 means no scaling. 13410 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 13411 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 13412 } 13413 13414 /// isLegalAddressingMode - Return true if the addressing mode represented 13415 /// by AM is legal for this target, for a load/store of the specified type. 13416 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 13417 const AddrMode &AM, Type *Ty, 13418 unsigned AS, Instruction *I) const { 13419 EVT VT = getValueType(DL, Ty, true); 13420 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 13421 return false; 13422 13423 // Can never fold addr of global into load/store. 13424 if (AM.BaseGV) 13425 return false; 13426 13427 switch (AM.Scale) { 13428 case 0: // no scale reg, must be "r+i" or "r", or "i". 13429 break; 13430 default: 13431 // ARM doesn't support any R+R*scale+imm addr modes. 13432 if (AM.BaseOffs) 13433 return false; 13434 13435 if (!VT.isSimple()) 13436 return false; 13437 13438 if (Subtarget->isThumb1Only()) 13439 return isLegalT1ScaledAddressingMode(AM, VT); 13440 13441 if (Subtarget->isThumb2()) 13442 return isLegalT2ScaledAddressingMode(AM, VT); 13443 13444 int Scale = AM.Scale; 13445 switch (VT.getSimpleVT().SimpleTy) { 13446 default: return false; 13447 case MVT::i1: 13448 case MVT::i8: 13449 case MVT::i32: 13450 if (Scale < 0) Scale = -Scale; 13451 if (Scale == 1) 13452 return true; 13453 // r + r << imm 13454 return isPowerOf2_32(Scale & ~1); 13455 case MVT::i16: 13456 case MVT::i64: 13457 // r +/- r 13458 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 13459 return true; 13460 // r * 2 (this can be lowered to r + r). 13461 if (!AM.HasBaseReg && Scale == 2) 13462 return true; 13463 return false; 13464 13465 case MVT::isVoid: 13466 // Note, we allow "void" uses (basically, uses that aren't loads or 13467 // stores), because arm allows folding a scale into many arithmetic 13468 // operations. This should be made more precise and revisited later. 13469 13470 // Allow r << imm, but the imm has to be a multiple of two. 13471 if (Scale & 1) return false; 13472 return isPowerOf2_32(Scale); 13473 } 13474 } 13475 return true; 13476 } 13477 13478 /// isLegalICmpImmediate - Return true if the specified immediate is legal 13479 /// icmp immediate, that is the target has icmp instructions which can compare 13480 /// a register against the immediate without having to materialize the 13481 /// immediate into a register. 13482 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 13483 // Thumb2 and ARM modes can use cmn for negative immediates. 13484 if (!Subtarget->isThumb()) 13485 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 13486 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 13487 if (Subtarget->isThumb2()) 13488 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 13489 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 13490 // Thumb1 doesn't have cmn, and only 8-bit immediates. 13491 return Imm >= 0 && Imm <= 255; 13492 } 13493 13494 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 13495 /// *or sub* immediate, that is the target has add or sub instructions which can 13496 /// add a register with the immediate without having to materialize the 13497 /// immediate into a register. 13498 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 13499 // Same encoding for add/sub, just flip the sign. 13500 int64_t AbsImm = std::abs(Imm); 13501 if (!Subtarget->isThumb()) 13502 return ARM_AM::getSOImmVal(AbsImm) != -1; 13503 if (Subtarget->isThumb2()) 13504 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 13505 // Thumb1 only has 8-bit unsigned immediate. 13506 return AbsImm >= 0 && AbsImm <= 255; 13507 } 13508 13509 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 13510 bool isSEXTLoad, SDValue &Base, 13511 SDValue &Offset, bool &isInc, 13512 SelectionDAG &DAG) { 13513 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 13514 return false; 13515 13516 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 13517 // AddressingMode 3 13518 Base = Ptr->getOperand(0); 13519 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13520 int RHSC = (int)RHS->getZExtValue(); 13521 if (RHSC < 0 && RHSC > -256) { 13522 assert(Ptr->getOpcode() == ISD::ADD); 13523 isInc = false; 13524 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13525 return true; 13526 } 13527 } 13528 isInc = (Ptr->getOpcode() == ISD::ADD); 13529 Offset = Ptr->getOperand(1); 13530 return true; 13531 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 13532 // AddressingMode 2 13533 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13534 int RHSC = (int)RHS->getZExtValue(); 13535 if (RHSC < 0 && RHSC > -0x1000) { 13536 assert(Ptr->getOpcode() == ISD::ADD); 13537 isInc = false; 13538 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13539 Base = Ptr->getOperand(0); 13540 return true; 13541 } 13542 } 13543 13544 if (Ptr->getOpcode() == ISD::ADD) { 13545 isInc = true; 13546 ARM_AM::ShiftOpc ShOpcVal= 13547 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 13548 if (ShOpcVal != ARM_AM::no_shift) { 13549 Base = Ptr->getOperand(1); 13550 Offset = Ptr->getOperand(0); 13551 } else { 13552 Base = Ptr->getOperand(0); 13553 Offset = Ptr->getOperand(1); 13554 } 13555 return true; 13556 } 13557 13558 isInc = (Ptr->getOpcode() == ISD::ADD); 13559 Base = Ptr->getOperand(0); 13560 Offset = Ptr->getOperand(1); 13561 return true; 13562 } 13563 13564 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 13565 return false; 13566 } 13567 13568 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 13569 bool isSEXTLoad, SDValue &Base, 13570 SDValue &Offset, bool &isInc, 13571 SelectionDAG &DAG) { 13572 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 13573 return false; 13574 13575 Base = Ptr->getOperand(0); 13576 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 13577 int RHSC = (int)RHS->getZExtValue(); 13578 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 13579 assert(Ptr->getOpcode() == ISD::ADD); 13580 isInc = false; 13581 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13582 return true; 13583 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 13584 isInc = Ptr->getOpcode() == ISD::ADD; 13585 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 13586 return true; 13587 } 13588 } 13589 13590 return false; 13591 } 13592 13593 /// getPreIndexedAddressParts - returns true by value, base pointer and 13594 /// offset pointer and addressing mode by reference if the node's address 13595 /// can be legally represented as pre-indexed load / store address. 13596 bool 13597 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 13598 SDValue &Offset, 13599 ISD::MemIndexedMode &AM, 13600 SelectionDAG &DAG) const { 13601 if (Subtarget->isThumb1Only()) 13602 return false; 13603 13604 EVT VT; 13605 SDValue Ptr; 13606 bool isSEXTLoad = false; 13607 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 13608 Ptr = LD->getBasePtr(); 13609 VT = LD->getMemoryVT(); 13610 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 13611 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 13612 Ptr = ST->getBasePtr(); 13613 VT = ST->getMemoryVT(); 13614 } else 13615 return false; 13616 13617 bool isInc; 13618 bool isLegal = false; 13619 if (Subtarget->isThumb2()) 13620 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 13621 Offset, isInc, DAG); 13622 else 13623 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 13624 Offset, isInc, DAG); 13625 if (!isLegal) 13626 return false; 13627 13628 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 13629 return true; 13630 } 13631 13632 /// getPostIndexedAddressParts - returns true by value, base pointer and 13633 /// offset pointer and addressing mode by reference if this node can be 13634 /// combined with a load / store to form a post-indexed load / store. 13635 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 13636 SDValue &Base, 13637 SDValue &Offset, 13638 ISD::MemIndexedMode &AM, 13639 SelectionDAG &DAG) const { 13640 EVT VT; 13641 SDValue Ptr; 13642 bool isSEXTLoad = false, isNonExt; 13643 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 13644 VT = LD->getMemoryVT(); 13645 Ptr = LD->getBasePtr(); 13646 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 13647 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 13648 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 13649 VT = ST->getMemoryVT(); 13650 Ptr = ST->getBasePtr(); 13651 isNonExt = !ST->isTruncatingStore(); 13652 } else 13653 return false; 13654 13655 if (Subtarget->isThumb1Only()) { 13656 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 13657 // must be non-extending/truncating, i32, with an offset of 4. 13658 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 13659 if (Op->getOpcode() != ISD::ADD || !isNonExt) 13660 return false; 13661 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 13662 if (!RHS || RHS->getZExtValue() != 4) 13663 return false; 13664 13665 Offset = Op->getOperand(1); 13666 Base = Op->getOperand(0); 13667 AM = ISD::POST_INC; 13668 return true; 13669 } 13670 13671 bool isInc; 13672 bool isLegal = false; 13673 if (Subtarget->isThumb2()) 13674 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 13675 isInc, DAG); 13676 else 13677 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 13678 isInc, DAG); 13679 if (!isLegal) 13680 return false; 13681 13682 if (Ptr != Base) { 13683 // Swap base ptr and offset to catch more post-index load / store when 13684 // it's legal. In Thumb2 mode, offset must be an immediate. 13685 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 13686 !Subtarget->isThumb2()) 13687 std::swap(Base, Offset); 13688 13689 // Post-indexed load / store update the base pointer. 13690 if (Ptr != Base) 13691 return false; 13692 } 13693 13694 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 13695 return true; 13696 } 13697 13698 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 13699 KnownBits &Known, 13700 const APInt &DemandedElts, 13701 const SelectionDAG &DAG, 13702 unsigned Depth) const { 13703 unsigned BitWidth = Known.getBitWidth(); 13704 Known.resetAll(); 13705 switch (Op.getOpcode()) { 13706 default: break; 13707 case ARMISD::ADDC: 13708 case ARMISD::ADDE: 13709 case ARMISD::SUBC: 13710 case ARMISD::SUBE: 13711 // Special cases when we convert a carry to a boolean. 13712 if (Op.getResNo() == 0) { 13713 SDValue LHS = Op.getOperand(0); 13714 SDValue RHS = Op.getOperand(1); 13715 // (ADDE 0, 0, C) will give us a single bit. 13716 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 13717 isNullConstant(RHS)) { 13718 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 13719 return; 13720 } 13721 } 13722 break; 13723 case ARMISD::CMOV: { 13724 // Bits are known zero/one if known on the LHS and RHS. 13725 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 13726 if (Known.isUnknown()) 13727 return; 13728 13729 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 13730 Known.Zero &= KnownRHS.Zero; 13731 Known.One &= KnownRHS.One; 13732 return; 13733 } 13734 case ISD::INTRINSIC_W_CHAIN: { 13735 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 13736 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 13737 switch (IntID) { 13738 default: return; 13739 case Intrinsic::arm_ldaex: 13740 case Intrinsic::arm_ldrex: { 13741 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 13742 unsigned MemBits = VT.getScalarSizeInBits(); 13743 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 13744 return; 13745 } 13746 } 13747 } 13748 case ARMISD::BFI: { 13749 // Conservatively, we can recurse down the first operand 13750 // and just mask out all affected bits. 13751 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 13752 13753 // The operand to BFI is already a mask suitable for removing the bits it 13754 // sets. 13755 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 13756 const APInt &Mask = CI->getAPIntValue(); 13757 Known.Zero &= Mask; 13758 Known.One &= Mask; 13759 return; 13760 } 13761 case ARMISD::VGETLANEs: 13762 case ARMISD::VGETLANEu: { 13763 const SDValue &SrcSV = Op.getOperand(0); 13764 EVT VecVT = SrcSV.getValueType(); 13765 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 13766 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 13767 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 13768 assert(Pos->getAPIntValue().ult(NumSrcElts) && 13769 "VGETLANE index out of bounds"); 13770 unsigned Idx = Pos->getZExtValue(); 13771 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 13772 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 13773 13774 EVT VT = Op.getValueType(); 13775 const unsigned DstSz = VT.getScalarSizeInBits(); 13776 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 13777 (void)SrcSz; 13778 assert(SrcSz == Known.getBitWidth()); 13779 assert(DstSz > SrcSz); 13780 if (Op.getOpcode() == ARMISD::VGETLANEs) 13781 Known = Known.sext(DstSz); 13782 else { 13783 Known = Known.zext(DstSz, true /* extended bits are known zero */); 13784 } 13785 assert(DstSz == Known.getBitWidth()); 13786 break; 13787 } 13788 } 13789 } 13790 13791 bool 13792 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, 13793 const APInt &DemandedAPInt, 13794 TargetLoweringOpt &TLO) const { 13795 // Delay optimization, so we don't have to deal with illegal types, or block 13796 // optimizations. 13797 if (!TLO.LegalOps) 13798 return false; 13799 13800 // Only optimize AND for now. 13801 if (Op.getOpcode() != ISD::AND) 13802 return false; 13803 13804 EVT VT = Op.getValueType(); 13805 13806 // Ignore vectors. 13807 if (VT.isVector()) 13808 return false; 13809 13810 assert(VT == MVT::i32 && "Unexpected integer type"); 13811 13812 // Make sure the RHS really is a constant. 13813 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 13814 if (!C) 13815 return false; 13816 13817 unsigned Mask = C->getZExtValue(); 13818 13819 unsigned Demanded = DemandedAPInt.getZExtValue(); 13820 unsigned ShrunkMask = Mask & Demanded; 13821 unsigned ExpandedMask = Mask | ~Demanded; 13822 13823 // If the mask is all zeros, let the target-independent code replace the 13824 // result with zero. 13825 if (ShrunkMask == 0) 13826 return false; 13827 13828 // If the mask is all ones, erase the AND. (Currently, the target-independent 13829 // code won't do this, so we have to do it explicitly to avoid an infinite 13830 // loop in obscure cases.) 13831 if (ExpandedMask == ~0U) 13832 return TLO.CombineTo(Op, Op.getOperand(0)); 13833 13834 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 13835 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 13836 }; 13837 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 13838 if (NewMask == Mask) 13839 return true; 13840 SDLoc DL(Op); 13841 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 13842 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 13843 return TLO.CombineTo(Op, NewOp); 13844 }; 13845 13846 // Prefer uxtb mask. 13847 if (IsLegalMask(0xFF)) 13848 return UseMask(0xFF); 13849 13850 // Prefer uxth mask. 13851 if (IsLegalMask(0xFFFF)) 13852 return UseMask(0xFFFF); 13853 13854 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 13855 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 13856 if (ShrunkMask < 256) 13857 return UseMask(ShrunkMask); 13858 13859 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 13860 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 13861 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 13862 return UseMask(ExpandedMask); 13863 13864 // Potential improvements: 13865 // 13866 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 13867 // We could try to prefer Thumb1 immediates which can be lowered to a 13868 // two-instruction sequence. 13869 // We could try to recognize more legal ARM/Thumb2 immediates here. 13870 13871 return false; 13872 } 13873 13874 13875 //===----------------------------------------------------------------------===// 13876 // ARM Inline Assembly Support 13877 //===----------------------------------------------------------------------===// 13878 13879 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 13880 // Looking for "rev" which is V6+. 13881 if (!Subtarget->hasV6Ops()) 13882 return false; 13883 13884 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 13885 std::string AsmStr = IA->getAsmString(); 13886 SmallVector<StringRef, 4> AsmPieces; 13887 SplitString(AsmStr, AsmPieces, ";\n"); 13888 13889 switch (AsmPieces.size()) { 13890 default: return false; 13891 case 1: 13892 AsmStr = AsmPieces[0]; 13893 AsmPieces.clear(); 13894 SplitString(AsmStr, AsmPieces, " \t,"); 13895 13896 // rev $0, $1 13897 if (AsmPieces.size() == 3 && 13898 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 13899 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 13900 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13901 if (Ty && Ty->getBitWidth() == 32) 13902 return IntrinsicLowering::LowerToByteSwap(CI); 13903 } 13904 break; 13905 } 13906 13907 return false; 13908 } 13909 13910 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 13911 // At this point, we have to lower this constraint to something else, so we 13912 // lower it to an "r" or "w". However, by doing this we will force the result 13913 // to be in register, while the X constraint is much more permissive. 13914 // 13915 // Although we are correct (we are free to emit anything, without 13916 // constraints), we might break use cases that would expect us to be more 13917 // efficient and emit something else. 13918 if (!Subtarget->hasVFP2()) 13919 return "r"; 13920 if (ConstraintVT.isFloatingPoint()) 13921 return "w"; 13922 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 13923 (ConstraintVT.getSizeInBits() == 64 || 13924 ConstraintVT.getSizeInBits() == 128)) 13925 return "w"; 13926 13927 return "r"; 13928 } 13929 13930 /// getConstraintType - Given a constraint letter, return the type of 13931 /// constraint it is for this target. 13932 ARMTargetLowering::ConstraintType 13933 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 13934 if (Constraint.size() == 1) { 13935 switch (Constraint[0]) { 13936 default: break; 13937 case 'l': return C_RegisterClass; 13938 case 'w': return C_RegisterClass; 13939 case 'h': return C_RegisterClass; 13940 case 'x': return C_RegisterClass; 13941 case 't': return C_RegisterClass; 13942 case 'j': return C_Other; // Constant for movw. 13943 // An address with a single base register. Due to the way we 13944 // currently handle addresses it is the same as an 'r' memory constraint. 13945 case 'Q': return C_Memory; 13946 } 13947 } else if (Constraint.size() == 2) { 13948 switch (Constraint[0]) { 13949 default: break; 13950 // All 'U+' constraints are addresses. 13951 case 'U': return C_Memory; 13952 } 13953 } 13954 return TargetLowering::getConstraintType(Constraint); 13955 } 13956 13957 /// Examine constraint type and operand type and determine a weight value. 13958 /// This object must already have been set up with the operand type 13959 /// and the current alternative constraint selected. 13960 TargetLowering::ConstraintWeight 13961 ARMTargetLowering::getSingleConstraintMatchWeight( 13962 AsmOperandInfo &info, const char *constraint) const { 13963 ConstraintWeight weight = CW_Invalid; 13964 Value *CallOperandVal = info.CallOperandVal; 13965 // If we don't have a value, we can't do a match, 13966 // but allow it at the lowest weight. 13967 if (!CallOperandVal) 13968 return CW_Default; 13969 Type *type = CallOperandVal->getType(); 13970 // Look at the constraint type. 13971 switch (*constraint) { 13972 default: 13973 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13974 break; 13975 case 'l': 13976 if (type->isIntegerTy()) { 13977 if (Subtarget->isThumb()) 13978 weight = CW_SpecificReg; 13979 else 13980 weight = CW_Register; 13981 } 13982 break; 13983 case 'w': 13984 if (type->isFloatingPointTy()) 13985 weight = CW_Register; 13986 break; 13987 } 13988 return weight; 13989 } 13990 13991 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 13992 13993 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 13994 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 13995 if (Constraint.size() == 1) { 13996 // GCC ARM Constraint Letters 13997 switch (Constraint[0]) { 13998 case 'l': // Low regs or general regs. 13999 if (Subtarget->isThumb()) 14000 return RCPair(0U, &ARM::tGPRRegClass); 14001 return RCPair(0U, &ARM::GPRRegClass); 14002 case 'h': // High regs or no regs. 14003 if (Subtarget->isThumb()) 14004 return RCPair(0U, &ARM::hGPRRegClass); 14005 break; 14006 case 'r': 14007 if (Subtarget->isThumb1Only()) 14008 return RCPair(0U, &ARM::tGPRRegClass); 14009 return RCPair(0U, &ARM::GPRRegClass); 14010 case 'w': 14011 if (VT == MVT::Other) 14012 break; 14013 if (VT == MVT::f32) 14014 return RCPair(0U, &ARM::SPRRegClass); 14015 if (VT.getSizeInBits() == 64) 14016 return RCPair(0U, &ARM::DPRRegClass); 14017 if (VT.getSizeInBits() == 128) 14018 return RCPair(0U, &ARM::QPRRegClass); 14019 break; 14020 case 'x': 14021 if (VT == MVT::Other) 14022 break; 14023 if (VT == MVT::f32) 14024 return RCPair(0U, &ARM::SPR_8RegClass); 14025 if (VT.getSizeInBits() == 64) 14026 return RCPair(0U, &ARM::DPR_8RegClass); 14027 if (VT.getSizeInBits() == 128) 14028 return RCPair(0U, &ARM::QPR_8RegClass); 14029 break; 14030 case 't': 14031 if (VT == MVT::Other) 14032 break; 14033 if (VT == MVT::f32 || VT == MVT::i32) 14034 return RCPair(0U, &ARM::SPRRegClass); 14035 if (VT.getSizeInBits() == 64) 14036 return RCPair(0U, &ARM::DPR_VFP2RegClass); 14037 if (VT.getSizeInBits() == 128) 14038 return RCPair(0U, &ARM::QPR_VFP2RegClass); 14039 break; 14040 } 14041 } 14042 if (StringRef("{cc}").equals_lower(Constraint)) 14043 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 14044 14045 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 14046 } 14047 14048 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 14049 /// vector. If it is invalid, don't add anything to Ops. 14050 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 14051 std::string &Constraint, 14052 std::vector<SDValue>&Ops, 14053 SelectionDAG &DAG) const { 14054 SDValue Result; 14055 14056 // Currently only support length 1 constraints. 14057 if (Constraint.length() != 1) return; 14058 14059 char ConstraintLetter = Constraint[0]; 14060 switch (ConstraintLetter) { 14061 default: break; 14062 case 'j': 14063 case 'I': case 'J': case 'K': case 'L': 14064 case 'M': case 'N': case 'O': 14065 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 14066 if (!C) 14067 return; 14068 14069 int64_t CVal64 = C->getSExtValue(); 14070 int CVal = (int) CVal64; 14071 // None of these constraints allow values larger than 32 bits. Check 14072 // that the value fits in an int. 14073 if (CVal != CVal64) 14074 return; 14075 14076 switch (ConstraintLetter) { 14077 case 'j': 14078 // Constant suitable for movw, must be between 0 and 14079 // 65535. 14080 if (Subtarget->hasV6T2Ops()) 14081 if (CVal >= 0 && CVal <= 65535) 14082 break; 14083 return; 14084 case 'I': 14085 if (Subtarget->isThumb1Only()) { 14086 // This must be a constant between 0 and 255, for ADD 14087 // immediates. 14088 if (CVal >= 0 && CVal <= 255) 14089 break; 14090 } else if (Subtarget->isThumb2()) { 14091 // A constant that can be used as an immediate value in a 14092 // data-processing instruction. 14093 if (ARM_AM::getT2SOImmVal(CVal) != -1) 14094 break; 14095 } else { 14096 // A constant that can be used as an immediate value in a 14097 // data-processing instruction. 14098 if (ARM_AM::getSOImmVal(CVal) != -1) 14099 break; 14100 } 14101 return; 14102 14103 case 'J': 14104 if (Subtarget->isThumb1Only()) { 14105 // This must be a constant between -255 and -1, for negated ADD 14106 // immediates. This can be used in GCC with an "n" modifier that 14107 // prints the negated value, for use with SUB instructions. It is 14108 // not useful otherwise but is implemented for compatibility. 14109 if (CVal >= -255 && CVal <= -1) 14110 break; 14111 } else { 14112 // This must be a constant between -4095 and 4095. It is not clear 14113 // what this constraint is intended for. Implemented for 14114 // compatibility with GCC. 14115 if (CVal >= -4095 && CVal <= 4095) 14116 break; 14117 } 14118 return; 14119 14120 case 'K': 14121 if (Subtarget->isThumb1Only()) { 14122 // A 32-bit value where only one byte has a nonzero value. Exclude 14123 // zero to match GCC. This constraint is used by GCC internally for 14124 // constants that can be loaded with a move/shift combination. 14125 // It is not useful otherwise but is implemented for compatibility. 14126 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 14127 break; 14128 } else if (Subtarget->isThumb2()) { 14129 // A constant whose bitwise inverse can be used as an immediate 14130 // value in a data-processing instruction. This can be used in GCC 14131 // with a "B" modifier that prints the inverted value, for use with 14132 // BIC and MVN instructions. It is not useful otherwise but is 14133 // implemented for compatibility. 14134 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 14135 break; 14136 } else { 14137 // A constant whose bitwise inverse can be used as an immediate 14138 // value in a data-processing instruction. This can be used in GCC 14139 // with a "B" modifier that prints the inverted value, for use with 14140 // BIC and MVN instructions. It is not useful otherwise but is 14141 // implemented for compatibility. 14142 if (ARM_AM::getSOImmVal(~CVal) != -1) 14143 break; 14144 } 14145 return; 14146 14147 case 'L': 14148 if (Subtarget->isThumb1Only()) { 14149 // This must be a constant between -7 and 7, 14150 // for 3-operand ADD/SUB immediate instructions. 14151 if (CVal >= -7 && CVal < 7) 14152 break; 14153 } else if (Subtarget->isThumb2()) { 14154 // A constant whose negation can be used as an immediate value in a 14155 // data-processing instruction. This can be used in GCC with an "n" 14156 // modifier that prints the negated value, for use with SUB 14157 // instructions. It is not useful otherwise but is implemented for 14158 // compatibility. 14159 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 14160 break; 14161 } else { 14162 // A constant whose negation can be used as an immediate value in a 14163 // data-processing instruction. This can be used in GCC with an "n" 14164 // modifier that prints the negated value, for use with SUB 14165 // instructions. It is not useful otherwise but is implemented for 14166 // compatibility. 14167 if (ARM_AM::getSOImmVal(-CVal) != -1) 14168 break; 14169 } 14170 return; 14171 14172 case 'M': 14173 if (Subtarget->isThumb1Only()) { 14174 // This must be a multiple of 4 between 0 and 1020, for 14175 // ADD sp + immediate. 14176 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 14177 break; 14178 } else { 14179 // A power of two or a constant between 0 and 32. This is used in 14180 // GCC for the shift amount on shifted register operands, but it is 14181 // useful in general for any shift amounts. 14182 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 14183 break; 14184 } 14185 return; 14186 14187 case 'N': 14188 if (Subtarget->isThumb()) { // FIXME thumb2 14189 // This must be a constant between 0 and 31, for shift amounts. 14190 if (CVal >= 0 && CVal <= 31) 14191 break; 14192 } 14193 return; 14194 14195 case 'O': 14196 if (Subtarget->isThumb()) { // FIXME thumb2 14197 // This must be a multiple of 4 between -508 and 508, for 14198 // ADD/SUB sp = sp + immediate. 14199 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 14200 break; 14201 } 14202 return; 14203 } 14204 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 14205 break; 14206 } 14207 14208 if (Result.getNode()) { 14209 Ops.push_back(Result); 14210 return; 14211 } 14212 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 14213 } 14214 14215 static RTLIB::Libcall getDivRemLibcall( 14216 const SDNode *N, MVT::SimpleValueType SVT) { 14217 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 14218 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 14219 "Unhandled Opcode in getDivRemLibcall"); 14220 bool isSigned = N->getOpcode() == ISD::SDIVREM || 14221 N->getOpcode() == ISD::SREM; 14222 RTLIB::Libcall LC; 14223 switch (SVT) { 14224 default: llvm_unreachable("Unexpected request for libcall!"); 14225 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 14226 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 14227 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 14228 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 14229 } 14230 return LC; 14231 } 14232 14233 static TargetLowering::ArgListTy getDivRemArgList( 14234 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 14235 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 14236 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 14237 "Unhandled Opcode in getDivRemArgList"); 14238 bool isSigned = N->getOpcode() == ISD::SDIVREM || 14239 N->getOpcode() == ISD::SREM; 14240 TargetLowering::ArgListTy Args; 14241 TargetLowering::ArgListEntry Entry; 14242 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 14243 EVT ArgVT = N->getOperand(i).getValueType(); 14244 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 14245 Entry.Node = N->getOperand(i); 14246 Entry.Ty = ArgTy; 14247 Entry.IsSExt = isSigned; 14248 Entry.IsZExt = !isSigned; 14249 Args.push_back(Entry); 14250 } 14251 if (Subtarget->isTargetWindows() && Args.size() >= 2) 14252 std::swap(Args[0], Args[1]); 14253 return Args; 14254 } 14255 14256 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 14257 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 14258 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 14259 Subtarget->isTargetWindows()) && 14260 "Register-based DivRem lowering only"); 14261 unsigned Opcode = Op->getOpcode(); 14262 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 14263 "Invalid opcode for Div/Rem lowering"); 14264 bool isSigned = (Opcode == ISD::SDIVREM); 14265 EVT VT = Op->getValueType(0); 14266 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 14267 SDLoc dl(Op); 14268 14269 // If the target has hardware divide, use divide + multiply + subtract: 14270 // div = a / b 14271 // rem = a - b * div 14272 // return {div, rem} 14273 // This should be lowered into UDIV/SDIV + MLS later on. 14274 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 14275 : Subtarget->hasDivideInARMMode(); 14276 if (hasDivide && Op->getValueType(0).isSimple() && 14277 Op->getSimpleValueType(0) == MVT::i32) { 14278 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 14279 const SDValue Dividend = Op->getOperand(0); 14280 const SDValue Divisor = Op->getOperand(1); 14281 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 14282 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 14283 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 14284 14285 SDValue Values[2] = {Div, Rem}; 14286 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 14287 } 14288 14289 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 14290 VT.getSimpleVT().SimpleTy); 14291 SDValue InChain = DAG.getEntryNode(); 14292 14293 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 14294 DAG.getContext(), 14295 Subtarget); 14296 14297 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 14298 getPointerTy(DAG.getDataLayout())); 14299 14300 Type *RetTy = StructType::get(Ty, Ty); 14301 14302 if (Subtarget->isTargetWindows()) 14303 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 14304 14305 TargetLowering::CallLoweringInfo CLI(DAG); 14306 CLI.setDebugLoc(dl).setChain(InChain) 14307 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 14308 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 14309 14310 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 14311 return CallInfo.first; 14312 } 14313 14314 // Lowers REM using divmod helpers 14315 // see RTABI section 4.2/4.3 14316 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 14317 // Build return types (div and rem) 14318 std::vector<Type*> RetTyParams; 14319 Type *RetTyElement; 14320 14321 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 14322 default: llvm_unreachable("Unexpected request for libcall!"); 14323 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 14324 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 14325 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 14326 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 14327 } 14328 14329 RetTyParams.push_back(RetTyElement); 14330 RetTyParams.push_back(RetTyElement); 14331 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 14332 Type *RetTy = StructType::get(*DAG.getContext(), ret); 14333 14334 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 14335 SimpleTy); 14336 SDValue InChain = DAG.getEntryNode(); 14337 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 14338 Subtarget); 14339 bool isSigned = N->getOpcode() == ISD::SREM; 14340 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 14341 getPointerTy(DAG.getDataLayout())); 14342 14343 if (Subtarget->isTargetWindows()) 14344 InChain = WinDBZCheckDenominator(DAG, N, InChain); 14345 14346 // Lower call 14347 CallLoweringInfo CLI(DAG); 14348 CLI.setChain(InChain) 14349 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 14350 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 14351 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 14352 14353 // Return second (rem) result operand (first contains div) 14354 SDNode *ResNode = CallResult.first.getNode(); 14355 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 14356 return ResNode->getOperand(1); 14357 } 14358 14359 SDValue 14360 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 14361 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 14362 SDLoc DL(Op); 14363 14364 // Get the inputs. 14365 SDValue Chain = Op.getOperand(0); 14366 SDValue Size = Op.getOperand(1); 14367 14368 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 14369 "no-stack-arg-probe")) { 14370 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 14371 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 14372 Chain = SP.getValue(1); 14373 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 14374 if (Align) 14375 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 14376 DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); 14377 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 14378 SDValue Ops[2] = { SP, Chain }; 14379 return DAG.getMergeValues(Ops, DL); 14380 } 14381 14382 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 14383 DAG.getConstant(2, DL, MVT::i32)); 14384 14385 SDValue Flag; 14386 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 14387 Flag = Chain.getValue(1); 14388 14389 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 14390 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 14391 14392 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 14393 Chain = NewSP.getValue(1); 14394 14395 SDValue Ops[2] = { NewSP, Chain }; 14396 return DAG.getMergeValues(Ops, DL); 14397 } 14398 14399 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 14400 assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && 14401 "Unexpected type for custom-lowering FP_EXTEND"); 14402 14403 RTLIB::Libcall LC; 14404 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 14405 14406 SDValue SrcVal = Op.getOperand(0); 14407 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 14408 SDLoc(Op)).first; 14409 } 14410 14411 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 14412 assert(Op.getOperand(0).getValueType() == MVT::f64 && 14413 Subtarget->isFPOnlySP() && 14414 "Unexpected type for custom-lowering FP_ROUND"); 14415 14416 RTLIB::Libcall LC; 14417 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 14418 14419 SDValue SrcVal = Op.getOperand(0); 14420 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 14421 SDLoc(Op)).first; 14422 } 14423 14424 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 14425 SelectionDAG &DAG) const { 14426 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 14427 MVT HalfT = MVT::i32; 14428 SDLoc dl(N); 14429 SDValue Hi, Lo, Tmp; 14430 14431 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 14432 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 14433 return ; 14434 14435 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 14436 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 14437 14438 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 14439 DAG.getConstant(0, dl, HalfT)); 14440 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 14441 DAG.getConstant(1, dl, HalfT)); 14442 14443 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 14444 DAG.getConstant(OpTypeBits - 1, dl, 14445 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 14446 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 14447 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 14448 SDValue(Lo.getNode(), 1)); 14449 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 14450 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 14451 14452 Results.push_back(Lo); 14453 Results.push_back(Hi); 14454 } 14455 14456 bool 14457 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 14458 // The ARM target isn't yet aware of offsets. 14459 return false; 14460 } 14461 14462 bool ARM::isBitFieldInvertedMask(unsigned v) { 14463 if (v == 0xffffffff) 14464 return false; 14465 14466 // there can be 1's on either or both "outsides", all the "inside" 14467 // bits must be 0's 14468 return isShiftedMask_32(~v); 14469 } 14470 14471 /// isFPImmLegal - Returns true if the target can instruction select the 14472 /// specified FP immediate natively. If false, the legalizer will 14473 /// materialize the FP immediate as a load from a constant pool. 14474 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 14475 bool ForCodeSize) const { 14476 if (!Subtarget->hasVFP3()) 14477 return false; 14478 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 14479 return ARM_AM::getFP16Imm(Imm) != -1; 14480 if (VT == MVT::f32) 14481 return ARM_AM::getFP32Imm(Imm) != -1; 14482 if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) 14483 return ARM_AM::getFP64Imm(Imm) != -1; 14484 return false; 14485 } 14486 14487 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 14488 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 14489 /// specified in the intrinsic calls. 14490 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 14491 const CallInst &I, 14492 MachineFunction &MF, 14493 unsigned Intrinsic) const { 14494 switch (Intrinsic) { 14495 case Intrinsic::arm_neon_vld1: 14496 case Intrinsic::arm_neon_vld2: 14497 case Intrinsic::arm_neon_vld3: 14498 case Intrinsic::arm_neon_vld4: 14499 case Intrinsic::arm_neon_vld2lane: 14500 case Intrinsic::arm_neon_vld3lane: 14501 case Intrinsic::arm_neon_vld4lane: 14502 case Intrinsic::arm_neon_vld2dup: 14503 case Intrinsic::arm_neon_vld3dup: 14504 case Intrinsic::arm_neon_vld4dup: { 14505 Info.opc = ISD::INTRINSIC_W_CHAIN; 14506 // Conservatively set memVT to the entire set of vectors loaded. 14507 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14508 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 14509 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14510 Info.ptrVal = I.getArgOperand(0); 14511 Info.offset = 0; 14512 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 14513 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 14514 // volatile loads with NEON intrinsics not supported 14515 Info.flags = MachineMemOperand::MOLoad; 14516 return true; 14517 } 14518 case Intrinsic::arm_neon_vld1x2: 14519 case Intrinsic::arm_neon_vld1x3: 14520 case Intrinsic::arm_neon_vld1x4: { 14521 Info.opc = ISD::INTRINSIC_W_CHAIN; 14522 // Conservatively set memVT to the entire set of vectors loaded. 14523 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14524 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 14525 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14526 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 14527 Info.offset = 0; 14528 Info.align = 0; 14529 // volatile loads with NEON intrinsics not supported 14530 Info.flags = MachineMemOperand::MOLoad; 14531 return true; 14532 } 14533 case Intrinsic::arm_neon_vst1: 14534 case Intrinsic::arm_neon_vst2: 14535 case Intrinsic::arm_neon_vst3: 14536 case Intrinsic::arm_neon_vst4: 14537 case Intrinsic::arm_neon_vst2lane: 14538 case Intrinsic::arm_neon_vst3lane: 14539 case Intrinsic::arm_neon_vst4lane: { 14540 Info.opc = ISD::INTRINSIC_VOID; 14541 // Conservatively set memVT to the entire set of vectors stored. 14542 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14543 unsigned NumElts = 0; 14544 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 14545 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 14546 if (!ArgTy->isVectorTy()) 14547 break; 14548 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 14549 } 14550 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14551 Info.ptrVal = I.getArgOperand(0); 14552 Info.offset = 0; 14553 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 14554 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 14555 // volatile stores with NEON intrinsics not supported 14556 Info.flags = MachineMemOperand::MOStore; 14557 return true; 14558 } 14559 case Intrinsic::arm_neon_vst1x2: 14560 case Intrinsic::arm_neon_vst1x3: 14561 case Intrinsic::arm_neon_vst1x4: { 14562 Info.opc = ISD::INTRINSIC_VOID; 14563 // Conservatively set memVT to the entire set of vectors stored. 14564 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14565 unsigned NumElts = 0; 14566 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 14567 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 14568 if (!ArgTy->isVectorTy()) 14569 break; 14570 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 14571 } 14572 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14573 Info.ptrVal = I.getArgOperand(0); 14574 Info.offset = 0; 14575 Info.align = 0; 14576 // volatile stores with NEON intrinsics not supported 14577 Info.flags = MachineMemOperand::MOStore; 14578 return true; 14579 } 14580 case Intrinsic::arm_ldaex: 14581 case Intrinsic::arm_ldrex: { 14582 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14583 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 14584 Info.opc = ISD::INTRINSIC_W_CHAIN; 14585 Info.memVT = MVT::getVT(PtrTy->getElementType()); 14586 Info.ptrVal = I.getArgOperand(0); 14587 Info.offset = 0; 14588 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 14589 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14590 return true; 14591 } 14592 case Intrinsic::arm_stlex: 14593 case Intrinsic::arm_strex: { 14594 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 14595 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 14596 Info.opc = ISD::INTRINSIC_W_CHAIN; 14597 Info.memVT = MVT::getVT(PtrTy->getElementType()); 14598 Info.ptrVal = I.getArgOperand(1); 14599 Info.offset = 0; 14600 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 14601 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14602 return true; 14603 } 14604 case Intrinsic::arm_stlexd: 14605 case Intrinsic::arm_strexd: 14606 Info.opc = ISD::INTRINSIC_W_CHAIN; 14607 Info.memVT = MVT::i64; 14608 Info.ptrVal = I.getArgOperand(2); 14609 Info.offset = 0; 14610 Info.align = 8; 14611 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14612 return true; 14613 14614 case Intrinsic::arm_ldaexd: 14615 case Intrinsic::arm_ldrexd: 14616 Info.opc = ISD::INTRINSIC_W_CHAIN; 14617 Info.memVT = MVT::i64; 14618 Info.ptrVal = I.getArgOperand(0); 14619 Info.offset = 0; 14620 Info.align = 8; 14621 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14622 return true; 14623 14624 default: 14625 break; 14626 } 14627 14628 return false; 14629 } 14630 14631 /// Returns true if it is beneficial to convert a load of a constant 14632 /// to just the constant itself. 14633 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 14634 Type *Ty) const { 14635 assert(Ty->isIntegerTy()); 14636 14637 unsigned Bits = Ty->getPrimitiveSizeInBits(); 14638 if (Bits == 0 || Bits > 32) 14639 return false; 14640 return true; 14641 } 14642 14643 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 14644 unsigned Index) const { 14645 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 14646 return false; 14647 14648 return (Index == 0 || Index == ResVT.getVectorNumElements()); 14649 } 14650 14651 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 14652 ARM_MB::MemBOpt Domain) const { 14653 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14654 14655 // First, if the target has no DMB, see what fallback we can use. 14656 if (!Subtarget->hasDataBarrier()) { 14657 // Some ARMv6 cpus can support data barriers with an mcr instruction. 14658 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 14659 // here. 14660 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 14661 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 14662 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 14663 Builder.getInt32(0), Builder.getInt32(7), 14664 Builder.getInt32(10), Builder.getInt32(5)}; 14665 return Builder.CreateCall(MCR, args); 14666 } else { 14667 // Instead of using barriers, atomic accesses on these subtargets use 14668 // libcalls. 14669 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 14670 } 14671 } else { 14672 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 14673 // Only a full system barrier exists in the M-class architectures. 14674 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 14675 Constant *CDomain = Builder.getInt32(Domain); 14676 return Builder.CreateCall(DMB, CDomain); 14677 } 14678 } 14679 14680 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 14681 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 14682 Instruction *Inst, 14683 AtomicOrdering Ord) const { 14684 switch (Ord) { 14685 case AtomicOrdering::NotAtomic: 14686 case AtomicOrdering::Unordered: 14687 llvm_unreachable("Invalid fence: unordered/non-atomic"); 14688 case AtomicOrdering::Monotonic: 14689 case AtomicOrdering::Acquire: 14690 return nullptr; // Nothing to do 14691 case AtomicOrdering::SequentiallyConsistent: 14692 if (!Inst->hasAtomicStore()) 14693 return nullptr; // Nothing to do 14694 LLVM_FALLTHROUGH; 14695 case AtomicOrdering::Release: 14696 case AtomicOrdering::AcquireRelease: 14697 if (Subtarget->preferISHSTBarriers()) 14698 return makeDMB(Builder, ARM_MB::ISHST); 14699 // FIXME: add a comment with a link to documentation justifying this. 14700 else 14701 return makeDMB(Builder, ARM_MB::ISH); 14702 } 14703 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 14704 } 14705 14706 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 14707 Instruction *Inst, 14708 AtomicOrdering Ord) const { 14709 switch (Ord) { 14710 case AtomicOrdering::NotAtomic: 14711 case AtomicOrdering::Unordered: 14712 llvm_unreachable("Invalid fence: unordered/not-atomic"); 14713 case AtomicOrdering::Monotonic: 14714 case AtomicOrdering::Release: 14715 return nullptr; // Nothing to do 14716 case AtomicOrdering::Acquire: 14717 case AtomicOrdering::AcquireRelease: 14718 case AtomicOrdering::SequentiallyConsistent: 14719 return makeDMB(Builder, ARM_MB::ISH); 14720 } 14721 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 14722 } 14723 14724 // Loads and stores less than 64-bits are already atomic; ones above that 14725 // are doomed anyway, so defer to the default libcall and blame the OS when 14726 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 14727 // anything for those. 14728 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 14729 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 14730 return (Size == 64) && !Subtarget->isMClass(); 14731 } 14732 14733 // Loads and stores less than 64-bits are already atomic; ones above that 14734 // are doomed anyway, so defer to the default libcall and blame the OS when 14735 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 14736 // anything for those. 14737 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 14738 // guarantee, see DDI0406C ARM architecture reference manual, 14739 // sections A8.8.72-74 LDRD) 14740 TargetLowering::AtomicExpansionKind 14741 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 14742 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 14743 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 14744 : AtomicExpansionKind::None; 14745 } 14746 14747 // For the real atomic operations, we have ldrex/strex up to 32 bits, 14748 // and up to 64 bits on the non-M profiles 14749 TargetLowering::AtomicExpansionKind 14750 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 14751 if (AI->isFloatingPointOperation()) 14752 return AtomicExpansionKind::CmpXChg; 14753 14754 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 14755 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 14756 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 14757 ? AtomicExpansionKind::LLSC 14758 : AtomicExpansionKind::None; 14759 } 14760 14761 TargetLowering::AtomicExpansionKind 14762 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 14763 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 14764 // implement cmpxchg without spilling. If the address being exchanged is also 14765 // on the stack and close enough to the spill slot, this can lead to a 14766 // situation where the monitor always gets cleared and the atomic operation 14767 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 14768 bool HasAtomicCmpXchg = 14769 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 14770 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg) 14771 return AtomicExpansionKind::LLSC; 14772 return AtomicExpansionKind::None; 14773 } 14774 14775 bool ARMTargetLowering::shouldInsertFencesForAtomic( 14776 const Instruction *I) const { 14777 return InsertFencesForAtomic; 14778 } 14779 14780 // This has so far only been implemented for MachO. 14781 bool ARMTargetLowering::useLoadStackGuardNode() const { 14782 return Subtarget->isTargetMachO(); 14783 } 14784 14785 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 14786 unsigned &Cost) const { 14787 // If we do not have NEON, vector types are not natively supported. 14788 if (!Subtarget->hasNEON()) 14789 return false; 14790 14791 // Floating point values and vector values map to the same register file. 14792 // Therefore, although we could do a store extract of a vector type, this is 14793 // better to leave at float as we have more freedom in the addressing mode for 14794 // those. 14795 if (VectorTy->isFPOrFPVectorTy()) 14796 return false; 14797 14798 // If the index is unknown at compile time, this is very expensive to lower 14799 // and it is not possible to combine the store with the extract. 14800 if (!isa<ConstantInt>(Idx)) 14801 return false; 14802 14803 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 14804 unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); 14805 // We can do a store + vector extract on any vector that fits perfectly in a D 14806 // or Q register. 14807 if (BitWidth == 64 || BitWidth == 128) { 14808 Cost = 0; 14809 return true; 14810 } 14811 return false; 14812 } 14813 14814 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 14815 return Subtarget->hasV6T2Ops(); 14816 } 14817 14818 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 14819 return Subtarget->hasV6T2Ops(); 14820 } 14821 14822 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 14823 return !Subtarget->hasMinSize(); 14824 } 14825 14826 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 14827 AtomicOrdering Ord) const { 14828 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14829 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 14830 bool IsAcquire = isAcquireOrStronger(Ord); 14831 14832 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 14833 // intrinsic must return {i32, i32} and we have to recombine them into a 14834 // single i64 here. 14835 if (ValTy->getPrimitiveSizeInBits() == 64) { 14836 Intrinsic::ID Int = 14837 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 14838 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 14839 14840 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14841 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 14842 14843 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 14844 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 14845 if (!Subtarget->isLittle()) 14846 std::swap (Lo, Hi); 14847 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 14848 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 14849 return Builder.CreateOr( 14850 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 14851 } 14852 14853 Type *Tys[] = { Addr->getType() }; 14854 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 14855 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 14856 14857 return Builder.CreateTruncOrBitCast( 14858 Builder.CreateCall(Ldrex, Addr), 14859 cast<PointerType>(Addr->getType())->getElementType()); 14860 } 14861 14862 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 14863 IRBuilder<> &Builder) const { 14864 if (!Subtarget->hasV7Ops()) 14865 return; 14866 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14867 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 14868 } 14869 14870 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 14871 Value *Addr, 14872 AtomicOrdering Ord) const { 14873 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 14874 bool IsRelease = isReleaseOrStronger(Ord); 14875 14876 // Since the intrinsics must have legal type, the i64 intrinsics take two 14877 // parameters: "i32, i32". We must marshal Val into the appropriate form 14878 // before the call. 14879 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 14880 Intrinsic::ID Int = 14881 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 14882 Function *Strex = Intrinsic::getDeclaration(M, Int); 14883 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 14884 14885 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 14886 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 14887 if (!Subtarget->isLittle()) 14888 std::swap(Lo, Hi); 14889 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 14890 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 14891 } 14892 14893 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 14894 Type *Tys[] = { Addr->getType() }; 14895 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 14896 14897 return Builder.CreateCall( 14898 Strex, {Builder.CreateZExtOrBitCast( 14899 Val, Strex->getFunctionType()->getParamType(0)), 14900 Addr}); 14901 } 14902 14903 14904 bool ARMTargetLowering::alignLoopsWithOptSize() const { 14905 return Subtarget->isMClass(); 14906 } 14907 14908 /// A helper function for determining the number of interleaved accesses we 14909 /// will generate when lowering accesses of the given type. 14910 unsigned 14911 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 14912 const DataLayout &DL) const { 14913 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 14914 } 14915 14916 bool ARMTargetLowering::isLegalInterleavedAccessType( 14917 VectorType *VecTy, const DataLayout &DL) const { 14918 14919 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 14920 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 14921 14922 // Ensure the vector doesn't have f16 elements. Even though we could do an 14923 // i16 vldN, we can't hold the f16 vectors and will end up converting via 14924 // f32. 14925 if (VecTy->getElementType()->isHalfTy()) 14926 return false; 14927 14928 // Ensure the number of vector elements is greater than 1. 14929 if (VecTy->getNumElements() < 2) 14930 return false; 14931 14932 // Ensure the element type is legal. 14933 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 14934 return false; 14935 14936 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 14937 // 128 will be split into multiple interleaved accesses. 14938 return VecSize == 64 || VecSize % 128 == 0; 14939 } 14940 14941 /// Lower an interleaved load into a vldN intrinsic. 14942 /// 14943 /// E.g. Lower an interleaved load (Factor = 2): 14944 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 14945 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 14946 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 14947 /// 14948 /// Into: 14949 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 14950 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 14951 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 14952 bool ARMTargetLowering::lowerInterleavedLoad( 14953 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 14954 ArrayRef<unsigned> Indices, unsigned Factor) const { 14955 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 14956 "Invalid interleave factor"); 14957 assert(!Shuffles.empty() && "Empty shufflevector input"); 14958 assert(Shuffles.size() == Indices.size() && 14959 "Unmatched number of shufflevectors and indices"); 14960 14961 VectorType *VecTy = Shuffles[0]->getType(); 14962 Type *EltTy = VecTy->getVectorElementType(); 14963 14964 const DataLayout &DL = LI->getModule()->getDataLayout(); 14965 14966 // Skip if we do not have NEON and skip illegal vector types. We can 14967 // "legalize" wide vector types into multiple interleaved accesses as long as 14968 // the vector types are divisible by 128. 14969 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) 14970 return false; 14971 14972 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 14973 14974 // A pointer vector can not be the return type of the ldN intrinsics. Need to 14975 // load integer vectors first and then convert to pointer vectors. 14976 if (EltTy->isPointerTy()) 14977 VecTy = 14978 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 14979 14980 IRBuilder<> Builder(LI); 14981 14982 // The base address of the load. 14983 Value *BaseAddr = LI->getPointerOperand(); 14984 14985 if (NumLoads > 1) { 14986 // If we're going to generate more than one load, reset the sub-vector type 14987 // to something legal. 14988 VecTy = VectorType::get(VecTy->getVectorElementType(), 14989 VecTy->getVectorNumElements() / NumLoads); 14990 14991 // We will compute the pointer operand of each load from the original base 14992 // address using GEPs. Cast the base address to a pointer to the scalar 14993 // element type. 14994 BaseAddr = Builder.CreateBitCast( 14995 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 14996 LI->getPointerAddressSpace())); 14997 } 14998 14999 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 15000 15001 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 15002 Type *Tys[] = {VecTy, Int8Ptr}; 15003 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 15004 Intrinsic::arm_neon_vld3, 15005 Intrinsic::arm_neon_vld4}; 15006 Function *VldnFunc = 15007 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 15008 15009 // Holds sub-vectors extracted from the load intrinsic return values. The 15010 // sub-vectors are associated with the shufflevector instructions they will 15011 // replace. 15012 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 15013 15014 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 15015 // If we're generating more than one load, compute the base address of 15016 // subsequent loads as an offset from the previous. 15017 if (LoadCount > 0) 15018 BaseAddr = 15019 Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, 15020 VecTy->getVectorNumElements() * Factor); 15021 15022 SmallVector<Value *, 2> Ops; 15023 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 15024 Ops.push_back(Builder.getInt32(LI->getAlignment())); 15025 15026 CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); 15027 15028 // Replace uses of each shufflevector with the corresponding vector loaded 15029 // by ldN. 15030 for (unsigned i = 0; i < Shuffles.size(); i++) { 15031 ShuffleVectorInst *SV = Shuffles[i]; 15032 unsigned Index = Indices[i]; 15033 15034 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 15035 15036 // Convert the integer vector to pointer vector if the element is pointer. 15037 if (EltTy->isPointerTy()) 15038 SubVec = Builder.CreateIntToPtr( 15039 SubVec, VectorType::get(SV->getType()->getVectorElementType(), 15040 VecTy->getVectorNumElements())); 15041 15042 SubVecs[SV].push_back(SubVec); 15043 } 15044 } 15045 15046 // Replace uses of the shufflevector instructions with the sub-vectors 15047 // returned by the load intrinsic. If a shufflevector instruction is 15048 // associated with more than one sub-vector, those sub-vectors will be 15049 // concatenated into a single wide vector. 15050 for (ShuffleVectorInst *SVI : Shuffles) { 15051 auto &SubVec = SubVecs[SVI]; 15052 auto *WideVec = 15053 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 15054 SVI->replaceAllUsesWith(WideVec); 15055 } 15056 15057 return true; 15058 } 15059 15060 /// Lower an interleaved store into a vstN intrinsic. 15061 /// 15062 /// E.g. Lower an interleaved store (Factor = 3): 15063 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 15064 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 15065 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 15066 /// 15067 /// Into: 15068 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 15069 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 15070 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 15071 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 15072 /// 15073 /// Note that the new shufflevectors will be removed and we'll only generate one 15074 /// vst3 instruction in CodeGen. 15075 /// 15076 /// Example for a more general valid mask (Factor 3). Lower: 15077 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 15078 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 15079 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 15080 /// 15081 /// Into: 15082 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 15083 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 15084 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 15085 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 15086 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 15087 ShuffleVectorInst *SVI, 15088 unsigned Factor) const { 15089 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 15090 "Invalid interleave factor"); 15091 15092 VectorType *VecTy = SVI->getType(); 15093 assert(VecTy->getVectorNumElements() % Factor == 0 && 15094 "Invalid interleaved store"); 15095 15096 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 15097 Type *EltTy = VecTy->getVectorElementType(); 15098 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 15099 15100 const DataLayout &DL = SI->getModule()->getDataLayout(); 15101 15102 // Skip if we do not have NEON and skip illegal vector types. We can 15103 // "legalize" wide vector types into multiple interleaved accesses as long as 15104 // the vector types are divisible by 128. 15105 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) 15106 return false; 15107 15108 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 15109 15110 Value *Op0 = SVI->getOperand(0); 15111 Value *Op1 = SVI->getOperand(1); 15112 IRBuilder<> Builder(SI); 15113 15114 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 15115 // vectors to integer vectors. 15116 if (EltTy->isPointerTy()) { 15117 Type *IntTy = DL.getIntPtrType(EltTy); 15118 15119 // Convert to the corresponding integer vector. 15120 Type *IntVecTy = 15121 VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); 15122 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 15123 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 15124 15125 SubVecTy = VectorType::get(IntTy, LaneLen); 15126 } 15127 15128 // The base address of the store. 15129 Value *BaseAddr = SI->getPointerOperand(); 15130 15131 if (NumStores > 1) { 15132 // If we're going to generate more than one store, reset the lane length 15133 // and sub-vector type to something legal. 15134 LaneLen /= NumStores; 15135 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 15136 15137 // We will compute the pointer operand of each store from the original base 15138 // address using GEPs. Cast the base address to a pointer to the scalar 15139 // element type. 15140 BaseAddr = Builder.CreateBitCast( 15141 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 15142 SI->getPointerAddressSpace())); 15143 } 15144 15145 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 15146 15147 auto Mask = SVI->getShuffleMask(); 15148 15149 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 15150 Type *Tys[] = {Int8Ptr, SubVecTy}; 15151 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 15152 Intrinsic::arm_neon_vst3, 15153 Intrinsic::arm_neon_vst4}; 15154 15155 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 15156 // If we generating more than one store, we compute the base address of 15157 // subsequent stores as an offset from the previous. 15158 if (StoreCount > 0) 15159 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), 15160 BaseAddr, LaneLen * Factor); 15161 15162 SmallVector<Value *, 6> Ops; 15163 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 15164 15165 Function *VstNFunc = 15166 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 15167 15168 // Split the shufflevector operands into sub vectors for the new vstN call. 15169 for (unsigned i = 0; i < Factor; i++) { 15170 unsigned IdxI = StoreCount * LaneLen * Factor + i; 15171 if (Mask[IdxI] >= 0) { 15172 Ops.push_back(Builder.CreateShuffleVector( 15173 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 15174 } else { 15175 unsigned StartMask = 0; 15176 for (unsigned j = 1; j < LaneLen; j++) { 15177 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 15178 if (Mask[IdxJ * Factor + IdxI] >= 0) { 15179 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 15180 break; 15181 } 15182 } 15183 // Note: If all elements in a chunk are undefs, StartMask=0! 15184 // Note: Filling undef gaps with random elements is ok, since 15185 // those elements were being written anyway (with undefs). 15186 // In the case of all undefs we're defaulting to using elems from 0 15187 // Note: StartMask cannot be negative, it's checked in 15188 // isReInterleaveMask 15189 Ops.push_back(Builder.CreateShuffleVector( 15190 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 15191 } 15192 } 15193 15194 Ops.push_back(Builder.getInt32(SI->getAlignment())); 15195 Builder.CreateCall(VstNFunc, Ops); 15196 } 15197 return true; 15198 } 15199 15200 enum HABaseType { 15201 HA_UNKNOWN = 0, 15202 HA_FLOAT, 15203 HA_DOUBLE, 15204 HA_VECT64, 15205 HA_VECT128 15206 }; 15207 15208 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 15209 uint64_t &Members) { 15210 if (auto *ST = dyn_cast<StructType>(Ty)) { 15211 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 15212 uint64_t SubMembers = 0; 15213 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 15214 return false; 15215 Members += SubMembers; 15216 } 15217 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 15218 uint64_t SubMembers = 0; 15219 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 15220 return false; 15221 Members += SubMembers * AT->getNumElements(); 15222 } else if (Ty->isFloatTy()) { 15223 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 15224 return false; 15225 Members = 1; 15226 Base = HA_FLOAT; 15227 } else if (Ty->isDoubleTy()) { 15228 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 15229 return false; 15230 Members = 1; 15231 Base = HA_DOUBLE; 15232 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 15233 Members = 1; 15234 switch (Base) { 15235 case HA_FLOAT: 15236 case HA_DOUBLE: 15237 return false; 15238 case HA_VECT64: 15239 return VT->getBitWidth() == 64; 15240 case HA_VECT128: 15241 return VT->getBitWidth() == 128; 15242 case HA_UNKNOWN: 15243 switch (VT->getBitWidth()) { 15244 case 64: 15245 Base = HA_VECT64; 15246 return true; 15247 case 128: 15248 Base = HA_VECT128; 15249 return true; 15250 default: 15251 return false; 15252 } 15253 } 15254 } 15255 15256 return (Members > 0 && Members <= 4); 15257 } 15258 15259 /// Return the correct alignment for the current calling convention. 15260 unsigned 15261 ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 15262 DataLayout DL) const { 15263 if (!ArgTy->isVectorTy()) 15264 return DL.getABITypeAlignment(ArgTy); 15265 15266 // Avoid over-aligning vector parameters. It would require realigning the 15267 // stack and waste space for no real benefit. 15268 return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment()); 15269 } 15270 15271 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 15272 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 15273 /// passing according to AAPCS rules. 15274 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 15275 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 15276 if (getEffectiveCallingConv(CallConv, isVarArg) != 15277 CallingConv::ARM_AAPCS_VFP) 15278 return false; 15279 15280 HABaseType Base = HA_UNKNOWN; 15281 uint64_t Members = 0; 15282 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 15283 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 15284 15285 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 15286 return IsHA || IsIntArray; 15287 } 15288 15289 unsigned ARMTargetLowering::getExceptionPointerRegister( 15290 const Constant *PersonalityFn) const { 15291 // Platforms which do not use SjLj EH may return values in these registers 15292 // via the personality function. 15293 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; 15294 } 15295 15296 unsigned ARMTargetLowering::getExceptionSelectorRegister( 15297 const Constant *PersonalityFn) const { 15298 // Platforms which do not use SjLj EH may return values in these registers 15299 // via the personality function. 15300 return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; 15301 } 15302 15303 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 15304 // Update IsSplitCSR in ARMFunctionInfo. 15305 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 15306 AFI->setIsSplitCSR(true); 15307 } 15308 15309 void ARMTargetLowering::insertCopiesSplitCSR( 15310 MachineBasicBlock *Entry, 15311 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 15312 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 15313 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 15314 if (!IStart) 15315 return; 15316 15317 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 15318 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 15319 MachineBasicBlock::iterator MBBI = Entry->begin(); 15320 for (const MCPhysReg *I = IStart; *I; ++I) { 15321 const TargetRegisterClass *RC = nullptr; 15322 if (ARM::GPRRegClass.contains(*I)) 15323 RC = &ARM::GPRRegClass; 15324 else if (ARM::DPRRegClass.contains(*I)) 15325 RC = &ARM::DPRRegClass; 15326 else 15327 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 15328 15329 unsigned NewVR = MRI->createVirtualRegister(RC); 15330 // Create copy from CSR to a virtual register. 15331 // FIXME: this currently does not emit CFI pseudo-instructions, it works 15332 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 15333 // nounwind. If we want to generalize this later, we may need to emit 15334 // CFI pseudo-instructions. 15335 assert(Entry->getParent()->getFunction().hasFnAttribute( 15336 Attribute::NoUnwind) && 15337 "Function should be nounwind in insertCopiesSplitCSR!"); 15338 Entry->addLiveIn(*I); 15339 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 15340 .addReg(*I); 15341 15342 // Insert the copy-back instructions right before the terminator. 15343 for (auto *Exit : Exits) 15344 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 15345 TII->get(TargetOpcode::COPY), *I) 15346 .addReg(NewVR); 15347 } 15348 } 15349 15350 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 15351 MF.getFrameInfo().computeMaxCallFrameSize(MF); 15352 TargetLoweringBase::finalizeLowering(MF); 15353 } 15354