1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that ARM uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARMBaseInstrInfo.h"
16 #include "ARMBaseRegisterInfo.h"
17 #include "ARMCallingConv.h"
18 #include "ARMConstantPoolValue.h"
19 #include "ARMISelLowering.h"
20 #include "ARMMachineFunctionInfo.h"
21 #include "ARMPerfectShuffle.h"
22 #include "ARMRegisterInfo.h"
23 #include "ARMSelectionDAGInfo.h"
24 #include "ARMSubtarget.h"
25 #include "MCTargetDesc/ARMAddressingModes.h"
26 #include "MCTargetDesc/ARMBaseInfo.h"
27 #include "llvm/ADT/APFloat.h"
28 #include "llvm/ADT/APInt.h"
29 #include "llvm/ADT/ArrayRef.h"
30 #include "llvm/ADT/BitVector.h"
31 #include "llvm/ADT/DenseMap.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/STLExtras.h"
36 #include "llvm/ADT/StringExtras.h"
37 #include "llvm/ADT/StringSwitch.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/ADT/Triple.h"
40 #include "llvm/ADT/Twine.h"
41 #include "llvm/Analysis/VectorUtils.h"
42 #include "llvm/CodeGen/CallingConvLower.h"
43 #include "llvm/CodeGen/ISDOpcodes.h"
44 #include "llvm/CodeGen/IntrinsicLowering.h"
45 #include "llvm/CodeGen/MachineBasicBlock.h"
46 #include "llvm/CodeGen/MachineConstantPool.h"
47 #include "llvm/CodeGen/MachineFrameInfo.h"
48 #include "llvm/CodeGen/MachineFunction.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineJumpTableInfo.h"
52 #include "llvm/CodeGen/MachineMemOperand.h"
53 #include "llvm/CodeGen/MachineOperand.h"
54 #include "llvm/CodeGen/MachineRegisterInfo.h"
55 #include "llvm/CodeGen/MachineValueType.h"
56 #include "llvm/CodeGen/RuntimeLibcalls.h"
57 #include "llvm/CodeGen/SelectionDAG.h"
58 #include "llvm/CodeGen/SelectionDAGNodes.h"
59 #include "llvm/CodeGen/ValueTypes.h"
60 #include "llvm/IR/Attributes.h"
61 #include "llvm/IR/CallingConv.h"
62 #include "llvm/IR/Constant.h"
63 #include "llvm/IR/Constants.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/DataLayout.h"
66 #include "llvm/IR/DebugLoc.h"
67 #include "llvm/IR/DerivedTypes.h"
68 #include "llvm/IR/Function.h"
69 #include "llvm/IR/GlobalAlias.h"
70 #include "llvm/IR/GlobalValue.h"
71 #include "llvm/IR/GlobalVariable.h"
72 #include "llvm/IR/IRBuilder.h"
73 #include "llvm/IR/InlineAsm.h"
74 #include "llvm/IR/Instruction.h"
75 #include "llvm/IR/Instructions.h"
76 #include "llvm/IR/IntrinsicInst.h"
77 #include "llvm/IR/Intrinsics.h"
78 #include "llvm/IR/Module.h"
79 #include "llvm/IR/Type.h"
80 #include "llvm/IR/User.h"
81 #include "llvm/IR/Value.h"
82 #include "llvm/MC/MCInstrDesc.h"
83 #include "llvm/MC/MCInstrItineraries.h"
84 #include "llvm/MC/MCRegisterInfo.h"
85 #include "llvm/MC/MCSchedule.h"
86 #include "llvm/Support/AtomicOrdering.h"
87 #include "llvm/Support/BranchProbability.h"
88 #include "llvm/Support/Casting.h"
89 #include "llvm/Support/CodeGen.h"
90 #include "llvm/Support/CommandLine.h"
91 #include "llvm/Support/Compiler.h"
92 #include "llvm/Support/Debug.h"
93 #include "llvm/Support/ErrorHandling.h"
94 #include "llvm/Support/MathExtras.h"
95 #include "llvm/Support/raw_ostream.h"
96 #include "llvm/Target/TargetInstrInfo.h"
97 #include "llvm/Target/TargetMachine.h"
98 #include "llvm/Target/TargetOptions.h"
99 #include <algorithm>
100 #include <cassert>
101 #include <cstdint>
102 #include <cstdlib>
103 #include <iterator>
104 #include <limits>
105 #include <tuple>
106 #include <string>
107 #include <utility>
108 #include <vector>
109 
110 using namespace llvm;
111 
112 #define DEBUG_TYPE "arm-isel"
113 
114 STATISTIC(NumTailCalls, "Number of tail calls");
115 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
116 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
117 STATISTIC(NumConstpoolPromoted,
118   "Number of constants with their storage promoted into constant pools");
119 
120 static cl::opt<bool>
121 ARMInterworking("arm-interworking", cl::Hidden,
122   cl::desc("Enable / disable ARM interworking (for debugging only)"),
123   cl::init(true));
124 
125 static cl::opt<bool> EnableConstpoolPromotion(
126     "arm-promote-constant", cl::Hidden,
127     cl::desc("Enable / disable promotion of unnamed_addr constants into "
128              "constant pools"),
129     cl::init(true));
130 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
131     "arm-promote-constant-max-size", cl::Hidden,
132     cl::desc("Maximum size of constant to promote into a constant pool"),
133     cl::init(64));
134 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
135     "arm-promote-constant-max-total", cl::Hidden,
136     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
137     cl::init(128));
138 
139 namespace {
140 
141   class ARMCCState : public CCState {
142   public:
143     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
144                SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
145                ParmContext PC)
146         : CCState(CC, isVarArg, MF, locs, C) {
147       assert(((PC == Call) || (PC == Prologue)) &&
148              "ARMCCState users must specify whether their context is call"
149              "or prologue generation.");
150       CallOrPrologue = PC;
151     }
152   };
153 
154 } // end anonymous namespace
155 
156 // The APCS parameter registers.
157 static const MCPhysReg GPRArgRegs[] = {
158   ARM::R0, ARM::R1, ARM::R2, ARM::R3
159 };
160 
161 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
162                                        MVT PromotedBitwiseVT) {
163   if (VT != PromotedLdStVT) {
164     setOperationAction(ISD::LOAD, VT, Promote);
165     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
166 
167     setOperationAction(ISD::STORE, VT, Promote);
168     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
169   }
170 
171   MVT ElemTy = VT.getVectorElementType();
172   if (ElemTy != MVT::f64)
173     setOperationAction(ISD::SETCC, VT, Custom);
174   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
175   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
176   if (ElemTy == MVT::i32) {
177     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
178     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
179     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
180     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
181   } else {
182     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
183     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
184     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
185     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
186   }
187   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
188   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
189   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
190   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
191   setOperationAction(ISD::SELECT,            VT, Expand);
192   setOperationAction(ISD::SELECT_CC,         VT, Expand);
193   setOperationAction(ISD::VSELECT,           VT, Expand);
194   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
195   if (VT.isInteger()) {
196     setOperationAction(ISD::SHL, VT, Custom);
197     setOperationAction(ISD::SRA, VT, Custom);
198     setOperationAction(ISD::SRL, VT, Custom);
199   }
200 
201   // Promote all bit-wise operations.
202   if (VT.isInteger() && VT != PromotedBitwiseVT) {
203     setOperationAction(ISD::AND, VT, Promote);
204     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
205     setOperationAction(ISD::OR,  VT, Promote);
206     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
207     setOperationAction(ISD::XOR, VT, Promote);
208     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
209   }
210 
211   // Neon does not support vector divide/remainder operations.
212   setOperationAction(ISD::SDIV, VT, Expand);
213   setOperationAction(ISD::UDIV, VT, Expand);
214   setOperationAction(ISD::FDIV, VT, Expand);
215   setOperationAction(ISD::SREM, VT, Expand);
216   setOperationAction(ISD::UREM, VT, Expand);
217   setOperationAction(ISD::FREM, VT, Expand);
218 
219   if (!VT.isFloatingPoint() &&
220       VT != MVT::v2i64 && VT != MVT::v1i64)
221     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
222       setOperationAction(Opcode, VT, Legal);
223 }
224 
225 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
226   addRegisterClass(VT, &ARM::DPRRegClass);
227   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
228 }
229 
230 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
231   addRegisterClass(VT, &ARM::DPairRegClass);
232   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
233 }
234 
235 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
236                                      const ARMSubtarget &STI)
237     : TargetLowering(TM), Subtarget(&STI) {
238   RegInfo = Subtarget->getRegisterInfo();
239   Itins = Subtarget->getInstrItineraryData();
240 
241   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
242 
243   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
244       !Subtarget->isTargetWatchOS()) {
245     const auto &E = Subtarget->getTargetTriple().getEnvironment();
246 
247     bool IsHFTarget = E == Triple::EABIHF || E == Triple::GNUEABIHF ||
248                       E == Triple::MuslEABIHF;
249     // Windows is a special case.  Technically, we will replace all of the "GNU"
250     // calls with calls to MSVCRT if appropriate and adjust the calling
251     // convention then.
252     IsHFTarget = IsHFTarget || Subtarget->isTargetWindows();
253 
254     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
255       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
256                             IsHFTarget ? CallingConv::ARM_AAPCS_VFP
257                                        : CallingConv::ARM_AAPCS);
258   }
259 
260   if (Subtarget->isTargetMachO()) {
261     // Uses VFP for Thumb libfuncs if available.
262     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
263         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
264       static const struct {
265         const RTLIB::Libcall Op;
266         const char * const Name;
267         const ISD::CondCode Cond;
268       } LibraryCalls[] = {
269         // Single-precision floating-point arithmetic.
270         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
271         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
272         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
273         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
274 
275         // Double-precision floating-point arithmetic.
276         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
277         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
278         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
279         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
280 
281         // Single-precision comparisons.
282         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
283         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
284         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
285         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
286         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
287         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
288         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
289         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
290 
291         // Double-precision comparisons.
292         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
293         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
294         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
295         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
296         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
297         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
298         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
299         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
300 
301         // Floating-point to integer conversions.
302         // i64 conversions are done via library routines even when generating VFP
303         // instructions, so use the same ones.
304         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
305         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
306         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
307         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
308 
309         // Conversions between floating types.
310         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
311         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
312 
313         // Integer to floating-point conversions.
314         // i64 conversions are done via library routines even when generating VFP
315         // instructions, so use the same ones.
316         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
317         // e.g., __floatunsidf vs. __floatunssidfvfp.
318         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
319         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
320         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
321         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
322       };
323 
324       for (const auto &LC : LibraryCalls) {
325         setLibcallName(LC.Op, LC.Name);
326         if (LC.Cond != ISD::SETCC_INVALID)
327           setCmpLibcallCC(LC.Op, LC.Cond);
328       }
329     }
330 
331     // Set the correct calling convention for ARMv7k WatchOS. It's just
332     // AAPCS_VFP for functions as simple as libcalls.
333     if (Subtarget->isTargetWatchABI()) {
334       for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
335         setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
336     }
337   }
338 
339   // These libcalls are not available in 32-bit.
340   setLibcallName(RTLIB::SHL_I128, nullptr);
341   setLibcallName(RTLIB::SRL_I128, nullptr);
342   setLibcallName(RTLIB::SRA_I128, nullptr);
343 
344   // RTLIB
345   if (Subtarget->isAAPCS_ABI() &&
346       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
347        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
348     static const struct {
349       const RTLIB::Libcall Op;
350       const char * const Name;
351       const CallingConv::ID CC;
352       const ISD::CondCode Cond;
353     } LibraryCalls[] = {
354       // Double-precision floating-point arithmetic helper functions
355       // RTABI chapter 4.1.2, Table 2
356       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
357       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
358       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
359       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
360 
361       // Double-precision floating-point comparison helper functions
362       // RTABI chapter 4.1.2, Table 3
363       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
364       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
365       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
366       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
367       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
368       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
369       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
370       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
371 
372       // Single-precision floating-point arithmetic helper functions
373       // RTABI chapter 4.1.2, Table 4
374       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
375       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
376       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
377       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
378 
379       // Single-precision floating-point comparison helper functions
380       // RTABI chapter 4.1.2, Table 5
381       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
382       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
383       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
384       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
385       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
386       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
387       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
388       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
389 
390       // Floating-point to integer conversions.
391       // RTABI chapter 4.1.2, Table 6
392       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
393       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
394       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
395       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
396       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
397       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
398       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
399       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
400 
401       // Conversions between floating types.
402       // RTABI chapter 4.1.2, Table 7
403       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
404       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
405       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
406 
407       // Integer to floating-point conversions.
408       // RTABI chapter 4.1.2, Table 8
409       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
410       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
411       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
412       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
413       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
414       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
415       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
416       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
417 
418       // Long long helper functions
419       // RTABI chapter 4.2, Table 9
420       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
421       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
422       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
423       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
424 
425       // Integer division functions
426       // RTABI chapter 4.3.1
427       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
428       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
429       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
430       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
431       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
432       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
433       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
434       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
435     };
436 
437     for (const auto &LC : LibraryCalls) {
438       setLibcallName(LC.Op, LC.Name);
439       setLibcallCallingConv(LC.Op, LC.CC);
440       if (LC.Cond != ISD::SETCC_INVALID)
441         setCmpLibcallCC(LC.Op, LC.Cond);
442     }
443 
444     // EABI dependent RTLIB
445     if (TM.Options.EABIVersion == EABI::EABI4 ||
446         TM.Options.EABIVersion == EABI::EABI5) {
447       static const struct {
448         const RTLIB::Libcall Op;
449         const char *const Name;
450         const CallingConv::ID CC;
451         const ISD::CondCode Cond;
452       } MemOpsLibraryCalls[] = {
453         // Memory operations
454         // RTABI chapter 4.3.4
455         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
456         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
457         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
458       };
459 
460       for (const auto &LC : MemOpsLibraryCalls) {
461         setLibcallName(LC.Op, LC.Name);
462         setLibcallCallingConv(LC.Op, LC.CC);
463         if (LC.Cond != ISD::SETCC_INVALID)
464           setCmpLibcallCC(LC.Op, LC.Cond);
465       }
466     }
467   }
468 
469   if (Subtarget->isTargetWindows()) {
470     static const struct {
471       const RTLIB::Libcall Op;
472       const char * const Name;
473       const CallingConv::ID CC;
474     } LibraryCalls[] = {
475       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
476       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
477       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
478       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
479       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
480       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
481       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
482       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
483     };
484 
485     for (const auto &LC : LibraryCalls) {
486       setLibcallName(LC.Op, LC.Name);
487       setLibcallCallingConv(LC.Op, LC.CC);
488     }
489   }
490 
491   // Use divmod compiler-rt calls for iOS 5.0 and later.
492   if (Subtarget->isTargetWatchOS() ||
493       (Subtarget->isTargetIOS() &&
494        !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
495     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
496     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
497   }
498 
499   // The half <-> float conversion functions are always soft-float on
500   // non-watchos platforms, but are needed for some targets which use a
501   // hard-float calling convention by default.
502   if (!Subtarget->isTargetWatchABI()) {
503     if (Subtarget->isAAPCS_ABI()) {
504       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
505       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
506       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
507     } else {
508       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
509       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
510       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
511     }
512   }
513 
514   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
515   // a __gnu_ prefix (which is the default).
516   if (Subtarget->isTargetAEABI()) {
517     static const struct {
518       const RTLIB::Libcall Op;
519       const char * const Name;
520       const CallingConv::ID CC;
521     } LibraryCalls[] = {
522       { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
523       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
524       { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
525     };
526 
527     for (const auto &LC : LibraryCalls) {
528       setLibcallName(LC.Op, LC.Name);
529       setLibcallCallingConv(LC.Op, LC.CC);
530     }
531   }
532 
533   if (Subtarget->isThumb1Only())
534     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
535   else
536     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
537 
538   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
539       !Subtarget->isThumb1Only()) {
540     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
541     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
542   }
543 
544   for (MVT VT : MVT::vector_valuetypes()) {
545     for (MVT InnerVT : MVT::vector_valuetypes()) {
546       setTruncStoreAction(VT, InnerVT, Expand);
547       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
548       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
549       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
550     }
551 
552     setOperationAction(ISD::MULHS, VT, Expand);
553     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
554     setOperationAction(ISD::MULHU, VT, Expand);
555     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
556 
557     setOperationAction(ISD::BSWAP, VT, Expand);
558   }
559 
560   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
561   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
562 
563   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
564   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
565 
566   if (Subtarget->hasNEON()) {
567     addDRTypeForNEON(MVT::v2f32);
568     addDRTypeForNEON(MVT::v8i8);
569     addDRTypeForNEON(MVT::v4i16);
570     addDRTypeForNEON(MVT::v2i32);
571     addDRTypeForNEON(MVT::v1i64);
572 
573     addQRTypeForNEON(MVT::v4f32);
574     addQRTypeForNEON(MVT::v2f64);
575     addQRTypeForNEON(MVT::v16i8);
576     addQRTypeForNEON(MVT::v8i16);
577     addQRTypeForNEON(MVT::v4i32);
578     addQRTypeForNEON(MVT::v2i64);
579 
580     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
581     // neither Neon nor VFP support any arithmetic operations on it.
582     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
583     // supported for v4f32.
584     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
585     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
586     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
587     // FIXME: Code duplication: FDIV and FREM are expanded always, see
588     // ARMTargetLowering::addTypeForNEON method for details.
589     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
590     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
591     // FIXME: Create unittest.
592     // In another words, find a way when "copysign" appears in DAG with vector
593     // operands.
594     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
595     // FIXME: Code duplication: SETCC has custom operation action, see
596     // ARMTargetLowering::addTypeForNEON method for details.
597     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
598     // FIXME: Create unittest for FNEG and for FABS.
599     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
600     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
601     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
602     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
603     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
604     setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
605     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
606     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
607     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
608     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
609     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
610     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
611     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
612     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
613     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
614     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
615     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
616     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
617     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
618 
619     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
620     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
621     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
622     setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
623     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
624     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
625     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
626     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
627     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
628     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
629     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
630     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
631     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
632     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
633     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
634 
635     // Mark v2f32 intrinsics.
636     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
637     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
638     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
639     setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
640     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
641     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
642     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
643     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
644     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
645     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
646     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
647     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
648     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
649     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
650     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
651 
652     // Neon does not support some operations on v1i64 and v2i64 types.
653     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
654     // Custom handling for some quad-vector types to detect VMULL.
655     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
656     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
657     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
658     // Custom handling for some vector types to avoid expensive expansions
659     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
660     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
661     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
662     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
663     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
664     // a destination type that is wider than the source, and nor does
665     // it have a FP_TO_[SU]INT instruction with a narrower destination than
666     // source.
667     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
668     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
669     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
670     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
671 
672     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
673     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
674 
675     // NEON does not have single instruction CTPOP for vectors with element
676     // types wider than 8-bits.  However, custom lowering can leverage the
677     // v8i8/v16i8 vcnt instruction.
678     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
679     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
680     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
681     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
682     setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
683     setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
684 
685     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
686     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
687 
688     // NEON does not have single instruction CTTZ for vectors.
689     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
690     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
691     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
692     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
693 
694     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
695     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
696     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
697     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
698 
699     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
700     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
701     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
702     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
703 
704     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
705     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
706     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
707     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
708 
709     // NEON only has FMA instructions as of VFP4.
710     if (!Subtarget->hasVFP4()) {
711       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
712       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
713     }
714 
715     setTargetDAGCombine(ISD::INTRINSIC_VOID);
716     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
717     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
718     setTargetDAGCombine(ISD::SHL);
719     setTargetDAGCombine(ISD::SRL);
720     setTargetDAGCombine(ISD::SRA);
721     setTargetDAGCombine(ISD::SIGN_EXTEND);
722     setTargetDAGCombine(ISD::ZERO_EXTEND);
723     setTargetDAGCombine(ISD::ANY_EXTEND);
724     setTargetDAGCombine(ISD::BUILD_VECTOR);
725     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
726     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
727     setTargetDAGCombine(ISD::STORE);
728     setTargetDAGCombine(ISD::FP_TO_SINT);
729     setTargetDAGCombine(ISD::FP_TO_UINT);
730     setTargetDAGCombine(ISD::FDIV);
731     setTargetDAGCombine(ISD::LOAD);
732 
733     // It is legal to extload from v4i8 to v4i16 or v4i32.
734     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
735                    MVT::v2i32}) {
736       for (MVT VT : MVT::integer_vector_valuetypes()) {
737         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
738         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
739         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
740       }
741     }
742   }
743 
744   // ARM and Thumb2 support UMLAL/SMLAL.
745   if (!Subtarget->isThumb1Only())
746     setTargetDAGCombine(ISD::ADDC);
747 
748   if (Subtarget->isFPOnlySP()) {
749     // When targeting a floating-point unit with only single-precision
750     // operations, f64 is legal for the few double-precision instructions which
751     // are present However, no double-precision operations other than moves,
752     // loads and stores are provided by the hardware.
753     setOperationAction(ISD::FADD,       MVT::f64, Expand);
754     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
755     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
756     setOperationAction(ISD::FMA,        MVT::f64, Expand);
757     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
758     setOperationAction(ISD::FREM,       MVT::f64, Expand);
759     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
760     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
761     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
762     setOperationAction(ISD::FABS,       MVT::f64, Expand);
763     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
764     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
765     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
766     setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
767     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
768     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
769     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
770     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
771     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
772     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
773     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
774     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
775     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
776     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
777     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
778     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
779     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
780     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
781     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
782     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
783     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
784     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
785     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
786   }
787 
788   computeRegisterProperties(Subtarget->getRegisterInfo());
789 
790   // ARM does not have floating-point extending loads.
791   for (MVT VT : MVT::fp_valuetypes()) {
792     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
793     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
794   }
795 
796   // ... or truncating stores
797   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
798   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
799   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
800 
801   // ARM does not have i1 sign extending load.
802   for (MVT VT : MVT::integer_valuetypes())
803     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
804 
805   // ARM supports all 4 flavors of integer indexed load / store.
806   if (!Subtarget->isThumb1Only()) {
807     for (unsigned im = (unsigned)ISD::PRE_INC;
808          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
809       setIndexedLoadAction(im,  MVT::i1,  Legal);
810       setIndexedLoadAction(im,  MVT::i8,  Legal);
811       setIndexedLoadAction(im,  MVT::i16, Legal);
812       setIndexedLoadAction(im,  MVT::i32, Legal);
813       setIndexedStoreAction(im, MVT::i1,  Legal);
814       setIndexedStoreAction(im, MVT::i8,  Legal);
815       setIndexedStoreAction(im, MVT::i16, Legal);
816       setIndexedStoreAction(im, MVT::i32, Legal);
817     }
818   } else {
819     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
820     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
821     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
822   }
823 
824   setOperationAction(ISD::SADDO, MVT::i32, Custom);
825   setOperationAction(ISD::UADDO, MVT::i32, Custom);
826   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
827   setOperationAction(ISD::USUBO, MVT::i32, Custom);
828 
829   // i64 operation support.
830   setOperationAction(ISD::MUL,     MVT::i64, Expand);
831   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
832   if (Subtarget->isThumb1Only()) {
833     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
834     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
835   }
836   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
837       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
838     setOperationAction(ISD::MULHS, MVT::i32, Expand);
839 
840   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
841   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
842   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
843   setOperationAction(ISD::SRL,       MVT::i64, Custom);
844   setOperationAction(ISD::SRA,       MVT::i64, Custom);
845 
846   if (!Subtarget->isThumb1Only()) {
847     // FIXME: We should do this for Thumb1 as well.
848     setOperationAction(ISD::ADDC,    MVT::i32, Custom);
849     setOperationAction(ISD::ADDE,    MVT::i32, Custom);
850     setOperationAction(ISD::SUBC,    MVT::i32, Custom);
851     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
852   }
853 
854   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
855     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
856 
857   // ARM does not have ROTL.
858   setOperationAction(ISD::ROTL, MVT::i32, Expand);
859   for (MVT VT : MVT::vector_valuetypes()) {
860     setOperationAction(ISD::ROTL, VT, Expand);
861     setOperationAction(ISD::ROTR, VT, Expand);
862   }
863   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
864   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
865   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
866     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
867 
868   // @llvm.readcyclecounter requires the Performance Monitors extension.
869   // Default to the 0 expansion on unsupported platforms.
870   // FIXME: Technically there are older ARM CPUs that have
871   // implementation-specific ways of obtaining this information.
872   if (Subtarget->hasPerfMon())
873     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
874 
875   // Only ARMv6 has BSWAP.
876   if (!Subtarget->hasV6Ops())
877     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
878 
879   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
880                                         : Subtarget->hasDivideInARMMode();
881   if (!hasDivide) {
882     // These are expanded into libcalls if the cpu doesn't have HW divider.
883     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
884     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
885   }
886 
887   if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
888     setOperationAction(ISD::SDIV, MVT::i32, Custom);
889     setOperationAction(ISD::UDIV, MVT::i32, Custom);
890 
891     setOperationAction(ISD::SDIV, MVT::i64, Custom);
892     setOperationAction(ISD::UDIV, MVT::i64, Custom);
893   }
894 
895   setOperationAction(ISD::SREM,  MVT::i32, Expand);
896   setOperationAction(ISD::UREM,  MVT::i32, Expand);
897 
898   // Register based DivRem for AEABI (RTABI 4.2)
899   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
900       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
901       Subtarget->isTargetWindows()) {
902     setOperationAction(ISD::SREM, MVT::i64, Custom);
903     setOperationAction(ISD::UREM, MVT::i64, Custom);
904     HasStandaloneRem = false;
905 
906     if (Subtarget->isTargetWindows()) {
907       const struct {
908         const RTLIB::Libcall Op;
909         const char * const Name;
910         const CallingConv::ID CC;
911       } LibraryCalls[] = {
912         { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
913         { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
914         { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
915         { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
916 
917         { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
918         { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
919         { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
920         { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
921       };
922 
923       for (const auto &LC : LibraryCalls) {
924         setLibcallName(LC.Op, LC.Name);
925         setLibcallCallingConv(LC.Op, LC.CC);
926       }
927     } else {
928       const struct {
929         const RTLIB::Libcall Op;
930         const char * const Name;
931         const CallingConv::ID CC;
932       } LibraryCalls[] = {
933         { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
934         { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
935         { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
936         { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
937 
938         { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
939         { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
940         { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
941         { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
942       };
943 
944       for (const auto &LC : LibraryCalls) {
945         setLibcallName(LC.Op, LC.Name);
946         setLibcallCallingConv(LC.Op, LC.CC);
947       }
948     }
949 
950     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
951     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
952     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
953     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
954   } else {
955     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
956     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
957   }
958 
959   if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
960     for (auto &VT : {MVT::f32, MVT::f64})
961       setOperationAction(ISD::FPOWI, VT, Custom);
962 
963   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
964   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
965   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
966   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
967 
968   setOperationAction(ISD::TRAP, MVT::Other, Legal);
969 
970   // Use the default implementation.
971   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
972   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
973   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
974   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
975   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
976   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
977 
978   if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
979     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
980   else
981     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
982 
983   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
984   // the default expansion.
985   InsertFencesForAtomic = false;
986   if (Subtarget->hasAnyDataBarrier() &&
987       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
988     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
989     // to ldrex/strex loops already.
990     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
991     if (!Subtarget->isThumb() || !Subtarget->isMClass())
992       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
993 
994     // On v8, we have particularly efficient implementations of atomic fences
995     // if they can be combined with nearby atomic loads and stores.
996     if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
997       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
998       InsertFencesForAtomic = true;
999     }
1000   } else {
1001     // If there's anything we can use as a barrier, go through custom lowering
1002     // for ATOMIC_FENCE.
1003     // If target has DMB in thumb, Fences can be inserted.
1004     if (Subtarget->hasDataBarrier())
1005       InsertFencesForAtomic = true;
1006 
1007     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
1008                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1009 
1010     // Set them all for expansion, which will force libcalls.
1011     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
1012     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
1013     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
1014     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
1015     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
1016     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
1017     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
1018     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1019     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1020     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1021     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1022     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1023     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1024     // Unordered/Monotonic case.
1025     if (!InsertFencesForAtomic) {
1026       setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1027       setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1028     }
1029   }
1030 
1031   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
1032 
1033   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1034   if (!Subtarget->hasV6Ops()) {
1035     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1036     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
1037   }
1038   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1039 
1040   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1041       !Subtarget->isThumb1Only()) {
1042     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1043     // iff target supports vfp2.
1044     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1045     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1046   }
1047 
1048   // We want to custom lower some of our intrinsics.
1049   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1050   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1051   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1052   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1053   if (Subtarget->useSjLjEH())
1054     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1055 
1056   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
1057   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
1058   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
1059   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
1060   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
1061   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
1062   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1063   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1064   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1065 
1066   // Thumb-1 cannot currently select ARMISD::SUBE.
1067   if (!Subtarget->isThumb1Only())
1068     setOperationAction(ISD::SETCCE, MVT::i32, Custom);
1069 
1070   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
1071   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
1072   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
1073   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
1074   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
1075 
1076   // We don't support sin/cos/fmod/copysign/pow
1077   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
1078   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
1079   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
1080   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
1081   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
1082   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
1083   setOperationAction(ISD::FREM,      MVT::f64, Expand);
1084   setOperationAction(ISD::FREM,      MVT::f32, Expand);
1085   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1086       !Subtarget->isThumb1Only()) {
1087     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1088     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1089   }
1090   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
1091   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
1092 
1093   if (!Subtarget->hasVFP4()) {
1094     setOperationAction(ISD::FMA, MVT::f64, Expand);
1095     setOperationAction(ISD::FMA, MVT::f32, Expand);
1096   }
1097 
1098   // Various VFP goodness
1099   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1100     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1101     if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
1102       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1103       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1104     }
1105 
1106     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1107     if (!Subtarget->hasFP16()) {
1108       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1109       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1110     }
1111   }
1112 
1113   // Combine sin / cos into one node or libcall if possible.
1114   if (Subtarget->hasSinCos()) {
1115     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1116     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1117     if (Subtarget->isTargetWatchABI()) {
1118       setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
1119       setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
1120     }
1121     if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
1122       // For iOS, we don't want to the normal expansion of a libcall to
1123       // sincos. We want to issue a libcall to __sincos_stret.
1124       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1125       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1126     }
1127   }
1128 
1129   // FP-ARMv8 implements a lot of rounding-like FP operations.
1130   if (Subtarget->hasFPARMv8()) {
1131     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1132     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1133     setOperationAction(ISD::FROUND, MVT::f32, Legal);
1134     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1135     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1136     setOperationAction(ISD::FRINT, MVT::f32, Legal);
1137     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1138     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1139     setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1140     setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1141     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1142     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1143 
1144     if (!Subtarget->isFPOnlySP()) {
1145       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1146       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1147       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1148       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1149       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1150       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1151       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1152       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1153     }
1154   }
1155 
1156   if (Subtarget->hasNEON()) {
1157     // vmin and vmax aren't available in a scalar form, so we use
1158     // a NEON instruction with an undef lane instead.
1159     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
1160     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
1161     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
1162     setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
1163     setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
1164     setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
1165   }
1166 
1167   // We have target-specific dag combine patterns for the following nodes:
1168   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1169   setTargetDAGCombine(ISD::ADD);
1170   setTargetDAGCombine(ISD::SUB);
1171   setTargetDAGCombine(ISD::MUL);
1172   setTargetDAGCombine(ISD::AND);
1173   setTargetDAGCombine(ISD::OR);
1174   setTargetDAGCombine(ISD::XOR);
1175 
1176   if (Subtarget->hasV6Ops())
1177     setTargetDAGCombine(ISD::SRL);
1178 
1179   setStackPointerRegisterToSaveRestore(ARM::SP);
1180 
1181   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1182       !Subtarget->hasVFP2())
1183     setSchedulingPreference(Sched::RegPressure);
1184   else
1185     setSchedulingPreference(Sched::Hybrid);
1186 
1187   //// temporary - rewrite interface to use type
1188   MaxStoresPerMemset = 8;
1189   MaxStoresPerMemsetOptSize = 4;
1190   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1191   MaxStoresPerMemcpyOptSize = 2;
1192   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1193   MaxStoresPerMemmoveOptSize = 2;
1194 
1195   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1196   // are at least 4 bytes aligned.
1197   setMinStackArgumentAlignment(4);
1198 
1199   // Prefer likely predicted branches to selects on out-of-order cores.
1200   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1201 
1202   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
1203 }
1204 
1205 bool ARMTargetLowering::useSoftFloat() const {
1206   return Subtarget->useSoftFloat();
1207 }
1208 
1209 // FIXME: It might make sense to define the representative register class as the
1210 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1211 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1212 // SPR's representative would be DPR_VFP2. This should work well if register
1213 // pressure tracking were modified such that a register use would increment the
1214 // pressure of the register class's representative and all of it's super
1215 // classes' representatives transitively. We have not implemented this because
1216 // of the difficulty prior to coalescing of modeling operand register classes
1217 // due to the common occurrence of cross class copies and subregister insertions
1218 // and extractions.
1219 std::pair<const TargetRegisterClass *, uint8_t>
1220 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1221                                            MVT VT) const {
1222   const TargetRegisterClass *RRC = nullptr;
1223   uint8_t Cost = 1;
1224   switch (VT.SimpleTy) {
1225   default:
1226     return TargetLowering::findRepresentativeClass(TRI, VT);
1227   // Use DPR as representative register class for all floating point
1228   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1229   // the cost is 1 for both f32 and f64.
1230   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1231   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1232     RRC = &ARM::DPRRegClass;
1233     // When NEON is used for SP, only half of the register file is available
1234     // because operations that define both SP and DP results will be constrained
1235     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1236     // coalescing by double-counting the SP regs. See the FIXME above.
1237     if (Subtarget->useNEONForSinglePrecisionFP())
1238       Cost = 2;
1239     break;
1240   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1241   case MVT::v4f32: case MVT::v2f64:
1242     RRC = &ARM::DPRRegClass;
1243     Cost = 2;
1244     break;
1245   case MVT::v4i64:
1246     RRC = &ARM::DPRRegClass;
1247     Cost = 4;
1248     break;
1249   case MVT::v8i64:
1250     RRC = &ARM::DPRRegClass;
1251     Cost = 8;
1252     break;
1253   }
1254   return std::make_pair(RRC, Cost);
1255 }
1256 
1257 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1258   switch ((ARMISD::NodeType)Opcode) {
1259   case ARMISD::FIRST_NUMBER:  break;
1260   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1261   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1262   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1263   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1264   case ARMISD::CALL:          return "ARMISD::CALL";
1265   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1266   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1267   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1268   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1269   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1270   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1271   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1272   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1273   case ARMISD::CMP:           return "ARMISD::CMP";
1274   case ARMISD::CMN:           return "ARMISD::CMN";
1275   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1276   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1277   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1278   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1279   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1280 
1281   case ARMISD::CMOV:          return "ARMISD::CMOV";
1282 
1283   case ARMISD::SSAT:          return "ARMISD::SSAT";
1284 
1285   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1286   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1287   case ARMISD::RRX:           return "ARMISD::RRX";
1288 
1289   case ARMISD::ADDC:          return "ARMISD::ADDC";
1290   case ARMISD::ADDE:          return "ARMISD::ADDE";
1291   case ARMISD::SUBC:          return "ARMISD::SUBC";
1292   case ARMISD::SUBE:          return "ARMISD::SUBE";
1293 
1294   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1295   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1296 
1297   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1298   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1299   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1300 
1301   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1302 
1303   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1304 
1305   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1306 
1307   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1308 
1309   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1310 
1311   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
1312   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1313 
1314   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1315   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1316   case ARMISD::VCGE:          return "ARMISD::VCGE";
1317   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1318   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1319   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1320   case ARMISD::VCGT:          return "ARMISD::VCGT";
1321   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1322   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1323   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1324   case ARMISD::VTST:          return "ARMISD::VTST";
1325 
1326   case ARMISD::VSHL:          return "ARMISD::VSHL";
1327   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1328   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1329   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1330   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1331   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1332   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1333   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1334   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1335   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1336   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1337   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1338   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1339   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1340   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1341   case ARMISD::VSLI:          return "ARMISD::VSLI";
1342   case ARMISD::VSRI:          return "ARMISD::VSRI";
1343   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1344   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1345   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1346   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1347   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1348   case ARMISD::VDUP:          return "ARMISD::VDUP";
1349   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1350   case ARMISD::VEXT:          return "ARMISD::VEXT";
1351   case ARMISD::VREV64:        return "ARMISD::VREV64";
1352   case ARMISD::VREV32:        return "ARMISD::VREV32";
1353   case ARMISD::VREV16:        return "ARMISD::VREV16";
1354   case ARMISD::VZIP:          return "ARMISD::VZIP";
1355   case ARMISD::VUZP:          return "ARMISD::VUZP";
1356   case ARMISD::VTRN:          return "ARMISD::VTRN";
1357   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1358   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1359   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1360   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1361   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
1362   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1363   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1364   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1365   case ARMISD::BFI:           return "ARMISD::BFI";
1366   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1367   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1368   case ARMISD::VBSL:          return "ARMISD::VBSL";
1369   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1370   case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
1371   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1372   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1373   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1374   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1375   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1376   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1377   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1378   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1379   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1380   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1381   case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
1382   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1383   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1384   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1385   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1386   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1387   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1388   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1389   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1390   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1391   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1392   }
1393   return nullptr;
1394 }
1395 
1396 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1397                                           EVT VT) const {
1398   if (!VT.isVector())
1399     return getPointerTy(DL);
1400   return VT.changeVectorElementTypeToInteger();
1401 }
1402 
1403 /// getRegClassFor - Return the register class that should be used for the
1404 /// specified value type.
1405 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1406   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1407   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1408   // load / store 4 to 8 consecutive D registers.
1409   if (Subtarget->hasNEON()) {
1410     if (VT == MVT::v4i64)
1411       return &ARM::QQPRRegClass;
1412     if (VT == MVT::v8i64)
1413       return &ARM::QQQQPRRegClass;
1414   }
1415   return TargetLowering::getRegClassFor(VT);
1416 }
1417 
1418 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1419 // source/dest is aligned and the copy size is large enough. We therefore want
1420 // to align such objects passed to memory intrinsics.
1421 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1422                                                unsigned &PrefAlign) const {
1423   if (!isa<MemIntrinsic>(CI))
1424     return false;
1425   MinSize = 8;
1426   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1427   // cycle faster than 4-byte aligned LDM.
1428   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1429   return true;
1430 }
1431 
1432 // Create a fast isel object.
1433 FastISel *
1434 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1435                                   const TargetLibraryInfo *libInfo) const {
1436   return ARM::createFastISel(funcInfo, libInfo);
1437 }
1438 
1439 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1440   unsigned NumVals = N->getNumValues();
1441   if (!NumVals)
1442     return Sched::RegPressure;
1443 
1444   for (unsigned i = 0; i != NumVals; ++i) {
1445     EVT VT = N->getValueType(i);
1446     if (VT == MVT::Glue || VT == MVT::Other)
1447       continue;
1448     if (VT.isFloatingPoint() || VT.isVector())
1449       return Sched::ILP;
1450   }
1451 
1452   if (!N->isMachineOpcode())
1453     return Sched::RegPressure;
1454 
1455   // Load are scheduled for latency even if there instruction itinerary
1456   // is not available.
1457   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1458   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1459 
1460   if (MCID.getNumDefs() == 0)
1461     return Sched::RegPressure;
1462   if (!Itins->isEmpty() &&
1463       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1464     return Sched::ILP;
1465 
1466   return Sched::RegPressure;
1467 }
1468 
1469 //===----------------------------------------------------------------------===//
1470 // Lowering Code
1471 //===----------------------------------------------------------------------===//
1472 
1473 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1474 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1475   switch (CC) {
1476   default: llvm_unreachable("Unknown condition code!");
1477   case ISD::SETNE:  return ARMCC::NE;
1478   case ISD::SETEQ:  return ARMCC::EQ;
1479   case ISD::SETGT:  return ARMCC::GT;
1480   case ISD::SETGE:  return ARMCC::GE;
1481   case ISD::SETLT:  return ARMCC::LT;
1482   case ISD::SETLE:  return ARMCC::LE;
1483   case ISD::SETUGT: return ARMCC::HI;
1484   case ISD::SETUGE: return ARMCC::HS;
1485   case ISD::SETULT: return ARMCC::LO;
1486   case ISD::SETULE: return ARMCC::LS;
1487   }
1488 }
1489 
1490 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1491 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1492                         ARMCC::CondCodes &CondCode2) {
1493   CondCode2 = ARMCC::AL;
1494   switch (CC) {
1495   default: llvm_unreachable("Unknown FP condition!");
1496   case ISD::SETEQ:
1497   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1498   case ISD::SETGT:
1499   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1500   case ISD::SETGE:
1501   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1502   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1503   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1504   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1505   case ISD::SETO:   CondCode = ARMCC::VC; break;
1506   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1507   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1508   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1509   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1510   case ISD::SETLT:
1511   case ISD::SETULT: CondCode = ARMCC::LT; break;
1512   case ISD::SETLE:
1513   case ISD::SETULE: CondCode = ARMCC::LE; break;
1514   case ISD::SETNE:
1515   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1516   }
1517 }
1518 
1519 //===----------------------------------------------------------------------===//
1520 //                      Calling Convention Implementation
1521 //===----------------------------------------------------------------------===//
1522 
1523 #include "ARMGenCallingConv.inc"
1524 
1525 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1526 /// account presence of floating point hardware and calling convention
1527 /// limitations, such as support for variadic functions.
1528 CallingConv::ID
1529 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1530                                            bool isVarArg) const {
1531   switch (CC) {
1532   default:
1533     llvm_unreachable("Unsupported calling convention");
1534   case CallingConv::ARM_AAPCS:
1535   case CallingConv::ARM_APCS:
1536   case CallingConv::GHC:
1537     return CC;
1538   case CallingConv::PreserveMost:
1539     return CallingConv::PreserveMost;
1540   case CallingConv::ARM_AAPCS_VFP:
1541   case CallingConv::Swift:
1542     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1543   case CallingConv::C:
1544     if (!Subtarget->isAAPCS_ABI())
1545       return CallingConv::ARM_APCS;
1546     else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
1547              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1548              !isVarArg)
1549       return CallingConv::ARM_AAPCS_VFP;
1550     else
1551       return CallingConv::ARM_AAPCS;
1552   case CallingConv::Fast:
1553   case CallingConv::CXX_FAST_TLS:
1554     if (!Subtarget->isAAPCS_ABI()) {
1555       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1556         return CallingConv::Fast;
1557       return CallingConv::ARM_APCS;
1558     } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1559       return CallingConv::ARM_AAPCS_VFP;
1560     else
1561       return CallingConv::ARM_AAPCS;
1562   }
1563 }
1564 
1565 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1566                                                  bool isVarArg) const {
1567   return CCAssignFnForNode(CC, false, isVarArg);
1568 }
1569 
1570 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1571                                                    bool isVarArg) const {
1572   return CCAssignFnForNode(CC, true, isVarArg);
1573 }
1574 
1575 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1576 /// CallingConvention.
1577 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1578                                                  bool Return,
1579                                                  bool isVarArg) const {
1580   switch (getEffectiveCallingConv(CC, isVarArg)) {
1581   default:
1582     llvm_unreachable("Unsupported calling convention");
1583   case CallingConv::ARM_APCS:
1584     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1585   case CallingConv::ARM_AAPCS:
1586     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1587   case CallingConv::ARM_AAPCS_VFP:
1588     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1589   case CallingConv::Fast:
1590     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1591   case CallingConv::GHC:
1592     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1593   case CallingConv::PreserveMost:
1594     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1595   }
1596 }
1597 
1598 /// LowerCallResult - Lower the result values of a call into the
1599 /// appropriate copies out of appropriate physical registers.
1600 SDValue ARMTargetLowering::LowerCallResult(
1601     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
1602     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1603     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1604     SDValue ThisVal) const {
1605 
1606   // Assign locations to each value returned by this call.
1607   SmallVector<CCValAssign, 16> RVLocs;
1608   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1609                     *DAG.getContext(), Call);
1610   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1611 
1612   // Copy all of the result registers out of their specified physreg.
1613   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1614     CCValAssign VA = RVLocs[i];
1615 
1616     // Pass 'this' value directly from the argument to return value, to avoid
1617     // reg unit interference
1618     if (i == 0 && isThisReturn) {
1619       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1620              "unexpected return calling convention register assignment");
1621       InVals.push_back(ThisVal);
1622       continue;
1623     }
1624 
1625     SDValue Val;
1626     if (VA.needsCustom()) {
1627       // Handle f64 or half of a v2f64.
1628       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1629                                       InFlag);
1630       Chain = Lo.getValue(1);
1631       InFlag = Lo.getValue(2);
1632       VA = RVLocs[++i]; // skip ahead to next loc
1633       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1634                                       InFlag);
1635       Chain = Hi.getValue(1);
1636       InFlag = Hi.getValue(2);
1637       if (!Subtarget->isLittle())
1638         std::swap (Lo, Hi);
1639       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1640 
1641       if (VA.getLocVT() == MVT::v2f64) {
1642         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1643         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1644                           DAG.getConstant(0, dl, MVT::i32));
1645 
1646         VA = RVLocs[++i]; // skip ahead to next loc
1647         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1648         Chain = Lo.getValue(1);
1649         InFlag = Lo.getValue(2);
1650         VA = RVLocs[++i]; // skip ahead to next loc
1651         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1652         Chain = Hi.getValue(1);
1653         InFlag = Hi.getValue(2);
1654         if (!Subtarget->isLittle())
1655           std::swap (Lo, Hi);
1656         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1657         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1658                           DAG.getConstant(1, dl, MVT::i32));
1659       }
1660     } else {
1661       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1662                                InFlag);
1663       Chain = Val.getValue(1);
1664       InFlag = Val.getValue(2);
1665     }
1666 
1667     switch (VA.getLocInfo()) {
1668     default: llvm_unreachable("Unknown loc info!");
1669     case CCValAssign::Full: break;
1670     case CCValAssign::BCvt:
1671       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1672       break;
1673     }
1674 
1675     InVals.push_back(Val);
1676   }
1677 
1678   return Chain;
1679 }
1680 
1681 /// LowerMemOpCallTo - Store the argument to the stack.
1682 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1683                                             SDValue Arg, const SDLoc &dl,
1684                                             SelectionDAG &DAG,
1685                                             const CCValAssign &VA,
1686                                             ISD::ArgFlagsTy Flags) const {
1687   unsigned LocMemOffset = VA.getLocMemOffset();
1688   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1689   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1690                        StackPtr, PtrOff);
1691   return DAG.getStore(
1692       Chain, dl, Arg, PtrOff,
1693       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
1694 }
1695 
1696 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1697                                          SDValue Chain, SDValue &Arg,
1698                                          RegsToPassVector &RegsToPass,
1699                                          CCValAssign &VA, CCValAssign &NextVA,
1700                                          SDValue &StackPtr,
1701                                          SmallVectorImpl<SDValue> &MemOpChains,
1702                                          ISD::ArgFlagsTy Flags) const {
1703 
1704   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1705                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1706   unsigned id = Subtarget->isLittle() ? 0 : 1;
1707   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1708 
1709   if (NextVA.isRegLoc())
1710     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1711   else {
1712     assert(NextVA.isMemLoc());
1713     if (!StackPtr.getNode())
1714       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1715                                     getPointerTy(DAG.getDataLayout()));
1716 
1717     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
1718                                            dl, DAG, NextVA,
1719                                            Flags));
1720   }
1721 }
1722 
1723 /// LowerCall - Lowering a call into a callseq_start <-
1724 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1725 /// nodes.
1726 SDValue
1727 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1728                              SmallVectorImpl<SDValue> &InVals) const {
1729   SelectionDAG &DAG                     = CLI.DAG;
1730   SDLoc &dl                             = CLI.DL;
1731   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1732   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1733   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1734   SDValue Chain                         = CLI.Chain;
1735   SDValue Callee                        = CLI.Callee;
1736   bool &isTailCall                      = CLI.IsTailCall;
1737   CallingConv::ID CallConv              = CLI.CallConv;
1738   bool doesNotRet                       = CLI.DoesNotReturn;
1739   bool isVarArg                         = CLI.IsVarArg;
1740 
1741   MachineFunction &MF = DAG.getMachineFunction();
1742   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1743   bool isThisReturn   = false;
1744   bool isSibCall      = false;
1745   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
1746 
1747   // Disable tail calls if they're not supported.
1748   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
1749     isTailCall = false;
1750 
1751   if (isTailCall) {
1752     // Check if it's really possible to do a tail call.
1753     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1754                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
1755                                                    Outs, OutVals, Ins, DAG);
1756     if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
1757       report_fatal_error("failed to perform tail call elimination on a call "
1758                          "site marked musttail");
1759     // We don't support GuaranteedTailCallOpt for ARM, only automatically
1760     // detected sibcalls.
1761     if (isTailCall) {
1762       ++NumTailCalls;
1763       isSibCall = true;
1764     }
1765   }
1766 
1767   // Analyze operands of the call, assigning locations to each operand.
1768   SmallVector<CCValAssign, 16> ArgLocs;
1769   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1770                     *DAG.getContext(), Call);
1771   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
1772 
1773   // Get a count of how many bytes are to be pushed on the stack.
1774   unsigned NumBytes = CCInfo.getNextStackOffset();
1775 
1776   // For tail calls, memory operands are available in our caller's stack.
1777   if (isSibCall)
1778     NumBytes = 0;
1779 
1780   // Adjust the stack pointer for the new arguments...
1781   // These operations are automatically eliminated by the prolog/epilog pass
1782   if (!isSibCall)
1783     Chain = DAG.getCALLSEQ_START(Chain,
1784                                  DAG.getIntPtrConstant(NumBytes, dl, true), dl);
1785 
1786   SDValue StackPtr =
1787       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
1788 
1789   RegsToPassVector RegsToPass;
1790   SmallVector<SDValue, 8> MemOpChains;
1791 
1792   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1793   // of tail call optimization, arguments are handled later.
1794   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1795        i != e;
1796        ++i, ++realArgIdx) {
1797     CCValAssign &VA = ArgLocs[i];
1798     SDValue Arg = OutVals[realArgIdx];
1799     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1800     bool isByVal = Flags.isByVal();
1801 
1802     // Promote the value if needed.
1803     switch (VA.getLocInfo()) {
1804     default: llvm_unreachable("Unknown loc info!");
1805     case CCValAssign::Full: break;
1806     case CCValAssign::SExt:
1807       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1808       break;
1809     case CCValAssign::ZExt:
1810       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1811       break;
1812     case CCValAssign::AExt:
1813       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1814       break;
1815     case CCValAssign::BCvt:
1816       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1817       break;
1818     }
1819 
1820     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1821     if (VA.needsCustom()) {
1822       if (VA.getLocVT() == MVT::v2f64) {
1823         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1824                                   DAG.getConstant(0, dl, MVT::i32));
1825         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1826                                   DAG.getConstant(1, dl, MVT::i32));
1827 
1828         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1829                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1830 
1831         VA = ArgLocs[++i]; // skip ahead to next loc
1832         if (VA.isRegLoc()) {
1833           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1834                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1835         } else {
1836           assert(VA.isMemLoc());
1837 
1838           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1839                                                  dl, DAG, VA, Flags));
1840         }
1841       } else {
1842         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1843                          StackPtr, MemOpChains, Flags);
1844       }
1845     } else if (VA.isRegLoc()) {
1846       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
1847         assert(VA.getLocVT() == MVT::i32 &&
1848                "unexpected calling convention register assignment");
1849         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1850                "unexpected use of 'returned'");
1851         isThisReturn = true;
1852       }
1853       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1854     } else if (isByVal) {
1855       assert(VA.isMemLoc());
1856       unsigned offset = 0;
1857 
1858       // True if this byval aggregate will be split between registers
1859       // and memory.
1860       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1861       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
1862 
1863       if (CurByValIdx < ByValArgsCount) {
1864 
1865         unsigned RegBegin, RegEnd;
1866         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1867 
1868         EVT PtrVT =
1869             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1870         unsigned int i, j;
1871         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1872           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
1873           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1874           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1875                                      MachinePointerInfo(),
1876                                      DAG.InferPtrAlignment(AddArg));
1877           MemOpChains.push_back(Load.getValue(1));
1878           RegsToPass.push_back(std::make_pair(j, Load));
1879         }
1880 
1881         // If parameter size outsides register area, "offset" value
1882         // helps us to calculate stack slot for remained part properly.
1883         offset = RegEnd - RegBegin;
1884 
1885         CCInfo.nextInRegsParam();
1886       }
1887 
1888       if (Flags.getByValSize() > 4*offset) {
1889         auto PtrVT = getPointerTy(DAG.getDataLayout());
1890         unsigned LocMemOffset = VA.getLocMemOffset();
1891         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1892         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
1893         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
1894         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
1895         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
1896                                            MVT::i32);
1897         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
1898                                             MVT::i32);
1899 
1900         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1901         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1902         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1903                                           Ops));
1904       }
1905     } else if (!isSibCall) {
1906       assert(VA.isMemLoc());
1907 
1908       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1909                                              dl, DAG, VA, Flags));
1910     }
1911   }
1912 
1913   if (!MemOpChains.empty())
1914     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
1915 
1916   // Build a sequence of copy-to-reg nodes chained together with token chain
1917   // and flag operands which copy the outgoing args into the appropriate regs.
1918   SDValue InFlag;
1919   // Tail call byval lowering might overwrite argument registers so in case of
1920   // tail call optimization the copies to registers are lowered later.
1921   if (!isTailCall)
1922     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1923       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1924                                RegsToPass[i].second, InFlag);
1925       InFlag = Chain.getValue(1);
1926     }
1927 
1928   // For tail calls lower the arguments to the 'real' stack slot.
1929   if (isTailCall) {
1930     // Force all the incoming stack arguments to be loaded from the stack
1931     // before any new outgoing arguments are stored to the stack, because the
1932     // outgoing stack slots may alias the incoming argument stack slots, and
1933     // the alias isn't otherwise explicit. This is slightly more conservative
1934     // than necessary, because it means that each store effectively depends
1935     // on every argument instead of just those arguments it would clobber.
1936 
1937     // Do not flag preceding copytoreg stuff together with the following stuff.
1938     InFlag = SDValue();
1939     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1940       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1941                                RegsToPass[i].second, InFlag);
1942       InFlag = Chain.getValue(1);
1943     }
1944     InFlag = SDValue();
1945   }
1946 
1947   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1948   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1949   // node so that legalize doesn't hack it.
1950   bool isDirect = false;
1951 
1952   const TargetMachine &TM = getTargetMachine();
1953   const Module *Mod = MF.getFunction()->getParent();
1954   const GlobalValue *GV = nullptr;
1955   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1956     GV = G->getGlobal();
1957   bool isStub =
1958       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
1959 
1960   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
1961   bool isLocalARMFunc = false;
1962   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1963   auto PtrVt = getPointerTy(DAG.getDataLayout());
1964 
1965   if (Subtarget->genLongCalls()) {
1966     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
1967            "long-calls codegen is not position independent!");
1968     // Handle a global address or an external symbol. If it's not one of
1969     // those, the target's already in a register, so we don't need to do
1970     // anything extra.
1971     if (isa<GlobalAddressSDNode>(Callee)) {
1972       // Create a constant pool entry for the callee address
1973       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1974       ARMConstantPoolValue *CPV =
1975         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
1976 
1977       // Get the address of the callee into a register
1978       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1979       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1980       Callee = DAG.getLoad(
1981           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1982           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
1983     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
1984       const char *Sym = S->getSymbol();
1985 
1986       // Create a constant pool entry for the callee address
1987       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1988       ARMConstantPoolValue *CPV =
1989         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1990                                       ARMPCLabelIndex, 0);
1991       // Get the address of the callee into a register
1992       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1993       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1994       Callee = DAG.getLoad(
1995           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1996           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
1997     }
1998   } else if (isa<GlobalAddressSDNode>(Callee)) {
1999     // If we're optimizing for minimum size and the function is called three or
2000     // more times in this block, we can improve codesize by calling indirectly
2001     // as BLXr has a 16-bit encoding.
2002     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2003     auto *BB = CLI.CS->getParent();
2004     bool PreferIndirect =
2005         Subtarget->isThumb() && MF.getFunction()->optForMinSize() &&
2006         count_if(GV->users(), [&BB](const User *U) {
2007           return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
2008         }) > 2;
2009 
2010     if (!PreferIndirect) {
2011       isDirect = true;
2012       bool isDef = GV->isStrongDefinitionForLinker();
2013 
2014       // ARM call to a local ARM function is predicable.
2015       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2016       // tBX takes a register source operand.
2017       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2018         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2019         Callee = DAG.getNode(
2020             ARMISD::WrapperPIC, dl, PtrVt,
2021             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2022         Callee = DAG.getLoad(
2023             PtrVt, dl, DAG.getEntryNode(), Callee,
2024             MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2025             /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
2026                                      MachineMemOperand::MOInvariant);
2027       } else if (Subtarget->isTargetCOFF()) {
2028         assert(Subtarget->isTargetWindows() &&
2029                "Windows is the only supported COFF target");
2030         unsigned TargetFlags = GV->hasDLLImportStorageClass()
2031                                    ? ARMII::MO_DLLIMPORT
2032                                    : ARMII::MO_NO_FLAG;
2033         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
2034                                             TargetFlags);
2035         if (GV->hasDLLImportStorageClass())
2036           Callee =
2037               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2038                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2039                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2040       } else {
2041         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2042       }
2043     }
2044   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2045     isDirect = true;
2046     // tBX takes a register source operand.
2047     const char *Sym = S->getSymbol();
2048     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2049       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2050       ARMConstantPoolValue *CPV =
2051         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2052                                       ARMPCLabelIndex, 4);
2053       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2054       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2055       Callee = DAG.getLoad(
2056           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2057           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2058       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2059       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2060     } else {
2061       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2062     }
2063   }
2064 
2065   // FIXME: handle tail calls differently.
2066   unsigned CallOpc;
2067   if (Subtarget->isThumb()) {
2068     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2069       CallOpc = ARMISD::CALL_NOLINK;
2070     else
2071       CallOpc = ARMISD::CALL;
2072   } else {
2073     if (!isDirect && !Subtarget->hasV5TOps())
2074       CallOpc = ARMISD::CALL_NOLINK;
2075     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2076              // Emit regular call when code size is the priority
2077              !MF.getFunction()->optForMinSize())
2078       // "mov lr, pc; b _foo" to avoid confusing the RSP
2079       CallOpc = ARMISD::CALL_NOLINK;
2080     else
2081       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2082   }
2083 
2084   std::vector<SDValue> Ops;
2085   Ops.push_back(Chain);
2086   Ops.push_back(Callee);
2087 
2088   // Add argument registers to the end of the list so that they are known live
2089   // into the call.
2090   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2091     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2092                                   RegsToPass[i].second.getValueType()));
2093 
2094   // Add a register mask operand representing the call-preserved registers.
2095   if (!isTailCall) {
2096     const uint32_t *Mask;
2097     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2098     if (isThisReturn) {
2099       // For 'this' returns, use the R0-preserving mask if applicable
2100       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2101       if (!Mask) {
2102         // Set isThisReturn to false if the calling convention is not one that
2103         // allows 'returned' to be modeled in this way, so LowerCallResult does
2104         // not try to pass 'this' straight through
2105         isThisReturn = false;
2106         Mask = ARI->getCallPreservedMask(MF, CallConv);
2107       }
2108     } else
2109       Mask = ARI->getCallPreservedMask(MF, CallConv);
2110 
2111     assert(Mask && "Missing call preserved mask for calling convention");
2112     Ops.push_back(DAG.getRegisterMask(Mask));
2113   }
2114 
2115   if (InFlag.getNode())
2116     Ops.push_back(InFlag);
2117 
2118   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2119   if (isTailCall) {
2120     MF.getFrameInfo().setHasTailCall();
2121     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2122   }
2123 
2124   // Returns a chain and a flag for retval copy to use.
2125   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2126   InFlag = Chain.getValue(1);
2127 
2128   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2129                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2130   if (!Ins.empty())
2131     InFlag = Chain.getValue(1);
2132 
2133   // Handle result values, copying them out of physregs into vregs that we
2134   // return.
2135   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2136                          InVals, isThisReturn,
2137                          isThisReturn ? OutVals[0] : SDValue());
2138 }
2139 
2140 /// HandleByVal - Every parameter *after* a byval parameter is passed
2141 /// on the stack.  Remember the next parameter register to allocate,
2142 /// and then confiscate the rest of the parameter registers to insure
2143 /// this.
2144 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2145                                     unsigned Align) const {
2146   assert((State->getCallOrPrologue() == Prologue ||
2147           State->getCallOrPrologue() == Call) &&
2148          "unhandled ParmContext");
2149 
2150   // Byval (as with any stack) slots are always at least 4 byte aligned.
2151   Align = std::max(Align, 4U);
2152 
2153   unsigned Reg = State->AllocateReg(GPRArgRegs);
2154   if (!Reg)
2155     return;
2156 
2157   unsigned AlignInRegs = Align / 4;
2158   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2159   for (unsigned i = 0; i < Waste; ++i)
2160     Reg = State->AllocateReg(GPRArgRegs);
2161 
2162   if (!Reg)
2163     return;
2164 
2165   unsigned Excess = 4 * (ARM::R4 - Reg);
2166 
2167   // Special case when NSAA != SP and parameter size greater than size of
2168   // all remained GPR regs. In that case we can't split parameter, we must
2169   // send it to stack. We also must set NCRN to R4, so waste all
2170   // remained registers.
2171   const unsigned NSAAOffset = State->getNextStackOffset();
2172   if (NSAAOffset != 0 && Size > Excess) {
2173     while (State->AllocateReg(GPRArgRegs))
2174       ;
2175     return;
2176   }
2177 
2178   // First register for byval parameter is the first register that wasn't
2179   // allocated before this method call, so it would be "reg".
2180   // If parameter is small enough to be saved in range [reg, r4), then
2181   // the end (first after last) register would be reg + param-size-in-regs,
2182   // else parameter would be splitted between registers and stack,
2183   // end register would be r4 in this case.
2184   unsigned ByValRegBegin = Reg;
2185   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2186   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2187   // Note, first register is allocated in the beginning of function already,
2188   // allocate remained amount of registers we need.
2189   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2190     State->AllocateReg(GPRArgRegs);
2191   // A byval parameter that is split between registers and memory needs its
2192   // size truncated here.
2193   // In the case where the entire structure fits in registers, we set the
2194   // size in memory to zero.
2195   Size = std::max<int>(Size - Excess, 0);
2196 }
2197 
2198 /// MatchingStackOffset - Return true if the given stack call argument is
2199 /// already available in the same position (relatively) of the caller's
2200 /// incoming argument stack.
2201 static
2202 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2203                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2204                          const TargetInstrInfo *TII) {
2205   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2206   int FI = std::numeric_limits<int>::max();
2207   if (Arg.getOpcode() == ISD::CopyFromReg) {
2208     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2209     if (!TargetRegisterInfo::isVirtualRegister(VR))
2210       return false;
2211     MachineInstr *Def = MRI->getVRegDef(VR);
2212     if (!Def)
2213       return false;
2214     if (!Flags.isByVal()) {
2215       if (!TII->isLoadFromStackSlot(*Def, FI))
2216         return false;
2217     } else {
2218       return false;
2219     }
2220   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2221     if (Flags.isByVal())
2222       // ByVal argument is passed in as a pointer but it's now being
2223       // dereferenced. e.g.
2224       // define @foo(%struct.X* %A) {
2225       //   tail call @bar(%struct.X* byval %A)
2226       // }
2227       return false;
2228     SDValue Ptr = Ld->getBasePtr();
2229     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2230     if (!FINode)
2231       return false;
2232     FI = FINode->getIndex();
2233   } else
2234     return false;
2235 
2236   assert(FI != std::numeric_limits<int>::max());
2237   if (!MFI.isFixedObjectIndex(FI))
2238     return false;
2239   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2240 }
2241 
2242 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2243 /// for tail call optimization. Targets which want to do tail call
2244 /// optimization should implement this function.
2245 bool
2246 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2247                                                      CallingConv::ID CalleeCC,
2248                                                      bool isVarArg,
2249                                                      bool isCalleeStructRet,
2250                                                      bool isCallerStructRet,
2251                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
2252                                     const SmallVectorImpl<SDValue> &OutVals,
2253                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2254                                                      SelectionDAG& DAG) const {
2255   MachineFunction &MF = DAG.getMachineFunction();
2256   const Function *CallerF = MF.getFunction();
2257   CallingConv::ID CallerCC = CallerF->getCallingConv();
2258 
2259   assert(Subtarget->supportsTailCall());
2260 
2261   // Look for obvious safe cases to perform tail call optimization that do not
2262   // require ABI changes. This is what gcc calls sibcall.
2263 
2264   // Exception-handling functions need a special set of instructions to indicate
2265   // a return to the hardware. Tail-calling another function would probably
2266   // break this.
2267   if (CallerF->hasFnAttribute("interrupt"))
2268     return false;
2269 
2270   // Also avoid sibcall optimization if either caller or callee uses struct
2271   // return semantics.
2272   if (isCalleeStructRet || isCallerStructRet)
2273     return false;
2274 
2275   // Externally-defined functions with weak linkage should not be
2276   // tail-called on ARM when the OS does not support dynamic
2277   // pre-emption of symbols, as the AAELF spec requires normal calls
2278   // to undefined weak functions to be replaced with a NOP or jump to the
2279   // next instruction. The behaviour of branch instructions in this
2280   // situation (as used for tail calls) is implementation-defined, so we
2281   // cannot rely on the linker replacing the tail call with a return.
2282   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2283     const GlobalValue *GV = G->getGlobal();
2284     const Triple &TT = getTargetMachine().getTargetTriple();
2285     if (GV->hasExternalWeakLinkage() &&
2286         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2287       return false;
2288   }
2289 
2290   // Check that the call results are passed in the same way.
2291   LLVMContext &C = *DAG.getContext();
2292   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2293                                   CCAssignFnForReturn(CalleeCC, isVarArg),
2294                                   CCAssignFnForReturn(CallerCC, isVarArg)))
2295     return false;
2296   // The callee has to preserve all registers the caller needs to preserve.
2297   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2298   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2299   if (CalleeCC != CallerCC) {
2300     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2301     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2302       return false;
2303   }
2304 
2305   // If Caller's vararg or byval argument has been split between registers and
2306   // stack, do not perform tail call, since part of the argument is in caller's
2307   // local frame.
2308   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2309   if (AFI_Caller->getArgRegsSaveSize())
2310     return false;
2311 
2312   // If the callee takes no arguments then go on to check the results of the
2313   // call.
2314   if (!Outs.empty()) {
2315     // Check if stack adjustment is needed. For now, do not do this if any
2316     // argument is passed on the stack.
2317     SmallVector<CCValAssign, 16> ArgLocs;
2318     ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
2319     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2320     if (CCInfo.getNextStackOffset()) {
2321       // Check if the arguments are already laid out in the right way as
2322       // the caller's fixed stack objects.
2323       MachineFrameInfo &MFI = MF.getFrameInfo();
2324       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2325       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2326       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2327            i != e;
2328            ++i, ++realArgIdx) {
2329         CCValAssign &VA = ArgLocs[i];
2330         EVT RegVT = VA.getLocVT();
2331         SDValue Arg = OutVals[realArgIdx];
2332         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2333         if (VA.getLocInfo() == CCValAssign::Indirect)
2334           return false;
2335         if (VA.needsCustom()) {
2336           // f64 and vector types are split into multiple registers or
2337           // register/stack-slot combinations.  The types will not match
2338           // the registers; give up on memory f64 refs until we figure
2339           // out what to do about this.
2340           if (!VA.isRegLoc())
2341             return false;
2342           if (!ArgLocs[++i].isRegLoc())
2343             return false;
2344           if (RegVT == MVT::v2f64) {
2345             if (!ArgLocs[++i].isRegLoc())
2346               return false;
2347             if (!ArgLocs[++i].isRegLoc())
2348               return false;
2349           }
2350         } else if (!VA.isRegLoc()) {
2351           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2352                                    MFI, MRI, TII))
2353             return false;
2354         }
2355       }
2356     }
2357 
2358     const MachineRegisterInfo &MRI = MF.getRegInfo();
2359     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2360       return false;
2361   }
2362 
2363   return true;
2364 }
2365 
2366 bool
2367 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2368                                   MachineFunction &MF, bool isVarArg,
2369                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2370                                   LLVMContext &Context) const {
2371   SmallVector<CCValAssign, 16> RVLocs;
2372   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2373   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2374 }
2375 
2376 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2377                                     const SDLoc &DL, SelectionDAG &DAG) {
2378   const MachineFunction &MF = DAG.getMachineFunction();
2379   const Function *F = MF.getFunction();
2380 
2381   StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
2382 
2383   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2384   // version of the "preferred return address". These offsets affect the return
2385   // instruction if this is a return from PL1 without hypervisor extensions.
2386   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2387   //    SWI:     0      "subs pc, lr, #0"
2388   //    ABORT:   +4     "subs pc, lr, #4"
2389   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2390   // UNDEF varies depending on where the exception came from ARM or Thumb
2391   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2392 
2393   int64_t LROffset;
2394   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2395       IntKind == "ABORT")
2396     LROffset = 4;
2397   else if (IntKind == "SWI" || IntKind == "UNDEF")
2398     LROffset = 0;
2399   else
2400     report_fatal_error("Unsupported interrupt attribute. If present, value "
2401                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2402 
2403   RetOps.insert(RetOps.begin() + 1,
2404                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2405 
2406   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2407 }
2408 
2409 SDValue
2410 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2411                                bool isVarArg,
2412                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2413                                const SmallVectorImpl<SDValue> &OutVals,
2414                                const SDLoc &dl, SelectionDAG &DAG) const {
2415 
2416   // CCValAssign - represent the assignment of the return value to a location.
2417   SmallVector<CCValAssign, 16> RVLocs;
2418 
2419   // CCState - Info about the registers and stack slots.
2420   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2421                     *DAG.getContext(), Call);
2422 
2423   // Analyze outgoing return values.
2424   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2425 
2426   SDValue Flag;
2427   SmallVector<SDValue, 4> RetOps;
2428   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2429   bool isLittleEndian = Subtarget->isLittle();
2430 
2431   MachineFunction &MF = DAG.getMachineFunction();
2432   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2433   AFI->setReturnRegsCount(RVLocs.size());
2434 
2435   // Copy the result values into the output registers.
2436   for (unsigned i = 0, realRVLocIdx = 0;
2437        i != RVLocs.size();
2438        ++i, ++realRVLocIdx) {
2439     CCValAssign &VA = RVLocs[i];
2440     assert(VA.isRegLoc() && "Can only return in registers!");
2441 
2442     SDValue Arg = OutVals[realRVLocIdx];
2443 
2444     switch (VA.getLocInfo()) {
2445     default: llvm_unreachable("Unknown loc info!");
2446     case CCValAssign::Full: break;
2447     case CCValAssign::BCvt:
2448       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2449       break;
2450     }
2451 
2452     if (VA.needsCustom()) {
2453       if (VA.getLocVT() == MVT::v2f64) {
2454         // Extract the first half and return it in two registers.
2455         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2456                                    DAG.getConstant(0, dl, MVT::i32));
2457         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2458                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2459 
2460         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2461                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2462                                  Flag);
2463         Flag = Chain.getValue(1);
2464         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2465         VA = RVLocs[++i]; // skip ahead to next loc
2466         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2467                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2468                                  Flag);
2469         Flag = Chain.getValue(1);
2470         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2471         VA = RVLocs[++i]; // skip ahead to next loc
2472 
2473         // Extract the 2nd half and fall through to handle it as an f64 value.
2474         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2475                           DAG.getConstant(1, dl, MVT::i32));
2476       }
2477       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2478       // available.
2479       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2480                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2481       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2482                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2483                                Flag);
2484       Flag = Chain.getValue(1);
2485       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2486       VA = RVLocs[++i]; // skip ahead to next loc
2487       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2488                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2489                                Flag);
2490     } else
2491       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2492 
2493     // Guarantee that all emitted copies are
2494     // stuck together, avoiding something bad.
2495     Flag = Chain.getValue(1);
2496     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2497   }
2498   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2499   const MCPhysReg *I =
2500       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2501   if (I) {
2502     for (; *I; ++I) {
2503       if (ARM::GPRRegClass.contains(*I))
2504         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2505       else if (ARM::DPRRegClass.contains(*I))
2506         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
2507       else
2508         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2509     }
2510   }
2511 
2512   // Update chain and glue.
2513   RetOps[0] = Chain;
2514   if (Flag.getNode())
2515     RetOps.push_back(Flag);
2516 
2517   // CPUs which aren't M-class use a special sequence to return from
2518   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2519   // though we use "subs pc, lr, #N").
2520   //
2521   // M-class CPUs actually use a normal return sequence with a special
2522   // (hardware-provided) value in LR, so the normal code path works.
2523   if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
2524       !Subtarget->isMClass()) {
2525     if (Subtarget->isThumb1Only())
2526       report_fatal_error("interrupt attribute is not supported in Thumb1");
2527     return LowerInterruptReturn(RetOps, dl, DAG);
2528   }
2529 
2530   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2531 }
2532 
2533 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2534   if (N->getNumValues() != 1)
2535     return false;
2536   if (!N->hasNUsesOfValue(1, 0))
2537     return false;
2538 
2539   SDValue TCChain = Chain;
2540   SDNode *Copy = *N->use_begin();
2541   if (Copy->getOpcode() == ISD::CopyToReg) {
2542     // If the copy has a glue operand, we conservatively assume it isn't safe to
2543     // perform a tail call.
2544     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2545       return false;
2546     TCChain = Copy->getOperand(0);
2547   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2548     SDNode *VMov = Copy;
2549     // f64 returned in a pair of GPRs.
2550     SmallPtrSet<SDNode*, 2> Copies;
2551     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2552          UI != UE; ++UI) {
2553       if (UI->getOpcode() != ISD::CopyToReg)
2554         return false;
2555       Copies.insert(*UI);
2556     }
2557     if (Copies.size() > 2)
2558       return false;
2559 
2560     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2561          UI != UE; ++UI) {
2562       SDValue UseChain = UI->getOperand(0);
2563       if (Copies.count(UseChain.getNode()))
2564         // Second CopyToReg
2565         Copy = *UI;
2566       else {
2567         // We are at the top of this chain.
2568         // If the copy has a glue operand, we conservatively assume it
2569         // isn't safe to perform a tail call.
2570         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2571           return false;
2572         // First CopyToReg
2573         TCChain = UseChain;
2574       }
2575     }
2576   } else if (Copy->getOpcode() == ISD::BITCAST) {
2577     // f32 returned in a single GPR.
2578     if (!Copy->hasOneUse())
2579       return false;
2580     Copy = *Copy->use_begin();
2581     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2582       return false;
2583     // If the copy has a glue operand, we conservatively assume it isn't safe to
2584     // perform a tail call.
2585     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2586       return false;
2587     TCChain = Copy->getOperand(0);
2588   } else {
2589     return false;
2590   }
2591 
2592   bool HasRet = false;
2593   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2594        UI != UE; ++UI) {
2595     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2596         UI->getOpcode() != ARMISD::INTRET_FLAG)
2597       return false;
2598     HasRet = true;
2599   }
2600 
2601   if (!HasRet)
2602     return false;
2603 
2604   Chain = TCChain;
2605   return true;
2606 }
2607 
2608 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2609   if (!Subtarget->supportsTailCall())
2610     return false;
2611 
2612   auto Attr =
2613       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2614   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2615     return false;
2616 
2617   return true;
2618 }
2619 
2620 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2621 // and pass the lower and high parts through.
2622 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2623   SDLoc DL(Op);
2624   SDValue WriteValue = Op->getOperand(2);
2625 
2626   // This function is only supposed to be called for i64 type argument.
2627   assert(WriteValue.getValueType() == MVT::i64
2628           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2629 
2630   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2631                            DAG.getConstant(0, DL, MVT::i32));
2632   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2633                            DAG.getConstant(1, DL, MVT::i32));
2634   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2635   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2636 }
2637 
2638 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2639 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2640 // one of the above mentioned nodes. It has to be wrapped because otherwise
2641 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2642 // be used to form addressing mode. These wrapped nodes will be selected
2643 // into MOVi.
2644 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2645   EVT PtrVT = Op.getValueType();
2646   // FIXME there is no actual debug info here
2647   SDLoc dl(Op);
2648   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2649   SDValue Res;
2650   if (CP->isMachineConstantPoolEntry())
2651     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2652                                     CP->getAlignment());
2653   else
2654     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2655                                     CP->getAlignment());
2656   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2657 }
2658 
2659 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2660   return MachineJumpTableInfo::EK_Inline;
2661 }
2662 
2663 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2664                                              SelectionDAG &DAG) const {
2665   MachineFunction &MF = DAG.getMachineFunction();
2666   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2667   unsigned ARMPCLabelIndex = 0;
2668   SDLoc DL(Op);
2669   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2670   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2671   SDValue CPAddr;
2672   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
2673   if (!IsPositionIndependent) {
2674     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2675   } else {
2676     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2677     ARMPCLabelIndex = AFI->createPICLabelUId();
2678     ARMConstantPoolValue *CPV =
2679       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2680                                       ARMCP::CPBlockAddress, PCAdj);
2681     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2682   }
2683   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2684   SDValue Result = DAG.getLoad(
2685       PtrVT, DL, DAG.getEntryNode(), CPAddr,
2686       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2687   if (!IsPositionIndependent)
2688     return Result;
2689   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
2690   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2691 }
2692 
2693 /// \brief Convert a TLS address reference into the correct sequence of loads
2694 /// and calls to compute the variable's address for Darwin, and return an
2695 /// SDValue containing the final node.
2696 
2697 /// Darwin only has one TLS scheme which must be capable of dealing with the
2698 /// fully general situation, in the worst case. This means:
2699 ///     + "extern __thread" declaration.
2700 ///     + Defined in a possibly unknown dynamic library.
2701 ///
2702 /// The general system is that each __thread variable has a [3 x i32] descriptor
2703 /// which contains information used by the runtime to calculate the address. The
2704 /// only part of this the compiler needs to know about is the first word, which
2705 /// contains a function pointer that must be called with the address of the
2706 /// entire descriptor in "r0".
2707 ///
2708 /// Since this descriptor may be in a different unit, in general access must
2709 /// proceed along the usual ARM rules. A common sequence to produce is:
2710 ///
2711 ///     movw rT1, :lower16:_var$non_lazy_ptr
2712 ///     movt rT1, :upper16:_var$non_lazy_ptr
2713 ///     ldr r0, [rT1]
2714 ///     ldr rT2, [r0]
2715 ///     blx rT2
2716 ///     [...address now in r0...]
2717 SDValue
2718 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
2719                                                SelectionDAG &DAG) const {
2720   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
2721   SDLoc DL(Op);
2722 
2723   // First step is to get the address of the actua global symbol. This is where
2724   // the TLS descriptor lives.
2725   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
2726 
2727   // The first entry in the descriptor is a function pointer that we must call
2728   // to obtain the address of the variable.
2729   SDValue Chain = DAG.getEntryNode();
2730   SDValue FuncTLVGet = DAG.getLoad(
2731       MVT::i32, DL, Chain, DescAddr,
2732       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2733       /* Alignment = */ 4,
2734       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
2735           MachineMemOperand::MOInvariant);
2736   Chain = FuncTLVGet.getValue(1);
2737 
2738   MachineFunction &F = DAG.getMachineFunction();
2739   MachineFrameInfo &MFI = F.getFrameInfo();
2740   MFI.setAdjustsStack(true);
2741 
2742   // TLS calls preserve all registers except those that absolutely must be
2743   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
2744   // silly).
2745   auto TRI =
2746       getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
2747   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
2748   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
2749 
2750   // Finally, we can make the call. This is just a degenerate version of a
2751   // normal AArch64 call node: r0 takes the address of the descriptor, and
2752   // returns the address of the variable in this thread.
2753   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
2754   Chain =
2755       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
2756                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
2757                   DAG.getRegisterMask(Mask), Chain.getValue(1));
2758   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
2759 }
2760 
2761 SDValue
2762 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
2763                                                 SelectionDAG &DAG) const {
2764   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
2765 
2766   SDValue Chain = DAG.getEntryNode();
2767   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2768   SDLoc DL(Op);
2769 
2770   // Load the current TEB (thread environment block)
2771   SDValue Ops[] = {Chain,
2772                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
2773                    DAG.getConstant(15, DL, MVT::i32),
2774                    DAG.getConstant(0, DL, MVT::i32),
2775                    DAG.getConstant(13, DL, MVT::i32),
2776                    DAG.getConstant(0, DL, MVT::i32),
2777                    DAG.getConstant(2, DL, MVT::i32)};
2778   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
2779                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
2780 
2781   SDValue TEB = CurrentTEB.getValue(0);
2782   Chain = CurrentTEB.getValue(1);
2783 
2784   // Load the ThreadLocalStoragePointer from the TEB
2785   // A pointer to the TLS array is located at offset 0x2c from the TEB.
2786   SDValue TLSArray =
2787       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
2788   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
2789 
2790   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
2791   // offset into the TLSArray.
2792 
2793   // Load the TLS index from the C runtime
2794   SDValue TLSIndex =
2795       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
2796   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
2797   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
2798 
2799   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
2800                               DAG.getConstant(2, DL, MVT::i32));
2801   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
2802                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
2803                             MachinePointerInfo());
2804 
2805   // Get the offset of the start of the .tls section (section base)
2806   const auto *GA = cast<GlobalAddressSDNode>(Op);
2807   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
2808   SDValue Offset = DAG.getLoad(
2809       PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
2810                                     DAG.getTargetConstantPool(CPV, PtrVT, 4)),
2811       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2812 
2813   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
2814 }
2815 
2816 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
2817 SDValue
2818 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2819                                                  SelectionDAG &DAG) const {
2820   SDLoc dl(GA);
2821   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2822   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2823   MachineFunction &MF = DAG.getMachineFunction();
2824   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2825   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2826   ARMConstantPoolValue *CPV =
2827     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2828                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2829   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2830   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2831   Argument = DAG.getLoad(
2832       PtrVT, dl, DAG.getEntryNode(), Argument,
2833       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2834   SDValue Chain = Argument.getValue(1);
2835 
2836   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2837   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2838 
2839   // call __tls_get_addr.
2840   ArgListTy Args;
2841   ArgListEntry Entry;
2842   Entry.Node = Argument;
2843   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2844   Args.push_back(Entry);
2845 
2846   // FIXME: is there useful debug info available here?
2847   TargetLowering::CallLoweringInfo CLI(DAG);
2848   CLI.setDebugLoc(dl).setChain(Chain)
2849     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
2850                DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
2851 
2852   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2853   return CallResult.first;
2854 }
2855 
2856 // Lower ISD::GlobalTLSAddress using the "initial exec" or
2857 // "local exec" model.
2858 SDValue
2859 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2860                                         SelectionDAG &DAG,
2861                                         TLSModel::Model model) const {
2862   const GlobalValue *GV = GA->getGlobal();
2863   SDLoc dl(GA);
2864   SDValue Offset;
2865   SDValue Chain = DAG.getEntryNode();
2866   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2867   // Get the Thread Pointer
2868   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2869 
2870   if (model == TLSModel::InitialExec) {
2871     MachineFunction &MF = DAG.getMachineFunction();
2872     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2873     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2874     // Initial exec model.
2875     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2876     ARMConstantPoolValue *CPV =
2877       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2878                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2879                                       true);
2880     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2881     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2882     Offset = DAG.getLoad(
2883         PtrVT, dl, Chain, Offset,
2884         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2885     Chain = Offset.getValue(1);
2886 
2887     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2888     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2889 
2890     Offset = DAG.getLoad(
2891         PtrVT, dl, Chain, Offset,
2892         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2893   } else {
2894     // local exec model
2895     assert(model == TLSModel::LocalExec);
2896     ARMConstantPoolValue *CPV =
2897       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2898     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2899     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2900     Offset = DAG.getLoad(
2901         PtrVT, dl, Chain, Offset,
2902         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2903   }
2904 
2905   // The address of the thread local variable is the add of the thread
2906   // pointer with the offset of the variable.
2907   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2908 }
2909 
2910 SDValue
2911 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2912   if (Subtarget->isTargetDarwin())
2913     return LowerGlobalTLSAddressDarwin(Op, DAG);
2914 
2915   if (Subtarget->isTargetWindows())
2916     return LowerGlobalTLSAddressWindows(Op, DAG);
2917 
2918   // TODO: implement the "local dynamic" model
2919   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
2920   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2921   if (DAG.getTarget().Options.EmulatedTLS)
2922     return LowerToTLSEmulatedModel(GA, DAG);
2923 
2924   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2925 
2926   switch (model) {
2927     case TLSModel::GeneralDynamic:
2928     case TLSModel::LocalDynamic:
2929       return LowerToTLSGeneralDynamicModel(GA, DAG);
2930     case TLSModel::InitialExec:
2931     case TLSModel::LocalExec:
2932       return LowerToTLSExecModels(GA, DAG, model);
2933   }
2934   llvm_unreachable("bogus TLS model");
2935 }
2936 
2937 /// Return true if all users of V are within function F, looking through
2938 /// ConstantExprs.
2939 static bool allUsersAreInFunction(const Value *V, const Function *F) {
2940   SmallVector<const User*,4> Worklist;
2941   for (auto *U : V->users())
2942     Worklist.push_back(U);
2943   while (!Worklist.empty()) {
2944     auto *U = Worklist.pop_back_val();
2945     if (isa<ConstantExpr>(U)) {
2946       for (auto *UU : U->users())
2947         Worklist.push_back(UU);
2948       continue;
2949     }
2950 
2951     auto *I = dyn_cast<Instruction>(U);
2952     if (!I || I->getParent()->getParent() != F)
2953       return false;
2954   }
2955   return true;
2956 }
2957 
2958 /// Return true if all users of V are within some (any) function, looking through
2959 /// ConstantExprs. In other words, are there any global constant users?
2960 static bool allUsersAreInFunctions(const Value *V) {
2961   SmallVector<const User*,4> Worklist;
2962   for (auto *U : V->users())
2963     Worklist.push_back(U);
2964   while (!Worklist.empty()) {
2965     auto *U = Worklist.pop_back_val();
2966     if (isa<ConstantExpr>(U)) {
2967       for (auto *UU : U->users())
2968         Worklist.push_back(UU);
2969       continue;
2970     }
2971 
2972     if (!isa<Instruction>(U))
2973       return false;
2974   }
2975   return true;
2976 }
2977 
2978 // Return true if T is an integer, float or an array/vector of either.
2979 static bool isSimpleType(Type *T) {
2980   if (T->isIntegerTy() || T->isFloatingPointTy())
2981     return true;
2982   Type *SubT = nullptr;
2983   if (T->isArrayTy())
2984     SubT = T->getArrayElementType();
2985   else if (T->isVectorTy())
2986     SubT = T->getVectorElementType();
2987   else
2988     return false;
2989   return SubT->isIntegerTy() || SubT->isFloatingPointTy();
2990 }
2991 
2992 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
2993                                      EVT PtrVT, const SDLoc &dl) {
2994   // If we're creating a pool entry for a constant global with unnamed address,
2995   // and the global is small enough, we can emit it inline into the constant pool
2996   // to save ourselves an indirection.
2997   //
2998   // This is a win if the constant is only used in one function (so it doesn't
2999   // need to be duplicated) or duplicating the constant wouldn't increase code
3000   // size (implying the constant is no larger than 4 bytes).
3001   const Function *F = DAG.getMachineFunction().getFunction();
3002 
3003   // We rely on this decision to inline being idemopotent and unrelated to the
3004   // use-site. We know that if we inline a variable at one use site, we'll
3005   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3006   // doesn't know about this optimization, so bail out if it's enabled else
3007   // we could decide to inline here (and thus never emit the GV) but require
3008   // the GV from fast-isel generated code.
3009   if (!EnableConstpoolPromotion ||
3010       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3011       return SDValue();
3012 
3013   auto *GVar = dyn_cast<GlobalVariable>(GV);
3014   if (!GVar || !GVar->hasInitializer() ||
3015       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3016       !GVar->hasLocalLinkage())
3017     return SDValue();
3018 
3019   // Ensure that we don't try and inline any type that contains pointers. If
3020   // we inline a value that contains relocations, we move the relocations from
3021   // .data to .text which is not ideal.
3022   auto *Init = GVar->getInitializer();
3023   if (!isSimpleType(Init->getType()))
3024     return SDValue();
3025 
3026   // The constant islands pass can only really deal with alignment requests
3027   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3028   // any type wanting greater alignment requirements than 4 bytes. We also
3029   // can only promote constants that are multiples of 4 bytes in size or
3030   // are paddable to a multiple of 4. Currently we only try and pad constants
3031   // that are strings for simplicity.
3032   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3033   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3034   unsigned Align = GVar->getAlignment();
3035   unsigned RequiredPadding = 4 - (Size % 4);
3036   bool PaddingPossible =
3037     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3038   if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
3039     return SDValue();
3040 
3041   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3042   MachineFunction &MF = DAG.getMachineFunction();
3043   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3044 
3045   // We can't bloat the constant pool too much, else the ConstantIslands pass
3046   // may fail to converge. If we haven't promoted this global yet (it may have
3047   // multiple uses), and promoting it would increase the constant pool size (Sz
3048   // > 4), ensure we have space to do so up to MaxTotal.
3049   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3050     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3051         ConstpoolPromotionMaxTotal)
3052       return SDValue();
3053 
3054   // This is only valid if all users are in a single function OR it has users
3055   // in multiple functions but it no larger than a pointer. We also check if
3056   // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
3057   // address taken.
3058   if (!allUsersAreInFunction(GVar, F) &&
3059       !(Size <= 4 && allUsersAreInFunctions(GVar)))
3060     return SDValue();
3061 
3062   // We're going to inline this global. Pad it out if needed.
3063   if (RequiredPadding != 4) {
3064     StringRef S = CDAInit->getAsString();
3065 
3066     SmallVector<uint8_t,16> V(S.size());
3067     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3068     while (RequiredPadding--)
3069       V.push_back(0);
3070     Init = ConstantDataArray::get(*DAG.getContext(), V);
3071   }
3072 
3073   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3074   SDValue CPAddr =
3075     DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
3076   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3077     AFI->markGlobalAsPromotedToConstantPool(GVar);
3078     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3079                                       PaddedSize - 4);
3080   }
3081   ++NumConstpoolPromoted;
3082   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3083 }
3084 
3085 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3086                                                  SelectionDAG &DAG) const {
3087   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3088   SDLoc dl(Op);
3089   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3090   const TargetMachine &TM = getTargetMachine();
3091   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3092     GV = GA->getBaseObject();
3093   bool IsRO =
3094       (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
3095       isa<Function>(GV);
3096 
3097   // promoteToConstantPool only if not generating XO text section
3098   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
3099     if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
3100       return V;
3101 
3102   if (isPositionIndependent()) {
3103     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3104 
3105     MachineFunction &MF = DAG.getMachineFunction();
3106     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3107     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3108     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3109     SDLoc dl(Op);
3110     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3111     ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
3112         GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
3113         UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
3114         /*AddCurrentAddress=*/UseGOT_PREL);
3115     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3116     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3117     SDValue Result = DAG.getLoad(
3118         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3119         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3120     SDValue Chain = Result.getValue(1);
3121     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3122     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3123     if (UseGOT_PREL)
3124       Result =
3125           DAG.getLoad(PtrVT, dl, Chain, Result,
3126                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3127     return Result;
3128   } else if (Subtarget->isROPI() && IsRO) {
3129     // PC-relative.
3130     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3131     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3132     return Result;
3133   } else if (Subtarget->isRWPI() && !IsRO) {
3134     // SB-relative.
3135     ARMConstantPoolValue *CPV =
3136       ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3137     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3138     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3139     SDValue G = DAG.getLoad(
3140         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3141         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3142     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3143     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
3144     return Result;
3145   }
3146 
3147   // If we have T2 ops, we can materialize the address directly via movt/movw
3148   // pair. This is always cheaper.
3149   if (Subtarget->useMovt(DAG.getMachineFunction())) {
3150     ++NumMovwMovt;
3151     // FIXME: Once remat is capable of dealing with instructions with register
3152     // operands, expand this into two nodes.
3153     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3154                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3155   } else {
3156     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
3157     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3158     return DAG.getLoad(
3159         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3160         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3161   }
3162 }
3163 
3164 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3165                                                     SelectionDAG &DAG) const {
3166   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3167          "ROPI/RWPI not currently supported for Darwin");
3168   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3169   SDLoc dl(Op);
3170   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3171 
3172   if (Subtarget->useMovt(DAG.getMachineFunction()))
3173     ++NumMovwMovt;
3174 
3175   // FIXME: Once remat is capable of dealing with instructions with register
3176   // operands, expand this into multiple nodes
3177   unsigned Wrapper =
3178       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3179 
3180   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3181   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3182 
3183   if (Subtarget->isGVIndirectSymbol(GV))
3184     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3185                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3186   return Result;
3187 }
3188 
3189 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3190                                                      SelectionDAG &DAG) const {
3191   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3192   assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
3193          "Windows on ARM expects to use movw/movt");
3194   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3195          "ROPI/RWPI not currently supported for Windows");
3196 
3197   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3198   const ARMII::TOF TargetFlags =
3199     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
3200   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3201   SDValue Result;
3202   SDLoc DL(Op);
3203 
3204   ++NumMovwMovt;
3205 
3206   // FIXME: Once remat is capable of dealing with instructions with register
3207   // operands, expand this into two nodes.
3208   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3209                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
3210                                                   TargetFlags));
3211   if (GV->hasDLLImportStorageClass())
3212     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3213                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3214   return Result;
3215 }
3216 
3217 SDValue
3218 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3219   SDLoc dl(Op);
3220   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3221   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3222                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3223                      Op.getOperand(1), Val);
3224 }
3225 
3226 SDValue
3227 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3228   SDLoc dl(Op);
3229   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3230                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3231 }
3232 
3233 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3234                                                       SelectionDAG &DAG) const {
3235   SDLoc dl(Op);
3236   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3237                      Op.getOperand(0));
3238 }
3239 
3240 SDValue
3241 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3242                                           const ARMSubtarget *Subtarget) const {
3243   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3244   SDLoc dl(Op);
3245   switch (IntNo) {
3246   default: return SDValue();    // Don't custom lower most intrinsics.
3247   case Intrinsic::thread_pointer: {
3248     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3249     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3250   }
3251   case Intrinsic::eh_sjlj_lsda: {
3252     MachineFunction &MF = DAG.getMachineFunction();
3253     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3254     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3255     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3256     SDValue CPAddr;
3257     bool IsPositionIndependent = isPositionIndependent();
3258     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3259     ARMConstantPoolValue *CPV =
3260       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
3261                                       ARMCP::CPLSDA, PCAdj);
3262     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3263     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3264     SDValue Result = DAG.getLoad(
3265         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3266         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3267 
3268     if (IsPositionIndependent) {
3269       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3270       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3271     }
3272     return Result;
3273   }
3274   case Intrinsic::arm_neon_vmulls:
3275   case Intrinsic::arm_neon_vmullu: {
3276     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3277       ? ARMISD::VMULLs : ARMISD::VMULLu;
3278     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3279                        Op.getOperand(1), Op.getOperand(2));
3280   }
3281   case Intrinsic::arm_neon_vminnm:
3282   case Intrinsic::arm_neon_vmaxnm: {
3283     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3284       ? ISD::FMINNUM : ISD::FMAXNUM;
3285     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3286                        Op.getOperand(1), Op.getOperand(2));
3287   }
3288   case Intrinsic::arm_neon_vminu:
3289   case Intrinsic::arm_neon_vmaxu: {
3290     if (Op.getValueType().isFloatingPoint())
3291       return SDValue();
3292     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3293       ? ISD::UMIN : ISD::UMAX;
3294     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3295                          Op.getOperand(1), Op.getOperand(2));
3296   }
3297   case Intrinsic::arm_neon_vmins:
3298   case Intrinsic::arm_neon_vmaxs: {
3299     // v{min,max}s is overloaded between signed integers and floats.
3300     if (!Op.getValueType().isFloatingPoint()) {
3301       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3302         ? ISD::SMIN : ISD::SMAX;
3303       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3304                          Op.getOperand(1), Op.getOperand(2));
3305     }
3306     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3307       ? ISD::FMINNAN : ISD::FMAXNAN;
3308     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3309                        Op.getOperand(1), Op.getOperand(2));
3310   }
3311   }
3312 }
3313 
3314 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
3315                                  const ARMSubtarget *Subtarget) {
3316   // FIXME: handle "fence singlethread" more efficiently.
3317   SDLoc dl(Op);
3318   if (!Subtarget->hasDataBarrier()) {
3319     // Some ARMv6 cpus can support data barriers with an mcr instruction.
3320     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3321     // here.
3322     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3323            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3324     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3325                        DAG.getConstant(0, dl, MVT::i32));
3326   }
3327 
3328   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
3329   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
3330   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
3331   if (Subtarget->isMClass()) {
3332     // Only a full system barrier exists in the M-class architectures.
3333     Domain = ARM_MB::SY;
3334   } else if (Subtarget->preferISHSTBarriers() &&
3335              Ord == AtomicOrdering::Release) {
3336     // Swift happens to implement ISHST barriers in a way that's compatible with
3337     // Release semantics but weaker than ISH so we'd be fools not to use
3338     // it. Beware: other processors probably don't!
3339     Domain = ARM_MB::ISHST;
3340   }
3341 
3342   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3343                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3344                      DAG.getConstant(Domain, dl, MVT::i32));
3345 }
3346 
3347 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
3348                              const ARMSubtarget *Subtarget) {
3349   // ARM pre v5TE and Thumb1 does not have preload instructions.
3350   if (!(Subtarget->isThumb2() ||
3351         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
3352     // Just preserve the chain.
3353     return Op.getOperand(0);
3354 
3355   SDLoc dl(Op);
3356   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
3357   if (!isRead &&
3358       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
3359     // ARMv7 with MP extension has PLDW.
3360     return Op.getOperand(0);
3361 
3362   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3363   if (Subtarget->isThumb()) {
3364     // Invert the bits.
3365     isRead = ~isRead & 1;
3366     isData = ~isData & 1;
3367   }
3368 
3369   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
3370                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
3371                      DAG.getConstant(isData, dl, MVT::i32));
3372 }
3373 
3374 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
3375   MachineFunction &MF = DAG.getMachineFunction();
3376   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
3377 
3378   // vastart just stores the address of the VarArgsFrameIndex slot into the
3379   // memory location argument.
3380   SDLoc dl(Op);
3381   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
3382   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3383   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3384   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3385                       MachinePointerInfo(SV));
3386 }
3387 
3388 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
3389                                                 CCValAssign &NextVA,
3390                                                 SDValue &Root,
3391                                                 SelectionDAG &DAG,
3392                                                 const SDLoc &dl) const {
3393   MachineFunction &MF = DAG.getMachineFunction();
3394   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3395 
3396   const TargetRegisterClass *RC;
3397   if (AFI->isThumb1OnlyFunction())
3398     RC = &ARM::tGPRRegClass;
3399   else
3400     RC = &ARM::GPRRegClass;
3401 
3402   // Transform the arguments stored in physical registers into virtual ones.
3403   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3404   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3405 
3406   SDValue ArgValue2;
3407   if (NextVA.isMemLoc()) {
3408     MachineFrameInfo &MFI = MF.getFrameInfo();
3409     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
3410 
3411     // Create load node to retrieve arguments from the stack.
3412     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3413     ArgValue2 = DAG.getLoad(
3414         MVT::i32, dl, Root, FIN,
3415         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3416   } else {
3417     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3418     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3419   }
3420   if (!Subtarget->isLittle())
3421     std::swap (ArgValue, ArgValue2);
3422   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
3423 }
3424 
3425 // The remaining GPRs hold either the beginning of variable-argument
3426 // data, or the beginning of an aggregate passed by value (usually
3427 // byval).  Either way, we allocate stack slots adjacent to the data
3428 // provided by our caller, and store the unallocated registers there.
3429 // If this is a variadic function, the va_list pointer will begin with
3430 // these values; otherwise, this reassembles a (byval) structure that
3431 // was split between registers and memory.
3432 // Return: The frame index registers were stored into.
3433 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
3434                                       const SDLoc &dl, SDValue &Chain,
3435                                       const Value *OrigArg,
3436                                       unsigned InRegsParamRecordIdx,
3437                                       int ArgOffset, unsigned ArgSize) const {
3438   // Currently, two use-cases possible:
3439   // Case #1. Non-var-args function, and we meet first byval parameter.
3440   //          Setup first unallocated register as first byval register;
3441   //          eat all remained registers
3442   //          (these two actions are performed by HandleByVal method).
3443   //          Then, here, we initialize stack frame with
3444   //          "store-reg" instructions.
3445   // Case #2. Var-args function, that doesn't contain byval parameters.
3446   //          The same: eat all remained unallocated registers,
3447   //          initialize stack frame.
3448 
3449   MachineFunction &MF = DAG.getMachineFunction();
3450   MachineFrameInfo &MFI = MF.getFrameInfo();
3451   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3452   unsigned RBegin, REnd;
3453   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
3454     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
3455   } else {
3456     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3457     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
3458     REnd = ARM::R4;
3459   }
3460 
3461   if (REnd != RBegin)
3462     ArgOffset = -4 * (ARM::R4 - RBegin);
3463 
3464   auto PtrVT = getPointerTy(DAG.getDataLayout());
3465   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
3466   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
3467 
3468   SmallVector<SDValue, 4> MemOps;
3469   const TargetRegisterClass *RC =
3470       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
3471 
3472   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3473     unsigned VReg = MF.addLiveIn(Reg, RC);
3474     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3475     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
3476                                  MachinePointerInfo(OrigArg, 4 * i));
3477     MemOps.push_back(Store);
3478     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3479   }
3480 
3481   if (!MemOps.empty())
3482     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3483   return FrameIndex;
3484 }
3485 
3486 // Setup stack frame, the va_list pointer will start from.
3487 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3488                                              const SDLoc &dl, SDValue &Chain,
3489                                              unsigned ArgOffset,
3490                                              unsigned TotalArgRegsSaveSize,
3491                                              bool ForceMutable) const {
3492   MachineFunction &MF = DAG.getMachineFunction();
3493   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3494 
3495   // Try to store any remaining integer argument regs
3496   // to their spots on the stack so that they may be loaded by dereferencing
3497   // the result of va_next.
3498   // If there is no regs to be stored, just point address after last
3499   // argument passed via stack.
3500   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3501                                   CCInfo.getInRegsParamsCount(),
3502                                   CCInfo.getNextStackOffset(), 4);
3503   AFI->setVarArgsFrameIndex(FrameIndex);
3504 }
3505 
3506 SDValue ARMTargetLowering::LowerFormalArguments(
3507     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3508     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3509     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3510   MachineFunction &MF = DAG.getMachineFunction();
3511   MachineFrameInfo &MFI = MF.getFrameInfo();
3512 
3513   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3514 
3515   // Assign locations to all of the incoming arguments.
3516   SmallVector<CCValAssign, 16> ArgLocs;
3517   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3518                     *DAG.getContext(), Prologue);
3519   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
3520 
3521   SmallVector<SDValue, 16> ArgValues;
3522   SDValue ArgValue;
3523   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
3524   unsigned CurArgIdx = 0;
3525 
3526   // Initially ArgRegsSaveSize is zero.
3527   // Then we increase this value each time we meet byval parameter.
3528   // We also increase this value in case of varargs function.
3529   AFI->setArgRegsSaveSize(0);
3530 
3531   // Calculate the amount of stack space that we need to allocate to store
3532   // byval and variadic arguments that are passed in registers.
3533   // We need to know this before we allocate the first byval or variadic
3534   // argument, as they will be allocated a stack slot below the CFA (Canonical
3535   // Frame Address, the stack pointer at entry to the function).
3536   unsigned ArgRegBegin = ARM::R4;
3537   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3538     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3539       break;
3540 
3541     CCValAssign &VA = ArgLocs[i];
3542     unsigned Index = VA.getValNo();
3543     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3544     if (!Flags.isByVal())
3545       continue;
3546 
3547     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3548     unsigned RBegin, REnd;
3549     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3550     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3551 
3552     CCInfo.nextInRegsParam();
3553   }
3554   CCInfo.rewindByValRegsInfo();
3555 
3556   int lastInsIndex = -1;
3557   if (isVarArg && MFI.hasVAStart()) {
3558     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3559     if (RegIdx != array_lengthof(GPRArgRegs))
3560       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3561   }
3562 
3563   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3564   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3565   auto PtrVT = getPointerTy(DAG.getDataLayout());
3566 
3567   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3568     CCValAssign &VA = ArgLocs[i];
3569     if (Ins[VA.getValNo()].isOrigArg()) {
3570       std::advance(CurOrigArg,
3571                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3572       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3573     }
3574     // Arguments stored in registers.
3575     if (VA.isRegLoc()) {
3576       EVT RegVT = VA.getLocVT();
3577 
3578       if (VA.needsCustom()) {
3579         // f64 and vector types are split up into multiple registers or
3580         // combinations of registers and stack slots.
3581         if (VA.getLocVT() == MVT::v2f64) {
3582           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3583                                                    Chain, DAG, dl);
3584           VA = ArgLocs[++i]; // skip ahead to next loc
3585           SDValue ArgValue2;
3586           if (VA.isMemLoc()) {
3587             int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
3588             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3589             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
3590                                     MachinePointerInfo::getFixedStack(
3591                                         DAG.getMachineFunction(), FI));
3592           } else {
3593             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3594                                              Chain, DAG, dl);
3595           }
3596           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3597           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3598                                  ArgValue, ArgValue1,
3599                                  DAG.getIntPtrConstant(0, dl));
3600           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3601                                  ArgValue, ArgValue2,
3602                                  DAG.getIntPtrConstant(1, dl));
3603         } else
3604           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3605 
3606       } else {
3607         const TargetRegisterClass *RC;
3608 
3609         if (RegVT == MVT::f32)
3610           RC = &ARM::SPRRegClass;
3611         else if (RegVT == MVT::f64)
3612           RC = &ARM::DPRRegClass;
3613         else if (RegVT == MVT::v2f64)
3614           RC = &ARM::QPRRegClass;
3615         else if (RegVT == MVT::i32)
3616           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3617                                            : &ARM::GPRRegClass;
3618         else
3619           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3620 
3621         // Transform the arguments in physical registers into virtual ones.
3622         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3623         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3624       }
3625 
3626       // If this is an 8 or 16-bit value, it is really passed promoted
3627       // to 32 bits.  Insert an assert[sz]ext to capture this, then
3628       // truncate to the right size.
3629       switch (VA.getLocInfo()) {
3630       default: llvm_unreachable("Unknown loc info!");
3631       case CCValAssign::Full: break;
3632       case CCValAssign::BCvt:
3633         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
3634         break;
3635       case CCValAssign::SExt:
3636         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3637                                DAG.getValueType(VA.getValVT()));
3638         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3639         break;
3640       case CCValAssign::ZExt:
3641         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3642                                DAG.getValueType(VA.getValVT()));
3643         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3644         break;
3645       }
3646 
3647       InVals.push_back(ArgValue);
3648 
3649     } else { // VA.isRegLoc()
3650       // sanity check
3651       assert(VA.isMemLoc());
3652       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
3653 
3654       int index = VA.getValNo();
3655 
3656       // Some Ins[] entries become multiple ArgLoc[] entries.
3657       // Process them only once.
3658       if (index != lastInsIndex)
3659         {
3660           ISD::ArgFlagsTy Flags = Ins[index].Flags;
3661           // FIXME: For now, all byval parameter objects are marked mutable.
3662           // This can be changed with more analysis.
3663           // In case of tail call optimization mark all arguments mutable.
3664           // Since they could be overwritten by lowering of arguments in case of
3665           // a tail call.
3666           if (Flags.isByVal()) {
3667             assert(Ins[index].isOrigArg() &&
3668                    "Byval arguments cannot be implicit");
3669             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
3670 
3671             int FrameIndex = StoreByValRegs(
3672                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
3673                 VA.getLocMemOffset(), Flags.getByValSize());
3674             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
3675             CCInfo.nextInRegsParam();
3676           } else {
3677             unsigned FIOffset = VA.getLocMemOffset();
3678             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
3679                                            FIOffset, true);
3680 
3681             // Create load nodes to retrieve arguments from the stack.
3682             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3683             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
3684                                          MachinePointerInfo::getFixedStack(
3685                                              DAG.getMachineFunction(), FI)));
3686           }
3687           lastInsIndex = index;
3688         }
3689     }
3690   }
3691 
3692   // varargs
3693   if (isVarArg && MFI.hasVAStart())
3694     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3695                          CCInfo.getNextStackOffset(),
3696                          TotalArgRegsSaveSize);
3697 
3698   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
3699 
3700   return Chain;
3701 }
3702 
3703 /// isFloatingPointZero - Return true if this is +0.0.
3704 static bool isFloatingPointZero(SDValue Op) {
3705   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3706     return CFP->getValueAPF().isPosZero();
3707   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3708     // Maybe this has already been legalized into the constant pool?
3709     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3710       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3711       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3712         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3713           return CFP->getValueAPF().isPosZero();
3714     }
3715   } else if (Op->getOpcode() == ISD::BITCAST &&
3716              Op->getValueType(0) == MVT::f64) {
3717     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
3718     // created by LowerConstantFP().
3719     SDValue BitcastOp = Op->getOperand(0);
3720     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
3721         isNullConstant(BitcastOp->getOperand(0)))
3722       return true;
3723   }
3724   return false;
3725 }
3726 
3727 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3728 /// the given operands.
3729 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3730                                      SDValue &ARMcc, SelectionDAG &DAG,
3731                                      const SDLoc &dl) const {
3732   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3733     unsigned C = RHSC->getZExtValue();
3734     if (!isLegalICmpImmediate(C)) {
3735       // Constant does not fit, try adjusting it by one?
3736       switch (CC) {
3737       default: break;
3738       case ISD::SETLT:
3739       case ISD::SETGE:
3740         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3741           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3742           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3743         }
3744         break;
3745       case ISD::SETULT:
3746       case ISD::SETUGE:
3747         if (C != 0 && isLegalICmpImmediate(C-1)) {
3748           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3749           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3750         }
3751         break;
3752       case ISD::SETLE:
3753       case ISD::SETGT:
3754         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3755           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3756           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3757         }
3758         break;
3759       case ISD::SETULE:
3760       case ISD::SETUGT:
3761         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3762           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3763           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3764         }
3765         break;
3766       }
3767     }
3768   }
3769 
3770   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3771   ARMISD::NodeType CompareType;
3772   switch (CondCode) {
3773   default:
3774     CompareType = ARMISD::CMP;
3775     break;
3776   case ARMCC::EQ:
3777   case ARMCC::NE:
3778     // Uses only Z Flag
3779     CompareType = ARMISD::CMPZ;
3780     break;
3781   }
3782   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3783   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3784 }
3785 
3786 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3787 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
3788                                      SelectionDAG &DAG, const SDLoc &dl) const {
3789   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
3790   SDValue Cmp;
3791   if (!isFloatingPointZero(RHS))
3792     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
3793   else
3794     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
3795   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3796 }
3797 
3798 /// duplicateCmp - Glue values can have only one use, so this function
3799 /// duplicates a comparison node.
3800 SDValue
3801 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3802   unsigned Opc = Cmp.getOpcode();
3803   SDLoc DL(Cmp);
3804   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3805     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3806 
3807   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3808   Cmp = Cmp.getOperand(0);
3809   Opc = Cmp.getOpcode();
3810   if (Opc == ARMISD::CMPFP)
3811     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3812   else {
3813     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3814     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
3815   }
3816   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3817 }
3818 
3819 std::pair<SDValue, SDValue>
3820 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
3821                                  SDValue &ARMcc) const {
3822   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
3823 
3824   SDValue Value, OverflowCmp;
3825   SDValue LHS = Op.getOperand(0);
3826   SDValue RHS = Op.getOperand(1);
3827   SDLoc dl(Op);
3828 
3829   // FIXME: We are currently always generating CMPs because we don't support
3830   // generating CMN through the backend. This is not as good as the natural
3831   // CMP case because it causes a register dependency and cannot be folded
3832   // later.
3833 
3834   switch (Op.getOpcode()) {
3835   default:
3836     llvm_unreachable("Unknown overflow instruction!");
3837   case ISD::SADDO:
3838     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3839     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3840     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3841     break;
3842   case ISD::UADDO:
3843     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3844     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3845     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3846     break;
3847   case ISD::SSUBO:
3848     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3849     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3850     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3851     break;
3852   case ISD::USUBO:
3853     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3854     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3855     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3856     break;
3857   } // switch (...)
3858 
3859   return std::make_pair(Value, OverflowCmp);
3860 }
3861 
3862 SDValue
3863 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
3864   // Let legalize expand this if it isn't a legal type yet.
3865   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3866     return SDValue();
3867 
3868   SDValue Value, OverflowCmp;
3869   SDValue ARMcc;
3870   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
3871   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3872   SDLoc dl(Op);
3873   // We use 0 and 1 as false and true values.
3874   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3875   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3876   EVT VT = Op.getValueType();
3877 
3878   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
3879                                  ARMcc, CCR, OverflowCmp);
3880 
3881   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3882   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3883 }
3884 
3885 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
3886   SDValue Cond = Op.getOperand(0);
3887   SDValue SelectTrue = Op.getOperand(1);
3888   SDValue SelectFalse = Op.getOperand(2);
3889   SDLoc dl(Op);
3890   unsigned Opc = Cond.getOpcode();
3891 
3892   if (Cond.getResNo() == 1 &&
3893       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3894        Opc == ISD::USUBO)) {
3895     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
3896       return SDValue();
3897 
3898     SDValue Value, OverflowCmp;
3899     SDValue ARMcc;
3900     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
3901     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3902     EVT VT = Op.getValueType();
3903 
3904     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
3905                    OverflowCmp, DAG);
3906   }
3907 
3908   // Convert:
3909   //
3910   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
3911   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
3912   //
3913   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
3914     const ConstantSDNode *CMOVTrue =
3915       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
3916     const ConstantSDNode *CMOVFalse =
3917       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3918 
3919     if (CMOVTrue && CMOVFalse) {
3920       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
3921       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
3922 
3923       SDValue True;
3924       SDValue False;
3925       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
3926         True = SelectTrue;
3927         False = SelectFalse;
3928       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
3929         True = SelectFalse;
3930         False = SelectTrue;
3931       }
3932 
3933       if (True.getNode() && False.getNode()) {
3934         EVT VT = Op.getValueType();
3935         SDValue ARMcc = Cond.getOperand(2);
3936         SDValue CCR = Cond.getOperand(3);
3937         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
3938         assert(True.getValueType() == VT);
3939         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
3940       }
3941     }
3942   }
3943 
3944   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
3945   // undefined bits before doing a full-word comparison with zero.
3946   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
3947                      DAG.getConstant(1, dl, Cond.getValueType()));
3948 
3949   return DAG.getSelectCC(dl, Cond,
3950                          DAG.getConstant(0, dl, Cond.getValueType()),
3951                          SelectTrue, SelectFalse, ISD::SETNE);
3952 }
3953 
3954 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
3955                                  bool &swpCmpOps, bool &swpVselOps) {
3956   // Start by selecting the GE condition code for opcodes that return true for
3957   // 'equality'
3958   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
3959       CC == ISD::SETULE)
3960     CondCode = ARMCC::GE;
3961 
3962   // and GT for opcodes that return false for 'equality'.
3963   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
3964            CC == ISD::SETULT)
3965     CondCode = ARMCC::GT;
3966 
3967   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
3968   // to swap the compare operands.
3969   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
3970       CC == ISD::SETULT)
3971     swpCmpOps = true;
3972 
3973   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
3974   // If we have an unordered opcode, we need to swap the operands to the VSEL
3975   // instruction (effectively negating the condition).
3976   //
3977   // This also has the effect of swapping which one of 'less' or 'greater'
3978   // returns true, so we also swap the compare operands. It also switches
3979   // whether we return true for 'equality', so we compensate by picking the
3980   // opposite condition code to our original choice.
3981   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
3982       CC == ISD::SETUGT) {
3983     swpCmpOps = !swpCmpOps;
3984     swpVselOps = !swpVselOps;
3985     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
3986   }
3987 
3988   // 'ordered' is 'anything but unordered', so use the VS condition code and
3989   // swap the VSEL operands.
3990   if (CC == ISD::SETO) {
3991     CondCode = ARMCC::VS;
3992     swpVselOps = true;
3993   }
3994 
3995   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
3996   // code and swap the VSEL operands.
3997   if (CC == ISD::SETUNE) {
3998     CondCode = ARMCC::EQ;
3999     swpVselOps = true;
4000   }
4001 }
4002 
4003 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4004                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
4005                                    SDValue Cmp, SelectionDAG &DAG) const {
4006   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
4007     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4008                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4009     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4010                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4011 
4012     SDValue TrueLow = TrueVal.getValue(0);
4013     SDValue TrueHigh = TrueVal.getValue(1);
4014     SDValue FalseLow = FalseVal.getValue(0);
4015     SDValue FalseHigh = FalseVal.getValue(1);
4016 
4017     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4018                               ARMcc, CCR, Cmp);
4019     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4020                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
4021 
4022     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4023   } else {
4024     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
4025                        Cmp);
4026   }
4027 }
4028 
4029 static bool isGTorGE(ISD::CondCode CC) {
4030   return CC == ISD::SETGT || CC == ISD::SETGE;
4031 }
4032 
4033 static bool isLTorLE(ISD::CondCode CC) {
4034   return CC == ISD::SETLT || CC == ISD::SETLE;
4035 }
4036 
4037 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4038 // All of these conditions (and their <= and >= counterparts) will do:
4039 //          x < k ? k : x
4040 //          x > k ? x : k
4041 //          k < x ? x : k
4042 //          k > x ? k : x
4043 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4044                             const SDValue TrueVal, const SDValue FalseVal,
4045                             const ISD::CondCode CC, const SDValue K) {
4046   return (isGTorGE(CC) &&
4047           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4048          (isLTorLE(CC) &&
4049           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4050 }
4051 
4052 // Similar to isLowerSaturate(), but checks for upper-saturating conditions.
4053 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
4054                             const SDValue TrueVal, const SDValue FalseVal,
4055                             const ISD::CondCode CC, const SDValue K) {
4056   return (isGTorGE(CC) &&
4057           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
4058          (isLTorLE(CC) &&
4059           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
4060 }
4061 
4062 // Check if two chained conditionals could be converted into SSAT.
4063 //
4064 // SSAT can replace a set of two conditional selectors that bound a number to an
4065 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4066 //
4067 //     x < -k ? -k : (x > k ? k : x)
4068 //     x < -k ? -k : (x < k ? x : k)
4069 //     x > -k ? (x > k ? k : x) : -k
4070 //     x < k ? (x < -k ? -k : x) : k
4071 //     etc.
4072 //
4073 // It returns true if the conversion can be done, false otherwise.
4074 // Additionally, the variable is returned in parameter V and the constant in K.
4075 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
4076                                     uint64_t &K) {
4077   SDValue LHS1 = Op.getOperand(0);
4078   SDValue RHS1 = Op.getOperand(1);
4079   SDValue TrueVal1 = Op.getOperand(2);
4080   SDValue FalseVal1 = Op.getOperand(3);
4081   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4082 
4083   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4084   if (Op2.getOpcode() != ISD::SELECT_CC)
4085     return false;
4086 
4087   SDValue LHS2 = Op2.getOperand(0);
4088   SDValue RHS2 = Op2.getOperand(1);
4089   SDValue TrueVal2 = Op2.getOperand(2);
4090   SDValue FalseVal2 = Op2.getOperand(3);
4091   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
4092 
4093   // Find out which are the constants and which are the variables
4094   // in each conditional
4095   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
4096                                                         ? &RHS1
4097                                                         : nullptr;
4098   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
4099                                                         ? &RHS2
4100                                                         : nullptr;
4101   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
4102   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
4103   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
4104   SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
4105 
4106   // We must detect cases where the original operations worked with 16- or
4107   // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
4108   // must work with sign-extended values but the select operations return
4109   // the original non-extended value.
4110   SDValue V2TmpReg = V2Tmp;
4111   if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
4112     V2TmpReg = V2Tmp->getOperand(0);
4113 
4114   // Check that the registers and the constants have the correct values
4115   // in both conditionals
4116   if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
4117       V2TmpReg != V2)
4118     return false;
4119 
4120   // Figure out which conditional is saturating the lower/upper bound.
4121   const SDValue *LowerCheckOp =
4122       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4123           ? &Op
4124           : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4125                 ? &Op2
4126                 : nullptr;
4127   const SDValue *UpperCheckOp =
4128       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4129           ? &Op
4130           : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4131                 ? &Op2
4132                 : nullptr;
4133 
4134   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
4135     return false;
4136 
4137   // Check that the constant in the lower-bound check is
4138   // the opposite of the constant in the upper-bound check
4139   // in 1's complement.
4140   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
4141   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
4142   int64_t PosVal = std::max(Val1, Val2);
4143 
4144   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
4145        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
4146       Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
4147 
4148     V = V2;
4149     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
4150     return true;
4151   }
4152 
4153   return false;
4154 }
4155 
4156 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
4157   EVT VT = Op.getValueType();
4158   SDLoc dl(Op);
4159 
4160   // Try to convert two saturating conditional selects into a single SSAT
4161   SDValue SatValue;
4162   uint64_t SatConstant;
4163   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
4164       isSaturatingConditional(Op, SatValue, SatConstant))
4165     return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
4166                        DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4167 
4168   SDValue LHS = Op.getOperand(0);
4169   SDValue RHS = Op.getOperand(1);
4170   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4171   SDValue TrueVal = Op.getOperand(2);
4172   SDValue FalseVal = Op.getOperand(3);
4173 
4174   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4175     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4176                                                     dl);
4177 
4178     // If softenSetCCOperands only returned one value, we should compare it to
4179     // zero.
4180     if (!RHS.getNode()) {
4181       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4182       CC = ISD::SETNE;
4183     }
4184   }
4185 
4186   if (LHS.getValueType() == MVT::i32) {
4187     // Try to generate VSEL on ARMv8.
4188     // The VSEL instruction can't use all the usual ARM condition
4189     // codes: it only has two bits to select the condition code, so it's
4190     // constrained to use only GE, GT, VS and EQ.
4191     //
4192     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
4193     // swap the operands of the previous compare instruction (effectively
4194     // inverting the compare condition, swapping 'less' and 'greater') and
4195     // sometimes need to swap the operands to the VSEL (which inverts the
4196     // condition in the sense of firing whenever the previous condition didn't)
4197     if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
4198                                     TrueVal.getValueType() == MVT::f64)) {
4199       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4200       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
4201           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
4202         CC = ISD::getSetCCInverse(CC, true);
4203         std::swap(TrueVal, FalseVal);
4204       }
4205     }
4206 
4207     SDValue ARMcc;
4208     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4209     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4210     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4211   }
4212 
4213   ARMCC::CondCodes CondCode, CondCode2;
4214   FPCCToARMCC(CC, CondCode, CondCode2);
4215 
4216   // Try to generate VMAXNM/VMINNM on ARMv8.
4217   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
4218                                   TrueVal.getValueType() == MVT::f64)) {
4219     bool swpCmpOps = false;
4220     bool swpVselOps = false;
4221     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
4222 
4223     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
4224         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
4225       if (swpCmpOps)
4226         std::swap(LHS, RHS);
4227       if (swpVselOps)
4228         std::swap(TrueVal, FalseVal);
4229     }
4230   }
4231 
4232   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4233   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
4234   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4235   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4236   if (CondCode2 != ARMCC::AL) {
4237     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
4238     // FIXME: Needs another CMP because flag can have but one use.
4239     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
4240     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
4241   }
4242   return Result;
4243 }
4244 
4245 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
4246 /// to morph to an integer compare sequence.
4247 static bool canChangeToInt(SDValue Op, bool &SeenZero,
4248                            const ARMSubtarget *Subtarget) {
4249   SDNode *N = Op.getNode();
4250   if (!N->hasOneUse())
4251     // Otherwise it requires moving the value from fp to integer registers.
4252     return false;
4253   if (!N->getNumValues())
4254     return false;
4255   EVT VT = Op.getValueType();
4256   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
4257     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
4258     // vmrs are very slow, e.g. cortex-a8.
4259     return false;
4260 
4261   if (isFloatingPointZero(Op)) {
4262     SeenZero = true;
4263     return true;
4264   }
4265   return ISD::isNormalLoad(N);
4266 }
4267 
4268 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
4269   if (isFloatingPointZero(Op))
4270     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
4271 
4272   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
4273     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
4274                        Ld->getPointerInfo(), Ld->getAlignment(),
4275                        Ld->getMemOperand()->getFlags());
4276 
4277   llvm_unreachable("Unknown VFP cmp argument!");
4278 }
4279 
4280 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
4281                            SDValue &RetVal1, SDValue &RetVal2) {
4282   SDLoc dl(Op);
4283 
4284   if (isFloatingPointZero(Op)) {
4285     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
4286     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
4287     return;
4288   }
4289 
4290   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
4291     SDValue Ptr = Ld->getBasePtr();
4292     RetVal1 =
4293         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
4294                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
4295 
4296     EVT PtrType = Ptr.getValueType();
4297     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
4298     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
4299                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
4300     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
4301                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
4302                           Ld->getMemOperand()->getFlags());
4303     return;
4304   }
4305 
4306   llvm_unreachable("Unknown VFP cmp argument!");
4307 }
4308 
4309 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
4310 /// f32 and even f64 comparisons to integer ones.
4311 SDValue
4312 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
4313   SDValue Chain = Op.getOperand(0);
4314   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4315   SDValue LHS = Op.getOperand(2);
4316   SDValue RHS = Op.getOperand(3);
4317   SDValue Dest = Op.getOperand(4);
4318   SDLoc dl(Op);
4319 
4320   bool LHSSeenZero = false;
4321   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
4322   bool RHSSeenZero = false;
4323   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
4324   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
4325     // If unsafe fp math optimization is enabled and there are no other uses of
4326     // the CMP operands, and the condition code is EQ or NE, we can optimize it
4327     // to an integer comparison.
4328     if (CC == ISD::SETOEQ)
4329       CC = ISD::SETEQ;
4330     else if (CC == ISD::SETUNE)
4331       CC = ISD::SETNE;
4332 
4333     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4334     SDValue ARMcc;
4335     if (LHS.getValueType() == MVT::f32) {
4336       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4337                         bitcastf32Toi32(LHS, DAG), Mask);
4338       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4339                         bitcastf32Toi32(RHS, DAG), Mask);
4340       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4341       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4342       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4343                          Chain, Dest, ARMcc, CCR, Cmp);
4344     }
4345 
4346     SDValue LHS1, LHS2;
4347     SDValue RHS1, RHS2;
4348     expandf64Toi32(LHS, DAG, LHS1, LHS2);
4349     expandf64Toi32(RHS, DAG, RHS1, RHS2);
4350     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
4351     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
4352     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4353     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4354     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4355     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
4356     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
4357   }
4358 
4359   return SDValue();
4360 }
4361 
4362 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
4363   SDValue Chain = Op.getOperand(0);
4364   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4365   SDValue LHS = Op.getOperand(2);
4366   SDValue RHS = Op.getOperand(3);
4367   SDValue Dest = Op.getOperand(4);
4368   SDLoc dl(Op);
4369 
4370   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4371     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4372                                                     dl);
4373 
4374     // If softenSetCCOperands only returned one value, we should compare it to
4375     // zero.
4376     if (!RHS.getNode()) {
4377       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4378       CC = ISD::SETNE;
4379     }
4380   }
4381 
4382   if (LHS.getValueType() == MVT::i32) {
4383     SDValue ARMcc;
4384     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4385     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4386     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4387                        Chain, Dest, ARMcc, CCR, Cmp);
4388   }
4389 
4390   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
4391 
4392   if (getTargetMachine().Options.UnsafeFPMath &&
4393       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
4394        CC == ISD::SETNE || CC == ISD::SETUNE)) {
4395     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
4396       return Result;
4397   }
4398 
4399   ARMCC::CondCodes CondCode, CondCode2;
4400   FPCCToARMCC(CC, CondCode, CondCode2);
4401 
4402   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4403   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
4404   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4405   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4406   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
4407   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4408   if (CondCode2 != ARMCC::AL) {
4409     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
4410     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
4411     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4412   }
4413   return Res;
4414 }
4415 
4416 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
4417   SDValue Chain = Op.getOperand(0);
4418   SDValue Table = Op.getOperand(1);
4419   SDValue Index = Op.getOperand(2);
4420   SDLoc dl(Op);
4421 
4422   EVT PTy = getPointerTy(DAG.getDataLayout());
4423   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
4424   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
4425   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
4426   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
4427   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
4428   if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
4429     // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
4430     // which does another jump to the destination. This also makes it easier
4431     // to translate it to TBB / TBH later (Thumb2 only).
4432     // FIXME: This might not work if the function is extremely large.
4433     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
4434                        Addr, Op.getOperand(2), JTI);
4435   }
4436   if (isPositionIndependent() || Subtarget->isROPI()) {
4437     Addr =
4438         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
4439                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4440     Chain = Addr.getValue(1);
4441     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
4442     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4443   } else {
4444     Addr =
4445         DAG.getLoad(PTy, dl, Chain, Addr,
4446                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4447     Chain = Addr.getValue(1);
4448     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4449   }
4450 }
4451 
4452 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
4453   EVT VT = Op.getValueType();
4454   SDLoc dl(Op);
4455 
4456   if (Op.getValueType().getVectorElementType() == MVT::i32) {
4457     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
4458       return Op;
4459     return DAG.UnrollVectorOp(Op.getNode());
4460   }
4461 
4462   assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
4463          "Invalid type for custom lowering!");
4464   if (VT != MVT::v4i16)
4465     return DAG.UnrollVectorOp(Op.getNode());
4466 
4467   Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
4468   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
4469 }
4470 
4471 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
4472   EVT VT = Op.getValueType();
4473   if (VT.isVector())
4474     return LowerVectorFP_TO_INT(Op, DAG);
4475   if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
4476     RTLIB::Libcall LC;
4477     if (Op.getOpcode() == ISD::FP_TO_SINT)
4478       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
4479                               Op.getValueType());
4480     else
4481       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
4482                               Op.getValueType());
4483     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4484                        /*isSigned*/ false, SDLoc(Op)).first;
4485   }
4486 
4487   return Op;
4488 }
4489 
4490 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4491   EVT VT = Op.getValueType();
4492   SDLoc dl(Op);
4493 
4494   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
4495     if (VT.getVectorElementType() == MVT::f32)
4496       return Op;
4497     return DAG.UnrollVectorOp(Op.getNode());
4498   }
4499 
4500   assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
4501          "Invalid type for custom lowering!");
4502   if (VT != MVT::v4f32)
4503     return DAG.UnrollVectorOp(Op.getNode());
4504 
4505   unsigned CastOpc;
4506   unsigned Opc;
4507   switch (Op.getOpcode()) {
4508   default: llvm_unreachable("Invalid opcode!");
4509   case ISD::SINT_TO_FP:
4510     CastOpc = ISD::SIGN_EXTEND;
4511     Opc = ISD::SINT_TO_FP;
4512     break;
4513   case ISD::UINT_TO_FP:
4514     CastOpc = ISD::ZERO_EXTEND;
4515     Opc = ISD::UINT_TO_FP;
4516     break;
4517   }
4518 
4519   Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
4520   return DAG.getNode(Opc, dl, VT, Op);
4521 }
4522 
4523 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
4524   EVT VT = Op.getValueType();
4525   if (VT.isVector())
4526     return LowerVectorINT_TO_FP(Op, DAG);
4527   if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
4528     RTLIB::Libcall LC;
4529     if (Op.getOpcode() == ISD::SINT_TO_FP)
4530       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
4531                               Op.getValueType());
4532     else
4533       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
4534                               Op.getValueType());
4535     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4536                        /*isSigned*/ false, SDLoc(Op)).first;
4537   }
4538 
4539   return Op;
4540 }
4541 
4542 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
4543   // Implement fcopysign with a fabs and a conditional fneg.
4544   SDValue Tmp0 = Op.getOperand(0);
4545   SDValue Tmp1 = Op.getOperand(1);
4546   SDLoc dl(Op);
4547   EVT VT = Op.getValueType();
4548   EVT SrcVT = Tmp1.getValueType();
4549   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
4550     Tmp0.getOpcode() == ARMISD::VMOVDRR;
4551   bool UseNEON = !InGPR && Subtarget->hasNEON();
4552 
4553   if (UseNEON) {
4554     // Use VBSL to copy the sign bit.
4555     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
4556     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
4557                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
4558     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
4559     if (VT == MVT::f64)
4560       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4561                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
4562                          DAG.getConstant(32, dl, MVT::i32));
4563     else /*if (VT == MVT::f32)*/
4564       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
4565     if (SrcVT == MVT::f32) {
4566       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
4567       if (VT == MVT::f64)
4568         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4569                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
4570                            DAG.getConstant(32, dl, MVT::i32));
4571     } else if (VT == MVT::f32)
4572       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
4573                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
4574                          DAG.getConstant(32, dl, MVT::i32));
4575     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
4576     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
4577 
4578     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
4579                                             dl, MVT::i32);
4580     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
4581     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
4582                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
4583 
4584     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
4585                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
4586                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
4587     if (VT == MVT::f32) {
4588       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
4589       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
4590                         DAG.getConstant(0, dl, MVT::i32));
4591     } else {
4592       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
4593     }
4594 
4595     return Res;
4596   }
4597 
4598   // Bitcast operand 1 to i32.
4599   if (SrcVT == MVT::f64)
4600     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4601                        Tmp1).getValue(1);
4602   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
4603 
4604   // Or in the signbit with integer operations.
4605   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
4606   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4607   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
4608   if (VT == MVT::f32) {
4609     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
4610                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
4611     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
4612                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
4613   }
4614 
4615   // f64: Or the high part with signbit and then combine two parts.
4616   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4617                      Tmp0);
4618   SDValue Lo = Tmp0.getValue(0);
4619   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
4620   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
4621   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
4622 }
4623 
4624 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
4625   MachineFunction &MF = DAG.getMachineFunction();
4626   MachineFrameInfo &MFI = MF.getFrameInfo();
4627   MFI.setReturnAddressIsTaken(true);
4628 
4629   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
4630     return SDValue();
4631 
4632   EVT VT = Op.getValueType();
4633   SDLoc dl(Op);
4634   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4635   if (Depth) {
4636     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
4637     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
4638     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
4639                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
4640                        MachinePointerInfo());
4641   }
4642 
4643   // Return LR, which contains the return address. Mark it an implicit live-in.
4644   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4645   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
4646 }
4647 
4648 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
4649   const ARMBaseRegisterInfo &ARI =
4650     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
4651   MachineFunction &MF = DAG.getMachineFunction();
4652   MachineFrameInfo &MFI = MF.getFrameInfo();
4653   MFI.setFrameAddressIsTaken(true);
4654 
4655   EVT VT = Op.getValueType();
4656   SDLoc dl(Op);  // FIXME probably not meaningful
4657   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4658   unsigned FrameReg = ARI.getFrameRegister(MF);
4659   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
4660   while (Depth--)
4661     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
4662                             MachinePointerInfo());
4663   return FrameAddr;
4664 }
4665 
4666 // FIXME? Maybe this could be a TableGen attribute on some registers and
4667 // this table could be generated automatically from RegInfo.
4668 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
4669                                               SelectionDAG &DAG) const {
4670   unsigned Reg = StringSwitch<unsigned>(RegName)
4671                        .Case("sp", ARM::SP)
4672                        .Default(0);
4673   if (Reg)
4674     return Reg;
4675   report_fatal_error(Twine("Invalid register name \""
4676                               + StringRef(RegName)  + "\"."));
4677 }
4678 
4679 // Result is 64 bit value so split into two 32 bit values and return as a
4680 // pair of values.
4681 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
4682                                 SelectionDAG &DAG) {
4683   SDLoc DL(N);
4684 
4685   // This function is only supposed to be called for i64 type destination.
4686   assert(N->getValueType(0) == MVT::i64
4687           && "ExpandREAD_REGISTER called for non-i64 type result.");
4688 
4689   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
4690                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
4691                              N->getOperand(0),
4692                              N->getOperand(1));
4693 
4694   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
4695                     Read.getValue(1)));
4696   Results.push_back(Read.getOperand(0));
4697 }
4698 
4699 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
4700 /// When \p DstVT, the destination type of \p BC, is on the vector
4701 /// register bank and the source of bitcast, \p Op, operates on the same bank,
4702 /// it might be possible to combine them, such that everything stays on the
4703 /// vector register bank.
4704 /// \p return The node that would replace \p BT, if the combine
4705 /// is possible.
4706 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
4707                                                 SelectionDAG &DAG) {
4708   SDValue Op = BC->getOperand(0);
4709   EVT DstVT = BC->getValueType(0);
4710 
4711   // The only vector instruction that can produce a scalar (remember,
4712   // since the bitcast was about to be turned into VMOVDRR, the source
4713   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
4714   // Moreover, we can do this combine only if there is one use.
4715   // Finally, if the destination type is not a vector, there is not
4716   // much point on forcing everything on the vector bank.
4717   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
4718       !Op.hasOneUse())
4719     return SDValue();
4720 
4721   // If the index is not constant, we will introduce an additional
4722   // multiply that will stick.
4723   // Give up in that case.
4724   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
4725   if (!Index)
4726     return SDValue();
4727   unsigned DstNumElt = DstVT.getVectorNumElements();
4728 
4729   // Compute the new index.
4730   const APInt &APIntIndex = Index->getAPIntValue();
4731   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
4732   NewIndex *= APIntIndex;
4733   // Check if the new constant index fits into i32.
4734   if (NewIndex.getBitWidth() > 32)
4735     return SDValue();
4736 
4737   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
4738   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
4739   SDLoc dl(Op);
4740   SDValue ExtractSrc = Op.getOperand(0);
4741   EVT VecVT = EVT::getVectorVT(
4742       *DAG.getContext(), DstVT.getScalarType(),
4743       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
4744   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
4745   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
4746                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
4747 }
4748 
4749 /// ExpandBITCAST - If the target supports VFP, this function is called to
4750 /// expand a bit convert where either the source or destination type is i64 to
4751 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
4752 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
4753 /// vectors), since the legalizer won't know what to do with that.
4754 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
4755   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4756   SDLoc dl(N);
4757   SDValue Op = N->getOperand(0);
4758 
4759   // This function is only supposed to be called for i64 types, either as the
4760   // source or destination of the bit convert.
4761   EVT SrcVT = Op.getValueType();
4762   EVT DstVT = N->getValueType(0);
4763   assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
4764          "ExpandBITCAST called for non-i64 type");
4765 
4766   // Turn i64->f64 into VMOVDRR.
4767   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
4768     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
4769     // if we can combine the bitcast with its source.
4770     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
4771       return Val;
4772 
4773     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4774                              DAG.getConstant(0, dl, MVT::i32));
4775     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4776                              DAG.getConstant(1, dl, MVT::i32));
4777     return DAG.getNode(ISD::BITCAST, dl, DstVT,
4778                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
4779   }
4780 
4781   // Turn f64->i64 into VMOVRRD.
4782   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
4783     SDValue Cvt;
4784     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
4785         SrcVT.getVectorNumElements() > 1)
4786       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4787                         DAG.getVTList(MVT::i32, MVT::i32),
4788                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
4789     else
4790       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4791                         DAG.getVTList(MVT::i32, MVT::i32), Op);
4792     // Merge the pieces into a single i64 value.
4793     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
4794   }
4795 
4796   return SDValue();
4797 }
4798 
4799 /// getZeroVector - Returns a vector of specified type with all zero elements.
4800 /// Zero vectors are used to represent vector negation and in those cases
4801 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
4802 /// not support i64 elements, so sometimes the zero vectors will need to be
4803 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
4804 /// zero vector.
4805 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4806   assert(VT.isVector() && "Expected a vector type");
4807   // The canonical modified immediate encoding of a zero vector is....0!
4808   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
4809   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
4810   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
4811   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4812 }
4813 
4814 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
4815 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4816 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
4817                                                 SelectionDAG &DAG) const {
4818   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4819   EVT VT = Op.getValueType();
4820   unsigned VTBits = VT.getSizeInBits();
4821   SDLoc dl(Op);
4822   SDValue ShOpLo = Op.getOperand(0);
4823   SDValue ShOpHi = Op.getOperand(1);
4824   SDValue ShAmt  = Op.getOperand(2);
4825   SDValue ARMcc;
4826   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4827   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
4828 
4829   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
4830 
4831   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4832                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4833   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
4834   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4835                                    DAG.getConstant(VTBits, dl, MVT::i32));
4836   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
4837   SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4838   SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
4839   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4840                             ISD::SETGE, ARMcc, DAG, dl);
4841   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
4842                            ARMcc, CCR, CmpLo);
4843 
4844 
4845   SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
4846   SDValue HiBigShift = Opc == ISD::SRA
4847                            ? DAG.getNode(Opc, dl, VT, ShOpHi,
4848                                          DAG.getConstant(VTBits - 1, dl, VT))
4849                            : DAG.getConstant(0, dl, VT);
4850   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4851                             ISD::SETGE, ARMcc, DAG, dl);
4852   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
4853                            ARMcc, CCR, CmpHi);
4854 
4855   SDValue Ops[2] = { Lo, Hi };
4856   return DAG.getMergeValues(Ops, dl);
4857 }
4858 
4859 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
4860 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4861 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
4862                                                SelectionDAG &DAG) const {
4863   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4864   EVT VT = Op.getValueType();
4865   unsigned VTBits = VT.getSizeInBits();
4866   SDLoc dl(Op);
4867   SDValue ShOpLo = Op.getOperand(0);
4868   SDValue ShOpHi = Op.getOperand(1);
4869   SDValue ShAmt  = Op.getOperand(2);
4870   SDValue ARMcc;
4871   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4872 
4873   assert(Op.getOpcode() == ISD::SHL_PARTS);
4874   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4875                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4876   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
4877   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
4878   SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4879 
4880   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4881                                    DAG.getConstant(VTBits, dl, MVT::i32));
4882   SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
4883   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4884                             ISD::SETGE, ARMcc, DAG, dl);
4885   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
4886                            ARMcc, CCR, CmpHi);
4887 
4888   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4889                           ISD::SETGE, ARMcc, DAG, dl);
4890   SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4891   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
4892                            DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
4893 
4894   SDValue Ops[2] = { Lo, Hi };
4895   return DAG.getMergeValues(Ops, dl);
4896 }
4897 
4898 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4899                                             SelectionDAG &DAG) const {
4900   // The rounding mode is in bits 23:22 of the FPSCR.
4901   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4902   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4903   // so that the shift + and get folded into a bitfield extract.
4904   SDLoc dl(Op);
4905   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
4906                               DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
4907                                               MVT::i32));
4908   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
4909                                   DAG.getConstant(1U << 22, dl, MVT::i32));
4910   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4911                               DAG.getConstant(22, dl, MVT::i32));
4912   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4913                      DAG.getConstant(3, dl, MVT::i32));
4914 }
4915 
4916 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
4917                          const ARMSubtarget *ST) {
4918   SDLoc dl(N);
4919   EVT VT = N->getValueType(0);
4920   if (VT.isVector()) {
4921     assert(ST->hasNEON());
4922 
4923     // Compute the least significant set bit: LSB = X & -X
4924     SDValue X = N->getOperand(0);
4925     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
4926     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
4927 
4928     EVT ElemTy = VT.getVectorElementType();
4929 
4930     if (ElemTy == MVT::i8) {
4931       // Compute with: cttz(x) = ctpop(lsb - 1)
4932       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4933                                 DAG.getTargetConstant(1, dl, ElemTy));
4934       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4935       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
4936     }
4937 
4938     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
4939         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
4940       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
4941       unsigned NumBits = ElemTy.getSizeInBits();
4942       SDValue WidthMinus1 =
4943           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4944                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
4945       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
4946       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
4947     }
4948 
4949     // Compute with: cttz(x) = ctpop(lsb - 1)
4950 
4951     // Since we can only compute the number of bits in a byte with vcnt.8, we
4952     // have to gather the result with pairwise addition (vpaddl) for i16, i32,
4953     // and i64.
4954 
4955     // Compute LSB - 1.
4956     SDValue Bits;
4957     if (ElemTy == MVT::i64) {
4958       // Load constant 0xffff'ffff'ffff'ffff to register.
4959       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4960                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
4961       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
4962     } else {
4963       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4964                                 DAG.getTargetConstant(1, dl, ElemTy));
4965       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4966     }
4967 
4968     // Count #bits with vcnt.8.
4969     EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
4970     SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
4971     SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
4972 
4973     // Gather the #bits with vpaddl (pairwise add.)
4974     EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
4975     SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
4976         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4977         Cnt8);
4978     if (ElemTy == MVT::i16)
4979       return Cnt16;
4980 
4981     EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
4982     SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
4983         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4984         Cnt16);
4985     if (ElemTy == MVT::i32)
4986       return Cnt32;
4987 
4988     assert(ElemTy == MVT::i64);
4989     SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4990         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4991         Cnt32);
4992     return Cnt64;
4993   }
4994 
4995   if (!ST->hasV6T2Ops())
4996     return SDValue();
4997 
4998   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
4999   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
5000 }
5001 
5002 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
5003 /// for each 16-bit element from operand, repeated.  The basic idea is to
5004 /// leverage vcnt to get the 8-bit counts, gather and add the results.
5005 ///
5006 /// Trace for v4i16:
5007 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
5008 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
5009 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
5010 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
5011 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
5012 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
5013 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
5014 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
5015 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
5016   EVT VT = N->getValueType(0);
5017   SDLoc DL(N);
5018 
5019   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5020   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
5021   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
5022   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
5023   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
5024   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
5025 }
5026 
5027 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
5028 /// bit-count for each 16-bit element from the operand.  We need slightly
5029 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
5030 /// 64/128-bit registers.
5031 ///
5032 /// Trace for v4i16:
5033 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
5034 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
5035 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
5036 /// v4i16:Extracted = [k0    k1    k2    k3    ]
5037 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
5038   EVT VT = N->getValueType(0);
5039   SDLoc DL(N);
5040 
5041   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
5042   if (VT.is64BitVector()) {
5043     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
5044     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
5045                        DAG.getIntPtrConstant(0, DL));
5046   } else {
5047     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
5048                                     BitCounts, DAG.getIntPtrConstant(0, DL));
5049     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
5050   }
5051 }
5052 
5053 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
5054 /// bit-count for each 32-bit element from the operand.  The idea here is
5055 /// to split the vector into 16-bit elements, leverage the 16-bit count
5056 /// routine, and then combine the results.
5057 ///
5058 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
5059 /// input    = [v0    v1    ] (vi: 32-bit elements)
5060 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
5061 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
5062 /// vrev: N0 = [k1 k0 k3 k2 ]
5063 ///            [k0 k1 k2 k3 ]
5064 ///       N1 =+[k1 k0 k3 k2 ]
5065 ///            [k0 k2 k1 k3 ]
5066 ///       N2 =+[k1 k3 k0 k2 ]
5067 ///            [k0    k2    k1    k3    ]
5068 /// Extended =+[k1    k3    k0    k2    ]
5069 ///            [k0    k2    ]
5070 /// Extracted=+[k1    k3    ]
5071 ///
5072 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
5073   EVT VT = N->getValueType(0);
5074   SDLoc DL(N);
5075 
5076   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
5077 
5078   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
5079   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
5080   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
5081   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
5082   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
5083 
5084   if (VT.is64BitVector()) {
5085     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
5086     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
5087                        DAG.getIntPtrConstant(0, DL));
5088   } else {
5089     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
5090                                     DAG.getIntPtrConstant(0, DL));
5091     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
5092   }
5093 }
5094 
5095 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
5096                           const ARMSubtarget *ST) {
5097   EVT VT = N->getValueType(0);
5098 
5099   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
5100   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
5101           VT == MVT::v4i16 || VT == MVT::v8i16) &&
5102          "Unexpected type for custom ctpop lowering");
5103 
5104   if (VT.getVectorElementType() == MVT::i32)
5105     return lowerCTPOP32BitElements(N, DAG);
5106   else
5107     return lowerCTPOP16BitElements(N, DAG);
5108 }
5109 
5110 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
5111                           const ARMSubtarget *ST) {
5112   EVT VT = N->getValueType(0);
5113   SDLoc dl(N);
5114 
5115   if (!VT.isVector())
5116     return SDValue();
5117 
5118   // Lower vector shifts on NEON to use VSHL.
5119   assert(ST->hasNEON() && "unexpected vector shift");
5120 
5121   // Left shifts translate directly to the vshiftu intrinsic.
5122   if (N->getOpcode() == ISD::SHL)
5123     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5124                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
5125                                        MVT::i32),
5126                        N->getOperand(0), N->getOperand(1));
5127 
5128   assert((N->getOpcode() == ISD::SRA ||
5129           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
5130 
5131   // NEON uses the same intrinsics for both left and right shifts.  For
5132   // right shifts, the shift amounts are negative, so negate the vector of
5133   // shift amounts.
5134   EVT ShiftVT = N->getOperand(1).getValueType();
5135   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
5136                                      getZeroVector(ShiftVT, DAG, dl),
5137                                      N->getOperand(1));
5138   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
5139                              Intrinsic::arm_neon_vshifts :
5140                              Intrinsic::arm_neon_vshiftu);
5141   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5142                      DAG.getConstant(vshiftInt, dl, MVT::i32),
5143                      N->getOperand(0), NegatedCount);
5144 }
5145 
5146 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
5147                                 const ARMSubtarget *ST) {
5148   EVT VT = N->getValueType(0);
5149   SDLoc dl(N);
5150 
5151   // We can get here for a node like i32 = ISD::SHL i32, i64
5152   if (VT != MVT::i64)
5153     return SDValue();
5154 
5155   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
5156          "Unknown shift to lower!");
5157 
5158   // We only lower SRA, SRL of 1 here, all others use generic lowering.
5159   if (!isOneConstant(N->getOperand(1)))
5160     return SDValue();
5161 
5162   // If we are in thumb mode, we don't have RRX.
5163   if (ST->isThumb1Only()) return SDValue();
5164 
5165   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
5166   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5167                            DAG.getConstant(0, dl, MVT::i32));
5168   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5169                            DAG.getConstant(1, dl, MVT::i32));
5170 
5171   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
5172   // captures the result into a carry flag.
5173   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
5174   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
5175 
5176   // The low part is an ARMISD::RRX operand, which shifts the carry in.
5177   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
5178 
5179   // Merge the pieces into a single i64 value.
5180  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
5181 }
5182 
5183 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5184   SDValue TmpOp0, TmpOp1;
5185   bool Invert = false;
5186   bool Swap = false;
5187   unsigned Opc = 0;
5188 
5189   SDValue Op0 = Op.getOperand(0);
5190   SDValue Op1 = Op.getOperand(1);
5191   SDValue CC = Op.getOperand(2);
5192   EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
5193   EVT VT = Op.getValueType();
5194   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5195   SDLoc dl(Op);
5196 
5197   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
5198       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
5199     // Special-case integer 64-bit equality comparisons. They aren't legal,
5200     // but they can be lowered with a few vector instructions.
5201     unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
5202     EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
5203     SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
5204     SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
5205     SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
5206                               DAG.getCondCode(ISD::SETEQ));
5207     SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
5208     SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
5209     Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
5210     if (SetCCOpcode == ISD::SETNE)
5211       Merged = DAG.getNOT(dl, Merged, CmpVT);
5212     Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
5213     return Merged;
5214   }
5215 
5216   if (CmpVT.getVectorElementType() == MVT::i64)
5217     // 64-bit comparisons are not legal in general.
5218     return SDValue();
5219 
5220   if (Op1.getValueType().isFloatingPoint()) {
5221     switch (SetCCOpcode) {
5222     default: llvm_unreachable("Illegal FP comparison");
5223     case ISD::SETUNE:
5224     case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
5225     case ISD::SETOEQ:
5226     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5227     case ISD::SETOLT:
5228     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
5229     case ISD::SETOGT:
5230     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5231     case ISD::SETOLE:
5232     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
5233     case ISD::SETOGE:
5234     case ISD::SETGE: Opc = ARMISD::VCGE; break;
5235     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
5236     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
5237     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
5238     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
5239     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
5240     case ISD::SETONE:
5241       // Expand this to (OLT | OGT).
5242       TmpOp0 = Op0;
5243       TmpOp1 = Op1;
5244       Opc = ISD::OR;
5245       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5246       Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
5247       break;
5248     case ISD::SETUO:
5249       Invert = true;
5250       LLVM_FALLTHROUGH;
5251     case ISD::SETO:
5252       // Expand this to (OLT | OGE).
5253       TmpOp0 = Op0;
5254       TmpOp1 = Op1;
5255       Opc = ISD::OR;
5256       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5257       Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
5258       break;
5259     }
5260   } else {
5261     // Integer comparisons.
5262     switch (SetCCOpcode) {
5263     default: llvm_unreachable("Illegal integer comparison");
5264     case ISD::SETNE:  Invert = true;
5265     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5266     case ISD::SETLT:  Swap = true;
5267     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5268     case ISD::SETLE:  Swap = true;
5269     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
5270     case ISD::SETULT: Swap = true;
5271     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
5272     case ISD::SETULE: Swap = true;
5273     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
5274     }
5275 
5276     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
5277     if (Opc == ARMISD::VCEQ) {
5278 
5279       SDValue AndOp;
5280       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5281         AndOp = Op0;
5282       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
5283         AndOp = Op1;
5284 
5285       // Ignore bitconvert.
5286       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
5287         AndOp = AndOp.getOperand(0);
5288 
5289       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
5290         Opc = ARMISD::VTST;
5291         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
5292         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
5293         Invert = !Invert;
5294       }
5295     }
5296   }
5297 
5298   if (Swap)
5299     std::swap(Op0, Op1);
5300 
5301   // If one of the operands is a constant vector zero, attempt to fold the
5302   // comparison to a specialized compare-against-zero form.
5303   SDValue SingleOp;
5304   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5305     SingleOp = Op0;
5306   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
5307     if (Opc == ARMISD::VCGE)
5308       Opc = ARMISD::VCLEZ;
5309     else if (Opc == ARMISD::VCGT)
5310       Opc = ARMISD::VCLTZ;
5311     SingleOp = Op1;
5312   }
5313 
5314   SDValue Result;
5315   if (SingleOp.getNode()) {
5316     switch (Opc) {
5317     case ARMISD::VCEQ:
5318       Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
5319     case ARMISD::VCGE:
5320       Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
5321     case ARMISD::VCLEZ:
5322       Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
5323     case ARMISD::VCGT:
5324       Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
5325     case ARMISD::VCLTZ:
5326       Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
5327     default:
5328       Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5329     }
5330   } else {
5331      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5332   }
5333 
5334   Result = DAG.getSExtOrTrunc(Result, dl, VT);
5335 
5336   if (Invert)
5337     Result = DAG.getNOT(dl, Result, VT);
5338 
5339   return Result;
5340 }
5341 
5342 static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
5343   SDValue LHS = Op.getOperand(0);
5344   SDValue RHS = Op.getOperand(1);
5345   SDValue Carry = Op.getOperand(2);
5346   SDValue Cond = Op.getOperand(3);
5347   SDLoc DL(Op);
5348 
5349   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
5350 
5351   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
5352   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
5353   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
5354 
5355   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
5356   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
5357   SDValue ARMcc = DAG.getConstant(
5358       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
5359   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5360   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
5361                                    Cmp.getValue(1), SDValue());
5362   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
5363                      CCR, Chain.getValue(1));
5364 }
5365 
5366 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
5367 /// valid vector constant for a NEON instruction with a "modified immediate"
5368 /// operand (e.g., VMOV).  If so, return the encoded value.
5369 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
5370                                  unsigned SplatBitSize, SelectionDAG &DAG,
5371                                  const SDLoc &dl, EVT &VT, bool is128Bits,
5372                                  NEONModImmType type) {
5373   unsigned OpCmode, Imm;
5374 
5375   // SplatBitSize is set to the smallest size that splats the vector, so a
5376   // zero vector will always have SplatBitSize == 8.  However, NEON modified
5377   // immediate instructions others than VMOV do not support the 8-bit encoding
5378   // of a zero vector, and the default encoding of zero is supposed to be the
5379   // 32-bit version.
5380   if (SplatBits == 0)
5381     SplatBitSize = 32;
5382 
5383   switch (SplatBitSize) {
5384   case 8:
5385     if (type != VMOVModImm)
5386       return SDValue();
5387     // Any 1-byte value is OK.  Op=0, Cmode=1110.
5388     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
5389     OpCmode = 0xe;
5390     Imm = SplatBits;
5391     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
5392     break;
5393 
5394   case 16:
5395     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
5396     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
5397     if ((SplatBits & ~0xff) == 0) {
5398       // Value = 0x00nn: Op=x, Cmode=100x.
5399       OpCmode = 0x8;
5400       Imm = SplatBits;
5401       break;
5402     }
5403     if ((SplatBits & ~0xff00) == 0) {
5404       // Value = 0xnn00: Op=x, Cmode=101x.
5405       OpCmode = 0xa;
5406       Imm = SplatBits >> 8;
5407       break;
5408     }
5409     return SDValue();
5410 
5411   case 32:
5412     // NEON's 32-bit VMOV supports splat values where:
5413     // * only one byte is nonzero, or
5414     // * the least significant byte is 0xff and the second byte is nonzero, or
5415     // * the least significant 2 bytes are 0xff and the third is nonzero.
5416     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
5417     if ((SplatBits & ~0xff) == 0) {
5418       // Value = 0x000000nn: Op=x, Cmode=000x.
5419       OpCmode = 0;
5420       Imm = SplatBits;
5421       break;
5422     }
5423     if ((SplatBits & ~0xff00) == 0) {
5424       // Value = 0x0000nn00: Op=x, Cmode=001x.
5425       OpCmode = 0x2;
5426       Imm = SplatBits >> 8;
5427       break;
5428     }
5429     if ((SplatBits & ~0xff0000) == 0) {
5430       // Value = 0x00nn0000: Op=x, Cmode=010x.
5431       OpCmode = 0x4;
5432       Imm = SplatBits >> 16;
5433       break;
5434     }
5435     if ((SplatBits & ~0xff000000) == 0) {
5436       // Value = 0xnn000000: Op=x, Cmode=011x.
5437       OpCmode = 0x6;
5438       Imm = SplatBits >> 24;
5439       break;
5440     }
5441 
5442     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
5443     if (type == OtherModImm) return SDValue();
5444 
5445     if ((SplatBits & ~0xffff) == 0 &&
5446         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
5447       // Value = 0x0000nnff: Op=x, Cmode=1100.
5448       OpCmode = 0xc;
5449       Imm = SplatBits >> 8;
5450       break;
5451     }
5452 
5453     if ((SplatBits & ~0xffffff) == 0 &&
5454         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
5455       // Value = 0x00nnffff: Op=x, Cmode=1101.
5456       OpCmode = 0xd;
5457       Imm = SplatBits >> 16;
5458       break;
5459     }
5460 
5461     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
5462     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
5463     // VMOV.I32.  A (very) minor optimization would be to replicate the value
5464     // and fall through here to test for a valid 64-bit splat.  But, then the
5465     // caller would also need to check and handle the change in size.
5466     return SDValue();
5467 
5468   case 64: {
5469     if (type != VMOVModImm)
5470       return SDValue();
5471     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
5472     uint64_t BitMask = 0xff;
5473     uint64_t Val = 0;
5474     unsigned ImmMask = 1;
5475     Imm = 0;
5476     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
5477       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
5478         Val |= BitMask;
5479         Imm |= ImmMask;
5480       } else if ((SplatBits & BitMask) != 0) {
5481         return SDValue();
5482       }
5483       BitMask <<= 8;
5484       ImmMask <<= 1;
5485     }
5486 
5487     if (DAG.getDataLayout().isBigEndian())
5488       // swap higher and lower 32 bit word
5489       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
5490 
5491     // Op=1, Cmode=1110.
5492     OpCmode = 0x1e;
5493     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
5494     break;
5495   }
5496 
5497   default:
5498     llvm_unreachable("unexpected size for isNEONModifiedImm");
5499   }
5500 
5501   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
5502   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
5503 }
5504 
5505 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
5506                                            const ARMSubtarget *ST) const {
5507   bool IsDouble = Op.getValueType() == MVT::f64;
5508   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
5509   const APFloat &FPVal = CFP->getValueAPF();
5510 
5511   // Prevent floating-point constants from using literal loads
5512   // when execute-only is enabled.
5513   if (ST->genExecuteOnly()) {
5514     APInt INTVal = FPVal.bitcastToAPInt();
5515     SDLoc DL(CFP);
5516     if (IsDouble) {
5517       SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
5518       SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
5519       if (!ST->isLittle())
5520         std::swap(Lo, Hi);
5521       return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
5522     } else {
5523       return DAG.getConstant(INTVal, DL, MVT::i32);
5524     }
5525   }
5526 
5527   if (!ST->hasVFP3())
5528     return SDValue();
5529 
5530   // Use the default (constant pool) lowering for double constants when we have
5531   // an SP-only FPU
5532   if (IsDouble && Subtarget->isFPOnlySP())
5533     return SDValue();
5534 
5535   // Try splatting with a VMOV.f32...
5536   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
5537 
5538   if (ImmVal != -1) {
5539     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
5540       // We have code in place to select a valid ConstantFP already, no need to
5541       // do any mangling.
5542       return Op;
5543     }
5544 
5545     // It's a float and we are trying to use NEON operations where
5546     // possible. Lower it to a splat followed by an extract.
5547     SDLoc DL(Op);
5548     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
5549     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
5550                                       NewVal);
5551     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
5552                        DAG.getConstant(0, DL, MVT::i32));
5553   }
5554 
5555   // The rest of our options are NEON only, make sure that's allowed before
5556   // proceeding..
5557   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
5558     return SDValue();
5559 
5560   EVT VMovVT;
5561   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
5562 
5563   // It wouldn't really be worth bothering for doubles except for one very
5564   // important value, which does happen to match: 0.0. So make sure we don't do
5565   // anything stupid.
5566   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
5567     return SDValue();
5568 
5569   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
5570   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
5571                                      VMovVT, false, VMOVModImm);
5572   if (NewVal != SDValue()) {
5573     SDLoc DL(Op);
5574     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
5575                                       NewVal);
5576     if (IsDouble)
5577       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5578 
5579     // It's a float: cast and extract a vector element.
5580     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5581                                        VecConstant);
5582     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5583                        DAG.getConstant(0, DL, MVT::i32));
5584   }
5585 
5586   // Finally, try a VMVN.i32
5587   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
5588                              false, VMVNModImm);
5589   if (NewVal != SDValue()) {
5590     SDLoc DL(Op);
5591     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
5592 
5593     if (IsDouble)
5594       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5595 
5596     // It's a float: cast and extract a vector element.
5597     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5598                                        VecConstant);
5599     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5600                        DAG.getConstant(0, DL, MVT::i32));
5601   }
5602 
5603   return SDValue();
5604 }
5605 
5606 // check if an VEXT instruction can handle the shuffle mask when the
5607 // vector sources of the shuffle are the same.
5608 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
5609   unsigned NumElts = VT.getVectorNumElements();
5610 
5611   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5612   if (M[0] < 0)
5613     return false;
5614 
5615   Imm = M[0];
5616 
5617   // If this is a VEXT shuffle, the immediate value is the index of the first
5618   // element.  The other shuffle indices must be the successive elements after
5619   // the first one.
5620   unsigned ExpectedElt = Imm;
5621   for (unsigned i = 1; i < NumElts; ++i) {
5622     // Increment the expected index.  If it wraps around, just follow it
5623     // back to index zero and keep going.
5624     ++ExpectedElt;
5625     if (ExpectedElt == NumElts)
5626       ExpectedElt = 0;
5627 
5628     if (M[i] < 0) continue; // ignore UNDEF indices
5629     if (ExpectedElt != static_cast<unsigned>(M[i]))
5630       return false;
5631   }
5632 
5633   return true;
5634 }
5635 
5636 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
5637                        bool &ReverseVEXT, unsigned &Imm) {
5638   unsigned NumElts = VT.getVectorNumElements();
5639   ReverseVEXT = false;
5640 
5641   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5642   if (M[0] < 0)
5643     return false;
5644 
5645   Imm = M[0];
5646 
5647   // If this is a VEXT shuffle, the immediate value is the index of the first
5648   // element.  The other shuffle indices must be the successive elements after
5649   // the first one.
5650   unsigned ExpectedElt = Imm;
5651   for (unsigned i = 1; i < NumElts; ++i) {
5652     // Increment the expected index.  If it wraps around, it may still be
5653     // a VEXT but the source vectors must be swapped.
5654     ExpectedElt += 1;
5655     if (ExpectedElt == NumElts * 2) {
5656       ExpectedElt = 0;
5657       ReverseVEXT = true;
5658     }
5659 
5660     if (M[i] < 0) continue; // ignore UNDEF indices
5661     if (ExpectedElt != static_cast<unsigned>(M[i]))
5662       return false;
5663   }
5664 
5665   // Adjust the index value if the source operands will be swapped.
5666   if (ReverseVEXT)
5667     Imm -= NumElts;
5668 
5669   return true;
5670 }
5671 
5672 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
5673 /// instruction with the specified blocksize.  (The order of the elements
5674 /// within each block of the vector is reversed.)
5675 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
5676   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
5677          "Only possible block sizes for VREV are: 16, 32, 64");
5678 
5679   unsigned EltSz = VT.getScalarSizeInBits();
5680   if (EltSz == 64)
5681     return false;
5682 
5683   unsigned NumElts = VT.getVectorNumElements();
5684   unsigned BlockElts = M[0] + 1;
5685   // If the first shuffle index is UNDEF, be optimistic.
5686   if (M[0] < 0)
5687     BlockElts = BlockSize / EltSz;
5688 
5689   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
5690     return false;
5691 
5692   for (unsigned i = 0; i < NumElts; ++i) {
5693     if (M[i] < 0) continue; // ignore UNDEF indices
5694     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
5695       return false;
5696   }
5697 
5698   return true;
5699 }
5700 
5701 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
5702   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
5703   // range, then 0 is placed into the resulting vector. So pretty much any mask
5704   // of 8 elements can work here.
5705   return VT == MVT::v8i8 && M.size() == 8;
5706 }
5707 
5708 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
5709 // checking that pairs of elements in the shuffle mask represent the same index
5710 // in each vector, incrementing the expected index by 2 at each step.
5711 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
5712 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
5713 //  v2={e,f,g,h}
5714 // WhichResult gives the offset for each element in the mask based on which
5715 // of the two results it belongs to.
5716 //
5717 // The transpose can be represented either as:
5718 // result1 = shufflevector v1, v2, result1_shuffle_mask
5719 // result2 = shufflevector v1, v2, result2_shuffle_mask
5720 // where v1/v2 and the shuffle masks have the same number of elements
5721 // (here WhichResult (see below) indicates which result is being checked)
5722 //
5723 // or as:
5724 // results = shufflevector v1, v2, shuffle_mask
5725 // where both results are returned in one vector and the shuffle mask has twice
5726 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
5727 // want to check the low half and high half of the shuffle mask as if it were
5728 // the other case
5729 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5730   unsigned EltSz = VT.getScalarSizeInBits();
5731   if (EltSz == 64)
5732     return false;
5733 
5734   unsigned NumElts = VT.getVectorNumElements();
5735   if (M.size() != NumElts && M.size() != NumElts*2)
5736     return false;
5737 
5738   // If the mask is twice as long as the input vector then we need to check the
5739   // upper and lower parts of the mask with a matching value for WhichResult
5740   // FIXME: A mask with only even values will be rejected in case the first
5741   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
5742   // M[0] is used to determine WhichResult
5743   for (unsigned i = 0; i < M.size(); i += NumElts) {
5744     if (M.size() == NumElts * 2)
5745       WhichResult = i / NumElts;
5746     else
5747       WhichResult = M[i] == 0 ? 0 : 1;
5748     for (unsigned j = 0; j < NumElts; j += 2) {
5749       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5750           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
5751         return false;
5752     }
5753   }
5754 
5755   if (M.size() == NumElts*2)
5756     WhichResult = 0;
5757 
5758   return true;
5759 }
5760 
5761 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
5762 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5763 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
5764 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5765   unsigned EltSz = VT.getScalarSizeInBits();
5766   if (EltSz == 64)
5767     return false;
5768 
5769   unsigned NumElts = VT.getVectorNumElements();
5770   if (M.size() != NumElts && M.size() != NumElts*2)
5771     return false;
5772 
5773   for (unsigned i = 0; i < M.size(); i += NumElts) {
5774     if (M.size() == NumElts * 2)
5775       WhichResult = i / NumElts;
5776     else
5777       WhichResult = M[i] == 0 ? 0 : 1;
5778     for (unsigned j = 0; j < NumElts; j += 2) {
5779       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5780           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
5781         return false;
5782     }
5783   }
5784 
5785   if (M.size() == NumElts*2)
5786     WhichResult = 0;
5787 
5788   return true;
5789 }
5790 
5791 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
5792 // that the mask elements are either all even and in steps of size 2 or all odd
5793 // and in steps of size 2.
5794 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
5795 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
5796 //  v2={e,f,g,h}
5797 // Requires similar checks to that of isVTRNMask with
5798 // respect the how results are returned.
5799 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5800   unsigned EltSz = VT.getScalarSizeInBits();
5801   if (EltSz == 64)
5802     return false;
5803 
5804   unsigned NumElts = VT.getVectorNumElements();
5805   if (M.size() != NumElts && M.size() != NumElts*2)
5806     return false;
5807 
5808   for (unsigned i = 0; i < M.size(); i += NumElts) {
5809     WhichResult = M[i] == 0 ? 0 : 1;
5810     for (unsigned j = 0; j < NumElts; ++j) {
5811       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
5812         return false;
5813     }
5814   }
5815 
5816   if (M.size() == NumElts*2)
5817     WhichResult = 0;
5818 
5819   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5820   if (VT.is64BitVector() && EltSz == 32)
5821     return false;
5822 
5823   return true;
5824 }
5825 
5826 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
5827 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5828 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
5829 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5830   unsigned EltSz = VT.getScalarSizeInBits();
5831   if (EltSz == 64)
5832     return false;
5833 
5834   unsigned NumElts = VT.getVectorNumElements();
5835   if (M.size() != NumElts && M.size() != NumElts*2)
5836     return false;
5837 
5838   unsigned Half = NumElts / 2;
5839   for (unsigned i = 0; i < M.size(); i += NumElts) {
5840     WhichResult = M[i] == 0 ? 0 : 1;
5841     for (unsigned j = 0; j < NumElts; j += Half) {
5842       unsigned Idx = WhichResult;
5843       for (unsigned k = 0; k < Half; ++k) {
5844         int MIdx = M[i + j + k];
5845         if (MIdx >= 0 && (unsigned) MIdx != Idx)
5846           return false;
5847         Idx += 2;
5848       }
5849     }
5850   }
5851 
5852   if (M.size() == NumElts*2)
5853     WhichResult = 0;
5854 
5855   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5856   if (VT.is64BitVector() && EltSz == 32)
5857     return false;
5858 
5859   return true;
5860 }
5861 
5862 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
5863 // that pairs of elements of the shufflemask represent the same index in each
5864 // vector incrementing sequentially through the vectors.
5865 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
5866 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
5867 //  v2={e,f,g,h}
5868 // Requires similar checks to that of isVTRNMask with respect the how results
5869 // are returned.
5870 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5871   unsigned EltSz = VT.getScalarSizeInBits();
5872   if (EltSz == 64)
5873     return false;
5874 
5875   unsigned NumElts = VT.getVectorNumElements();
5876   if (M.size() != NumElts && M.size() != NumElts*2)
5877     return false;
5878 
5879   for (unsigned i = 0; i < M.size(); i += NumElts) {
5880     WhichResult = M[i] == 0 ? 0 : 1;
5881     unsigned Idx = WhichResult * NumElts / 2;
5882     for (unsigned j = 0; j < NumElts; j += 2) {
5883       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5884           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
5885         return false;
5886       Idx += 1;
5887     }
5888   }
5889 
5890   if (M.size() == NumElts*2)
5891     WhichResult = 0;
5892 
5893   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5894   if (VT.is64BitVector() && EltSz == 32)
5895     return false;
5896 
5897   return true;
5898 }
5899 
5900 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
5901 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5902 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
5903 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5904   unsigned EltSz = VT.getScalarSizeInBits();
5905   if (EltSz == 64)
5906     return false;
5907 
5908   unsigned NumElts = VT.getVectorNumElements();
5909   if (M.size() != NumElts && M.size() != NumElts*2)
5910     return false;
5911 
5912   for (unsigned i = 0; i < M.size(); i += NumElts) {
5913     WhichResult = M[i] == 0 ? 0 : 1;
5914     unsigned Idx = WhichResult * NumElts / 2;
5915     for (unsigned j = 0; j < NumElts; j += 2) {
5916       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5917           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
5918         return false;
5919       Idx += 1;
5920     }
5921   }
5922 
5923   if (M.size() == NumElts*2)
5924     WhichResult = 0;
5925 
5926   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5927   if (VT.is64BitVector() && EltSz == 32)
5928     return false;
5929 
5930   return true;
5931 }
5932 
5933 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
5934 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
5935 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
5936                                            unsigned &WhichResult,
5937                                            bool &isV_UNDEF) {
5938   isV_UNDEF = false;
5939   if (isVTRNMask(ShuffleMask, VT, WhichResult))
5940     return ARMISD::VTRN;
5941   if (isVUZPMask(ShuffleMask, VT, WhichResult))
5942     return ARMISD::VUZP;
5943   if (isVZIPMask(ShuffleMask, VT, WhichResult))
5944     return ARMISD::VZIP;
5945 
5946   isV_UNDEF = true;
5947   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
5948     return ARMISD::VTRN;
5949   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5950     return ARMISD::VUZP;
5951   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5952     return ARMISD::VZIP;
5953 
5954   return 0;
5955 }
5956 
5957 /// \return true if this is a reverse operation on an vector.
5958 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
5959   unsigned NumElts = VT.getVectorNumElements();
5960   // Make sure the mask has the right size.
5961   if (NumElts != M.size())
5962       return false;
5963 
5964   // Look for <15, ..., 3, -1, 1, 0>.
5965   for (unsigned i = 0; i != NumElts; ++i)
5966     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
5967       return false;
5968 
5969   return true;
5970 }
5971 
5972 // If N is an integer constant that can be moved into a register in one
5973 // instruction, return an SDValue of such a constant (will become a MOV
5974 // instruction).  Otherwise return null.
5975 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
5976                                      const ARMSubtarget *ST, const SDLoc &dl) {
5977   uint64_t Val;
5978   if (!isa<ConstantSDNode>(N))
5979     return SDValue();
5980   Val = cast<ConstantSDNode>(N)->getZExtValue();
5981 
5982   if (ST->isThumb1Only()) {
5983     if (Val <= 255 || ~Val <= 255)
5984       return DAG.getConstant(Val, dl, MVT::i32);
5985   } else {
5986     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
5987       return DAG.getConstant(Val, dl, MVT::i32);
5988   }
5989   return SDValue();
5990 }
5991 
5992 // If this is a case we can't handle, return null and let the default
5993 // expansion code take care of it.
5994 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
5995                                              const ARMSubtarget *ST) const {
5996   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
5997   SDLoc dl(Op);
5998   EVT VT = Op.getValueType();
5999 
6000   APInt SplatBits, SplatUndef;
6001   unsigned SplatBitSize;
6002   bool HasAnyUndefs;
6003   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
6004     if (SplatUndef.isAllOnesValue())
6005       return DAG.getUNDEF(VT);
6006 
6007     if (SplatBitSize <= 64) {
6008       // Check if an immediate VMOV works.
6009       EVT VmovVT;
6010       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
6011                                       SplatUndef.getZExtValue(), SplatBitSize,
6012                                       DAG, dl, VmovVT, VT.is128BitVector(),
6013                                       VMOVModImm);
6014       if (Val.getNode()) {
6015         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
6016         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6017       }
6018 
6019       // Try an immediate VMVN.
6020       uint64_t NegatedImm = (~SplatBits).getZExtValue();
6021       Val = isNEONModifiedImm(NegatedImm,
6022                                       SplatUndef.getZExtValue(), SplatBitSize,
6023                                       DAG, dl, VmovVT, VT.is128BitVector(),
6024                                       VMVNModImm);
6025       if (Val.getNode()) {
6026         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
6027         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6028       }
6029 
6030       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
6031       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
6032         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
6033         if (ImmVal != -1) {
6034           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
6035           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
6036         }
6037       }
6038     }
6039   }
6040 
6041   // Scan through the operands to see if only one value is used.
6042   //
6043   // As an optimisation, even if more than one value is used it may be more
6044   // profitable to splat with one value then change some lanes.
6045   //
6046   // Heuristically we decide to do this if the vector has a "dominant" value,
6047   // defined as splatted to more than half of the lanes.
6048   unsigned NumElts = VT.getVectorNumElements();
6049   bool isOnlyLowElement = true;
6050   bool usesOnlyOneValue = true;
6051   bool hasDominantValue = false;
6052   bool isConstant = true;
6053 
6054   // Map of the number of times a particular SDValue appears in the
6055   // element list.
6056   DenseMap<SDValue, unsigned> ValueCounts;
6057   SDValue Value;
6058   for (unsigned i = 0; i < NumElts; ++i) {
6059     SDValue V = Op.getOperand(i);
6060     if (V.isUndef())
6061       continue;
6062     if (i > 0)
6063       isOnlyLowElement = false;
6064     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
6065       isConstant = false;
6066 
6067     ValueCounts.insert(std::make_pair(V, 0));
6068     unsigned &Count = ValueCounts[V];
6069 
6070     // Is this value dominant? (takes up more than half of the lanes)
6071     if (++Count > (NumElts / 2)) {
6072       hasDominantValue = true;
6073       Value = V;
6074     }
6075   }
6076   if (ValueCounts.size() != 1)
6077     usesOnlyOneValue = false;
6078   if (!Value.getNode() && !ValueCounts.empty())
6079     Value = ValueCounts.begin()->first;
6080 
6081   if (ValueCounts.empty())
6082     return DAG.getUNDEF(VT);
6083 
6084   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
6085   // Keep going if we are hitting this case.
6086   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
6087     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
6088 
6089   unsigned EltSize = VT.getScalarSizeInBits();
6090 
6091   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
6092   // i32 and try again.
6093   if (hasDominantValue && EltSize <= 32) {
6094     if (!isConstant) {
6095       SDValue N;
6096 
6097       // If we are VDUPing a value that comes directly from a vector, that will
6098       // cause an unnecessary move to and from a GPR, where instead we could
6099       // just use VDUPLANE. We can only do this if the lane being extracted
6100       // is at a constant index, as the VDUP from lane instructions only have
6101       // constant-index forms.
6102       ConstantSDNode *constIndex;
6103       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6104           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
6105         // We need to create a new undef vector to use for the VDUPLANE if the
6106         // size of the vector from which we get the value is different than the
6107         // size of the vector that we need to create. We will insert the element
6108         // such that the register coalescer will remove unnecessary copies.
6109         if (VT != Value->getOperand(0).getValueType()) {
6110           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
6111                              VT.getVectorNumElements();
6112           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6113                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
6114                         Value, DAG.getConstant(index, dl, MVT::i32)),
6115                            DAG.getConstant(index, dl, MVT::i32));
6116         } else
6117           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6118                         Value->getOperand(0), Value->getOperand(1));
6119       } else
6120         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
6121 
6122       if (!usesOnlyOneValue) {
6123         // The dominant value was splatted as 'N', but we now have to insert
6124         // all differing elements.
6125         for (unsigned I = 0; I < NumElts; ++I) {
6126           if (Op.getOperand(I) == Value)
6127             continue;
6128           SmallVector<SDValue, 3> Ops;
6129           Ops.push_back(N);
6130           Ops.push_back(Op.getOperand(I));
6131           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
6132           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
6133         }
6134       }
6135       return N;
6136     }
6137     if (VT.getVectorElementType().isFloatingPoint()) {
6138       SmallVector<SDValue, 8> Ops;
6139       for (unsigned i = 0; i < NumElts; ++i)
6140         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6141                                   Op.getOperand(i)));
6142       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
6143       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
6144       Val = LowerBUILD_VECTOR(Val, DAG, ST);
6145       if (Val.getNode())
6146         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6147     }
6148     if (usesOnlyOneValue) {
6149       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
6150       if (isConstant && Val.getNode())
6151         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
6152     }
6153   }
6154 
6155   // If all elements are constants and the case above didn't get hit, fall back
6156   // to the default expansion, which will generate a load from the constant
6157   // pool.
6158   if (isConstant)
6159     return SDValue();
6160 
6161   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
6162   if (NumElts >= 4) {
6163     SDValue shuffle = ReconstructShuffle(Op, DAG);
6164     if (shuffle != SDValue())
6165       return shuffle;
6166   }
6167 
6168   if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
6169     // If we haven't found an efficient lowering, try splitting a 128-bit vector
6170     // into two 64-bit vectors; we might discover a better way to lower it.
6171     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
6172     EVT ExtVT = VT.getVectorElementType();
6173     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
6174     SDValue Lower =
6175         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
6176     if (Lower.getOpcode() == ISD::BUILD_VECTOR)
6177       Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
6178     SDValue Upper = DAG.getBuildVector(
6179         HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
6180     if (Upper.getOpcode() == ISD::BUILD_VECTOR)
6181       Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
6182     if (Lower && Upper)
6183       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
6184   }
6185 
6186   // Vectors with 32- or 64-bit elements can be built by directly assigning
6187   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
6188   // will be legalized.
6189   if (EltSize >= 32) {
6190     // Do the expansion with floating-point types, since that is what the VFP
6191     // registers are defined to use, and since i64 is not legal.
6192     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6193     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6194     SmallVector<SDValue, 8> Ops;
6195     for (unsigned i = 0; i < NumElts; ++i)
6196       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
6197     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6198     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6199   }
6200 
6201   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
6202   // know the default expansion would otherwise fall back on something even
6203   // worse. For a vector with one or two non-undef values, that's
6204   // scalar_to_vector for the elements followed by a shuffle (provided the
6205   // shuffle is valid for the target) and materialization element by element
6206   // on the stack followed by a load for everything else.
6207   if (!isConstant && !usesOnlyOneValue) {
6208     SDValue Vec = DAG.getUNDEF(VT);
6209     for (unsigned i = 0 ; i < NumElts; ++i) {
6210       SDValue V = Op.getOperand(i);
6211       if (V.isUndef())
6212         continue;
6213       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
6214       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
6215     }
6216     return Vec;
6217   }
6218 
6219   return SDValue();
6220 }
6221 
6222 // Gather data to see if the operation can be modelled as a
6223 // shuffle in combination with VEXTs.
6224 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
6225                                               SelectionDAG &DAG) const {
6226   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
6227   SDLoc dl(Op);
6228   EVT VT = Op.getValueType();
6229   unsigned NumElts = VT.getVectorNumElements();
6230 
6231   struct ShuffleSourceInfo {
6232     SDValue Vec;
6233     unsigned MinElt = std::numeric_limits<unsigned>::max();
6234     unsigned MaxElt = 0;
6235 
6236     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
6237     // be compatible with the shuffle we intend to construct. As a result
6238     // ShuffleVec will be some sliding window into the original Vec.
6239     SDValue ShuffleVec;
6240 
6241     // Code should guarantee that element i in Vec starts at element "WindowBase
6242     // + i * WindowScale in ShuffleVec".
6243     int WindowBase = 0;
6244     int WindowScale = 1;
6245 
6246     ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
6247 
6248     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
6249   };
6250 
6251   // First gather all vectors used as an immediate source for this BUILD_VECTOR
6252   // node.
6253   SmallVector<ShuffleSourceInfo, 2> Sources;
6254   for (unsigned i = 0; i < NumElts; ++i) {
6255     SDValue V = Op.getOperand(i);
6256     if (V.isUndef())
6257       continue;
6258     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
6259       // A shuffle can only come from building a vector from various
6260       // elements of other vectors.
6261       return SDValue();
6262     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
6263       // Furthermore, shuffles require a constant mask, whereas extractelts
6264       // accept variable indices.
6265       return SDValue();
6266     }
6267 
6268     // Add this element source to the list if it's not already there.
6269     SDValue SourceVec = V.getOperand(0);
6270     auto Source = llvm::find(Sources, SourceVec);
6271     if (Source == Sources.end())
6272       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
6273 
6274     // Update the minimum and maximum lane number seen.
6275     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
6276     Source->MinElt = std::min(Source->MinElt, EltNo);
6277     Source->MaxElt = std::max(Source->MaxElt, EltNo);
6278   }
6279 
6280   // Currently only do something sane when at most two source vectors
6281   // are involved.
6282   if (Sources.size() > 2)
6283     return SDValue();
6284 
6285   // Find out the smallest element size among result and two sources, and use
6286   // it as element size to build the shuffle_vector.
6287   EVT SmallestEltTy = VT.getVectorElementType();
6288   for (auto &Source : Sources) {
6289     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
6290     if (SrcEltTy.bitsLT(SmallestEltTy))
6291       SmallestEltTy = SrcEltTy;
6292   }
6293   unsigned ResMultiplier =
6294       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
6295   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
6296   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
6297 
6298   // If the source vector is too wide or too narrow, we may nevertheless be able
6299   // to construct a compatible shuffle either by concatenating it with UNDEF or
6300   // extracting a suitable range of elements.
6301   for (auto &Src : Sources) {
6302     EVT SrcVT = Src.ShuffleVec.getValueType();
6303 
6304     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
6305       continue;
6306 
6307     // This stage of the search produces a source with the same element type as
6308     // the original, but with a total width matching the BUILD_VECTOR output.
6309     EVT EltVT = SrcVT.getVectorElementType();
6310     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
6311     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
6312 
6313     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
6314       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
6315         return SDValue();
6316       // We can pad out the smaller vector for free, so if it's part of a
6317       // shuffle...
6318       Src.ShuffleVec =
6319           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
6320                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
6321       continue;
6322     }
6323 
6324     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
6325       return SDValue();
6326 
6327     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
6328       // Span too large for a VEXT to cope
6329       return SDValue();
6330     }
6331 
6332     if (Src.MinElt >= NumSrcElts) {
6333       // The extraction can just take the second half
6334       Src.ShuffleVec =
6335           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6336                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6337       Src.WindowBase = -NumSrcElts;
6338     } else if (Src.MaxElt < NumSrcElts) {
6339       // The extraction can just take the first half
6340       Src.ShuffleVec =
6341           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6342                       DAG.getConstant(0, dl, MVT::i32));
6343     } else {
6344       // An actual VEXT is needed
6345       SDValue VEXTSrc1 =
6346           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6347                       DAG.getConstant(0, dl, MVT::i32));
6348       SDValue VEXTSrc2 =
6349           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6350                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6351 
6352       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
6353                                    VEXTSrc2,
6354                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
6355       Src.WindowBase = -Src.MinElt;
6356     }
6357   }
6358 
6359   // Another possible incompatibility occurs from the vector element types. We
6360   // can fix this by bitcasting the source vectors to the same type we intend
6361   // for the shuffle.
6362   for (auto &Src : Sources) {
6363     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
6364     if (SrcEltTy == SmallestEltTy)
6365       continue;
6366     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
6367     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
6368     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
6369     Src.WindowBase *= Src.WindowScale;
6370   }
6371 
6372   // Final sanity check before we try to actually produce a shuffle.
6373   DEBUG(
6374     for (auto Src : Sources)
6375       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
6376   );
6377 
6378   // The stars all align, our next step is to produce the mask for the shuffle.
6379   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
6380   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
6381   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
6382     SDValue Entry = Op.getOperand(i);
6383     if (Entry.isUndef())
6384       continue;
6385 
6386     auto Src = llvm::find(Sources, Entry.getOperand(0));
6387     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
6388 
6389     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
6390     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
6391     // segment.
6392     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
6393     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
6394                                VT.getScalarSizeInBits());
6395     int LanesDefined = BitsDefined / BitsPerShuffleLane;
6396 
6397     // This source is expected to fill ResMultiplier lanes of the final shuffle,
6398     // starting at the appropriate offset.
6399     int *LaneMask = &Mask[i * ResMultiplier];
6400 
6401     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
6402     ExtractBase += NumElts * (Src - Sources.begin());
6403     for (int j = 0; j < LanesDefined; ++j)
6404       LaneMask[j] = ExtractBase + j;
6405   }
6406 
6407   // Final check before we try to produce nonsense...
6408   if (!isShuffleMaskLegal(Mask, ShuffleVT))
6409     return SDValue();
6410 
6411   // We can't handle more than two sources. This should have already
6412   // been checked before this point.
6413   assert(Sources.size() <= 2 && "Too many sources!");
6414 
6415   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
6416   for (unsigned i = 0; i < Sources.size(); ++i)
6417     ShuffleOps[i] = Sources[i].ShuffleVec;
6418 
6419   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
6420                                          ShuffleOps[1], Mask);
6421   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
6422 }
6423 
6424 /// isShuffleMaskLegal - Targets can use this to indicate that they only
6425 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
6426 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
6427 /// are assumed to be legal.
6428 bool
6429 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
6430                                       EVT VT) const {
6431   if (VT.getVectorNumElements() == 4 &&
6432       (VT.is128BitVector() || VT.is64BitVector())) {
6433     unsigned PFIndexes[4];
6434     for (unsigned i = 0; i != 4; ++i) {
6435       if (M[i] < 0)
6436         PFIndexes[i] = 8;
6437       else
6438         PFIndexes[i] = M[i];
6439     }
6440 
6441     // Compute the index in the perfect shuffle table.
6442     unsigned PFTableIndex =
6443       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6444     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6445     unsigned Cost = (PFEntry >> 30);
6446 
6447     if (Cost <= 4)
6448       return true;
6449   }
6450 
6451   bool ReverseVEXT, isV_UNDEF;
6452   unsigned Imm, WhichResult;
6453 
6454   unsigned EltSize = VT.getScalarSizeInBits();
6455   return (EltSize >= 32 ||
6456           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
6457           isVREVMask(M, VT, 64) ||
6458           isVREVMask(M, VT, 32) ||
6459           isVREVMask(M, VT, 16) ||
6460           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
6461           isVTBLMask(M, VT) ||
6462           isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
6463           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
6464 }
6465 
6466 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
6467 /// the specified operations to build the shuffle.
6468 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
6469                                       SDValue RHS, SelectionDAG &DAG,
6470                                       const SDLoc &dl) {
6471   unsigned OpNum = (PFEntry >> 26) & 0x0F;
6472   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
6473   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
6474 
6475   enum {
6476     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
6477     OP_VREV,
6478     OP_VDUP0,
6479     OP_VDUP1,
6480     OP_VDUP2,
6481     OP_VDUP3,
6482     OP_VEXT1,
6483     OP_VEXT2,
6484     OP_VEXT3,
6485     OP_VUZPL, // VUZP, left result
6486     OP_VUZPR, // VUZP, right result
6487     OP_VZIPL, // VZIP, left result
6488     OP_VZIPR, // VZIP, right result
6489     OP_VTRNL, // VTRN, left result
6490     OP_VTRNR  // VTRN, right result
6491   };
6492 
6493   if (OpNum == OP_COPY) {
6494     if (LHSID == (1*9+2)*9+3) return LHS;
6495     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
6496     return RHS;
6497   }
6498 
6499   SDValue OpLHS, OpRHS;
6500   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
6501   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
6502   EVT VT = OpLHS.getValueType();
6503 
6504   switch (OpNum) {
6505   default: llvm_unreachable("Unknown shuffle opcode!");
6506   case OP_VREV:
6507     // VREV divides the vector in half and swaps within the half.
6508     if (VT.getVectorElementType() == MVT::i32 ||
6509         VT.getVectorElementType() == MVT::f32)
6510       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
6511     // vrev <4 x i16> -> VREV32
6512     if (VT.getVectorElementType() == MVT::i16)
6513       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
6514     // vrev <4 x i8> -> VREV16
6515     assert(VT.getVectorElementType() == MVT::i8);
6516     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
6517   case OP_VDUP0:
6518   case OP_VDUP1:
6519   case OP_VDUP2:
6520   case OP_VDUP3:
6521     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6522                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
6523   case OP_VEXT1:
6524   case OP_VEXT2:
6525   case OP_VEXT3:
6526     return DAG.getNode(ARMISD::VEXT, dl, VT,
6527                        OpLHS, OpRHS,
6528                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
6529   case OP_VUZPL:
6530   case OP_VUZPR:
6531     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
6532                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
6533   case OP_VZIPL:
6534   case OP_VZIPR:
6535     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
6536                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
6537   case OP_VTRNL:
6538   case OP_VTRNR:
6539     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
6540                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
6541   }
6542 }
6543 
6544 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
6545                                        ArrayRef<int> ShuffleMask,
6546                                        SelectionDAG &DAG) {
6547   // Check to see if we can use the VTBL instruction.
6548   SDValue V1 = Op.getOperand(0);
6549   SDValue V2 = Op.getOperand(1);
6550   SDLoc DL(Op);
6551 
6552   SmallVector<SDValue, 8> VTBLMask;
6553   for (ArrayRef<int>::iterator
6554          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
6555     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
6556 
6557   if (V2.getNode()->isUndef())
6558     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
6559                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6560 
6561   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
6562                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6563 }
6564 
6565 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
6566                                                       SelectionDAG &DAG) {
6567   SDLoc DL(Op);
6568   SDValue OpLHS = Op.getOperand(0);
6569   EVT VT = OpLHS.getValueType();
6570 
6571   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
6572          "Expect an v8i16/v16i8 type");
6573   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
6574   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
6575   // extract the first 8 bytes into the top double word and the last 8 bytes
6576   // into the bottom double word. The v8i16 case is similar.
6577   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
6578   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
6579                      DAG.getConstant(ExtractNum, DL, MVT::i32));
6580 }
6581 
6582 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
6583   SDValue V1 = Op.getOperand(0);
6584   SDValue V2 = Op.getOperand(1);
6585   SDLoc dl(Op);
6586   EVT VT = Op.getValueType();
6587   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
6588 
6589   // Convert shuffles that are directly supported on NEON to target-specific
6590   // DAG nodes, instead of keeping them as shuffles and matching them again
6591   // during code selection.  This is more efficient and avoids the possibility
6592   // of inconsistencies between legalization and selection.
6593   // FIXME: floating-point vectors should be canonicalized to integer vectors
6594   // of the same time so that they get CSEd properly.
6595   ArrayRef<int> ShuffleMask = SVN->getMask();
6596 
6597   unsigned EltSize = VT.getScalarSizeInBits();
6598   if (EltSize <= 32) {
6599     if (SVN->isSplat()) {
6600       int Lane = SVN->getSplatIndex();
6601       // If this is undef splat, generate it via "just" vdup, if possible.
6602       if (Lane == -1) Lane = 0;
6603 
6604       // Test if V1 is a SCALAR_TO_VECTOR.
6605       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
6606         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6607       }
6608       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
6609       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
6610       // reaches it).
6611       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
6612           !isa<ConstantSDNode>(V1.getOperand(0))) {
6613         bool IsScalarToVector = true;
6614         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
6615           if (!V1.getOperand(i).isUndef()) {
6616             IsScalarToVector = false;
6617             break;
6618           }
6619         if (IsScalarToVector)
6620           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6621       }
6622       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
6623                          DAG.getConstant(Lane, dl, MVT::i32));
6624     }
6625 
6626     bool ReverseVEXT;
6627     unsigned Imm;
6628     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
6629       if (ReverseVEXT)
6630         std::swap(V1, V2);
6631       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
6632                          DAG.getConstant(Imm, dl, MVT::i32));
6633     }
6634 
6635     if (isVREVMask(ShuffleMask, VT, 64))
6636       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
6637     if (isVREVMask(ShuffleMask, VT, 32))
6638       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
6639     if (isVREVMask(ShuffleMask, VT, 16))
6640       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
6641 
6642     if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
6643       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
6644                          DAG.getConstant(Imm, dl, MVT::i32));
6645     }
6646 
6647     // Check for Neon shuffles that modify both input vectors in place.
6648     // If both results are used, i.e., if there are two shuffles with the same
6649     // source operands and with masks corresponding to both results of one of
6650     // these operations, DAG memoization will ensure that a single node is
6651     // used for both shuffles.
6652     unsigned WhichResult;
6653     bool isV_UNDEF;
6654     if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6655             ShuffleMask, VT, WhichResult, isV_UNDEF)) {
6656       if (isV_UNDEF)
6657         V2 = V1;
6658       return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
6659           .getValue(WhichResult);
6660     }
6661 
6662     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
6663     // shuffles that produce a result larger than their operands with:
6664     //   shuffle(concat(v1, undef), concat(v2, undef))
6665     // ->
6666     //   shuffle(concat(v1, v2), undef)
6667     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
6668     //
6669     // This is useful in the general case, but there are special cases where
6670     // native shuffles produce larger results: the two-result ops.
6671     //
6672     // Look through the concat when lowering them:
6673     //   shuffle(concat(v1, v2), undef)
6674     // ->
6675     //   concat(VZIP(v1, v2):0, :1)
6676     //
6677     if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
6678       SDValue SubV1 = V1->getOperand(0);
6679       SDValue SubV2 = V1->getOperand(1);
6680       EVT SubVT = SubV1.getValueType();
6681 
6682       // We expect these to have been canonicalized to -1.
6683       assert(llvm::all_of(ShuffleMask, [&](int i) {
6684         return i < (int)VT.getVectorNumElements();
6685       }) && "Unexpected shuffle index into UNDEF operand!");
6686 
6687       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6688               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
6689         if (isV_UNDEF)
6690           SubV2 = SubV1;
6691         assert((WhichResult == 0) &&
6692                "In-place shuffle of concat can only have one result!");
6693         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
6694                                   SubV1, SubV2);
6695         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
6696                            Res.getValue(1));
6697       }
6698     }
6699   }
6700 
6701   // If the shuffle is not directly supported and it has 4 elements, use
6702   // the PerfectShuffle-generated table to synthesize it from other shuffles.
6703   unsigned NumElts = VT.getVectorNumElements();
6704   if (NumElts == 4) {
6705     unsigned PFIndexes[4];
6706     for (unsigned i = 0; i != 4; ++i) {
6707       if (ShuffleMask[i] < 0)
6708         PFIndexes[i] = 8;
6709       else
6710         PFIndexes[i] = ShuffleMask[i];
6711     }
6712 
6713     // Compute the index in the perfect shuffle table.
6714     unsigned PFTableIndex =
6715       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6716     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6717     unsigned Cost = (PFEntry >> 30);
6718 
6719     if (Cost <= 4)
6720       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
6721   }
6722 
6723   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
6724   if (EltSize >= 32) {
6725     // Do the expansion with floating-point types, since that is what the VFP
6726     // registers are defined to use, and since i64 is not legal.
6727     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6728     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6729     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
6730     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
6731     SmallVector<SDValue, 8> Ops;
6732     for (unsigned i = 0; i < NumElts; ++i) {
6733       if (ShuffleMask[i] < 0)
6734         Ops.push_back(DAG.getUNDEF(EltVT));
6735       else
6736         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6737                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
6738                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
6739                                                   dl, MVT::i32)));
6740     }
6741     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6742     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6743   }
6744 
6745   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
6746     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
6747 
6748   if (VT == MVT::v8i8)
6749     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
6750       return NewOp;
6751 
6752   return SDValue();
6753 }
6754 
6755 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6756   // INSERT_VECTOR_ELT is legal only for immediate indexes.
6757   SDValue Lane = Op.getOperand(2);
6758   if (!isa<ConstantSDNode>(Lane))
6759     return SDValue();
6760 
6761   return Op;
6762 }
6763 
6764 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6765   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
6766   SDValue Lane = Op.getOperand(1);
6767   if (!isa<ConstantSDNode>(Lane))
6768     return SDValue();
6769 
6770   SDValue Vec = Op.getOperand(0);
6771   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
6772     SDLoc dl(Op);
6773     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
6774   }
6775 
6776   return Op;
6777 }
6778 
6779 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6780   // The only time a CONCAT_VECTORS operation can have legal types is when
6781   // two 64-bit vectors are concatenated to a 128-bit vector.
6782   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
6783          "unexpected CONCAT_VECTORS");
6784   SDLoc dl(Op);
6785   SDValue Val = DAG.getUNDEF(MVT::v2f64);
6786   SDValue Op0 = Op.getOperand(0);
6787   SDValue Op1 = Op.getOperand(1);
6788   if (!Op0.isUndef())
6789     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6790                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
6791                       DAG.getIntPtrConstant(0, dl));
6792   if (!Op1.isUndef())
6793     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6794                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
6795                       DAG.getIntPtrConstant(1, dl));
6796   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
6797 }
6798 
6799 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
6800 /// element has been zero/sign-extended, depending on the isSigned parameter,
6801 /// from an integer type half its size.
6802 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
6803                                    bool isSigned) {
6804   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
6805   EVT VT = N->getValueType(0);
6806   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
6807     SDNode *BVN = N->getOperand(0).getNode();
6808     if (BVN->getValueType(0) != MVT::v4i32 ||
6809         BVN->getOpcode() != ISD::BUILD_VECTOR)
6810       return false;
6811     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6812     unsigned HiElt = 1 - LoElt;
6813     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
6814     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
6815     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
6816     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
6817     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
6818       return false;
6819     if (isSigned) {
6820       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
6821           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
6822         return true;
6823     } else {
6824       if (Hi0->isNullValue() && Hi1->isNullValue())
6825         return true;
6826     }
6827     return false;
6828   }
6829 
6830   if (N->getOpcode() != ISD::BUILD_VECTOR)
6831     return false;
6832 
6833   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
6834     SDNode *Elt = N->getOperand(i).getNode();
6835     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
6836       unsigned EltSize = VT.getScalarSizeInBits();
6837       unsigned HalfSize = EltSize / 2;
6838       if (isSigned) {
6839         if (!isIntN(HalfSize, C->getSExtValue()))
6840           return false;
6841       } else {
6842         if (!isUIntN(HalfSize, C->getZExtValue()))
6843           return false;
6844       }
6845       continue;
6846     }
6847     return false;
6848   }
6849 
6850   return true;
6851 }
6852 
6853 /// isSignExtended - Check if a node is a vector value that is sign-extended
6854 /// or a constant BUILD_VECTOR with sign-extended elements.
6855 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
6856   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
6857     return true;
6858   if (isExtendedBUILD_VECTOR(N, DAG, true))
6859     return true;
6860   return false;
6861 }
6862 
6863 /// isZeroExtended - Check if a node is a vector value that is zero-extended
6864 /// or a constant BUILD_VECTOR with zero-extended elements.
6865 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
6866   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
6867     return true;
6868   if (isExtendedBUILD_VECTOR(N, DAG, false))
6869     return true;
6870   return false;
6871 }
6872 
6873 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
6874   if (OrigVT.getSizeInBits() >= 64)
6875     return OrigVT;
6876 
6877   assert(OrigVT.isSimple() && "Expecting a simple value type");
6878 
6879   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
6880   switch (OrigSimpleTy) {
6881   default: llvm_unreachable("Unexpected Vector Type");
6882   case MVT::v2i8:
6883   case MVT::v2i16:
6884      return MVT::v2i32;
6885   case MVT::v4i8:
6886     return  MVT::v4i16;
6887   }
6888 }
6889 
6890 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
6891 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
6892 /// We insert the required extension here to get the vector to fill a D register.
6893 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
6894                                             const EVT &OrigTy,
6895                                             const EVT &ExtTy,
6896                                             unsigned ExtOpcode) {
6897   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
6898   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
6899   // 64-bits we need to insert a new extension so that it will be 64-bits.
6900   assert(ExtTy.is128BitVector() && "Unexpected extension size");
6901   if (OrigTy.getSizeInBits() >= 64)
6902     return N;
6903 
6904   // Must extend size to at least 64 bits to be used as an operand for VMULL.
6905   EVT NewVT = getExtensionTo64Bits(OrigTy);
6906 
6907   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
6908 }
6909 
6910 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
6911 /// does not do any sign/zero extension. If the original vector is less
6912 /// than 64 bits, an appropriate extension will be added after the load to
6913 /// reach a total size of 64 bits. We have to add the extension separately
6914 /// because ARM does not have a sign/zero extending load for vectors.
6915 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
6916   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
6917 
6918   // The load already has the right type.
6919   if (ExtendedTy == LD->getMemoryVT())
6920     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
6921                        LD->getBasePtr(), LD->getPointerInfo(),
6922                        LD->getAlignment(), LD->getMemOperand()->getFlags());
6923 
6924   // We need to create a zextload/sextload. We cannot just create a load
6925   // followed by a zext/zext node because LowerMUL is also run during normal
6926   // operation legalization where we can't create illegal types.
6927   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
6928                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
6929                         LD->getMemoryVT(), LD->getAlignment(),
6930                         LD->getMemOperand()->getFlags());
6931 }
6932 
6933 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
6934 /// extending load, or BUILD_VECTOR with extended elements, return the
6935 /// unextended value. The unextended vector should be 64 bits so that it can
6936 /// be used as an operand to a VMULL instruction. If the original vector size
6937 /// before extension is less than 64 bits we add a an extension to resize
6938 /// the vector to 64 bits.
6939 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
6940   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
6941     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
6942                                         N->getOperand(0)->getValueType(0),
6943                                         N->getValueType(0),
6944                                         N->getOpcode());
6945 
6946   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
6947     return SkipLoadExtensionForVMULL(LD, DAG);
6948 
6949   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
6950   // have been legalized as a BITCAST from v4i32.
6951   if (N->getOpcode() == ISD::BITCAST) {
6952     SDNode *BVN = N->getOperand(0).getNode();
6953     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
6954            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
6955     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6956     return DAG.getBuildVector(
6957         MVT::v2i32, SDLoc(N),
6958         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
6959   }
6960   // Construct a new BUILD_VECTOR with elements truncated to half the size.
6961   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
6962   EVT VT = N->getValueType(0);
6963   unsigned EltSize = VT.getScalarSizeInBits() / 2;
6964   unsigned NumElts = VT.getVectorNumElements();
6965   MVT TruncVT = MVT::getIntegerVT(EltSize);
6966   SmallVector<SDValue, 8> Ops;
6967   SDLoc dl(N);
6968   for (unsigned i = 0; i != NumElts; ++i) {
6969     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
6970     const APInt &CInt = C->getAPIntValue();
6971     // Element types smaller than 32 bits are not legal, so use i32 elements.
6972     // The values are implicitly truncated so sext vs. zext doesn't matter.
6973     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
6974   }
6975   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
6976 }
6977 
6978 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
6979   unsigned Opcode = N->getOpcode();
6980   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6981     SDNode *N0 = N->getOperand(0).getNode();
6982     SDNode *N1 = N->getOperand(1).getNode();
6983     return N0->hasOneUse() && N1->hasOneUse() &&
6984       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
6985   }
6986   return false;
6987 }
6988 
6989 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
6990   unsigned Opcode = N->getOpcode();
6991   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6992     SDNode *N0 = N->getOperand(0).getNode();
6993     SDNode *N1 = N->getOperand(1).getNode();
6994     return N0->hasOneUse() && N1->hasOneUse() &&
6995       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
6996   }
6997   return false;
6998 }
6999 
7000 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
7001   // Multiplications are only custom-lowered for 128-bit vectors so that
7002   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
7003   EVT VT = Op.getValueType();
7004   assert(VT.is128BitVector() && VT.isInteger() &&
7005          "unexpected type for custom-lowering ISD::MUL");
7006   SDNode *N0 = Op.getOperand(0).getNode();
7007   SDNode *N1 = Op.getOperand(1).getNode();
7008   unsigned NewOpc = 0;
7009   bool isMLA = false;
7010   bool isN0SExt = isSignExtended(N0, DAG);
7011   bool isN1SExt = isSignExtended(N1, DAG);
7012   if (isN0SExt && isN1SExt)
7013     NewOpc = ARMISD::VMULLs;
7014   else {
7015     bool isN0ZExt = isZeroExtended(N0, DAG);
7016     bool isN1ZExt = isZeroExtended(N1, DAG);
7017     if (isN0ZExt && isN1ZExt)
7018       NewOpc = ARMISD::VMULLu;
7019     else if (isN1SExt || isN1ZExt) {
7020       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
7021       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
7022       if (isN1SExt && isAddSubSExt(N0, DAG)) {
7023         NewOpc = ARMISD::VMULLs;
7024         isMLA = true;
7025       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
7026         NewOpc = ARMISD::VMULLu;
7027         isMLA = true;
7028       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
7029         std::swap(N0, N1);
7030         NewOpc = ARMISD::VMULLu;
7031         isMLA = true;
7032       }
7033     }
7034 
7035     if (!NewOpc) {
7036       if (VT == MVT::v2i64)
7037         // Fall through to expand this.  It is not legal.
7038         return SDValue();
7039       else
7040         // Other vector multiplications are legal.
7041         return Op;
7042     }
7043   }
7044 
7045   // Legalize to a VMULL instruction.
7046   SDLoc DL(Op);
7047   SDValue Op0;
7048   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
7049   if (!isMLA) {
7050     Op0 = SkipExtensionForVMULL(N0, DAG);
7051     assert(Op0.getValueType().is64BitVector() &&
7052            Op1.getValueType().is64BitVector() &&
7053            "unexpected types for extended operands to VMULL");
7054     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
7055   }
7056 
7057   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
7058   // isel lowering to take advantage of no-stall back to back vmul + vmla.
7059   //   vmull q0, d4, d6
7060   //   vmlal q0, d5, d6
7061   // is faster than
7062   //   vaddl q0, d4, d5
7063   //   vmovl q1, d6
7064   //   vmul  q0, q0, q1
7065   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
7066   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
7067   EVT Op1VT = Op1.getValueType();
7068   return DAG.getNode(N0->getOpcode(), DL, VT,
7069                      DAG.getNode(NewOpc, DL, VT,
7070                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
7071                      DAG.getNode(NewOpc, DL, VT,
7072                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
7073 }
7074 
7075 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
7076                               SelectionDAG &DAG) {
7077   // TODO: Should this propagate fast-math-flags?
7078 
7079   // Convert to float
7080   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
7081   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
7082   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
7083   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
7084   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
7085   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
7086   // Get reciprocal estimate.
7087   // float4 recip = vrecpeq_f32(yf);
7088   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7089                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7090                    Y);
7091   // Because char has a smaller range than uchar, we can actually get away
7092   // without any newton steps.  This requires that we use a weird bias
7093   // of 0xb000, however (again, this has been exhaustively tested).
7094   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
7095   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
7096   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
7097   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
7098   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
7099   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
7100   // Convert back to short.
7101   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
7102   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
7103   return X;
7104 }
7105 
7106 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
7107                                SelectionDAG &DAG) {
7108   // TODO: Should this propagate fast-math-flags?
7109 
7110   SDValue N2;
7111   // Convert to float.
7112   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
7113   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
7114   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
7115   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
7116   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7117   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7118 
7119   // Use reciprocal estimate and one refinement step.
7120   // float4 recip = vrecpeq_f32(yf);
7121   // recip *= vrecpsq_f32(yf, recip);
7122   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7123                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7124                    N1);
7125   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7126                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7127                    N1, N2);
7128   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7129   // Because short has a smaller range than ushort, we can actually get away
7130   // with only a single newton step.  This requires that we use a weird bias
7131   // of 89, however (again, this has been exhaustively tested).
7132   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
7133   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7134   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7135   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
7136   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7137   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7138   // Convert back to integer and return.
7139   // return vmovn_s32(vcvt_s32_f32(result));
7140   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7141   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7142   return N0;
7143 }
7144 
7145 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
7146   EVT VT = Op.getValueType();
7147   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7148          "unexpected type for custom-lowering ISD::SDIV");
7149 
7150   SDLoc dl(Op);
7151   SDValue N0 = Op.getOperand(0);
7152   SDValue N1 = Op.getOperand(1);
7153   SDValue N2, N3;
7154 
7155   if (VT == MVT::v8i8) {
7156     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
7157     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
7158 
7159     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7160                      DAG.getIntPtrConstant(4, dl));
7161     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7162                      DAG.getIntPtrConstant(4, dl));
7163     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7164                      DAG.getIntPtrConstant(0, dl));
7165     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7166                      DAG.getIntPtrConstant(0, dl));
7167 
7168     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
7169     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
7170 
7171     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7172     N0 = LowerCONCAT_VECTORS(N0, DAG);
7173 
7174     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
7175     return N0;
7176   }
7177   return LowerSDIV_v4i16(N0, N1, dl, DAG);
7178 }
7179 
7180 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
7181   // TODO: Should this propagate fast-math-flags?
7182   EVT VT = Op.getValueType();
7183   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7184          "unexpected type for custom-lowering ISD::UDIV");
7185 
7186   SDLoc dl(Op);
7187   SDValue N0 = Op.getOperand(0);
7188   SDValue N1 = Op.getOperand(1);
7189   SDValue N2, N3;
7190 
7191   if (VT == MVT::v8i8) {
7192     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
7193     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
7194 
7195     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7196                      DAG.getIntPtrConstant(4, dl));
7197     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7198                      DAG.getIntPtrConstant(4, dl));
7199     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7200                      DAG.getIntPtrConstant(0, dl));
7201     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7202                      DAG.getIntPtrConstant(0, dl));
7203 
7204     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
7205     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
7206 
7207     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7208     N0 = LowerCONCAT_VECTORS(N0, DAG);
7209 
7210     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
7211                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
7212                                      MVT::i32),
7213                      N0);
7214     return N0;
7215   }
7216 
7217   // v4i16 sdiv ... Convert to float.
7218   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
7219   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
7220   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
7221   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
7222   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7223   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7224 
7225   // Use reciprocal estimate and two refinement steps.
7226   // float4 recip = vrecpeq_f32(yf);
7227   // recip *= vrecpsq_f32(yf, recip);
7228   // recip *= vrecpsq_f32(yf, recip);
7229   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7230                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7231                    BN1);
7232   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7233                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7234                    BN1, N2);
7235   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7236   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7237                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7238                    BN1, N2);
7239   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7240   // Simply multiplying by the reciprocal estimate can leave us a few ulps
7241   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
7242   // and that it will never cause us to return an answer too large).
7243   // float4 result = as_float4(as_int4(xf*recip) + 2);
7244   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7245   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7246   N1 = DAG.getConstant(2, dl, MVT::v4i32);
7247   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7248   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7249   // Convert back to integer and return.
7250   // return vmovn_u32(vcvt_s32_f32(result));
7251   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7252   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7253   return N0;
7254 }
7255 
7256 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
7257   EVT VT = Op.getNode()->getValueType(0);
7258   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
7259 
7260   unsigned Opc;
7261   bool ExtraOp = false;
7262   switch (Op.getOpcode()) {
7263   default: llvm_unreachable("Invalid code");
7264   case ISD::ADDC: Opc = ARMISD::ADDC; break;
7265   case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
7266   case ISD::SUBC: Opc = ARMISD::SUBC; break;
7267   case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
7268   }
7269 
7270   if (!ExtraOp)
7271     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
7272                        Op.getOperand(1));
7273   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
7274                      Op.getOperand(1), Op.getOperand(2));
7275 }
7276 
7277 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
7278   assert(Subtarget->isTargetDarwin());
7279 
7280   // For iOS, we want to call an alternative entry point: __sincos_stret,
7281   // return values are passed via sret.
7282   SDLoc dl(Op);
7283   SDValue Arg = Op.getOperand(0);
7284   EVT ArgVT = Arg.getValueType();
7285   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
7286   auto PtrVT = getPointerTy(DAG.getDataLayout());
7287 
7288   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7289   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7290 
7291   // Pair of floats / doubles used to pass the result.
7292   Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
7293   auto &DL = DAG.getDataLayout();
7294 
7295   ArgListTy Args;
7296   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
7297   SDValue SRet;
7298   if (ShouldUseSRet) {
7299     // Create stack object for sret.
7300     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
7301     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
7302     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
7303     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
7304 
7305     ArgListEntry Entry;
7306     Entry.Node = SRet;
7307     Entry.Ty = RetTy->getPointerTo();
7308     Entry.isSExt = false;
7309     Entry.isZExt = false;
7310     Entry.isSRet = true;
7311     Args.push_back(Entry);
7312     RetTy = Type::getVoidTy(*DAG.getContext());
7313   }
7314 
7315   ArgListEntry Entry;
7316   Entry.Node = Arg;
7317   Entry.Ty = ArgTy;
7318   Entry.isSExt = false;
7319   Entry.isZExt = false;
7320   Args.push_back(Entry);
7321 
7322   const char *LibcallName =
7323       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
7324   RTLIB::Libcall LC =
7325       (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
7326   CallingConv::ID CC = getLibcallCallingConv(LC);
7327   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
7328 
7329   TargetLowering::CallLoweringInfo CLI(DAG);
7330   CLI.setDebugLoc(dl)
7331       .setChain(DAG.getEntryNode())
7332       .setCallee(CC, RetTy, Callee, std::move(Args))
7333       .setDiscardResult(ShouldUseSRet);
7334   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7335 
7336   if (!ShouldUseSRet)
7337     return CallResult.first;
7338 
7339   SDValue LoadSin =
7340       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
7341 
7342   // Address of cos field.
7343   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
7344                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
7345   SDValue LoadCos =
7346       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
7347 
7348   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
7349   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
7350                      LoadSin.getValue(0), LoadCos.getValue(0));
7351 }
7352 
7353 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
7354                                                   bool Signed,
7355                                                   SDValue &Chain) const {
7356   EVT VT = Op.getValueType();
7357   assert((VT == MVT::i32 || VT == MVT::i64) &&
7358          "unexpected type for custom lowering DIV");
7359   SDLoc dl(Op);
7360 
7361   const auto &DL = DAG.getDataLayout();
7362   const auto &TLI = DAG.getTargetLoweringInfo();
7363 
7364   const char *Name = nullptr;
7365   if (Signed)
7366     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
7367   else
7368     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
7369 
7370   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
7371 
7372   ARMTargetLowering::ArgListTy Args;
7373 
7374   for (auto AI : {1, 0}) {
7375     ArgListEntry Arg;
7376     Arg.Node = Op.getOperand(AI);
7377     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
7378     Args.push_back(Arg);
7379   }
7380 
7381   CallLoweringInfo CLI(DAG);
7382   CLI.setDebugLoc(dl)
7383     .setChain(Chain)
7384     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
7385                ES, std::move(Args));
7386 
7387   return LowerCallTo(CLI).first;
7388 }
7389 
7390 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
7391                                             bool Signed) const {
7392   assert(Op.getValueType() == MVT::i32 &&
7393          "unexpected type for custom lowering DIV");
7394   SDLoc dl(Op);
7395 
7396   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
7397                                DAG.getEntryNode(), Op.getOperand(1));
7398 
7399   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7400 }
7401 
7402 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
7403   SDLoc DL(N);
7404   SDValue Op = N->getOperand(1);
7405   if (N->getValueType(0) == MVT::i32)
7406     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
7407   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7408                            DAG.getConstant(0, DL, MVT::i32));
7409   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7410                            DAG.getConstant(1, DL, MVT::i32));
7411   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
7412                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
7413 }
7414 
7415 void ARMTargetLowering::ExpandDIV_Windows(
7416     SDValue Op, SelectionDAG &DAG, bool Signed,
7417     SmallVectorImpl<SDValue> &Results) const {
7418   const auto &DL = DAG.getDataLayout();
7419   const auto &TLI = DAG.getTargetLoweringInfo();
7420 
7421   assert(Op.getValueType() == MVT::i64 &&
7422          "unexpected type for custom lowering DIV");
7423   SDLoc dl(Op);
7424 
7425   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
7426 
7427   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7428 
7429   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
7430   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
7431                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
7432   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
7433 
7434   Results.push_back(Lower);
7435   Results.push_back(Upper);
7436 }
7437 
7438 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
7439   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
7440     // Acquire/Release load/store is not legal for targets without a dmb or
7441     // equivalent available.
7442     return SDValue();
7443 
7444   // Monotonic load/store is legal for all targets.
7445   return Op;
7446 }
7447 
7448 static void ReplaceREADCYCLECOUNTER(SDNode *N,
7449                                     SmallVectorImpl<SDValue> &Results,
7450                                     SelectionDAG &DAG,
7451                                     const ARMSubtarget *Subtarget) {
7452   SDLoc DL(N);
7453   // Under Power Management extensions, the cycle-count is:
7454   //    mrc p15, #0, <Rt>, c9, c13, #0
7455   SDValue Ops[] = { N->getOperand(0), // Chain
7456                     DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
7457                     DAG.getConstant(15, DL, MVT::i32),
7458                     DAG.getConstant(0, DL, MVT::i32),
7459                     DAG.getConstant(9, DL, MVT::i32),
7460                     DAG.getConstant(13, DL, MVT::i32),
7461                     DAG.getConstant(0, DL, MVT::i32)
7462   };
7463 
7464   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
7465                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
7466   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
7467                                 DAG.getConstant(0, DL, MVT::i32)));
7468   Results.push_back(Cycles32.getValue(1));
7469 }
7470 
7471 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
7472   SDLoc dl(V.getNode());
7473   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
7474   SDValue VHi = DAG.getAnyExtOrTrunc(
7475       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
7476       dl, MVT::i32);
7477   SDValue RegClass =
7478       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
7479   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
7480   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
7481   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
7482   return SDValue(
7483       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
7484 }
7485 
7486 static void ReplaceCMP_SWAP_64Results(SDNode *N,
7487                                        SmallVectorImpl<SDValue> & Results,
7488                                        SelectionDAG &DAG) {
7489   assert(N->getValueType(0) == MVT::i64 &&
7490          "AtomicCmpSwap on types less than 64 should be legal");
7491   SDValue Ops[] = {N->getOperand(1),
7492                    createGPRPairNode(DAG, N->getOperand(2)),
7493                    createGPRPairNode(DAG, N->getOperand(3)),
7494                    N->getOperand(0)};
7495   SDNode *CmpSwap = DAG.getMachineNode(
7496       ARM::CMP_SWAP_64, SDLoc(N),
7497       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
7498 
7499   MachineFunction &MF = DAG.getMachineFunction();
7500   MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
7501   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
7502   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
7503 
7504   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
7505                                                SDValue(CmpSwap, 0)));
7506   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
7507                                                SDValue(CmpSwap, 0)));
7508   Results.push_back(SDValue(CmpSwap, 2));
7509 }
7510 
7511 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
7512                           SelectionDAG &DAG) {
7513   const auto &TLI = DAG.getTargetLoweringInfo();
7514 
7515   assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
7516          "Custom lowering is MSVCRT specific!");
7517 
7518   SDLoc dl(Op);
7519   SDValue Val = Op.getOperand(0);
7520   MVT Ty = Val->getSimpleValueType(0);
7521   SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
7522   SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
7523                                          TLI.getPointerTy(DAG.getDataLayout()));
7524 
7525   TargetLowering::ArgListTy Args;
7526   TargetLowering::ArgListEntry Entry;
7527 
7528   Entry.Node = Val;
7529   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
7530   Entry.isZExt = true;
7531   Args.push_back(Entry);
7532 
7533   Entry.Node = Exponent;
7534   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
7535   Entry.isZExt = true;
7536   Args.push_back(Entry);
7537 
7538   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
7539 
7540   // In the in-chain to the call is the entry node  If we are emitting a
7541   // tailcall, the chain will be mutated if the node has a non-entry input
7542   // chain.
7543   SDValue InChain = DAG.getEntryNode();
7544   SDValue TCChain = InChain;
7545 
7546   const auto *F = DAG.getMachineFunction().getFunction();
7547   bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
7548               F->getReturnType() == LCRTy;
7549   if (IsTC)
7550     InChain = TCChain;
7551 
7552   TargetLowering::CallLoweringInfo CLI(DAG);
7553   CLI.setDebugLoc(dl)
7554       .setChain(InChain)
7555       .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
7556       .setTailCall(IsTC);
7557   std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
7558 
7559   // Return the chain (the DAG root) if it is a tail call
7560   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
7561 }
7562 
7563 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7564   switch (Op.getOpcode()) {
7565   default: llvm_unreachable("Don't know how to custom lower this!");
7566   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
7567   case ISD::ConstantPool:
7568     if (Subtarget->genExecuteOnly())
7569       llvm_unreachable("execute-only should not generate constant pools");
7570     return LowerConstantPool(Op, DAG);
7571   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
7572   case ISD::GlobalAddress:
7573     switch (Subtarget->getTargetTriple().getObjectFormat()) {
7574     default: llvm_unreachable("unknown object format");
7575     case Triple::COFF:
7576       return LowerGlobalAddressWindows(Op, DAG);
7577     case Triple::ELF:
7578       return LowerGlobalAddressELF(Op, DAG);
7579     case Triple::MachO:
7580       return LowerGlobalAddressDarwin(Op, DAG);
7581     }
7582   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
7583   case ISD::SELECT:        return LowerSELECT(Op, DAG);
7584   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
7585   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
7586   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
7587   case ISD::VASTART:       return LowerVASTART(Op, DAG);
7588   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
7589   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
7590   case ISD::SINT_TO_FP:
7591   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
7592   case ISD::FP_TO_SINT:
7593   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
7594   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
7595   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
7596   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
7597   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
7598   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
7599   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
7600   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
7601                                                                Subtarget);
7602   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
7603   case ISD::SHL:
7604   case ISD::SRL:
7605   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
7606   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
7607   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
7608   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
7609   case ISD::SRL_PARTS:
7610   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
7611   case ISD::CTTZ:
7612   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
7613   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
7614   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
7615   case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
7616   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
7617   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
7618   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
7619   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
7620   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7621   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
7622   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
7623   case ISD::MUL:           return LowerMUL(Op, DAG);
7624   case ISD::SDIV:
7625     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
7626       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
7627     return LowerSDIV(Op, DAG);
7628   case ISD::UDIV:
7629     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
7630       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
7631     return LowerUDIV(Op, DAG);
7632   case ISD::ADDC:
7633   case ISD::ADDE:
7634   case ISD::SUBC:
7635   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
7636   case ISD::SADDO:
7637   case ISD::UADDO:
7638   case ISD::SSUBO:
7639   case ISD::USUBO:
7640     return LowerXALUO(Op, DAG);
7641   case ISD::ATOMIC_LOAD:
7642   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
7643   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
7644   case ISD::SDIVREM:
7645   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
7646   case ISD::DYNAMIC_STACKALLOC:
7647     if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
7648       return LowerDYNAMIC_STACKALLOC(Op, DAG);
7649     llvm_unreachable("Don't know how to custom lower this!");
7650   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
7651   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
7652   case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
7653   case ARMISD::WIN__DBZCHK: return SDValue();
7654   }
7655 }
7656 
7657 /// ReplaceNodeResults - Replace the results of node with an illegal result
7658 /// type with new values built out of custom code.
7659 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
7660                                            SmallVectorImpl<SDValue> &Results,
7661                                            SelectionDAG &DAG) const {
7662   SDValue Res;
7663   switch (N->getOpcode()) {
7664   default:
7665     llvm_unreachable("Don't know how to custom expand this!");
7666   case ISD::READ_REGISTER:
7667     ExpandREAD_REGISTER(N, Results, DAG);
7668     break;
7669   case ISD::BITCAST:
7670     Res = ExpandBITCAST(N, DAG);
7671     break;
7672   case ISD::SRL:
7673   case ISD::SRA:
7674     Res = Expand64BitShift(N, DAG, Subtarget);
7675     break;
7676   case ISD::SREM:
7677   case ISD::UREM:
7678     Res = LowerREM(N, DAG);
7679     break;
7680   case ISD::SDIVREM:
7681   case ISD::UDIVREM:
7682     Res = LowerDivRem(SDValue(N, 0), DAG);
7683     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
7684     Results.push_back(Res.getValue(0));
7685     Results.push_back(Res.getValue(1));
7686     return;
7687   case ISD::READCYCLECOUNTER:
7688     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
7689     return;
7690   case ISD::UDIV:
7691   case ISD::SDIV:
7692     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
7693     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
7694                              Results);
7695   case ISD::ATOMIC_CMP_SWAP:
7696     ReplaceCMP_SWAP_64Results(N, Results, DAG);
7697     return;
7698   }
7699   if (Res.getNode())
7700     Results.push_back(Res);
7701 }
7702 
7703 //===----------------------------------------------------------------------===//
7704 //                           ARM Scheduler Hooks
7705 //===----------------------------------------------------------------------===//
7706 
7707 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
7708 /// registers the function context.
7709 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
7710                                                MachineBasicBlock *MBB,
7711                                                MachineBasicBlock *DispatchBB,
7712                                                int FI) const {
7713   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
7714          "ROPI/RWPI not currently supported with SjLj");
7715   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7716   DebugLoc dl = MI.getDebugLoc();
7717   MachineFunction *MF = MBB->getParent();
7718   MachineRegisterInfo *MRI = &MF->getRegInfo();
7719   MachineConstantPool *MCP = MF->getConstantPool();
7720   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
7721   const Function *F = MF->getFunction();
7722 
7723   bool isThumb = Subtarget->isThumb();
7724   bool isThumb2 = Subtarget->isThumb2();
7725 
7726   unsigned PCLabelId = AFI->createPICLabelUId();
7727   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
7728   ARMConstantPoolValue *CPV =
7729     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
7730   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
7731 
7732   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
7733                                            : &ARM::GPRRegClass;
7734 
7735   // Grab constant pool and fixed stack memory operands.
7736   MachineMemOperand *CPMMO =
7737       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
7738                                MachineMemOperand::MOLoad, 4, 4);
7739 
7740   MachineMemOperand *FIMMOSt =
7741       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
7742                                MachineMemOperand::MOStore, 4, 4);
7743 
7744   // Load the address of the dispatch MBB into the jump buffer.
7745   if (isThumb2) {
7746     // Incoming value: jbuf
7747     //   ldr.n  r5, LCPI1_1
7748     //   orr    r5, r5, #1
7749     //   add    r5, pc
7750     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
7751     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7752     BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
7753         .addConstantPoolIndex(CPI)
7754         .addMemOperand(CPMMO)
7755         .add(predOps(ARMCC::AL));
7756     // Set the low bit because of thumb mode.
7757     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7758     BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
7759         .addReg(NewVReg1, RegState::Kill)
7760         .addImm(0x01)
7761         .add(predOps(ARMCC::AL))
7762         .add(condCodeOp());
7763     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7764     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
7765       .addReg(NewVReg2, RegState::Kill)
7766       .addImm(PCLabelId);
7767     BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
7768         .addReg(NewVReg3, RegState::Kill)
7769         .addFrameIndex(FI)
7770         .addImm(36) // &jbuf[1] :: pc
7771         .addMemOperand(FIMMOSt)
7772         .add(predOps(ARMCC::AL));
7773   } else if (isThumb) {
7774     // Incoming value: jbuf
7775     //   ldr.n  r1, LCPI1_4
7776     //   add    r1, pc
7777     //   mov    r2, #1
7778     //   orrs   r1, r2
7779     //   add    r2, $jbuf, #+4 ; &jbuf[1]
7780     //   str    r1, [r2]
7781     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7782     BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
7783         .addConstantPoolIndex(CPI)
7784         .addMemOperand(CPMMO)
7785         .add(predOps(ARMCC::AL));
7786     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7787     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
7788       .addReg(NewVReg1, RegState::Kill)
7789       .addImm(PCLabelId);
7790     // Set the low bit because of thumb mode.
7791     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7792     BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
7793         .addReg(ARM::CPSR, RegState::Define)
7794         .addImm(1)
7795         .add(predOps(ARMCC::AL));
7796     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7797     BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
7798         .addReg(ARM::CPSR, RegState::Define)
7799         .addReg(NewVReg2, RegState::Kill)
7800         .addReg(NewVReg3, RegState::Kill)
7801         .add(predOps(ARMCC::AL));
7802     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7803     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
7804             .addFrameIndex(FI)
7805             .addImm(36); // &jbuf[1] :: pc
7806     BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
7807         .addReg(NewVReg4, RegState::Kill)
7808         .addReg(NewVReg5, RegState::Kill)
7809         .addImm(0)
7810         .addMemOperand(FIMMOSt)
7811         .add(predOps(ARMCC::AL));
7812   } else {
7813     // Incoming value: jbuf
7814     //   ldr  r1, LCPI1_1
7815     //   add  r1, pc, r1
7816     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
7817     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7818     BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
7819         .addConstantPoolIndex(CPI)
7820         .addImm(0)
7821         .addMemOperand(CPMMO)
7822         .add(predOps(ARMCC::AL));
7823     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7824     BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
7825         .addReg(NewVReg1, RegState::Kill)
7826         .addImm(PCLabelId)
7827         .add(predOps(ARMCC::AL));
7828     BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
7829         .addReg(NewVReg2, RegState::Kill)
7830         .addFrameIndex(FI)
7831         .addImm(36) // &jbuf[1] :: pc
7832         .addMemOperand(FIMMOSt)
7833         .add(predOps(ARMCC::AL));
7834   }
7835 }
7836 
7837 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
7838                                               MachineBasicBlock *MBB) const {
7839   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7840   DebugLoc dl = MI.getDebugLoc();
7841   MachineFunction *MF = MBB->getParent();
7842   MachineRegisterInfo *MRI = &MF->getRegInfo();
7843   MachineFrameInfo &MFI = MF->getFrameInfo();
7844   int FI = MFI.getFunctionContextIndex();
7845 
7846   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
7847                                                         : &ARM::GPRnopcRegClass;
7848 
7849   // Get a mapping of the call site numbers to all of the landing pads they're
7850   // associated with.
7851   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
7852   unsigned MaxCSNum = 0;
7853   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
7854        ++BB) {
7855     if (!BB->isEHPad()) continue;
7856 
7857     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
7858     // pad.
7859     for (MachineBasicBlock::iterator
7860            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
7861       if (!II->isEHLabel()) continue;
7862 
7863       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
7864       if (!MF->hasCallSiteLandingPad(Sym)) continue;
7865 
7866       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
7867       for (SmallVectorImpl<unsigned>::iterator
7868              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
7869            CSI != CSE; ++CSI) {
7870         CallSiteNumToLPad[*CSI].push_back(&*BB);
7871         MaxCSNum = std::max(MaxCSNum, *CSI);
7872       }
7873       break;
7874     }
7875   }
7876 
7877   // Get an ordered list of the machine basic blocks for the jump table.
7878   std::vector<MachineBasicBlock*> LPadList;
7879   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
7880   LPadList.reserve(CallSiteNumToLPad.size());
7881   for (unsigned I = 1; I <= MaxCSNum; ++I) {
7882     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
7883     for (SmallVectorImpl<MachineBasicBlock*>::iterator
7884            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
7885       LPadList.push_back(*II);
7886       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
7887     }
7888   }
7889 
7890   assert(!LPadList.empty() &&
7891          "No landing pad destinations for the dispatch jump table!");
7892 
7893   // Create the jump table and associated information.
7894   MachineJumpTableInfo *JTI =
7895     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
7896   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
7897 
7898   // Create the MBBs for the dispatch code.
7899 
7900   // Shove the dispatch's address into the return slot in the function context.
7901   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
7902   DispatchBB->setIsEHPad();
7903 
7904   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7905   unsigned trap_opcode;
7906   if (Subtarget->isThumb())
7907     trap_opcode = ARM::tTRAP;
7908   else
7909     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
7910 
7911   BuildMI(TrapBB, dl, TII->get(trap_opcode));
7912   DispatchBB->addSuccessor(TrapBB);
7913 
7914   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
7915   DispatchBB->addSuccessor(DispContBB);
7916 
7917   // Insert and MBBs.
7918   MF->insert(MF->end(), DispatchBB);
7919   MF->insert(MF->end(), DispContBB);
7920   MF->insert(MF->end(), TrapBB);
7921 
7922   // Insert code into the entry block that creates and registers the function
7923   // context.
7924   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
7925 
7926   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
7927       MachinePointerInfo::getFixedStack(*MF, FI),
7928       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
7929 
7930   MachineInstrBuilder MIB;
7931   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
7932 
7933   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
7934   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
7935 
7936   // Add a register mask with no preserved registers.  This results in all
7937   // registers being marked as clobbered. This can't work if the dispatch block
7938   // is in a Thumb1 function and is linked with ARM code which uses the FP
7939   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
7940   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
7941 
7942   bool IsPositionIndependent = isPositionIndependent();
7943   unsigned NumLPads = LPadList.size();
7944   if (Subtarget->isThumb2()) {
7945     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7946     BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
7947         .addFrameIndex(FI)
7948         .addImm(4)
7949         .addMemOperand(FIMMOLd)
7950         .add(predOps(ARMCC::AL));
7951 
7952     if (NumLPads < 256) {
7953       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
7954           .addReg(NewVReg1)
7955           .addImm(LPadList.size())
7956           .add(predOps(ARMCC::AL));
7957     } else {
7958       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7959       BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
7960           .addImm(NumLPads & 0xFFFF)
7961           .add(predOps(ARMCC::AL));
7962 
7963       unsigned VReg2 = VReg1;
7964       if ((NumLPads & 0xFFFF0000) != 0) {
7965         VReg2 = MRI->createVirtualRegister(TRC);
7966         BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
7967             .addReg(VReg1)
7968             .addImm(NumLPads >> 16)
7969             .add(predOps(ARMCC::AL));
7970       }
7971 
7972       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
7973           .addReg(NewVReg1)
7974           .addReg(VReg2)
7975           .add(predOps(ARMCC::AL));
7976     }
7977 
7978     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
7979       .addMBB(TrapBB)
7980       .addImm(ARMCC::HI)
7981       .addReg(ARM::CPSR);
7982 
7983     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7984     BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
7985         .addJumpTableIndex(MJTI)
7986         .add(predOps(ARMCC::AL));
7987 
7988     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7989     BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
7990         .addReg(NewVReg3, RegState::Kill)
7991         .addReg(NewVReg1)
7992         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
7993         .add(predOps(ARMCC::AL))
7994         .add(condCodeOp());
7995 
7996     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
7997       .addReg(NewVReg4, RegState::Kill)
7998       .addReg(NewVReg1)
7999       .addJumpTableIndex(MJTI);
8000   } else if (Subtarget->isThumb()) {
8001     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8002     BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
8003         .addFrameIndex(FI)
8004         .addImm(1)
8005         .addMemOperand(FIMMOLd)
8006         .add(predOps(ARMCC::AL));
8007 
8008     if (NumLPads < 256) {
8009       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
8010           .addReg(NewVReg1)
8011           .addImm(NumLPads)
8012           .add(predOps(ARMCC::AL));
8013     } else {
8014       MachineConstantPool *ConstantPool = MF->getConstantPool();
8015       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8016       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
8017 
8018       // MachineConstantPool wants an explicit alignment.
8019       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8020       if (Align == 0)
8021         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8022       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8023 
8024       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8025       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
8026           .addReg(VReg1, RegState::Define)
8027           .addConstantPoolIndex(Idx)
8028           .add(predOps(ARMCC::AL));
8029       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
8030           .addReg(NewVReg1)
8031           .addReg(VReg1)
8032           .add(predOps(ARMCC::AL));
8033     }
8034 
8035     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
8036       .addMBB(TrapBB)
8037       .addImm(ARMCC::HI)
8038       .addReg(ARM::CPSR);
8039 
8040     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
8041     BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
8042         .addReg(ARM::CPSR, RegState::Define)
8043         .addReg(NewVReg1)
8044         .addImm(2)
8045         .add(predOps(ARMCC::AL));
8046 
8047     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8048     BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
8049         .addJumpTableIndex(MJTI)
8050         .add(predOps(ARMCC::AL));
8051 
8052     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8053     BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
8054         .addReg(ARM::CPSR, RegState::Define)
8055         .addReg(NewVReg2, RegState::Kill)
8056         .addReg(NewVReg3)
8057         .add(predOps(ARMCC::AL));
8058 
8059     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
8060         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
8061 
8062     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8063     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
8064         .addReg(NewVReg4, RegState::Kill)
8065         .addImm(0)
8066         .addMemOperand(JTMMOLd)
8067         .add(predOps(ARMCC::AL));
8068 
8069     unsigned NewVReg6 = NewVReg5;
8070     if (IsPositionIndependent) {
8071       NewVReg6 = MRI->createVirtualRegister(TRC);
8072       BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
8073           .addReg(ARM::CPSR, RegState::Define)
8074           .addReg(NewVReg5, RegState::Kill)
8075           .addReg(NewVReg3)
8076           .add(predOps(ARMCC::AL));
8077     }
8078 
8079     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
8080       .addReg(NewVReg6, RegState::Kill)
8081       .addJumpTableIndex(MJTI);
8082   } else {
8083     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8084     BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
8085         .addFrameIndex(FI)
8086         .addImm(4)
8087         .addMemOperand(FIMMOLd)
8088         .add(predOps(ARMCC::AL));
8089 
8090     if (NumLPads < 256) {
8091       BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
8092           .addReg(NewVReg1)
8093           .addImm(NumLPads)
8094           .add(predOps(ARMCC::AL));
8095     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
8096       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8097       BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
8098           .addImm(NumLPads & 0xFFFF)
8099           .add(predOps(ARMCC::AL));
8100 
8101       unsigned VReg2 = VReg1;
8102       if ((NumLPads & 0xFFFF0000) != 0) {
8103         VReg2 = MRI->createVirtualRegister(TRC);
8104         BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
8105             .addReg(VReg1)
8106             .addImm(NumLPads >> 16)
8107             .add(predOps(ARMCC::AL));
8108       }
8109 
8110       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8111           .addReg(NewVReg1)
8112           .addReg(VReg2)
8113           .add(predOps(ARMCC::AL));
8114     } else {
8115       MachineConstantPool *ConstantPool = MF->getConstantPool();
8116       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8117       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
8118 
8119       // MachineConstantPool wants an explicit alignment.
8120       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8121       if (Align == 0)
8122         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8123       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8124 
8125       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8126       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
8127           .addReg(VReg1, RegState::Define)
8128           .addConstantPoolIndex(Idx)
8129           .addImm(0)
8130           .add(predOps(ARMCC::AL));
8131       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8132           .addReg(NewVReg1)
8133           .addReg(VReg1, RegState::Kill)
8134           .add(predOps(ARMCC::AL));
8135     }
8136 
8137     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
8138       .addMBB(TrapBB)
8139       .addImm(ARMCC::HI)
8140       .addReg(ARM::CPSR);
8141 
8142     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8143     BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
8144         .addReg(NewVReg1)
8145         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
8146         .add(predOps(ARMCC::AL))
8147         .add(condCodeOp());
8148     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8149     BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
8150         .addJumpTableIndex(MJTI)
8151         .add(predOps(ARMCC::AL));
8152 
8153     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
8154         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
8155     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8156     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
8157         .addReg(NewVReg3, RegState::Kill)
8158         .addReg(NewVReg4)
8159         .addImm(0)
8160         .addMemOperand(JTMMOLd)
8161         .add(predOps(ARMCC::AL));
8162 
8163     if (IsPositionIndependent) {
8164       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
8165         .addReg(NewVReg5, RegState::Kill)
8166         .addReg(NewVReg4)
8167         .addJumpTableIndex(MJTI);
8168     } else {
8169       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
8170         .addReg(NewVReg5, RegState::Kill)
8171         .addJumpTableIndex(MJTI);
8172     }
8173   }
8174 
8175   // Add the jump table entries as successors to the MBB.
8176   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
8177   for (std::vector<MachineBasicBlock*>::iterator
8178          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
8179     MachineBasicBlock *CurMBB = *I;
8180     if (SeenMBBs.insert(CurMBB).second)
8181       DispContBB->addSuccessor(CurMBB);
8182   }
8183 
8184   // N.B. the order the invoke BBs are processed in doesn't matter here.
8185   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
8186   SmallVector<MachineBasicBlock*, 64> MBBLPads;
8187   for (MachineBasicBlock *BB : InvokeBBs) {
8188 
8189     // Remove the landing pad successor from the invoke block and replace it
8190     // with the new dispatch block.
8191     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
8192                                                   BB->succ_end());
8193     while (!Successors.empty()) {
8194       MachineBasicBlock *SMBB = Successors.pop_back_val();
8195       if (SMBB->isEHPad()) {
8196         BB->removeSuccessor(SMBB);
8197         MBBLPads.push_back(SMBB);
8198       }
8199     }
8200 
8201     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
8202     BB->normalizeSuccProbs();
8203 
8204     // Find the invoke call and mark all of the callee-saved registers as
8205     // 'implicit defined' so that they're spilled. This prevents code from
8206     // moving instructions to before the EH block, where they will never be
8207     // executed.
8208     for (MachineBasicBlock::reverse_iterator
8209            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
8210       if (!II->isCall()) continue;
8211 
8212       DenseMap<unsigned, bool> DefRegs;
8213       for (MachineInstr::mop_iterator
8214              OI = II->operands_begin(), OE = II->operands_end();
8215            OI != OE; ++OI) {
8216         if (!OI->isReg()) continue;
8217         DefRegs[OI->getReg()] = true;
8218       }
8219 
8220       MachineInstrBuilder MIB(*MF, &*II);
8221 
8222       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
8223         unsigned Reg = SavedRegs[i];
8224         if (Subtarget->isThumb2() &&
8225             !ARM::tGPRRegClass.contains(Reg) &&
8226             !ARM::hGPRRegClass.contains(Reg))
8227           continue;
8228         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
8229           continue;
8230         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
8231           continue;
8232         if (!DefRegs[Reg])
8233           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
8234       }
8235 
8236       break;
8237     }
8238   }
8239 
8240   // Mark all former landing pads as non-landing pads. The dispatch is the only
8241   // landing pad now.
8242   for (SmallVectorImpl<MachineBasicBlock*>::iterator
8243          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
8244     (*I)->setIsEHPad(false);
8245 
8246   // The instruction is gone now.
8247   MI.eraseFromParent();
8248 }
8249 
8250 static
8251 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
8252   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
8253        E = MBB->succ_end(); I != E; ++I)
8254     if (*I != Succ)
8255       return *I;
8256   llvm_unreachable("Expecting a BB with two successors!");
8257 }
8258 
8259 /// Return the load opcode for a given load size. If load size >= 8,
8260 /// neon opcode will be returned.
8261 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
8262   if (LdSize >= 8)
8263     return LdSize == 16 ? ARM::VLD1q32wb_fixed
8264                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
8265   if (IsThumb1)
8266     return LdSize == 4 ? ARM::tLDRi
8267                        : LdSize == 2 ? ARM::tLDRHi
8268                                      : LdSize == 1 ? ARM::tLDRBi : 0;
8269   if (IsThumb2)
8270     return LdSize == 4 ? ARM::t2LDR_POST
8271                        : LdSize == 2 ? ARM::t2LDRH_POST
8272                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
8273   return LdSize == 4 ? ARM::LDR_POST_IMM
8274                      : LdSize == 2 ? ARM::LDRH_POST
8275                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
8276 }
8277 
8278 /// Return the store opcode for a given store size. If store size >= 8,
8279 /// neon opcode will be returned.
8280 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
8281   if (StSize >= 8)
8282     return StSize == 16 ? ARM::VST1q32wb_fixed
8283                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
8284   if (IsThumb1)
8285     return StSize == 4 ? ARM::tSTRi
8286                        : StSize == 2 ? ARM::tSTRHi
8287                                      : StSize == 1 ? ARM::tSTRBi : 0;
8288   if (IsThumb2)
8289     return StSize == 4 ? ARM::t2STR_POST
8290                        : StSize == 2 ? ARM::t2STRH_POST
8291                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
8292   return StSize == 4 ? ARM::STR_POST_IMM
8293                      : StSize == 2 ? ARM::STRH_POST
8294                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
8295 }
8296 
8297 /// Emit a post-increment load operation with given size. The instructions
8298 /// will be added to BB at Pos.
8299 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8300                        const TargetInstrInfo *TII, const DebugLoc &dl,
8301                        unsigned LdSize, unsigned Data, unsigned AddrIn,
8302                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8303   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
8304   assert(LdOpc != 0 && "Should have a load opcode");
8305   if (LdSize >= 8) {
8306     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8307         .addReg(AddrOut, RegState::Define)
8308         .addReg(AddrIn)
8309         .addImm(0)
8310         .add(predOps(ARMCC::AL));
8311   } else if (IsThumb1) {
8312     // load + update AddrIn
8313     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8314         .addReg(AddrIn)
8315         .addImm(0)
8316         .add(predOps(ARMCC::AL));
8317     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
8318         .add(t1CondCodeOp())
8319         .addReg(AddrIn)
8320         .addImm(LdSize)
8321         .add(predOps(ARMCC::AL));
8322   } else if (IsThumb2) {
8323     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8324         .addReg(AddrOut, RegState::Define)
8325         .addReg(AddrIn)
8326         .addImm(LdSize)
8327         .add(predOps(ARMCC::AL));
8328   } else { // arm
8329     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8330         .addReg(AddrOut, RegState::Define)
8331         .addReg(AddrIn)
8332         .addReg(0)
8333         .addImm(LdSize)
8334         .add(predOps(ARMCC::AL));
8335   }
8336 }
8337 
8338 /// Emit a post-increment store operation with given size. The instructions
8339 /// will be added to BB at Pos.
8340 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8341                        const TargetInstrInfo *TII, const DebugLoc &dl,
8342                        unsigned StSize, unsigned Data, unsigned AddrIn,
8343                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8344   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
8345   assert(StOpc != 0 && "Should have a store opcode");
8346   if (StSize >= 8) {
8347     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8348         .addReg(AddrIn)
8349         .addImm(0)
8350         .addReg(Data)
8351         .add(predOps(ARMCC::AL));
8352   } else if (IsThumb1) {
8353     // store + update AddrIn
8354     BuildMI(*BB, Pos, dl, TII->get(StOpc))
8355         .addReg(Data)
8356         .addReg(AddrIn)
8357         .addImm(0)
8358         .add(predOps(ARMCC::AL));
8359     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
8360         .add(t1CondCodeOp())
8361         .addReg(AddrIn)
8362         .addImm(StSize)
8363         .add(predOps(ARMCC::AL));
8364   } else if (IsThumb2) {
8365     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8366         .addReg(Data)
8367         .addReg(AddrIn)
8368         .addImm(StSize)
8369         .add(predOps(ARMCC::AL));
8370   } else { // arm
8371     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8372         .addReg(Data)
8373         .addReg(AddrIn)
8374         .addReg(0)
8375         .addImm(StSize)
8376         .add(predOps(ARMCC::AL));
8377   }
8378 }
8379 
8380 MachineBasicBlock *
8381 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
8382                                    MachineBasicBlock *BB) const {
8383   // This pseudo instruction has 3 operands: dst, src, size
8384   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
8385   // Otherwise, we will generate unrolled scalar copies.
8386   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8387   const BasicBlock *LLVM_BB = BB->getBasicBlock();
8388   MachineFunction::iterator It = ++BB->getIterator();
8389 
8390   unsigned dest = MI.getOperand(0).getReg();
8391   unsigned src = MI.getOperand(1).getReg();
8392   unsigned SizeVal = MI.getOperand(2).getImm();
8393   unsigned Align = MI.getOperand(3).getImm();
8394   DebugLoc dl = MI.getDebugLoc();
8395 
8396   MachineFunction *MF = BB->getParent();
8397   MachineRegisterInfo &MRI = MF->getRegInfo();
8398   unsigned UnitSize = 0;
8399   const TargetRegisterClass *TRC = nullptr;
8400   const TargetRegisterClass *VecTRC = nullptr;
8401 
8402   bool IsThumb1 = Subtarget->isThumb1Only();
8403   bool IsThumb2 = Subtarget->isThumb2();
8404   bool IsThumb = Subtarget->isThumb();
8405 
8406   if (Align & 1) {
8407     UnitSize = 1;
8408   } else if (Align & 2) {
8409     UnitSize = 2;
8410   } else {
8411     // Check whether we can use NEON instructions.
8412     if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
8413         Subtarget->hasNEON()) {
8414       if ((Align % 16 == 0) && SizeVal >= 16)
8415         UnitSize = 16;
8416       else if ((Align % 8 == 0) && SizeVal >= 8)
8417         UnitSize = 8;
8418     }
8419     // Can't use NEON instructions.
8420     if (UnitSize == 0)
8421       UnitSize = 4;
8422   }
8423 
8424   // Select the correct opcode and register class for unit size load/store
8425   bool IsNeon = UnitSize >= 8;
8426   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
8427   if (IsNeon)
8428     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
8429                             : UnitSize == 8 ? &ARM::DPRRegClass
8430                                             : nullptr;
8431 
8432   unsigned BytesLeft = SizeVal % UnitSize;
8433   unsigned LoopSize = SizeVal - BytesLeft;
8434 
8435   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
8436     // Use LDR and STR to copy.
8437     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
8438     // [destOut] = STR_POST(scratch, destIn, UnitSize)
8439     unsigned srcIn = src;
8440     unsigned destIn = dest;
8441     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
8442       unsigned srcOut = MRI.createVirtualRegister(TRC);
8443       unsigned destOut = MRI.createVirtualRegister(TRC);
8444       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
8445       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
8446                  IsThumb1, IsThumb2);
8447       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
8448                  IsThumb1, IsThumb2);
8449       srcIn = srcOut;
8450       destIn = destOut;
8451     }
8452 
8453     // Handle the leftover bytes with LDRB and STRB.
8454     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
8455     // [destOut] = STRB_POST(scratch, destIn, 1)
8456     for (unsigned i = 0; i < BytesLeft; i++) {
8457       unsigned srcOut = MRI.createVirtualRegister(TRC);
8458       unsigned destOut = MRI.createVirtualRegister(TRC);
8459       unsigned scratch = MRI.createVirtualRegister(TRC);
8460       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
8461                  IsThumb1, IsThumb2);
8462       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
8463                  IsThumb1, IsThumb2);
8464       srcIn = srcOut;
8465       destIn = destOut;
8466     }
8467     MI.eraseFromParent(); // The instruction is gone now.
8468     return BB;
8469   }
8470 
8471   // Expand the pseudo op to a loop.
8472   // thisMBB:
8473   //   ...
8474   //   movw varEnd, # --> with thumb2
8475   //   movt varEnd, #
8476   //   ldrcp varEnd, idx --> without thumb2
8477   //   fallthrough --> loopMBB
8478   // loopMBB:
8479   //   PHI varPhi, varEnd, varLoop
8480   //   PHI srcPhi, src, srcLoop
8481   //   PHI destPhi, dst, destLoop
8482   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
8483   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
8484   //   subs varLoop, varPhi, #UnitSize
8485   //   bne loopMBB
8486   //   fallthrough --> exitMBB
8487   // exitMBB:
8488   //   epilogue to handle left-over bytes
8489   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
8490   //   [destOut] = STRB_POST(scratch, destLoop, 1)
8491   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
8492   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
8493   MF->insert(It, loopMBB);
8494   MF->insert(It, exitMBB);
8495 
8496   // Transfer the remainder of BB and its successor edges to exitMBB.
8497   exitMBB->splice(exitMBB->begin(), BB,
8498                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
8499   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
8500 
8501   // Load an immediate to varEnd.
8502   unsigned varEnd = MRI.createVirtualRegister(TRC);
8503   if (Subtarget->useMovt(*MF)) {
8504     unsigned Vtmp = varEnd;
8505     if ((LoopSize & 0xFFFF0000) != 0)
8506       Vtmp = MRI.createVirtualRegister(TRC);
8507     BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
8508         .addImm(LoopSize & 0xFFFF)
8509         .add(predOps(ARMCC::AL));
8510 
8511     if ((LoopSize & 0xFFFF0000) != 0)
8512       BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
8513           .addReg(Vtmp)
8514           .addImm(LoopSize >> 16)
8515           .add(predOps(ARMCC::AL));
8516   } else {
8517     MachineConstantPool *ConstantPool = MF->getConstantPool();
8518     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8519     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
8520 
8521     // MachineConstantPool wants an explicit alignment.
8522     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8523     if (Align == 0)
8524       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8525     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8526 
8527     if (IsThumb)
8528       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
8529           .addReg(varEnd, RegState::Define)
8530           .addConstantPoolIndex(Idx)
8531           .add(predOps(ARMCC::AL));
8532     else
8533       BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
8534           .addReg(varEnd, RegState::Define)
8535           .addConstantPoolIndex(Idx)
8536           .addImm(0)
8537           .add(predOps(ARMCC::AL));
8538   }
8539   BB->addSuccessor(loopMBB);
8540 
8541   // Generate the loop body:
8542   //   varPhi = PHI(varLoop, varEnd)
8543   //   srcPhi = PHI(srcLoop, src)
8544   //   destPhi = PHI(destLoop, dst)
8545   MachineBasicBlock *entryBB = BB;
8546   BB = loopMBB;
8547   unsigned varLoop = MRI.createVirtualRegister(TRC);
8548   unsigned varPhi = MRI.createVirtualRegister(TRC);
8549   unsigned srcLoop = MRI.createVirtualRegister(TRC);
8550   unsigned srcPhi = MRI.createVirtualRegister(TRC);
8551   unsigned destLoop = MRI.createVirtualRegister(TRC);
8552   unsigned destPhi = MRI.createVirtualRegister(TRC);
8553 
8554   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
8555     .addReg(varLoop).addMBB(loopMBB)
8556     .addReg(varEnd).addMBB(entryBB);
8557   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
8558     .addReg(srcLoop).addMBB(loopMBB)
8559     .addReg(src).addMBB(entryBB);
8560   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
8561     .addReg(destLoop).addMBB(loopMBB)
8562     .addReg(dest).addMBB(entryBB);
8563 
8564   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
8565   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
8566   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
8567   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
8568              IsThumb1, IsThumb2);
8569   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
8570              IsThumb1, IsThumb2);
8571 
8572   // Decrement loop variable by UnitSize.
8573   if (IsThumb1) {
8574     BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
8575         .add(t1CondCodeOp())
8576         .addReg(varPhi)
8577         .addImm(UnitSize)
8578         .add(predOps(ARMCC::AL));
8579   } else {
8580     MachineInstrBuilder MIB =
8581         BuildMI(*BB, BB->end(), dl,
8582                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
8583     MIB.addReg(varPhi)
8584         .addImm(UnitSize)
8585         .add(predOps(ARMCC::AL))
8586         .add(condCodeOp());
8587     MIB->getOperand(5).setReg(ARM::CPSR);
8588     MIB->getOperand(5).setIsDef(true);
8589   }
8590   BuildMI(*BB, BB->end(), dl,
8591           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
8592       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
8593 
8594   // loopMBB can loop back to loopMBB or fall through to exitMBB.
8595   BB->addSuccessor(loopMBB);
8596   BB->addSuccessor(exitMBB);
8597 
8598   // Add epilogue to handle BytesLeft.
8599   BB = exitMBB;
8600   auto StartOfExit = exitMBB->begin();
8601 
8602   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
8603   //   [destOut] = STRB_POST(scratch, destLoop, 1)
8604   unsigned srcIn = srcLoop;
8605   unsigned destIn = destLoop;
8606   for (unsigned i = 0; i < BytesLeft; i++) {
8607     unsigned srcOut = MRI.createVirtualRegister(TRC);
8608     unsigned destOut = MRI.createVirtualRegister(TRC);
8609     unsigned scratch = MRI.createVirtualRegister(TRC);
8610     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
8611                IsThumb1, IsThumb2);
8612     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
8613                IsThumb1, IsThumb2);
8614     srcIn = srcOut;
8615     destIn = destOut;
8616   }
8617 
8618   MI.eraseFromParent(); // The instruction is gone now.
8619   return BB;
8620 }
8621 
8622 MachineBasicBlock *
8623 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
8624                                        MachineBasicBlock *MBB) const {
8625   const TargetMachine &TM = getTargetMachine();
8626   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
8627   DebugLoc DL = MI.getDebugLoc();
8628 
8629   assert(Subtarget->isTargetWindows() &&
8630          "__chkstk is only supported on Windows");
8631   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
8632 
8633   // __chkstk takes the number of words to allocate on the stack in R4, and
8634   // returns the stack adjustment in number of bytes in R4.  This will not
8635   // clober any other registers (other than the obvious lr).
8636   //
8637   // Although, technically, IP should be considered a register which may be
8638   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
8639   // thumb-2 environment, so there is no interworking required.  As a result, we
8640   // do not expect a veneer to be emitted by the linker, clobbering IP.
8641   //
8642   // Each module receives its own copy of __chkstk, so no import thunk is
8643   // required, again, ensuring that IP is not clobbered.
8644   //
8645   // Finally, although some linkers may theoretically provide a trampoline for
8646   // out of range calls (which is quite common due to a 32M range limitation of
8647   // branches for Thumb), we can generate the long-call version via
8648   // -mcmodel=large, alleviating the need for the trampoline which may clobber
8649   // IP.
8650 
8651   switch (TM.getCodeModel()) {
8652   case CodeModel::Small:
8653   case CodeModel::Medium:
8654   case CodeModel::Default:
8655   case CodeModel::Kernel:
8656     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
8657         .add(predOps(ARMCC::AL))
8658         .addExternalSymbol("__chkstk")
8659         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8660         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8661         .addReg(ARM::R12,
8662                 RegState::Implicit | RegState::Define | RegState::Dead);
8663     break;
8664   case CodeModel::Large:
8665   case CodeModel::JITDefault: {
8666     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
8667     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
8668 
8669     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
8670       .addExternalSymbol("__chkstk");
8671     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
8672         .add(predOps(ARMCC::AL))
8673         .addReg(Reg, RegState::Kill)
8674         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8675         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8676         .addReg(ARM::R12,
8677                 RegState::Implicit | RegState::Define | RegState::Dead);
8678     break;
8679   }
8680   }
8681 
8682   BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
8683       .addReg(ARM::SP, RegState::Kill)
8684       .addReg(ARM::R4, RegState::Kill)
8685       .setMIFlags(MachineInstr::FrameSetup)
8686       .add(predOps(ARMCC::AL))
8687       .add(condCodeOp());
8688 
8689   MI.eraseFromParent();
8690   return MBB;
8691 }
8692 
8693 MachineBasicBlock *
8694 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
8695                                        MachineBasicBlock *MBB) const {
8696   DebugLoc DL = MI.getDebugLoc();
8697   MachineFunction *MF = MBB->getParent();
8698   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8699 
8700   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
8701   MF->insert(++MBB->getIterator(), ContBB);
8702   ContBB->splice(ContBB->begin(), MBB,
8703                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
8704   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
8705   MBB->addSuccessor(ContBB);
8706 
8707   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
8708   BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
8709   MF->push_back(TrapBB);
8710   MBB->addSuccessor(TrapBB);
8711 
8712   BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
8713       .addReg(MI.getOperand(0).getReg())
8714       .addImm(0)
8715       .add(predOps(ARMCC::AL));
8716   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
8717       .addMBB(TrapBB)
8718       .addImm(ARMCC::EQ)
8719       .addReg(ARM::CPSR);
8720 
8721   MI.eraseFromParent();
8722   return ContBB;
8723 }
8724 
8725 MachineBasicBlock *
8726 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
8727                                                MachineBasicBlock *BB) const {
8728   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8729   DebugLoc dl = MI.getDebugLoc();
8730   bool isThumb2 = Subtarget->isThumb2();
8731   switch (MI.getOpcode()) {
8732   default: {
8733     MI.print(errs());
8734     llvm_unreachable("Unexpected instr type to insert");
8735   }
8736 
8737   // Thumb1 post-indexed loads are really just single-register LDMs.
8738   case ARM::tLDR_postidx: {
8739     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
8740         .add(MI.getOperand(1))  // Rn_wb
8741         .add(MI.getOperand(2))  // Rn
8742         .add(MI.getOperand(3))  // PredImm
8743         .add(MI.getOperand(4))  // PredReg
8744         .add(MI.getOperand(0)); // Rt
8745     MI.eraseFromParent();
8746     return BB;
8747   }
8748 
8749   // The Thumb2 pre-indexed stores have the same MI operands, they just
8750   // define them differently in the .td files from the isel patterns, so
8751   // they need pseudos.
8752   case ARM::t2STR_preidx:
8753     MI.setDesc(TII->get(ARM::t2STR_PRE));
8754     return BB;
8755   case ARM::t2STRB_preidx:
8756     MI.setDesc(TII->get(ARM::t2STRB_PRE));
8757     return BB;
8758   case ARM::t2STRH_preidx:
8759     MI.setDesc(TII->get(ARM::t2STRH_PRE));
8760     return BB;
8761 
8762   case ARM::STRi_preidx:
8763   case ARM::STRBi_preidx: {
8764     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
8765                                                          : ARM::STRB_PRE_IMM;
8766     // Decode the offset.
8767     unsigned Offset = MI.getOperand(4).getImm();
8768     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
8769     Offset = ARM_AM::getAM2Offset(Offset);
8770     if (isSub)
8771       Offset = -Offset;
8772 
8773     MachineMemOperand *MMO = *MI.memoperands_begin();
8774     BuildMI(*BB, MI, dl, TII->get(NewOpc))
8775         .add(MI.getOperand(0)) // Rn_wb
8776         .add(MI.getOperand(1)) // Rt
8777         .add(MI.getOperand(2)) // Rn
8778         .addImm(Offset)        // offset (skip GPR==zero_reg)
8779         .add(MI.getOperand(5)) // pred
8780         .add(MI.getOperand(6))
8781         .addMemOperand(MMO);
8782     MI.eraseFromParent();
8783     return BB;
8784   }
8785   case ARM::STRr_preidx:
8786   case ARM::STRBr_preidx:
8787   case ARM::STRH_preidx: {
8788     unsigned NewOpc;
8789     switch (MI.getOpcode()) {
8790     default: llvm_unreachable("unexpected opcode!");
8791     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
8792     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
8793     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
8794     }
8795     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
8796     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
8797       MIB.add(MI.getOperand(i));
8798     MI.eraseFromParent();
8799     return BB;
8800   }
8801 
8802   case ARM::tMOVCCr_pseudo: {
8803     // To "insert" a SELECT_CC instruction, we actually have to insert the
8804     // diamond control-flow pattern.  The incoming instruction knows the
8805     // destination vreg to set, the condition code register to branch on, the
8806     // true/false values to select between, and a branch opcode to use.
8807     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8808     MachineFunction::iterator It = ++BB->getIterator();
8809 
8810     //  thisMBB:
8811     //  ...
8812     //   TrueVal = ...
8813     //   cmpTY ccX, r1, r2
8814     //   bCC copy1MBB
8815     //   fallthrough --> copy0MBB
8816     MachineBasicBlock *thisMBB  = BB;
8817     MachineFunction *F = BB->getParent();
8818     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8819     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
8820     F->insert(It, copy0MBB);
8821     F->insert(It, sinkMBB);
8822 
8823     // Transfer the remainder of BB and its successor edges to sinkMBB.
8824     sinkMBB->splice(sinkMBB->begin(), BB,
8825                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
8826     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
8827 
8828     BB->addSuccessor(copy0MBB);
8829     BB->addSuccessor(sinkMBB);
8830 
8831     BuildMI(BB, dl, TII->get(ARM::tBcc))
8832         .addMBB(sinkMBB)
8833         .addImm(MI.getOperand(3).getImm())
8834         .addReg(MI.getOperand(4).getReg());
8835 
8836     //  copy0MBB:
8837     //   %FalseValue = ...
8838     //   # fallthrough to sinkMBB
8839     BB = copy0MBB;
8840 
8841     // Update machine-CFG edges
8842     BB->addSuccessor(sinkMBB);
8843 
8844     //  sinkMBB:
8845     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8846     //  ...
8847     BB = sinkMBB;
8848     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
8849         .addReg(MI.getOperand(1).getReg())
8850         .addMBB(copy0MBB)
8851         .addReg(MI.getOperand(2).getReg())
8852         .addMBB(thisMBB);
8853 
8854     MI.eraseFromParent(); // The pseudo instruction is gone now.
8855     return BB;
8856   }
8857 
8858   case ARM::BCCi64:
8859   case ARM::BCCZi64: {
8860     // If there is an unconditional branch to the other successor, remove it.
8861     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
8862 
8863     // Compare both parts that make up the double comparison separately for
8864     // equality.
8865     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
8866 
8867     unsigned LHS1 = MI.getOperand(1).getReg();
8868     unsigned LHS2 = MI.getOperand(2).getReg();
8869     if (RHSisZero) {
8870       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8871           .addReg(LHS1)
8872           .addImm(0)
8873           .add(predOps(ARMCC::AL));
8874       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8875         .addReg(LHS2).addImm(0)
8876         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8877     } else {
8878       unsigned RHS1 = MI.getOperand(3).getReg();
8879       unsigned RHS2 = MI.getOperand(4).getReg();
8880       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8881           .addReg(LHS1)
8882           .addReg(RHS1)
8883           .add(predOps(ARMCC::AL));
8884       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8885         .addReg(LHS2).addReg(RHS2)
8886         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8887     }
8888 
8889     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
8890     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
8891     if (MI.getOperand(0).getImm() == ARMCC::NE)
8892       std::swap(destMBB, exitMBB);
8893 
8894     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
8895       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
8896     if (isThumb2)
8897       BuildMI(BB, dl, TII->get(ARM::t2B))
8898           .addMBB(exitMBB)
8899           .add(predOps(ARMCC::AL));
8900     else
8901       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
8902 
8903     MI.eraseFromParent(); // The pseudo instruction is gone now.
8904     return BB;
8905   }
8906 
8907   case ARM::Int_eh_sjlj_setjmp:
8908   case ARM::Int_eh_sjlj_setjmp_nofp:
8909   case ARM::tInt_eh_sjlj_setjmp:
8910   case ARM::t2Int_eh_sjlj_setjmp:
8911   case ARM::t2Int_eh_sjlj_setjmp_nofp:
8912     return BB;
8913 
8914   case ARM::Int_eh_sjlj_setup_dispatch:
8915     EmitSjLjDispatchBlock(MI, BB);
8916     return BB;
8917 
8918   case ARM::ABS:
8919   case ARM::t2ABS: {
8920     // To insert an ABS instruction, we have to insert the
8921     // diamond control-flow pattern.  The incoming instruction knows the
8922     // source vreg to test against 0, the destination vreg to set,
8923     // the condition code register to branch on, the
8924     // true/false values to select between, and a branch opcode to use.
8925     // It transforms
8926     //     V1 = ABS V0
8927     // into
8928     //     V2 = MOVS V0
8929     //     BCC                      (branch to SinkBB if V0 >= 0)
8930     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
8931     //     SinkBB: V1 = PHI(V2, V3)
8932     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8933     MachineFunction::iterator BBI = ++BB->getIterator();
8934     MachineFunction *Fn = BB->getParent();
8935     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
8936     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
8937     Fn->insert(BBI, RSBBB);
8938     Fn->insert(BBI, SinkBB);
8939 
8940     unsigned int ABSSrcReg = MI.getOperand(1).getReg();
8941     unsigned int ABSDstReg = MI.getOperand(0).getReg();
8942     bool ABSSrcKIll = MI.getOperand(1).isKill();
8943     bool isThumb2 = Subtarget->isThumb2();
8944     MachineRegisterInfo &MRI = Fn->getRegInfo();
8945     // In Thumb mode S must not be specified if source register is the SP or
8946     // PC and if destination register is the SP, so restrict register class
8947     unsigned NewRsbDstReg =
8948       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
8949 
8950     // Transfer the remainder of BB and its successor edges to sinkMBB.
8951     SinkBB->splice(SinkBB->begin(), BB,
8952                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
8953     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
8954 
8955     BB->addSuccessor(RSBBB);
8956     BB->addSuccessor(SinkBB);
8957 
8958     // fall through to SinkMBB
8959     RSBBB->addSuccessor(SinkBB);
8960 
8961     // insert a cmp at the end of BB
8962     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8963         .addReg(ABSSrcReg)
8964         .addImm(0)
8965         .add(predOps(ARMCC::AL));
8966 
8967     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
8968     BuildMI(BB, dl,
8969       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
8970       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
8971 
8972     // insert rsbri in RSBBB
8973     // Note: BCC and rsbri will be converted into predicated rsbmi
8974     // by if-conversion pass
8975     BuildMI(*RSBBB, RSBBB->begin(), dl,
8976             TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
8977         .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
8978         .addImm(0)
8979         .add(predOps(ARMCC::AL))
8980         .add(condCodeOp());
8981 
8982     // insert PHI in SinkBB,
8983     // reuse ABSDstReg to not change uses of ABS instruction
8984     BuildMI(*SinkBB, SinkBB->begin(), dl,
8985       TII->get(ARM::PHI), ABSDstReg)
8986       .addReg(NewRsbDstReg).addMBB(RSBBB)
8987       .addReg(ABSSrcReg).addMBB(BB);
8988 
8989     // remove ABS instruction
8990     MI.eraseFromParent();
8991 
8992     // return last added BB
8993     return SinkBB;
8994   }
8995   case ARM::COPY_STRUCT_BYVAL_I32:
8996     ++NumLoopByVals;
8997     return EmitStructByval(MI, BB);
8998   case ARM::WIN__CHKSTK:
8999     return EmitLowered__chkstk(MI, BB);
9000   case ARM::WIN__DBZCHK:
9001     return EmitLowered__dbzchk(MI, BB);
9002   }
9003 }
9004 
9005 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers
9006 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
9007 /// instead of as a custom inserter because we need the use list from the SDNode.
9008 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
9009                                     MachineInstr &MI, const SDNode *Node) {
9010   bool isThumb1 = Subtarget->isThumb1Only();
9011 
9012   DebugLoc DL = MI.getDebugLoc();
9013   MachineFunction *MF = MI.getParent()->getParent();
9014   MachineRegisterInfo &MRI = MF->getRegInfo();
9015   MachineInstrBuilder MIB(*MF, MI);
9016 
9017   // If the new dst/src is unused mark it as dead.
9018   if (!Node->hasAnyUseOfValue(0)) {
9019     MI.getOperand(0).setIsDead(true);
9020   }
9021   if (!Node->hasAnyUseOfValue(1)) {
9022     MI.getOperand(1).setIsDead(true);
9023   }
9024 
9025   // The MEMCPY both defines and kills the scratch registers.
9026   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
9027     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
9028                                                          : &ARM::GPRRegClass);
9029     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
9030   }
9031 }
9032 
9033 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9034                                                       SDNode *Node) const {
9035   if (MI.getOpcode() == ARM::MEMCPY) {
9036     attachMEMCPYScratchRegs(Subtarget, MI, Node);
9037     return;
9038   }
9039 
9040   const MCInstrDesc *MCID = &MI.getDesc();
9041   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
9042   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
9043   // operand is still set to noreg. If needed, set the optional operand's
9044   // register to CPSR, and remove the redundant implicit def.
9045   //
9046   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
9047 
9048   // Rename pseudo opcodes.
9049   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
9050   if (NewOpc) {
9051     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
9052     MCID = &TII->get(NewOpc);
9053 
9054     assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
9055            "converted opcode should be the same except for cc_out");
9056 
9057     MI.setDesc(*MCID);
9058 
9059     // Add the optional cc_out operand
9060     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
9061   }
9062   unsigned ccOutIdx = MCID->getNumOperands() - 1;
9063 
9064   // Any ARM instruction that sets the 's' bit should specify an optional
9065   // "cc_out" operand in the last operand position.
9066   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
9067     assert(!NewOpc && "Optional cc_out operand required");
9068     return;
9069   }
9070   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
9071   // since we already have an optional CPSR def.
9072   bool definesCPSR = false;
9073   bool deadCPSR = false;
9074   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
9075        ++i) {
9076     const MachineOperand &MO = MI.getOperand(i);
9077     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
9078       definesCPSR = true;
9079       if (MO.isDead())
9080         deadCPSR = true;
9081       MI.RemoveOperand(i);
9082       break;
9083     }
9084   }
9085   if (!definesCPSR) {
9086     assert(!NewOpc && "Optional cc_out operand required");
9087     return;
9088   }
9089   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
9090   if (deadCPSR) {
9091     assert(!MI.getOperand(ccOutIdx).getReg() &&
9092            "expect uninitialized optional cc_out operand");
9093     return;
9094   }
9095 
9096   // If this instruction was defined with an optional CPSR def and its dag node
9097   // had a live implicit CPSR def, then activate the optional CPSR def.
9098   MachineOperand &MO = MI.getOperand(ccOutIdx);
9099   MO.setReg(ARM::CPSR);
9100   MO.setIsDef(true);
9101 }
9102 
9103 //===----------------------------------------------------------------------===//
9104 //                           ARM Optimization Hooks
9105 //===----------------------------------------------------------------------===//
9106 
9107 // Helper function that checks if N is a null or all ones constant.
9108 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
9109   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
9110 }
9111 
9112 // Return true if N is conditionally 0 or all ones.
9113 // Detects these expressions where cc is an i1 value:
9114 //
9115 //   (select cc 0, y)   [AllOnes=0]
9116 //   (select cc y, 0)   [AllOnes=0]
9117 //   (zext cc)          [AllOnes=0]
9118 //   (sext cc)          [AllOnes=0/1]
9119 //   (select cc -1, y)  [AllOnes=1]
9120 //   (select cc y, -1)  [AllOnes=1]
9121 //
9122 // Invert is set when N is the null/all ones constant when CC is false.
9123 // OtherOp is set to the alternative value of N.
9124 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
9125                                        SDValue &CC, bool &Invert,
9126                                        SDValue &OtherOp,
9127                                        SelectionDAG &DAG) {
9128   switch (N->getOpcode()) {
9129   default: return false;
9130   case ISD::SELECT: {
9131     CC = N->getOperand(0);
9132     SDValue N1 = N->getOperand(1);
9133     SDValue N2 = N->getOperand(2);
9134     if (isZeroOrAllOnes(N1, AllOnes)) {
9135       Invert = false;
9136       OtherOp = N2;
9137       return true;
9138     }
9139     if (isZeroOrAllOnes(N2, AllOnes)) {
9140       Invert = true;
9141       OtherOp = N1;
9142       return true;
9143     }
9144     return false;
9145   }
9146   case ISD::ZERO_EXTEND:
9147     // (zext cc) can never be the all ones value.
9148     if (AllOnes)
9149       return false;
9150     LLVM_FALLTHROUGH;
9151   case ISD::SIGN_EXTEND: {
9152     SDLoc dl(N);
9153     EVT VT = N->getValueType(0);
9154     CC = N->getOperand(0);
9155     if (CC.getValueType() != MVT::i1)
9156       return false;
9157     Invert = !AllOnes;
9158     if (AllOnes)
9159       // When looking for an AllOnes constant, N is an sext, and the 'other'
9160       // value is 0.
9161       OtherOp = DAG.getConstant(0, dl, VT);
9162     else if (N->getOpcode() == ISD::ZERO_EXTEND)
9163       // When looking for a 0 constant, N can be zext or sext.
9164       OtherOp = DAG.getConstant(1, dl, VT);
9165     else
9166       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
9167                                 VT);
9168     return true;
9169   }
9170   }
9171 }
9172 
9173 // Combine a constant select operand into its use:
9174 //
9175 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
9176 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
9177 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
9178 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
9179 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
9180 //
9181 // The transform is rejected if the select doesn't have a constant operand that
9182 // is null, or all ones when AllOnes is set.
9183 //
9184 // Also recognize sext/zext from i1:
9185 //
9186 //   (add (zext cc), x) -> (select cc (add x, 1), x)
9187 //   (add (sext cc), x) -> (select cc (add x, -1), x)
9188 //
9189 // These transformations eventually create predicated instructions.
9190 //
9191 // @param N       The node to transform.
9192 // @param Slct    The N operand that is a select.
9193 // @param OtherOp The other N operand (x above).
9194 // @param DCI     Context.
9195 // @param AllOnes Require the select constant to be all ones instead of null.
9196 // @returns The new node, or SDValue() on failure.
9197 static
9198 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
9199                             TargetLowering::DAGCombinerInfo &DCI,
9200                             bool AllOnes = false) {
9201   SelectionDAG &DAG = DCI.DAG;
9202   EVT VT = N->getValueType(0);
9203   SDValue NonConstantVal;
9204   SDValue CCOp;
9205   bool SwapSelectOps;
9206   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
9207                                   NonConstantVal, DAG))
9208     return SDValue();
9209 
9210   // Slct is now know to be the desired identity constant when CC is true.
9211   SDValue TrueVal = OtherOp;
9212   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
9213                                  OtherOp, NonConstantVal);
9214   // Unless SwapSelectOps says CC should be false.
9215   if (SwapSelectOps)
9216     std::swap(TrueVal, FalseVal);
9217 
9218   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
9219                      CCOp, TrueVal, FalseVal);
9220 }
9221 
9222 // Attempt combineSelectAndUse on each operand of a commutative operator N.
9223 static
9224 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
9225                                        TargetLowering::DAGCombinerInfo &DCI) {
9226   SDValue N0 = N->getOperand(0);
9227   SDValue N1 = N->getOperand(1);
9228   if (N0.getNode()->hasOneUse())
9229     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
9230       return Result;
9231   if (N1.getNode()->hasOneUse())
9232     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
9233       return Result;
9234   return SDValue();
9235 }
9236 
9237 static bool IsVUZPShuffleNode(SDNode *N) {
9238   // VUZP shuffle node.
9239   if (N->getOpcode() == ARMISD::VUZP)
9240     return true;
9241 
9242   // "VUZP" on i32 is an alias for VTRN.
9243   if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
9244     return true;
9245 
9246   return false;
9247 }
9248 
9249 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
9250                                  TargetLowering::DAGCombinerInfo &DCI,
9251                                  const ARMSubtarget *Subtarget) {
9252   // Look for ADD(VUZP.0, VUZP.1).
9253   if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
9254       N0 == N1)
9255    return SDValue();
9256 
9257   // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
9258   if (!N->getValueType(0).is64BitVector())
9259     return SDValue();
9260 
9261   // Generate vpadd.
9262   SelectionDAG &DAG = DCI.DAG;
9263   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9264   SDLoc dl(N);
9265   SDNode *Unzip = N0.getNode();
9266   EVT VT = N->getValueType(0);
9267 
9268   SmallVector<SDValue, 8> Ops;
9269   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
9270                                 TLI.getPointerTy(DAG.getDataLayout())));
9271   Ops.push_back(Unzip->getOperand(0));
9272   Ops.push_back(Unzip->getOperand(1));
9273 
9274   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
9275 }
9276 
9277 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9278                                       TargetLowering::DAGCombinerInfo &DCI,
9279                                       const ARMSubtarget *Subtarget) {
9280   // Check for two extended operands.
9281   if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
9282         N1.getOpcode() == ISD::SIGN_EXTEND) &&
9283       !(N0.getOpcode() == ISD::ZERO_EXTEND &&
9284         N1.getOpcode() == ISD::ZERO_EXTEND))
9285     return SDValue();
9286 
9287   SDValue N00 = N0.getOperand(0);
9288   SDValue N10 = N1.getOperand(0);
9289 
9290   // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
9291   if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
9292       N00 == N10)
9293     return SDValue();
9294 
9295   // We only recognize Q register paddl here; this can't be reached until
9296   // after type legalization.
9297   if (!N00.getValueType().is64BitVector() ||
9298       !N0.getValueType().is128BitVector())
9299     return SDValue();
9300 
9301   // Generate vpaddl.
9302   SelectionDAG &DAG = DCI.DAG;
9303   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9304   SDLoc dl(N);
9305   EVT VT = N->getValueType(0);
9306 
9307   SmallVector<SDValue, 8> Ops;
9308   // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
9309   unsigned Opcode;
9310   if (N0.getOpcode() == ISD::SIGN_EXTEND)
9311     Opcode = Intrinsic::arm_neon_vpaddls;
9312   else
9313     Opcode = Intrinsic::arm_neon_vpaddlu;
9314   Ops.push_back(DAG.getConstant(Opcode, dl,
9315                                 TLI.getPointerTy(DAG.getDataLayout())));
9316   EVT ElemTy = N00.getValueType().getVectorElementType();
9317   unsigned NumElts = VT.getVectorNumElements();
9318   EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
9319   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
9320                                N00.getOperand(0), N00.getOperand(1));
9321   Ops.push_back(Concat);
9322 
9323   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
9324 }
9325 
9326 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
9327 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
9328 // much easier to match.
9329 static SDValue
9330 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9331                                TargetLowering::DAGCombinerInfo &DCI,
9332                                const ARMSubtarget *Subtarget) {
9333   // Only perform optimization if after legalize, and if NEON is available. We
9334   // also expected both operands to be BUILD_VECTORs.
9335   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
9336       || N0.getOpcode() != ISD::BUILD_VECTOR
9337       || N1.getOpcode() != ISD::BUILD_VECTOR)
9338     return SDValue();
9339 
9340   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
9341   EVT VT = N->getValueType(0);
9342   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
9343     return SDValue();
9344 
9345   // Check that the vector operands are of the right form.
9346   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
9347   // operands, where N is the size of the formed vector.
9348   // Each EXTRACT_VECTOR should have the same input vector and odd or even
9349   // index such that we have a pair wise add pattern.
9350 
9351   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
9352   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9353     return SDValue();
9354   SDValue Vec = N0->getOperand(0)->getOperand(0);
9355   SDNode *V = Vec.getNode();
9356   unsigned nextIndex = 0;
9357 
9358   // For each operands to the ADD which are BUILD_VECTORs,
9359   // check to see if each of their operands are an EXTRACT_VECTOR with
9360   // the same vector and appropriate index.
9361   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
9362     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
9363         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
9364 
9365       SDValue ExtVec0 = N0->getOperand(i);
9366       SDValue ExtVec1 = N1->getOperand(i);
9367 
9368       // First operand is the vector, verify its the same.
9369       if (V != ExtVec0->getOperand(0).getNode() ||
9370           V != ExtVec1->getOperand(0).getNode())
9371         return SDValue();
9372 
9373       // Second is the constant, verify its correct.
9374       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
9375       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
9376 
9377       // For the constant, we want to see all the even or all the odd.
9378       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
9379           || C1->getZExtValue() != nextIndex+1)
9380         return SDValue();
9381 
9382       // Increment index.
9383       nextIndex+=2;
9384     } else
9385       return SDValue();
9386   }
9387 
9388   // Don't generate vpaddl+vmovn; we'll match it to vpadd later.
9389   if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
9390     return SDValue();
9391 
9392   // Create VPADDL node.
9393   SelectionDAG &DAG = DCI.DAG;
9394   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9395 
9396   SDLoc dl(N);
9397 
9398   // Build operand list.
9399   SmallVector<SDValue, 8> Ops;
9400   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
9401                                 TLI.getPointerTy(DAG.getDataLayout())));
9402 
9403   // Input is the vector.
9404   Ops.push_back(Vec);
9405 
9406   // Get widened type and narrowed type.
9407   MVT widenType;
9408   unsigned numElem = VT.getVectorNumElements();
9409 
9410   EVT inputLaneType = Vec.getValueType().getVectorElementType();
9411   switch (inputLaneType.getSimpleVT().SimpleTy) {
9412     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
9413     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
9414     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
9415     default:
9416       llvm_unreachable("Invalid vector element type for padd optimization.");
9417   }
9418 
9419   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
9420   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
9421   return DAG.getNode(ExtOp, dl, VT, tmp);
9422 }
9423 
9424 static SDValue findMUL_LOHI(SDValue V) {
9425   if (V->getOpcode() == ISD::UMUL_LOHI ||
9426       V->getOpcode() == ISD::SMUL_LOHI)
9427     return V;
9428   return SDValue();
9429 }
9430 
9431 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
9432                                      TargetLowering::DAGCombinerInfo &DCI,
9433                                      const ARMSubtarget *Subtarget) {
9434   // Look for multiply add opportunities.
9435   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
9436   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
9437   // a glue link from the first add to the second add.
9438   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
9439   // a S/UMLAL instruction.
9440   //                  UMUL_LOHI
9441   //                 / :lo    \ :hi
9442   //                /          \          [no multiline comment]
9443   //    loAdd ->  ADDE         |
9444   //                 \ :glue  /
9445   //                  \      /
9446   //                    ADDC   <- hiAdd
9447   //
9448   assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
9449   SDValue AddcOp0 = AddcNode->getOperand(0);
9450   SDValue AddcOp1 = AddcNode->getOperand(1);
9451 
9452   // Check if the two operands are from the same mul_lohi node.
9453   if (AddcOp0.getNode() == AddcOp1.getNode())
9454     return SDValue();
9455 
9456   assert(AddcNode->getNumValues() == 2 &&
9457          AddcNode->getValueType(0) == MVT::i32 &&
9458          "Expect ADDC with two result values. First: i32");
9459 
9460   // Check that we have a glued ADDC node.
9461   if (AddcNode->getValueType(1) != MVT::Glue)
9462     return SDValue();
9463 
9464   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
9465   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
9466       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
9467       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
9468       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
9469     return SDValue();
9470 
9471   // Look for the glued ADDE.
9472   SDNode* AddeNode = AddcNode->getGluedUser();
9473   if (!AddeNode)
9474     return SDValue();
9475 
9476   // Make sure it is really an ADDE.
9477   if (AddeNode->getOpcode() != ISD::ADDE)
9478     return SDValue();
9479 
9480   assert(AddeNode->getNumOperands() == 3 &&
9481          AddeNode->getOperand(2).getValueType() == MVT::Glue &&
9482          "ADDE node has the wrong inputs");
9483 
9484   // Check for the triangle shape.
9485   SDValue AddeOp0 = AddeNode->getOperand(0);
9486   SDValue AddeOp1 = AddeNode->getOperand(1);
9487 
9488   // Make sure that the ADDE operands are not coming from the same node.
9489   if (AddeOp0.getNode() == AddeOp1.getNode())
9490     return SDValue();
9491 
9492   // Find the MUL_LOHI node walking up ADDE's operands.
9493   bool IsLeftOperandMUL = false;
9494   SDValue MULOp = findMUL_LOHI(AddeOp0);
9495   if (MULOp == SDValue())
9496    MULOp = findMUL_LOHI(AddeOp1);
9497   else
9498     IsLeftOperandMUL = true;
9499   if (MULOp == SDValue())
9500     return SDValue();
9501 
9502   // Figure out the right opcode.
9503   unsigned Opc = MULOp->getOpcode();
9504   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
9505 
9506   // Figure out the high and low input values to the MLAL node.
9507   SDValue* HiAdd = nullptr;
9508   SDValue* LoMul = nullptr;
9509   SDValue* LowAdd = nullptr;
9510 
9511   // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
9512   if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
9513     return SDValue();
9514 
9515   if (IsLeftOperandMUL)
9516     HiAdd = &AddeOp1;
9517   else
9518     HiAdd = &AddeOp0;
9519 
9520 
9521   // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
9522   // whose low result is fed to the ADDC we are checking.
9523 
9524   if (AddcOp0 == MULOp.getValue(0)) {
9525     LoMul = &AddcOp0;
9526     LowAdd = &AddcOp1;
9527   }
9528   if (AddcOp1 == MULOp.getValue(0)) {
9529     LoMul = &AddcOp1;
9530     LowAdd = &AddcOp0;
9531   }
9532 
9533   if (!LoMul)
9534     return SDValue();
9535 
9536   // Create the merged node.
9537   SelectionDAG &DAG = DCI.DAG;
9538 
9539   // Build operand list.
9540   SmallVector<SDValue, 8> Ops;
9541   Ops.push_back(LoMul->getOperand(0));
9542   Ops.push_back(LoMul->getOperand(1));
9543   Ops.push_back(*LowAdd);
9544   Ops.push_back(*HiAdd);
9545 
9546   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
9547                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
9548 
9549   // Replace the ADDs' nodes uses by the MLA node's values.
9550   SDValue HiMLALResult(MLALNode.getNode(), 1);
9551   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
9552 
9553   SDValue LoMLALResult(MLALNode.getNode(), 0);
9554   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
9555 
9556   // Return original node to notify the driver to stop replacing.
9557   SDValue resNode(AddcNode, 0);
9558   return resNode;
9559 }
9560 
9561 static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
9562                                       TargetLowering::DAGCombinerInfo &DCI,
9563                                       const ARMSubtarget *Subtarget) {
9564   // UMAAL is similar to UMLAL except that it adds two unsigned values.
9565   // While trying to combine for the other MLAL nodes, first search for the
9566   // chance to use UMAAL. Check if Addc uses another addc node which can first
9567   // be combined into a UMLAL. The other pattern is AddcNode being combined
9568   // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
9569 
9570   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() ||
9571       (Subtarget->isThumb() && !Subtarget->hasThumb2()))
9572     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9573 
9574   SDNode *PrevAddc = nullptr;
9575   if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
9576     PrevAddc = AddcNode->getOperand(0).getNode();
9577   else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
9578     PrevAddc = AddcNode->getOperand(1).getNode();
9579 
9580   // If there's no addc chains, just return a search for any MLAL.
9581   if (PrevAddc == nullptr)
9582     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9583 
9584   // Try to convert the addc operand to an MLAL and if that fails try to
9585   // combine AddcNode.
9586   SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
9587   if (MLAL != SDValue(PrevAddc, 0))
9588     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9589 
9590   // Find the converted UMAAL or quit if it doesn't exist.
9591   SDNode *UmlalNode = nullptr;
9592   SDValue AddHi;
9593   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
9594     UmlalNode = AddcNode->getOperand(0).getNode();
9595     AddHi = AddcNode->getOperand(1);
9596   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
9597     UmlalNode = AddcNode->getOperand(1).getNode();
9598     AddHi = AddcNode->getOperand(0);
9599   } else {
9600     return SDValue();
9601   }
9602 
9603   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
9604   // the ADDC as well as Zero.
9605   auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
9606 
9607   if (!Zero || Zero->getZExtValue() != 0)
9608     return SDValue();
9609 
9610   // Check that we have a glued ADDC node.
9611   if (AddcNode->getValueType(1) != MVT::Glue)
9612     return SDValue();
9613 
9614   // Look for the glued ADDE.
9615   SDNode* AddeNode = AddcNode->getGluedUser();
9616   if (!AddeNode)
9617     return SDValue();
9618 
9619   if ((AddeNode->getOperand(0).getNode() == Zero &&
9620        AddeNode->getOperand(1).getNode() == UmlalNode) ||
9621       (AddeNode->getOperand(0).getNode() == UmlalNode &&
9622        AddeNode->getOperand(1).getNode() == Zero)) {
9623 
9624     SelectionDAG &DAG = DCI.DAG;
9625     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
9626                       UmlalNode->getOperand(2), AddHi };
9627     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
9628                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
9629 
9630     // Replace the ADDs' nodes uses by the UMAAL node's values.
9631     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
9632     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
9633 
9634     // Return original node to notify the driver to stop replacing.
9635     return SDValue(AddcNode, 0);
9636   }
9637   return SDValue();
9638 }
9639 
9640 /// PerformADDCCombine - Target-specific dag combine transform from
9641 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
9642 /// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
9643 static SDValue PerformADDCCombine(SDNode *N,
9644                                  TargetLowering::DAGCombinerInfo &DCI,
9645                                  const ARMSubtarget *Subtarget) {
9646   if (Subtarget->isThumb1Only()) return SDValue();
9647 
9648   // Only perform the checks after legalize when the pattern is available.
9649   if (DCI.isBeforeLegalize()) return SDValue();
9650 
9651   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
9652 }
9653 
9654 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
9655 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
9656 /// called with the default operands, and if that fails, with commuted
9657 /// operands.
9658 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
9659                                           TargetLowering::DAGCombinerInfo &DCI,
9660                                           const ARMSubtarget *Subtarget){
9661   // Attempt to create vpadd for this add.
9662   if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
9663     return Result;
9664 
9665   // Attempt to create vpaddl for this add.
9666   if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
9667     return Result;
9668   if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
9669                                                       Subtarget))
9670     return Result;
9671 
9672   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
9673   if (N0.getNode()->hasOneUse())
9674     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
9675       return Result;
9676   return SDValue();
9677 }
9678 
9679 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
9680 ///
9681 static SDValue PerformADDCombine(SDNode *N,
9682                                  TargetLowering::DAGCombinerInfo &DCI,
9683                                  const ARMSubtarget *Subtarget) {
9684   SDValue N0 = N->getOperand(0);
9685   SDValue N1 = N->getOperand(1);
9686 
9687   // First try with the default operand order.
9688   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
9689     return Result;
9690 
9691   // If that didn't work, try again with the operands commuted.
9692   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
9693 }
9694 
9695 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
9696 ///
9697 static SDValue PerformSUBCombine(SDNode *N,
9698                                  TargetLowering::DAGCombinerInfo &DCI) {
9699   SDValue N0 = N->getOperand(0);
9700   SDValue N1 = N->getOperand(1);
9701 
9702   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
9703   if (N1.getNode()->hasOneUse())
9704     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
9705       return Result;
9706 
9707   return SDValue();
9708 }
9709 
9710 /// PerformVMULCombine
9711 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
9712 /// special multiplier accumulator forwarding.
9713 ///   vmul d3, d0, d2
9714 ///   vmla d3, d1, d2
9715 /// is faster than
9716 ///   vadd d3, d0, d1
9717 ///   vmul d3, d3, d2
9718 //  However, for (A + B) * (A + B),
9719 //    vadd d2, d0, d1
9720 //    vmul d3, d0, d2
9721 //    vmla d3, d1, d2
9722 //  is slower than
9723 //    vadd d2, d0, d1
9724 //    vmul d3, d2, d2
9725 static SDValue PerformVMULCombine(SDNode *N,
9726                                   TargetLowering::DAGCombinerInfo &DCI,
9727                                   const ARMSubtarget *Subtarget) {
9728   if (!Subtarget->hasVMLxForwarding())
9729     return SDValue();
9730 
9731   SelectionDAG &DAG = DCI.DAG;
9732   SDValue N0 = N->getOperand(0);
9733   SDValue N1 = N->getOperand(1);
9734   unsigned Opcode = N0.getOpcode();
9735   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
9736       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
9737     Opcode = N1.getOpcode();
9738     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
9739         Opcode != ISD::FADD && Opcode != ISD::FSUB)
9740       return SDValue();
9741     std::swap(N0, N1);
9742   }
9743 
9744   if (N0 == N1)
9745     return SDValue();
9746 
9747   EVT VT = N->getValueType(0);
9748   SDLoc DL(N);
9749   SDValue N00 = N0->getOperand(0);
9750   SDValue N01 = N0->getOperand(1);
9751   return DAG.getNode(Opcode, DL, VT,
9752                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
9753                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
9754 }
9755 
9756 static SDValue PerformMULCombine(SDNode *N,
9757                                  TargetLowering::DAGCombinerInfo &DCI,
9758                                  const ARMSubtarget *Subtarget) {
9759   SelectionDAG &DAG = DCI.DAG;
9760 
9761   if (Subtarget->isThumb1Only())
9762     return SDValue();
9763 
9764   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9765     return SDValue();
9766 
9767   EVT VT = N->getValueType(0);
9768   if (VT.is64BitVector() || VT.is128BitVector())
9769     return PerformVMULCombine(N, DCI, Subtarget);
9770   if (VT != MVT::i32)
9771     return SDValue();
9772 
9773   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9774   if (!C)
9775     return SDValue();
9776 
9777   int64_t MulAmt = C->getSExtValue();
9778   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
9779 
9780   ShiftAmt = ShiftAmt & (32 - 1);
9781   SDValue V = N->getOperand(0);
9782   SDLoc DL(N);
9783 
9784   SDValue Res;
9785   MulAmt >>= ShiftAmt;
9786 
9787   if (MulAmt >= 0) {
9788     if (isPowerOf2_32(MulAmt - 1)) {
9789       // (mul x, 2^N + 1) => (add (shl x, N), x)
9790       Res = DAG.getNode(ISD::ADD, DL, VT,
9791                         V,
9792                         DAG.getNode(ISD::SHL, DL, VT,
9793                                     V,
9794                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
9795                                                     MVT::i32)));
9796     } else if (isPowerOf2_32(MulAmt + 1)) {
9797       // (mul x, 2^N - 1) => (sub (shl x, N), x)
9798       Res = DAG.getNode(ISD::SUB, DL, VT,
9799                         DAG.getNode(ISD::SHL, DL, VT,
9800                                     V,
9801                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
9802                                                     MVT::i32)),
9803                         V);
9804     } else
9805       return SDValue();
9806   } else {
9807     uint64_t MulAmtAbs = -MulAmt;
9808     if (isPowerOf2_32(MulAmtAbs + 1)) {
9809       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
9810       Res = DAG.getNode(ISD::SUB, DL, VT,
9811                         V,
9812                         DAG.getNode(ISD::SHL, DL, VT,
9813                                     V,
9814                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
9815                                                     MVT::i32)));
9816     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
9817       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
9818       Res = DAG.getNode(ISD::ADD, DL, VT,
9819                         V,
9820                         DAG.getNode(ISD::SHL, DL, VT,
9821                                     V,
9822                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
9823                                                     MVT::i32)));
9824       Res = DAG.getNode(ISD::SUB, DL, VT,
9825                         DAG.getConstant(0, DL, MVT::i32), Res);
9826 
9827     } else
9828       return SDValue();
9829   }
9830 
9831   if (ShiftAmt != 0)
9832     Res = DAG.getNode(ISD::SHL, DL, VT,
9833                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
9834 
9835   // Do not add new nodes to DAG combiner worklist.
9836   DCI.CombineTo(N, Res, false);
9837   return SDValue();
9838 }
9839 
9840 static SDValue PerformANDCombine(SDNode *N,
9841                                  TargetLowering::DAGCombinerInfo &DCI,
9842                                  const ARMSubtarget *Subtarget) {
9843   // Attempt to use immediate-form VBIC
9844   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9845   SDLoc dl(N);
9846   EVT VT = N->getValueType(0);
9847   SelectionDAG &DAG = DCI.DAG;
9848 
9849   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9850     return SDValue();
9851 
9852   APInt SplatBits, SplatUndef;
9853   unsigned SplatBitSize;
9854   bool HasAnyUndefs;
9855   if (BVN &&
9856       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9857     if (SplatBitSize <= 64) {
9858       EVT VbicVT;
9859       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
9860                                       SplatUndef.getZExtValue(), SplatBitSize,
9861                                       DAG, dl, VbicVT, VT.is128BitVector(),
9862                                       OtherModImm);
9863       if (Val.getNode()) {
9864         SDValue Input =
9865           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
9866         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
9867         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
9868       }
9869     }
9870   }
9871 
9872   if (!Subtarget->isThumb1Only()) {
9873     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
9874     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
9875       return Result;
9876   }
9877 
9878   return SDValue();
9879 }
9880 
9881 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
9882 static SDValue PerformORCombine(SDNode *N,
9883                                 TargetLowering::DAGCombinerInfo &DCI,
9884                                 const ARMSubtarget *Subtarget) {
9885   // Attempt to use immediate-form VORR
9886   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9887   SDLoc dl(N);
9888   EVT VT = N->getValueType(0);
9889   SelectionDAG &DAG = DCI.DAG;
9890 
9891   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9892     return SDValue();
9893 
9894   APInt SplatBits, SplatUndef;
9895   unsigned SplatBitSize;
9896   bool HasAnyUndefs;
9897   if (BVN && Subtarget->hasNEON() &&
9898       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9899     if (SplatBitSize <= 64) {
9900       EVT VorrVT;
9901       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
9902                                       SplatUndef.getZExtValue(), SplatBitSize,
9903                                       DAG, dl, VorrVT, VT.is128BitVector(),
9904                                       OtherModImm);
9905       if (Val.getNode()) {
9906         SDValue Input =
9907           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
9908         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
9909         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
9910       }
9911     }
9912   }
9913 
9914   if (!Subtarget->isThumb1Only()) {
9915     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
9916     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
9917       return Result;
9918   }
9919 
9920   // The code below optimizes (or (and X, Y), Z).
9921   // The AND operand needs to have a single user to make these optimizations
9922   // profitable.
9923   SDValue N0 = N->getOperand(0);
9924   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
9925     return SDValue();
9926   SDValue N1 = N->getOperand(1);
9927 
9928   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
9929   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
9930       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
9931     APInt SplatUndef;
9932     unsigned SplatBitSize;
9933     bool HasAnyUndefs;
9934 
9935     APInt SplatBits0, SplatBits1;
9936     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
9937     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
9938     // Ensure that the second operand of both ands are constants
9939     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
9940                                       HasAnyUndefs) && !HasAnyUndefs) {
9941         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
9942                                           HasAnyUndefs) && !HasAnyUndefs) {
9943             // Ensure that the bit width of the constants are the same and that
9944             // the splat arguments are logical inverses as per the pattern we
9945             // are trying to simplify.
9946             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
9947                 SplatBits0 == ~SplatBits1) {
9948                 // Canonicalize the vector type to make instruction selection
9949                 // simpler.
9950                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
9951                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
9952                                              N0->getOperand(1),
9953                                              N0->getOperand(0),
9954                                              N1->getOperand(0));
9955                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
9956             }
9957         }
9958     }
9959   }
9960 
9961   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
9962   // reasonable.
9963 
9964   // BFI is only available on V6T2+
9965   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
9966     return SDValue();
9967 
9968   SDLoc DL(N);
9969   // 1) or (and A, mask), val => ARMbfi A, val, mask
9970   //      iff (val & mask) == val
9971   //
9972   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
9973   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
9974   //          && mask == ~mask2
9975   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
9976   //          && ~mask == mask2
9977   //  (i.e., copy a bitfield value into another bitfield of the same width)
9978 
9979   if (VT != MVT::i32)
9980     return SDValue();
9981 
9982   SDValue N00 = N0.getOperand(0);
9983 
9984   // The value and the mask need to be constants so we can verify this is
9985   // actually a bitfield set. If the mask is 0xffff, we can do better
9986   // via a movt instruction, so don't use BFI in that case.
9987   SDValue MaskOp = N0.getOperand(1);
9988   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
9989   if (!MaskC)
9990     return SDValue();
9991   unsigned Mask = MaskC->getZExtValue();
9992   if (Mask == 0xffff)
9993     return SDValue();
9994   SDValue Res;
9995   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
9996   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9997   if (N1C) {
9998     unsigned Val = N1C->getZExtValue();
9999     if ((Val & ~Mask) != Val)
10000       return SDValue();
10001 
10002     if (ARM::isBitFieldInvertedMask(Mask)) {
10003       Val >>= countTrailingZeros(~Mask);
10004 
10005       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
10006                         DAG.getConstant(Val, DL, MVT::i32),
10007                         DAG.getConstant(Mask, DL, MVT::i32));
10008 
10009       // Do not add new nodes to DAG combiner worklist.
10010       DCI.CombineTo(N, Res, false);
10011       return SDValue();
10012     }
10013   } else if (N1.getOpcode() == ISD::AND) {
10014     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
10015     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
10016     if (!N11C)
10017       return SDValue();
10018     unsigned Mask2 = N11C->getZExtValue();
10019 
10020     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
10021     // as is to match.
10022     if (ARM::isBitFieldInvertedMask(Mask) &&
10023         (Mask == ~Mask2)) {
10024       // The pack halfword instruction works better for masks that fit it,
10025       // so use that when it's available.
10026       if (Subtarget->hasT2ExtractPack() &&
10027           (Mask == 0xffff || Mask == 0xffff0000))
10028         return SDValue();
10029       // 2a
10030       unsigned amt = countTrailingZeros(Mask2);
10031       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
10032                         DAG.getConstant(amt, DL, MVT::i32));
10033       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
10034                         DAG.getConstant(Mask, DL, MVT::i32));
10035       // Do not add new nodes to DAG combiner worklist.
10036       DCI.CombineTo(N, Res, false);
10037       return SDValue();
10038     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
10039                (~Mask == Mask2)) {
10040       // The pack halfword instruction works better for masks that fit it,
10041       // so use that when it's available.
10042       if (Subtarget->hasT2ExtractPack() &&
10043           (Mask2 == 0xffff || Mask2 == 0xffff0000))
10044         return SDValue();
10045       // 2b
10046       unsigned lsb = countTrailingZeros(Mask);
10047       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
10048                         DAG.getConstant(lsb, DL, MVT::i32));
10049       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
10050                         DAG.getConstant(Mask2, DL, MVT::i32));
10051       // Do not add new nodes to DAG combiner worklist.
10052       DCI.CombineTo(N, Res, false);
10053       return SDValue();
10054     }
10055   }
10056 
10057   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
10058       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
10059       ARM::isBitFieldInvertedMask(~Mask)) {
10060     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
10061     // where lsb(mask) == #shamt and masked bits of B are known zero.
10062     SDValue ShAmt = N00.getOperand(1);
10063     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
10064     unsigned LSB = countTrailingZeros(Mask);
10065     if (ShAmtC != LSB)
10066       return SDValue();
10067 
10068     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
10069                       DAG.getConstant(~Mask, DL, MVT::i32));
10070 
10071     // Do not add new nodes to DAG combiner worklist.
10072     DCI.CombineTo(N, Res, false);
10073   }
10074 
10075   return SDValue();
10076 }
10077 
10078 static SDValue PerformXORCombine(SDNode *N,
10079                                  TargetLowering::DAGCombinerInfo &DCI,
10080                                  const ARMSubtarget *Subtarget) {
10081   EVT VT = N->getValueType(0);
10082   SelectionDAG &DAG = DCI.DAG;
10083 
10084   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
10085     return SDValue();
10086 
10087   if (!Subtarget->isThumb1Only()) {
10088     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
10089     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
10090       return Result;
10091   }
10092 
10093   return SDValue();
10094 }
10095 
10096 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
10097 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
10098 // their position in "to" (Rd).
10099 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
10100   assert(N->getOpcode() == ARMISD::BFI);
10101 
10102   SDValue From = N->getOperand(1);
10103   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
10104   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
10105 
10106   // If the Base came from a SHR #C, we can deduce that it is really testing bit
10107   // #C in the base of the SHR.
10108   if (From->getOpcode() == ISD::SRL &&
10109       isa<ConstantSDNode>(From->getOperand(1))) {
10110     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
10111     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
10112     FromMask <<= Shift.getLimitedValue(31);
10113     From = From->getOperand(0);
10114   }
10115 
10116   return From;
10117 }
10118 
10119 // If A and B contain one contiguous set of bits, does A | B == A . B?
10120 //
10121 // Neither A nor B must be zero.
10122 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
10123   unsigned LastActiveBitInA =  A.countTrailingZeros();
10124   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
10125   return LastActiveBitInA - 1 == FirstActiveBitInB;
10126 }
10127 
10128 static SDValue FindBFIToCombineWith(SDNode *N) {
10129   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
10130   // if one exists.
10131   APInt ToMask, FromMask;
10132   SDValue From = ParseBFI(N, ToMask, FromMask);
10133   SDValue To = N->getOperand(0);
10134 
10135   // Now check for a compatible BFI to merge with. We can pass through BFIs that
10136   // aren't compatible, but not if they set the same bit in their destination as
10137   // we do (or that of any BFI we're going to combine with).
10138   SDValue V = To;
10139   APInt CombinedToMask = ToMask;
10140   while (V.getOpcode() == ARMISD::BFI) {
10141     APInt NewToMask, NewFromMask;
10142     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
10143     if (NewFrom != From) {
10144       // This BFI has a different base. Keep going.
10145       CombinedToMask |= NewToMask;
10146       V = V.getOperand(0);
10147       continue;
10148     }
10149 
10150     // Do the written bits conflict with any we've seen so far?
10151     if ((NewToMask & CombinedToMask).getBoolValue())
10152       // Conflicting bits - bail out because going further is unsafe.
10153       return SDValue();
10154 
10155     // Are the new bits contiguous when combined with the old bits?
10156     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
10157         BitsProperlyConcatenate(FromMask, NewFromMask))
10158       return V;
10159     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
10160         BitsProperlyConcatenate(NewFromMask, FromMask))
10161       return V;
10162 
10163     // We've seen a write to some bits, so track it.
10164     CombinedToMask |= NewToMask;
10165     // Keep going...
10166     V = V.getOperand(0);
10167   }
10168 
10169   return SDValue();
10170 }
10171 
10172 static SDValue PerformBFICombine(SDNode *N,
10173                                  TargetLowering::DAGCombinerInfo &DCI) {
10174   SDValue N1 = N->getOperand(1);
10175   if (N1.getOpcode() == ISD::AND) {
10176     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
10177     // the bits being cleared by the AND are not demanded by the BFI.
10178     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
10179     if (!N11C)
10180       return SDValue();
10181     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
10182     unsigned LSB = countTrailingZeros(~InvMask);
10183     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
10184     assert(Width <
10185                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
10186            "undefined behavior");
10187     unsigned Mask = (1u << Width) - 1;
10188     unsigned Mask2 = N11C->getZExtValue();
10189     if ((Mask & (~Mask2)) == 0)
10190       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
10191                              N->getOperand(0), N1.getOperand(0),
10192                              N->getOperand(2));
10193   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
10194     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
10195     // Keep track of any consecutive bits set that all come from the same base
10196     // value. We can combine these together into a single BFI.
10197     SDValue CombineBFI = FindBFIToCombineWith(N);
10198     if (CombineBFI == SDValue())
10199       return SDValue();
10200 
10201     // We've found a BFI.
10202     APInt ToMask1, FromMask1;
10203     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
10204 
10205     APInt ToMask2, FromMask2;
10206     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
10207     assert(From1 == From2);
10208     (void)From2;
10209 
10210     // First, unlink CombineBFI.
10211     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
10212     // Then create a new BFI, combining the two together.
10213     APInt NewFromMask = FromMask1 | FromMask2;
10214     APInt NewToMask = ToMask1 | ToMask2;
10215 
10216     EVT VT = N->getValueType(0);
10217     SDLoc dl(N);
10218 
10219     if (NewFromMask[0] == 0)
10220       From1 = DCI.DAG.getNode(
10221         ISD::SRL, dl, VT, From1,
10222         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
10223     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
10224                            DCI.DAG.getConstant(~NewToMask, dl, VT));
10225   }
10226   return SDValue();
10227 }
10228 
10229 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
10230 /// ARMISD::VMOVRRD.
10231 static SDValue PerformVMOVRRDCombine(SDNode *N,
10232                                      TargetLowering::DAGCombinerInfo &DCI,
10233                                      const ARMSubtarget *Subtarget) {
10234   // vmovrrd(vmovdrr x, y) -> x,y
10235   SDValue InDouble = N->getOperand(0);
10236   if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
10237     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
10238 
10239   // vmovrrd(load f64) -> (load i32), (load i32)
10240   SDNode *InNode = InDouble.getNode();
10241   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
10242       InNode->getValueType(0) == MVT::f64 &&
10243       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
10244       !cast<LoadSDNode>(InNode)->isVolatile()) {
10245     // TODO: Should this be done for non-FrameIndex operands?
10246     LoadSDNode *LD = cast<LoadSDNode>(InNode);
10247 
10248     SelectionDAG &DAG = DCI.DAG;
10249     SDLoc DL(LD);
10250     SDValue BasePtr = LD->getBasePtr();
10251     SDValue NewLD1 =
10252         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
10253                     LD->getAlignment(), LD->getMemOperand()->getFlags());
10254 
10255     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
10256                                     DAG.getConstant(4, DL, MVT::i32));
10257     SDValue NewLD2 = DAG.getLoad(
10258         MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
10259         std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
10260 
10261     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
10262     if (DCI.DAG.getDataLayout().isBigEndian())
10263       std::swap (NewLD1, NewLD2);
10264     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
10265     return Result;
10266   }
10267 
10268   return SDValue();
10269 }
10270 
10271 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
10272 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
10273 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
10274   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
10275   SDValue Op0 = N->getOperand(0);
10276   SDValue Op1 = N->getOperand(1);
10277   if (Op0.getOpcode() == ISD::BITCAST)
10278     Op0 = Op0.getOperand(0);
10279   if (Op1.getOpcode() == ISD::BITCAST)
10280     Op1 = Op1.getOperand(0);
10281   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
10282       Op0.getNode() == Op1.getNode() &&
10283       Op0.getResNo() == 0 && Op1.getResNo() == 1)
10284     return DAG.getNode(ISD::BITCAST, SDLoc(N),
10285                        N->getValueType(0), Op0.getOperand(0));
10286   return SDValue();
10287 }
10288 
10289 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
10290 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
10291 /// i64 vector to have f64 elements, since the value can then be loaded
10292 /// directly into a VFP register.
10293 static bool hasNormalLoadOperand(SDNode *N) {
10294   unsigned NumElts = N->getValueType(0).getVectorNumElements();
10295   for (unsigned i = 0; i < NumElts; ++i) {
10296     SDNode *Elt = N->getOperand(i).getNode();
10297     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
10298       return true;
10299   }
10300   return false;
10301 }
10302 
10303 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
10304 /// ISD::BUILD_VECTOR.
10305 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
10306                                           TargetLowering::DAGCombinerInfo &DCI,
10307                                           const ARMSubtarget *Subtarget) {
10308   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
10309   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
10310   // into a pair of GPRs, which is fine when the value is used as a scalar,
10311   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
10312   SelectionDAG &DAG = DCI.DAG;
10313   if (N->getNumOperands() == 2)
10314     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
10315       return RV;
10316 
10317   // Load i64 elements as f64 values so that type legalization does not split
10318   // them up into i32 values.
10319   EVT VT = N->getValueType(0);
10320   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
10321     return SDValue();
10322   SDLoc dl(N);
10323   SmallVector<SDValue, 8> Ops;
10324   unsigned NumElts = VT.getVectorNumElements();
10325   for (unsigned i = 0; i < NumElts; ++i) {
10326     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
10327     Ops.push_back(V);
10328     // Make the DAGCombiner fold the bitcast.
10329     DCI.AddToWorklist(V.getNode());
10330   }
10331   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
10332   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
10333   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
10334 }
10335 
10336 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
10337 static SDValue
10338 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
10339   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
10340   // At that time, we may have inserted bitcasts from integer to float.
10341   // If these bitcasts have survived DAGCombine, change the lowering of this
10342   // BUILD_VECTOR in something more vector friendly, i.e., that does not
10343   // force to use floating point types.
10344 
10345   // Make sure we can change the type of the vector.
10346   // This is possible iff:
10347   // 1. The vector is only used in a bitcast to a integer type. I.e.,
10348   //    1.1. Vector is used only once.
10349   //    1.2. Use is a bit convert to an integer type.
10350   // 2. The size of its operands are 32-bits (64-bits are not legal).
10351   EVT VT = N->getValueType(0);
10352   EVT EltVT = VT.getVectorElementType();
10353 
10354   // Check 1.1. and 2.
10355   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
10356     return SDValue();
10357 
10358   // By construction, the input type must be float.
10359   assert(EltVT == MVT::f32 && "Unexpected type!");
10360 
10361   // Check 1.2.
10362   SDNode *Use = *N->use_begin();
10363   if (Use->getOpcode() != ISD::BITCAST ||
10364       Use->getValueType(0).isFloatingPoint())
10365     return SDValue();
10366 
10367   // Check profitability.
10368   // Model is, if more than half of the relevant operands are bitcast from
10369   // i32, turn the build_vector into a sequence of insert_vector_elt.
10370   // Relevant operands are everything that is not statically
10371   // (i.e., at compile time) bitcasted.
10372   unsigned NumOfBitCastedElts = 0;
10373   unsigned NumElts = VT.getVectorNumElements();
10374   unsigned NumOfRelevantElts = NumElts;
10375   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
10376     SDValue Elt = N->getOperand(Idx);
10377     if (Elt->getOpcode() == ISD::BITCAST) {
10378       // Assume only bit cast to i32 will go away.
10379       if (Elt->getOperand(0).getValueType() == MVT::i32)
10380         ++NumOfBitCastedElts;
10381     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
10382       // Constants are statically casted, thus do not count them as
10383       // relevant operands.
10384       --NumOfRelevantElts;
10385   }
10386 
10387   // Check if more than half of the elements require a non-free bitcast.
10388   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
10389     return SDValue();
10390 
10391   SelectionDAG &DAG = DCI.DAG;
10392   // Create the new vector type.
10393   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
10394   // Check if the type is legal.
10395   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10396   if (!TLI.isTypeLegal(VecVT))
10397     return SDValue();
10398 
10399   // Combine:
10400   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
10401   // => BITCAST INSERT_VECTOR_ELT
10402   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
10403   //                      (BITCAST EN), N.
10404   SDValue Vec = DAG.getUNDEF(VecVT);
10405   SDLoc dl(N);
10406   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
10407     SDValue V = N->getOperand(Idx);
10408     if (V.isUndef())
10409       continue;
10410     if (V.getOpcode() == ISD::BITCAST &&
10411         V->getOperand(0).getValueType() == MVT::i32)
10412       // Fold obvious case.
10413       V = V.getOperand(0);
10414     else {
10415       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
10416       // Make the DAGCombiner fold the bitcasts.
10417       DCI.AddToWorklist(V.getNode());
10418     }
10419     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
10420     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
10421   }
10422   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
10423   // Make the DAGCombiner fold the bitcasts.
10424   DCI.AddToWorklist(Vec.getNode());
10425   return Vec;
10426 }
10427 
10428 /// PerformInsertEltCombine - Target-specific dag combine xforms for
10429 /// ISD::INSERT_VECTOR_ELT.
10430 static SDValue PerformInsertEltCombine(SDNode *N,
10431                                        TargetLowering::DAGCombinerInfo &DCI) {
10432   // Bitcast an i64 load inserted into a vector to f64.
10433   // Otherwise, the i64 value will be legalized to a pair of i32 values.
10434   EVT VT = N->getValueType(0);
10435   SDNode *Elt = N->getOperand(1).getNode();
10436   if (VT.getVectorElementType() != MVT::i64 ||
10437       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
10438     return SDValue();
10439 
10440   SelectionDAG &DAG = DCI.DAG;
10441   SDLoc dl(N);
10442   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
10443                                  VT.getVectorNumElements());
10444   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
10445   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
10446   // Make the DAGCombiner fold the bitcasts.
10447   DCI.AddToWorklist(Vec.getNode());
10448   DCI.AddToWorklist(V.getNode());
10449   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
10450                                Vec, V, N->getOperand(2));
10451   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
10452 }
10453 
10454 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
10455 /// ISD::VECTOR_SHUFFLE.
10456 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
10457   // The LLVM shufflevector instruction does not require the shuffle mask
10458   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
10459   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
10460   // operands do not match the mask length, they are extended by concatenating
10461   // them with undef vectors.  That is probably the right thing for other
10462   // targets, but for NEON it is better to concatenate two double-register
10463   // size vector operands into a single quad-register size vector.  Do that
10464   // transformation here:
10465   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
10466   //   shuffle(concat(v1, v2), undef)
10467   SDValue Op0 = N->getOperand(0);
10468   SDValue Op1 = N->getOperand(1);
10469   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
10470       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
10471       Op0.getNumOperands() != 2 ||
10472       Op1.getNumOperands() != 2)
10473     return SDValue();
10474   SDValue Concat0Op1 = Op0.getOperand(1);
10475   SDValue Concat1Op1 = Op1.getOperand(1);
10476   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
10477     return SDValue();
10478   // Skip the transformation if any of the types are illegal.
10479   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10480   EVT VT = N->getValueType(0);
10481   if (!TLI.isTypeLegal(VT) ||
10482       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
10483       !TLI.isTypeLegal(Concat1Op1.getValueType()))
10484     return SDValue();
10485 
10486   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
10487                                   Op0.getOperand(0), Op1.getOperand(0));
10488   // Translate the shuffle mask.
10489   SmallVector<int, 16> NewMask;
10490   unsigned NumElts = VT.getVectorNumElements();
10491   unsigned HalfElts = NumElts/2;
10492   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
10493   for (unsigned n = 0; n < NumElts; ++n) {
10494     int MaskElt = SVN->getMaskElt(n);
10495     int NewElt = -1;
10496     if (MaskElt < (int)HalfElts)
10497       NewElt = MaskElt;
10498     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
10499       NewElt = HalfElts + MaskElt - NumElts;
10500     NewMask.push_back(NewElt);
10501   }
10502   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
10503                               DAG.getUNDEF(VT), NewMask);
10504 }
10505 
10506 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
10507 /// NEON load/store intrinsics, and generic vector load/stores, to merge
10508 /// base address updates.
10509 /// For generic load/stores, the memory type is assumed to be a vector.
10510 /// The caller is assumed to have checked legality.
10511 static SDValue CombineBaseUpdate(SDNode *N,
10512                                  TargetLowering::DAGCombinerInfo &DCI) {
10513   SelectionDAG &DAG = DCI.DAG;
10514   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
10515                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
10516   const bool isStore = N->getOpcode() == ISD::STORE;
10517   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
10518   SDValue Addr = N->getOperand(AddrOpIdx);
10519   MemSDNode *MemN = cast<MemSDNode>(N);
10520   SDLoc dl(N);
10521 
10522   // Search for a use of the address operand that is an increment.
10523   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
10524          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
10525     SDNode *User = *UI;
10526     if (User->getOpcode() != ISD::ADD ||
10527         UI.getUse().getResNo() != Addr.getResNo())
10528       continue;
10529 
10530     // Check that the add is independent of the load/store.  Otherwise, folding
10531     // it would create a cycle.
10532     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
10533       continue;
10534 
10535     // Find the new opcode for the updating load/store.
10536     bool isLoadOp = true;
10537     bool isLaneOp = false;
10538     unsigned NewOpc = 0;
10539     unsigned NumVecs = 0;
10540     if (isIntrinsic) {
10541       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
10542       switch (IntNo) {
10543       default: llvm_unreachable("unexpected intrinsic for Neon base update");
10544       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
10545         NumVecs = 1; break;
10546       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
10547         NumVecs = 2; break;
10548       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
10549         NumVecs = 3; break;
10550       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
10551         NumVecs = 4; break;
10552       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
10553         NumVecs = 2; isLaneOp = true; break;
10554       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
10555         NumVecs = 3; isLaneOp = true; break;
10556       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
10557         NumVecs = 4; isLaneOp = true; break;
10558       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
10559         NumVecs = 1; isLoadOp = false; break;
10560       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
10561         NumVecs = 2; isLoadOp = false; break;
10562       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
10563         NumVecs = 3; isLoadOp = false; break;
10564       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
10565         NumVecs = 4; isLoadOp = false; break;
10566       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
10567         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
10568       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
10569         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
10570       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
10571         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
10572       }
10573     } else {
10574       isLaneOp = true;
10575       switch (N->getOpcode()) {
10576       default: llvm_unreachable("unexpected opcode for Neon base update");
10577       case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
10578       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
10579       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
10580       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
10581       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
10582         NumVecs = 1; isLaneOp = false; break;
10583       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
10584         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
10585       }
10586     }
10587 
10588     // Find the size of memory referenced by the load/store.
10589     EVT VecTy;
10590     if (isLoadOp) {
10591       VecTy = N->getValueType(0);
10592     } else if (isIntrinsic) {
10593       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
10594     } else {
10595       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
10596       VecTy = N->getOperand(1).getValueType();
10597     }
10598 
10599     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
10600     if (isLaneOp)
10601       NumBytes /= VecTy.getVectorNumElements();
10602 
10603     // If the increment is a constant, it must match the memory ref size.
10604     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
10605     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
10606       uint64_t IncVal = CInc->getZExtValue();
10607       if (IncVal != NumBytes)
10608         continue;
10609     } else if (NumBytes >= 3 * 16) {
10610       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
10611       // separate instructions that make it harder to use a non-constant update.
10612       continue;
10613     }
10614 
10615     // OK, we found an ADD we can fold into the base update.
10616     // Now, create a _UPD node, taking care of not breaking alignment.
10617 
10618     EVT AlignedVecTy = VecTy;
10619     unsigned Alignment = MemN->getAlignment();
10620 
10621     // If this is a less-than-standard-aligned load/store, change the type to
10622     // match the standard alignment.
10623     // The alignment is overlooked when selecting _UPD variants; and it's
10624     // easier to introduce bitcasts here than fix that.
10625     // There are 3 ways to get to this base-update combine:
10626     // - intrinsics: they are assumed to be properly aligned (to the standard
10627     //   alignment of the memory type), so we don't need to do anything.
10628     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
10629     //   intrinsics, so, likewise, there's nothing to do.
10630     // - generic load/store instructions: the alignment is specified as an
10631     //   explicit operand, rather than implicitly as the standard alignment
10632     //   of the memory type (like the intrisics).  We need to change the
10633     //   memory type to match the explicit alignment.  That way, we don't
10634     //   generate non-standard-aligned ARMISD::VLDx nodes.
10635     if (isa<LSBaseSDNode>(N)) {
10636       if (Alignment == 0)
10637         Alignment = 1;
10638       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
10639         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
10640         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
10641         assert(!isLaneOp && "Unexpected generic load/store lane.");
10642         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
10643         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
10644       }
10645       // Don't set an explicit alignment on regular load/stores that we want
10646       // to transform to VLD/VST 1_UPD nodes.
10647       // This matches the behavior of regular load/stores, which only get an
10648       // explicit alignment if the MMO alignment is larger than the standard
10649       // alignment of the memory type.
10650       // Intrinsics, however, always get an explicit alignment, set to the
10651       // alignment of the MMO.
10652       Alignment = 1;
10653     }
10654 
10655     // Create the new updating load/store node.
10656     // First, create an SDVTList for the new updating node's results.
10657     EVT Tys[6];
10658     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
10659     unsigned n;
10660     for (n = 0; n < NumResultVecs; ++n)
10661       Tys[n] = AlignedVecTy;
10662     Tys[n++] = MVT::i32;
10663     Tys[n] = MVT::Other;
10664     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
10665 
10666     // Then, gather the new node's operands.
10667     SmallVector<SDValue, 8> Ops;
10668     Ops.push_back(N->getOperand(0)); // incoming chain
10669     Ops.push_back(N->getOperand(AddrOpIdx));
10670     Ops.push_back(Inc);
10671 
10672     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
10673       // Try to match the intrinsic's signature
10674       Ops.push_back(StN->getValue());
10675     } else {
10676       // Loads (and of course intrinsics) match the intrinsics' signature,
10677       // so just add all but the alignment operand.
10678       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
10679         Ops.push_back(N->getOperand(i));
10680     }
10681 
10682     // For all node types, the alignment operand is always the last one.
10683     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
10684 
10685     // If this is a non-standard-aligned STORE, the penultimate operand is the
10686     // stored value.  Bitcast it to the aligned type.
10687     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
10688       SDValue &StVal = Ops[Ops.size()-2];
10689       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
10690     }
10691 
10692     EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
10693     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
10694                                            MemN->getMemOperand());
10695 
10696     // Update the uses.
10697     SmallVector<SDValue, 5> NewResults;
10698     for (unsigned i = 0; i < NumResultVecs; ++i)
10699       NewResults.push_back(SDValue(UpdN.getNode(), i));
10700 
10701     // If this is an non-standard-aligned LOAD, the first result is the loaded
10702     // value.  Bitcast it to the expected result type.
10703     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
10704       SDValue &LdVal = NewResults[0];
10705       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
10706     }
10707 
10708     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
10709     DCI.CombineTo(N, NewResults);
10710     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
10711 
10712     break;
10713   }
10714   return SDValue();
10715 }
10716 
10717 static SDValue PerformVLDCombine(SDNode *N,
10718                                  TargetLowering::DAGCombinerInfo &DCI) {
10719   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
10720     return SDValue();
10721 
10722   return CombineBaseUpdate(N, DCI);
10723 }
10724 
10725 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
10726 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
10727 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
10728 /// return true.
10729 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
10730   SelectionDAG &DAG = DCI.DAG;
10731   EVT VT = N->getValueType(0);
10732   // vldN-dup instructions only support 64-bit vectors for N > 1.
10733   if (!VT.is64BitVector())
10734     return false;
10735 
10736   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
10737   SDNode *VLD = N->getOperand(0).getNode();
10738   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
10739     return false;
10740   unsigned NumVecs = 0;
10741   unsigned NewOpc = 0;
10742   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
10743   if (IntNo == Intrinsic::arm_neon_vld2lane) {
10744     NumVecs = 2;
10745     NewOpc = ARMISD::VLD2DUP;
10746   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
10747     NumVecs = 3;
10748     NewOpc = ARMISD::VLD3DUP;
10749   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
10750     NumVecs = 4;
10751     NewOpc = ARMISD::VLD4DUP;
10752   } else {
10753     return false;
10754   }
10755 
10756   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
10757   // numbers match the load.
10758   unsigned VLDLaneNo =
10759     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
10760   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
10761        UI != UE; ++UI) {
10762     // Ignore uses of the chain result.
10763     if (UI.getUse().getResNo() == NumVecs)
10764       continue;
10765     SDNode *User = *UI;
10766     if (User->getOpcode() != ARMISD::VDUPLANE ||
10767         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
10768       return false;
10769   }
10770 
10771   // Create the vldN-dup node.
10772   EVT Tys[5];
10773   unsigned n;
10774   for (n = 0; n < NumVecs; ++n)
10775     Tys[n] = VT;
10776   Tys[n] = MVT::Other;
10777   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
10778   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
10779   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
10780   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
10781                                            Ops, VLDMemInt->getMemoryVT(),
10782                                            VLDMemInt->getMemOperand());
10783 
10784   // Update the uses.
10785   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
10786        UI != UE; ++UI) {
10787     unsigned ResNo = UI.getUse().getResNo();
10788     // Ignore uses of the chain result.
10789     if (ResNo == NumVecs)
10790       continue;
10791     SDNode *User = *UI;
10792     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
10793   }
10794 
10795   // Now the vldN-lane intrinsic is dead except for its chain result.
10796   // Update uses of the chain.
10797   std::vector<SDValue> VLDDupResults;
10798   for (unsigned n = 0; n < NumVecs; ++n)
10799     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
10800   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
10801   DCI.CombineTo(VLD, VLDDupResults);
10802 
10803   return true;
10804 }
10805 
10806 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
10807 /// ARMISD::VDUPLANE.
10808 static SDValue PerformVDUPLANECombine(SDNode *N,
10809                                       TargetLowering::DAGCombinerInfo &DCI) {
10810   SDValue Op = N->getOperand(0);
10811 
10812   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
10813   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
10814   if (CombineVLDDUP(N, DCI))
10815     return SDValue(N, 0);
10816 
10817   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
10818   // redundant.  Ignore bit_converts for now; element sizes are checked below.
10819   while (Op.getOpcode() == ISD::BITCAST)
10820     Op = Op.getOperand(0);
10821   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
10822     return SDValue();
10823 
10824   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
10825   unsigned EltSize = Op.getScalarValueSizeInBits();
10826   // The canonical VMOV for a zero vector uses a 32-bit element size.
10827   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10828   unsigned EltBits;
10829   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
10830     EltSize = 8;
10831   EVT VT = N->getValueType(0);
10832   if (EltSize > VT.getScalarSizeInBits())
10833     return SDValue();
10834 
10835   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
10836 }
10837 
10838 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
10839 static SDValue PerformVDUPCombine(SDNode *N,
10840                                   TargetLowering::DAGCombinerInfo &DCI) {
10841   SelectionDAG &DAG = DCI.DAG;
10842   SDValue Op = N->getOperand(0);
10843 
10844   // Match VDUP(LOAD) -> VLD1DUP.
10845   // We match this pattern here rather than waiting for isel because the
10846   // transform is only legal for unindexed loads.
10847   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
10848   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
10849       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
10850     SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
10851                       DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
10852     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
10853     SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
10854                                              Ops, LD->getMemoryVT(),
10855                                              LD->getMemOperand());
10856     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
10857     return VLDDup;
10858   }
10859 
10860   return SDValue();
10861 }
10862 
10863 static SDValue PerformLOADCombine(SDNode *N,
10864                                   TargetLowering::DAGCombinerInfo &DCI) {
10865   EVT VT = N->getValueType(0);
10866 
10867   // If this is a legal vector load, try to combine it into a VLD1_UPD.
10868   if (ISD::isNormalLoad(N) && VT.isVector() &&
10869       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
10870     return CombineBaseUpdate(N, DCI);
10871 
10872   return SDValue();
10873 }
10874 
10875 /// PerformSTORECombine - Target-specific dag combine xforms for
10876 /// ISD::STORE.
10877 static SDValue PerformSTORECombine(SDNode *N,
10878                                    TargetLowering::DAGCombinerInfo &DCI) {
10879   StoreSDNode *St = cast<StoreSDNode>(N);
10880   if (St->isVolatile())
10881     return SDValue();
10882 
10883   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
10884   // pack all of the elements in one place.  Next, store to memory in fewer
10885   // chunks.
10886   SDValue StVal = St->getValue();
10887   EVT VT = StVal.getValueType();
10888   if (St->isTruncatingStore() && VT.isVector()) {
10889     SelectionDAG &DAG = DCI.DAG;
10890     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10891     EVT StVT = St->getMemoryVT();
10892     unsigned NumElems = VT.getVectorNumElements();
10893     assert(StVT != VT && "Cannot truncate to the same type");
10894     unsigned FromEltSz = VT.getScalarSizeInBits();
10895     unsigned ToEltSz = StVT.getScalarSizeInBits();
10896 
10897     // From, To sizes and ElemCount must be pow of two
10898     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
10899 
10900     // We are going to use the original vector elt for storing.
10901     // Accumulated smaller vector elements must be a multiple of the store size.
10902     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
10903 
10904     unsigned SizeRatio  = FromEltSz / ToEltSz;
10905     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
10906 
10907     // Create a type on which we perform the shuffle.
10908     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
10909                                      NumElems*SizeRatio);
10910     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
10911 
10912     SDLoc DL(St);
10913     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
10914     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
10915     for (unsigned i = 0; i < NumElems; ++i)
10916       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
10917                           ? (i + 1) * SizeRatio - 1
10918                           : i * SizeRatio;
10919 
10920     // Can't shuffle using an illegal type.
10921     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
10922 
10923     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
10924                                 DAG.getUNDEF(WideVec.getValueType()),
10925                                 ShuffleVec);
10926     // At this point all of the data is stored at the bottom of the
10927     // register. We now need to save it to mem.
10928 
10929     // Find the largest store unit
10930     MVT StoreType = MVT::i8;
10931     for (MVT Tp : MVT::integer_valuetypes()) {
10932       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
10933         StoreType = Tp;
10934     }
10935     // Didn't find a legal store type.
10936     if (!TLI.isTypeLegal(StoreType))
10937       return SDValue();
10938 
10939     // Bitcast the original vector into a vector of store-size units
10940     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
10941             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
10942     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
10943     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
10944     SmallVector<SDValue, 8> Chains;
10945     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
10946                                         TLI.getPointerTy(DAG.getDataLayout()));
10947     SDValue BasePtr = St->getBasePtr();
10948 
10949     // Perform one or more big stores into memory.
10950     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
10951     for (unsigned I = 0; I < E; I++) {
10952       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
10953                                    StoreType, ShuffWide,
10954                                    DAG.getIntPtrConstant(I, DL));
10955       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
10956                                 St->getPointerInfo(), St->getAlignment(),
10957                                 St->getMemOperand()->getFlags());
10958       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
10959                             Increment);
10960       Chains.push_back(Ch);
10961     }
10962     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
10963   }
10964 
10965   if (!ISD::isNormalStore(St))
10966     return SDValue();
10967 
10968   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
10969   // ARM stores of arguments in the same cache line.
10970   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
10971       StVal.getNode()->hasOneUse()) {
10972     SelectionDAG  &DAG = DCI.DAG;
10973     bool isBigEndian = DAG.getDataLayout().isBigEndian();
10974     SDLoc DL(St);
10975     SDValue BasePtr = St->getBasePtr();
10976     SDValue NewST1 = DAG.getStore(
10977         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
10978         BasePtr, St->getPointerInfo(), St->getAlignment(),
10979         St->getMemOperand()->getFlags());
10980 
10981     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
10982                                     DAG.getConstant(4, DL, MVT::i32));
10983     return DAG.getStore(NewST1.getValue(0), DL,
10984                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
10985                         OffsetPtr, St->getPointerInfo(),
10986                         std::min(4U, St->getAlignment() / 2),
10987                         St->getMemOperand()->getFlags());
10988   }
10989 
10990   if (StVal.getValueType() == MVT::i64 &&
10991       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10992 
10993     // Bitcast an i64 store extracted from a vector to f64.
10994     // Otherwise, the i64 value will be legalized to a pair of i32 values.
10995     SelectionDAG &DAG = DCI.DAG;
10996     SDLoc dl(StVal);
10997     SDValue IntVec = StVal.getOperand(0);
10998     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
10999                                    IntVec.getValueType().getVectorNumElements());
11000     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
11001     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
11002                                  Vec, StVal.getOperand(1));
11003     dl = SDLoc(N);
11004     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
11005     // Make the DAGCombiner fold the bitcasts.
11006     DCI.AddToWorklist(Vec.getNode());
11007     DCI.AddToWorklist(ExtElt.getNode());
11008     DCI.AddToWorklist(V.getNode());
11009     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
11010                         St->getPointerInfo(), St->getAlignment(),
11011                         St->getMemOperand()->getFlags(), St->getAAInfo());
11012   }
11013 
11014   // If this is a legal vector store, try to combine it into a VST1_UPD.
11015   if (ISD::isNormalStore(N) && VT.isVector() &&
11016       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
11017     return CombineBaseUpdate(N, DCI);
11018 
11019   return SDValue();
11020 }
11021 
11022 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
11023 /// can replace combinations of VMUL and VCVT (floating-point to integer)
11024 /// when the VMUL has a constant operand that is a power of 2.
11025 ///
11026 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
11027 ///  vmul.f32        d16, d17, d16
11028 ///  vcvt.s32.f32    d16, d16
11029 /// becomes:
11030 ///  vcvt.s32.f32    d16, d16, #3
11031 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
11032                                   const ARMSubtarget *Subtarget) {
11033   if (!Subtarget->hasNEON())
11034     return SDValue();
11035 
11036   SDValue Op = N->getOperand(0);
11037   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
11038       Op.getOpcode() != ISD::FMUL)
11039     return SDValue();
11040 
11041   SDValue ConstVec = Op->getOperand(1);
11042   if (!isa<BuildVectorSDNode>(ConstVec))
11043     return SDValue();
11044 
11045   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
11046   uint32_t FloatBits = FloatTy.getSizeInBits();
11047   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
11048   uint32_t IntBits = IntTy.getSizeInBits();
11049   unsigned NumLanes = Op.getValueType().getVectorNumElements();
11050   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
11051     // These instructions only exist converting from f32 to i32. We can handle
11052     // smaller integers by generating an extra truncate, but larger ones would
11053     // be lossy. We also can't handle more then 4 lanes, since these intructions
11054     // only support v2i32/v4i32 types.
11055     return SDValue();
11056   }
11057 
11058   BitVector UndefElements;
11059   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
11060   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
11061   if (C == -1 || C == 0 || C > 32)
11062     return SDValue();
11063 
11064   SDLoc dl(N);
11065   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
11066   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
11067     Intrinsic::arm_neon_vcvtfp2fxu;
11068   SDValue FixConv = DAG.getNode(
11069       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
11070       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
11071       DAG.getConstant(C, dl, MVT::i32));
11072 
11073   if (IntBits < FloatBits)
11074     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
11075 
11076   return FixConv;
11077 }
11078 
11079 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
11080 /// can replace combinations of VCVT (integer to floating-point) and VDIV
11081 /// when the VDIV has a constant operand that is a power of 2.
11082 ///
11083 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
11084 ///  vcvt.f32.s32    d16, d16
11085 ///  vdiv.f32        d16, d17, d16
11086 /// becomes:
11087 ///  vcvt.f32.s32    d16, d16, #3
11088 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
11089                                   const ARMSubtarget *Subtarget) {
11090   if (!Subtarget->hasNEON())
11091     return SDValue();
11092 
11093   SDValue Op = N->getOperand(0);
11094   unsigned OpOpcode = Op.getNode()->getOpcode();
11095   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
11096       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
11097     return SDValue();
11098 
11099   SDValue ConstVec = N->getOperand(1);
11100   if (!isa<BuildVectorSDNode>(ConstVec))
11101     return SDValue();
11102 
11103   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
11104   uint32_t FloatBits = FloatTy.getSizeInBits();
11105   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
11106   uint32_t IntBits = IntTy.getSizeInBits();
11107   unsigned NumLanes = Op.getValueType().getVectorNumElements();
11108   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
11109     // These instructions only exist converting from i32 to f32. We can handle
11110     // smaller integers by generating an extra extend, but larger ones would
11111     // be lossy. We also can't handle more then 4 lanes, since these intructions
11112     // only support v2i32/v4i32 types.
11113     return SDValue();
11114   }
11115 
11116   BitVector UndefElements;
11117   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
11118   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
11119   if (C == -1 || C == 0 || C > 32)
11120     return SDValue();
11121 
11122   SDLoc dl(N);
11123   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
11124   SDValue ConvInput = Op.getOperand(0);
11125   if (IntBits < FloatBits)
11126     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
11127                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
11128                             ConvInput);
11129 
11130   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
11131     Intrinsic::arm_neon_vcvtfxu2fp;
11132   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
11133                      Op.getValueType(),
11134                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
11135                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
11136 }
11137 
11138 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
11139 /// operand of a vector shift operation, where all the elements of the
11140 /// build_vector must have the same constant integer value.
11141 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
11142   // Ignore bit_converts.
11143   while (Op.getOpcode() == ISD::BITCAST)
11144     Op = Op.getOperand(0);
11145   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
11146   APInt SplatBits, SplatUndef;
11147   unsigned SplatBitSize;
11148   bool HasAnyUndefs;
11149   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
11150                                       HasAnyUndefs, ElementBits) ||
11151       SplatBitSize > ElementBits)
11152     return false;
11153   Cnt = SplatBits.getSExtValue();
11154   return true;
11155 }
11156 
11157 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
11158 /// operand of a vector shift left operation.  That value must be in the range:
11159 ///   0 <= Value < ElementBits for a left shift; or
11160 ///   0 <= Value <= ElementBits for a long left shift.
11161 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
11162   assert(VT.isVector() && "vector shift count is not a vector type");
11163   int64_t ElementBits = VT.getScalarSizeInBits();
11164   if (! getVShiftImm(Op, ElementBits, Cnt))
11165     return false;
11166   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
11167 }
11168 
11169 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
11170 /// operand of a vector shift right operation.  For a shift opcode, the value
11171 /// is positive, but for an intrinsic the value count must be negative. The
11172 /// absolute value must be in the range:
11173 ///   1 <= |Value| <= ElementBits for a right shift; or
11174 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
11175 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
11176                          int64_t &Cnt) {
11177   assert(VT.isVector() && "vector shift count is not a vector type");
11178   int64_t ElementBits = VT.getScalarSizeInBits();
11179   if (! getVShiftImm(Op, ElementBits, Cnt))
11180     return false;
11181   if (!isIntrinsic)
11182     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
11183   if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
11184     Cnt = -Cnt;
11185     return true;
11186   }
11187   return false;
11188 }
11189 
11190 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
11191 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
11192   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
11193   switch (IntNo) {
11194   default:
11195     // Don't do anything for most intrinsics.
11196     break;
11197 
11198   // Vector shifts: check for immediate versions and lower them.
11199   // Note: This is done during DAG combining instead of DAG legalizing because
11200   // the build_vectors for 64-bit vector element shift counts are generally
11201   // not legal, and it is hard to see their values after they get legalized to
11202   // loads from a constant pool.
11203   case Intrinsic::arm_neon_vshifts:
11204   case Intrinsic::arm_neon_vshiftu:
11205   case Intrinsic::arm_neon_vrshifts:
11206   case Intrinsic::arm_neon_vrshiftu:
11207   case Intrinsic::arm_neon_vrshiftn:
11208   case Intrinsic::arm_neon_vqshifts:
11209   case Intrinsic::arm_neon_vqshiftu:
11210   case Intrinsic::arm_neon_vqshiftsu:
11211   case Intrinsic::arm_neon_vqshiftns:
11212   case Intrinsic::arm_neon_vqshiftnu:
11213   case Intrinsic::arm_neon_vqshiftnsu:
11214   case Intrinsic::arm_neon_vqrshiftns:
11215   case Intrinsic::arm_neon_vqrshiftnu:
11216   case Intrinsic::arm_neon_vqrshiftnsu: {
11217     EVT VT = N->getOperand(1).getValueType();
11218     int64_t Cnt;
11219     unsigned VShiftOpc = 0;
11220 
11221     switch (IntNo) {
11222     case Intrinsic::arm_neon_vshifts:
11223     case Intrinsic::arm_neon_vshiftu:
11224       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
11225         VShiftOpc = ARMISD::VSHL;
11226         break;
11227       }
11228       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
11229         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
11230                      ARMISD::VSHRs : ARMISD::VSHRu);
11231         break;
11232       }
11233       return SDValue();
11234 
11235     case Intrinsic::arm_neon_vrshifts:
11236     case Intrinsic::arm_neon_vrshiftu:
11237       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
11238         break;
11239       return SDValue();
11240 
11241     case Intrinsic::arm_neon_vqshifts:
11242     case Intrinsic::arm_neon_vqshiftu:
11243       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
11244         break;
11245       return SDValue();
11246 
11247     case Intrinsic::arm_neon_vqshiftsu:
11248       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
11249         break;
11250       llvm_unreachable("invalid shift count for vqshlu intrinsic");
11251 
11252     case Intrinsic::arm_neon_vrshiftn:
11253     case Intrinsic::arm_neon_vqshiftns:
11254     case Intrinsic::arm_neon_vqshiftnu:
11255     case Intrinsic::arm_neon_vqshiftnsu:
11256     case Intrinsic::arm_neon_vqrshiftns:
11257     case Intrinsic::arm_neon_vqrshiftnu:
11258     case Intrinsic::arm_neon_vqrshiftnsu:
11259       // Narrowing shifts require an immediate right shift.
11260       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
11261         break;
11262       llvm_unreachable("invalid shift count for narrowing vector shift "
11263                        "intrinsic");
11264 
11265     default:
11266       llvm_unreachable("unhandled vector shift");
11267     }
11268 
11269     switch (IntNo) {
11270     case Intrinsic::arm_neon_vshifts:
11271     case Intrinsic::arm_neon_vshiftu:
11272       // Opcode already set above.
11273       break;
11274     case Intrinsic::arm_neon_vrshifts:
11275       VShiftOpc = ARMISD::VRSHRs; break;
11276     case Intrinsic::arm_neon_vrshiftu:
11277       VShiftOpc = ARMISD::VRSHRu; break;
11278     case Intrinsic::arm_neon_vrshiftn:
11279       VShiftOpc = ARMISD::VRSHRN; break;
11280     case Intrinsic::arm_neon_vqshifts:
11281       VShiftOpc = ARMISD::VQSHLs; break;
11282     case Intrinsic::arm_neon_vqshiftu:
11283       VShiftOpc = ARMISD::VQSHLu; break;
11284     case Intrinsic::arm_neon_vqshiftsu:
11285       VShiftOpc = ARMISD::VQSHLsu; break;
11286     case Intrinsic::arm_neon_vqshiftns:
11287       VShiftOpc = ARMISD::VQSHRNs; break;
11288     case Intrinsic::arm_neon_vqshiftnu:
11289       VShiftOpc = ARMISD::VQSHRNu; break;
11290     case Intrinsic::arm_neon_vqshiftnsu:
11291       VShiftOpc = ARMISD::VQSHRNsu; break;
11292     case Intrinsic::arm_neon_vqrshiftns:
11293       VShiftOpc = ARMISD::VQRSHRNs; break;
11294     case Intrinsic::arm_neon_vqrshiftnu:
11295       VShiftOpc = ARMISD::VQRSHRNu; break;
11296     case Intrinsic::arm_neon_vqrshiftnsu:
11297       VShiftOpc = ARMISD::VQRSHRNsu; break;
11298     }
11299 
11300     SDLoc dl(N);
11301     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
11302                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
11303   }
11304 
11305   case Intrinsic::arm_neon_vshiftins: {
11306     EVT VT = N->getOperand(1).getValueType();
11307     int64_t Cnt;
11308     unsigned VShiftOpc = 0;
11309 
11310     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
11311       VShiftOpc = ARMISD::VSLI;
11312     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
11313       VShiftOpc = ARMISD::VSRI;
11314     else {
11315       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
11316     }
11317 
11318     SDLoc dl(N);
11319     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
11320                        N->getOperand(1), N->getOperand(2),
11321                        DAG.getConstant(Cnt, dl, MVT::i32));
11322   }
11323 
11324   case Intrinsic::arm_neon_vqrshifts:
11325   case Intrinsic::arm_neon_vqrshiftu:
11326     // No immediate versions of these to check for.
11327     break;
11328   }
11329 
11330   return SDValue();
11331 }
11332 
11333 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
11334 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
11335 /// combining instead of DAG legalizing because the build_vectors for 64-bit
11336 /// vector element shift counts are generally not legal, and it is hard to see
11337 /// their values after they get legalized to loads from a constant pool.
11338 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
11339                                    const ARMSubtarget *ST) {
11340   EVT VT = N->getValueType(0);
11341   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
11342     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
11343     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
11344     SDValue N1 = N->getOperand(1);
11345     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
11346       SDValue N0 = N->getOperand(0);
11347       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
11348           DAG.MaskedValueIsZero(N0.getOperand(0),
11349                                 APInt::getHighBitsSet(32, 16)))
11350         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
11351     }
11352   }
11353 
11354   // Nothing to be done for scalar shifts.
11355   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11356   if (!VT.isVector() || !TLI.isTypeLegal(VT))
11357     return SDValue();
11358 
11359   assert(ST->hasNEON() && "unexpected vector shift");
11360   int64_t Cnt;
11361 
11362   switch (N->getOpcode()) {
11363   default: llvm_unreachable("unexpected shift opcode");
11364 
11365   case ISD::SHL:
11366     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
11367       SDLoc dl(N);
11368       return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
11369                          DAG.getConstant(Cnt, dl, MVT::i32));
11370     }
11371     break;
11372 
11373   case ISD::SRA:
11374   case ISD::SRL:
11375     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
11376       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
11377                             ARMISD::VSHRs : ARMISD::VSHRu);
11378       SDLoc dl(N);
11379       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
11380                          DAG.getConstant(Cnt, dl, MVT::i32));
11381     }
11382   }
11383   return SDValue();
11384 }
11385 
11386 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
11387 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
11388 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
11389                                     const ARMSubtarget *ST) {
11390   SDValue N0 = N->getOperand(0);
11391 
11392   // Check for sign- and zero-extensions of vector extract operations of 8-
11393   // and 16-bit vector elements.  NEON supports these directly.  They are
11394   // handled during DAG combining because type legalization will promote them
11395   // to 32-bit types and it is messy to recognize the operations after that.
11396   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
11397     SDValue Vec = N0.getOperand(0);
11398     SDValue Lane = N0.getOperand(1);
11399     EVT VT = N->getValueType(0);
11400     EVT EltVT = N0.getValueType();
11401     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11402 
11403     if (VT == MVT::i32 &&
11404         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
11405         TLI.isTypeLegal(Vec.getValueType()) &&
11406         isa<ConstantSDNode>(Lane)) {
11407 
11408       unsigned Opc = 0;
11409       switch (N->getOpcode()) {
11410       default: llvm_unreachable("unexpected opcode");
11411       case ISD::SIGN_EXTEND:
11412         Opc = ARMISD::VGETLANEs;
11413         break;
11414       case ISD::ZERO_EXTEND:
11415       case ISD::ANY_EXTEND:
11416         Opc = ARMISD::VGETLANEu;
11417         break;
11418       }
11419       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
11420     }
11421   }
11422 
11423   return SDValue();
11424 }
11425 
11426 static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
11427                              APInt &KnownOne) {
11428   if (Op.getOpcode() == ARMISD::BFI) {
11429     // Conservatively, we can recurse down the first operand
11430     // and just mask out all affected bits.
11431     computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
11432 
11433     // The operand to BFI is already a mask suitable for removing the bits it
11434     // sets.
11435     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
11436     const APInt &Mask = CI->getAPIntValue();
11437     KnownZero &= Mask;
11438     KnownOne &= Mask;
11439     return;
11440   }
11441   if (Op.getOpcode() == ARMISD::CMOV) {
11442     APInt KZ2(KnownZero.getBitWidth(), 0);
11443     APInt KO2(KnownOne.getBitWidth(), 0);
11444     computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
11445     computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
11446 
11447     KnownZero &= KZ2;
11448     KnownOne &= KO2;
11449     return;
11450   }
11451   return DAG.computeKnownBits(Op, KnownZero, KnownOne);
11452 }
11453 
11454 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
11455   // If we have a CMOV, OR and AND combination such as:
11456   //   if (x & CN)
11457   //     y |= CM;
11458   //
11459   // And:
11460   //   * CN is a single bit;
11461   //   * All bits covered by CM are known zero in y
11462   //
11463   // Then we can convert this into a sequence of BFI instructions. This will
11464   // always be a win if CM is a single bit, will always be no worse than the
11465   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
11466   // three bits (due to the extra IT instruction).
11467 
11468   SDValue Op0 = CMOV->getOperand(0);
11469   SDValue Op1 = CMOV->getOperand(1);
11470   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
11471   auto CC = CCNode->getAPIntValue().getLimitedValue();
11472   SDValue CmpZ = CMOV->getOperand(4);
11473 
11474   // The compare must be against zero.
11475   if (!isNullConstant(CmpZ->getOperand(1)))
11476     return SDValue();
11477 
11478   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
11479   SDValue And = CmpZ->getOperand(0);
11480   if (And->getOpcode() != ISD::AND)
11481     return SDValue();
11482   ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
11483   if (!AndC || !AndC->getAPIntValue().isPowerOf2())
11484     return SDValue();
11485   SDValue X = And->getOperand(0);
11486 
11487   if (CC == ARMCC::EQ) {
11488     // We're performing an "equal to zero" compare. Swap the operands so we
11489     // canonicalize on a "not equal to zero" compare.
11490     std::swap(Op0, Op1);
11491   } else {
11492     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
11493   }
11494 
11495   if (Op1->getOpcode() != ISD::OR)
11496     return SDValue();
11497 
11498   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
11499   if (!OrC)
11500     return SDValue();
11501   SDValue Y = Op1->getOperand(0);
11502 
11503   if (Op0 != Y)
11504     return SDValue();
11505 
11506   // Now, is it profitable to continue?
11507   APInt OrCI = OrC->getAPIntValue();
11508   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
11509   if (OrCI.countPopulation() > Heuristic)
11510     return SDValue();
11511 
11512   // Lastly, can we determine that the bits defined by OrCI
11513   // are zero in Y?
11514   APInt KnownZero, KnownOne;
11515   computeKnownBits(DAG, Y, KnownZero, KnownOne);
11516   if ((OrCI & KnownZero) != OrCI)
11517     return SDValue();
11518 
11519   // OK, we can do the combine.
11520   SDValue V = Y;
11521   SDLoc dl(X);
11522   EVT VT = X.getValueType();
11523   unsigned BitInX = AndC->getAPIntValue().logBase2();
11524 
11525   if (BitInX != 0) {
11526     // We must shift X first.
11527     X = DAG.getNode(ISD::SRL, dl, VT, X,
11528                     DAG.getConstant(BitInX, dl, VT));
11529   }
11530 
11531   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
11532        BitInY < NumActiveBits; ++BitInY) {
11533     if (OrCI[BitInY] == 0)
11534       continue;
11535     APInt Mask(VT.getSizeInBits(), 0);
11536     Mask.setBit(BitInY);
11537     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
11538                     // Confusingly, the operand is an *inverted* mask.
11539                     DAG.getConstant(~Mask, dl, VT));
11540   }
11541 
11542   return V;
11543 }
11544 
11545 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
11546 SDValue
11547 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
11548   SDValue Cmp = N->getOperand(4);
11549   if (Cmp.getOpcode() != ARMISD::CMPZ)
11550     // Only looking at NE cases.
11551     return SDValue();
11552 
11553   EVT VT = N->getValueType(0);
11554   SDLoc dl(N);
11555   SDValue LHS = Cmp.getOperand(0);
11556   SDValue RHS = Cmp.getOperand(1);
11557   SDValue Chain = N->getOperand(0);
11558   SDValue BB = N->getOperand(1);
11559   SDValue ARMcc = N->getOperand(2);
11560   ARMCC::CondCodes CC =
11561     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
11562 
11563   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
11564   // -> (brcond Chain BB CC CPSR Cmp)
11565   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
11566       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
11567       LHS->getOperand(0)->hasOneUse()) {
11568     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
11569     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
11570     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
11571     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
11572     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
11573         (LHS01C && LHS01C->getZExtValue() == 1) &&
11574         (LHS1C && LHS1C->getZExtValue() == 1) &&
11575         (RHSC && RHSC->getZExtValue() == 0)) {
11576       return DAG.getNode(
11577           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
11578           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
11579     }
11580   }
11581 
11582   return SDValue();
11583 }
11584 
11585 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
11586 SDValue
11587 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
11588   SDValue Cmp = N->getOperand(4);
11589   if (Cmp.getOpcode() != ARMISD::CMPZ)
11590     // Only looking at EQ and NE cases.
11591     return SDValue();
11592 
11593   EVT VT = N->getValueType(0);
11594   SDLoc dl(N);
11595   SDValue LHS = Cmp.getOperand(0);
11596   SDValue RHS = Cmp.getOperand(1);
11597   SDValue FalseVal = N->getOperand(0);
11598   SDValue TrueVal = N->getOperand(1);
11599   SDValue ARMcc = N->getOperand(2);
11600   ARMCC::CondCodes CC =
11601     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
11602 
11603   // BFI is only available on V6T2+.
11604   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
11605     SDValue R = PerformCMOVToBFICombine(N, DAG);
11606     if (R)
11607       return R;
11608   }
11609 
11610   // Simplify
11611   //   mov     r1, r0
11612   //   cmp     r1, x
11613   //   mov     r0, y
11614   //   moveq   r0, x
11615   // to
11616   //   cmp     r0, x
11617   //   movne   r0, y
11618   //
11619   //   mov     r1, r0
11620   //   cmp     r1, x
11621   //   mov     r0, x
11622   //   movne   r0, y
11623   // to
11624   //   cmp     r0, x
11625   //   movne   r0, y
11626   /// FIXME: Turn this into a target neutral optimization?
11627   SDValue Res;
11628   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
11629     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
11630                       N->getOperand(3), Cmp);
11631   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
11632     SDValue ARMcc;
11633     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
11634     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
11635                       N->getOperand(3), NewCmp);
11636   }
11637 
11638   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
11639   // -> (cmov F T CC CPSR Cmp)
11640   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
11641     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
11642     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
11643     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
11644     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
11645         (LHS1C && LHS1C->getZExtValue() == 1) &&
11646         (RHSC && RHSC->getZExtValue() == 0)) {
11647       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
11648                          LHS->getOperand(2), LHS->getOperand(3),
11649                          LHS->getOperand(4));
11650     }
11651   }
11652 
11653   if (Res.getNode()) {
11654     APInt KnownZero, KnownOne;
11655     DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
11656     // Capture demanded bits information that would be otherwise lost.
11657     if (KnownZero == 0xfffffffe)
11658       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11659                         DAG.getValueType(MVT::i1));
11660     else if (KnownZero == 0xffffff00)
11661       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11662                         DAG.getValueType(MVT::i8));
11663     else if (KnownZero == 0xffff0000)
11664       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11665                         DAG.getValueType(MVT::i16));
11666   }
11667 
11668   return Res;
11669 }
11670 
11671 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
11672                                              DAGCombinerInfo &DCI) const {
11673   switch (N->getOpcode()) {
11674   default: break;
11675   case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
11676   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
11677   case ISD::SUB:        return PerformSUBCombine(N, DCI);
11678   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
11679   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
11680   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
11681   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
11682   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
11683   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
11684   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
11685   case ISD::STORE:      return PerformSTORECombine(N, DCI);
11686   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
11687   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
11688   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
11689   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
11690   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
11691   case ISD::FP_TO_SINT:
11692   case ISD::FP_TO_UINT:
11693     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
11694   case ISD::FDIV:
11695     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
11696   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
11697   case ISD::SHL:
11698   case ISD::SRA:
11699   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
11700   case ISD::SIGN_EXTEND:
11701   case ISD::ZERO_EXTEND:
11702   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
11703   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
11704   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
11705   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
11706   case ARMISD::VLD1DUP:
11707   case ARMISD::VLD2DUP:
11708   case ARMISD::VLD3DUP:
11709   case ARMISD::VLD4DUP:
11710     return PerformVLDCombine(N, DCI);
11711   case ARMISD::BUILD_VECTOR:
11712     return PerformARMBUILD_VECTORCombine(N, DCI);
11713   case ISD::INTRINSIC_VOID:
11714   case ISD::INTRINSIC_W_CHAIN:
11715     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
11716     case Intrinsic::arm_neon_vld1:
11717     case Intrinsic::arm_neon_vld2:
11718     case Intrinsic::arm_neon_vld3:
11719     case Intrinsic::arm_neon_vld4:
11720     case Intrinsic::arm_neon_vld2lane:
11721     case Intrinsic::arm_neon_vld3lane:
11722     case Intrinsic::arm_neon_vld4lane:
11723     case Intrinsic::arm_neon_vst1:
11724     case Intrinsic::arm_neon_vst2:
11725     case Intrinsic::arm_neon_vst3:
11726     case Intrinsic::arm_neon_vst4:
11727     case Intrinsic::arm_neon_vst2lane:
11728     case Intrinsic::arm_neon_vst3lane:
11729     case Intrinsic::arm_neon_vst4lane:
11730       return PerformVLDCombine(N, DCI);
11731     default: break;
11732     }
11733     break;
11734   }
11735   return SDValue();
11736 }
11737 
11738 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
11739                                                           EVT VT) const {
11740   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
11741 }
11742 
11743 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
11744                                                        unsigned,
11745                                                        unsigned,
11746                                                        bool *Fast) const {
11747   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
11748   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
11749 
11750   switch (VT.getSimpleVT().SimpleTy) {
11751   default:
11752     return false;
11753   case MVT::i8:
11754   case MVT::i16:
11755   case MVT::i32: {
11756     // Unaligned access can use (for example) LRDB, LRDH, LDR
11757     if (AllowsUnaligned) {
11758       if (Fast)
11759         *Fast = Subtarget->hasV7Ops();
11760       return true;
11761     }
11762     return false;
11763   }
11764   case MVT::f64:
11765   case MVT::v2f64: {
11766     // For any little-endian targets with neon, we can support unaligned ld/st
11767     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
11768     // A big-endian target may also explicitly support unaligned accesses
11769     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
11770       if (Fast)
11771         *Fast = true;
11772       return true;
11773     }
11774     return false;
11775   }
11776   }
11777 }
11778 
11779 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
11780                        unsigned AlignCheck) {
11781   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
11782           (DstAlign == 0 || DstAlign % AlignCheck == 0));
11783 }
11784 
11785 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
11786                                            unsigned DstAlign, unsigned SrcAlign,
11787                                            bool IsMemset, bool ZeroMemset,
11788                                            bool MemcpyStrSrc,
11789                                            MachineFunction &MF) const {
11790   const Function *F = MF.getFunction();
11791 
11792   // See if we can use NEON instructions for this...
11793   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
11794       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
11795     bool Fast;
11796     if (Size >= 16 &&
11797         (memOpAlign(SrcAlign, DstAlign, 16) ||
11798          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
11799       return MVT::v2f64;
11800     } else if (Size >= 8 &&
11801                (memOpAlign(SrcAlign, DstAlign, 8) ||
11802                 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
11803                  Fast))) {
11804       return MVT::f64;
11805     }
11806   }
11807 
11808   // Lowering to i32/i16 if the size permits.
11809   if (Size >= 4)
11810     return MVT::i32;
11811   else if (Size >= 2)
11812     return MVT::i16;
11813 
11814   // Let the target-independent logic figure it out.
11815   return MVT::Other;
11816 }
11817 
11818 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
11819   if (Val.getOpcode() != ISD::LOAD)
11820     return false;
11821 
11822   EVT VT1 = Val.getValueType();
11823   if (!VT1.isSimple() || !VT1.isInteger() ||
11824       !VT2.isSimple() || !VT2.isInteger())
11825     return false;
11826 
11827   switch (VT1.getSimpleVT().SimpleTy) {
11828   default: break;
11829   case MVT::i1:
11830   case MVT::i8:
11831   case MVT::i16:
11832     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
11833     return true;
11834   }
11835 
11836   return false;
11837 }
11838 
11839 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
11840   EVT VT = ExtVal.getValueType();
11841 
11842   if (!isTypeLegal(VT))
11843     return false;
11844 
11845   // Don't create a loadext if we can fold the extension into a wide/long
11846   // instruction.
11847   // If there's more than one user instruction, the loadext is desirable no
11848   // matter what.  There can be two uses by the same instruction.
11849   if (ExtVal->use_empty() ||
11850       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
11851     return true;
11852 
11853   SDNode *U = *ExtVal->use_begin();
11854   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
11855        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
11856     return false;
11857 
11858   return true;
11859 }
11860 
11861 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
11862   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11863     return false;
11864 
11865   if (!isTypeLegal(EVT::getEVT(Ty1)))
11866     return false;
11867 
11868   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
11869 
11870   // Assuming the caller doesn't have a zeroext or signext return parameter,
11871   // truncation all the way down to i1 is valid.
11872   return true;
11873 }
11874 
11875 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
11876                                                 const AddrMode &AM, Type *Ty,
11877                                                 unsigned AS) const {
11878   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
11879     if (Subtarget->hasFPAO())
11880       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
11881     return 0;
11882   }
11883   return -1;
11884 }
11885 
11886 
11887 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
11888   if (V < 0)
11889     return false;
11890 
11891   unsigned Scale = 1;
11892   switch (VT.getSimpleVT().SimpleTy) {
11893   default: return false;
11894   case MVT::i1:
11895   case MVT::i8:
11896     // Scale == 1;
11897     break;
11898   case MVT::i16:
11899     // Scale == 2;
11900     Scale = 2;
11901     break;
11902   case MVT::i32:
11903     // Scale == 4;
11904     Scale = 4;
11905     break;
11906   }
11907 
11908   if ((V & (Scale - 1)) != 0)
11909     return false;
11910   V /= Scale;
11911   return V == (V & ((1LL << 5) - 1));
11912 }
11913 
11914 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
11915                                       const ARMSubtarget *Subtarget) {
11916   bool isNeg = false;
11917   if (V < 0) {
11918     isNeg = true;
11919     V = - V;
11920   }
11921 
11922   switch (VT.getSimpleVT().SimpleTy) {
11923   default: return false;
11924   case MVT::i1:
11925   case MVT::i8:
11926   case MVT::i16:
11927   case MVT::i32:
11928     // + imm12 or - imm8
11929     if (isNeg)
11930       return V == (V & ((1LL << 8) - 1));
11931     return V == (V & ((1LL << 12) - 1));
11932   case MVT::f32:
11933   case MVT::f64:
11934     // Same as ARM mode. FIXME: NEON?
11935     if (!Subtarget->hasVFP2())
11936       return false;
11937     if ((V & 3) != 0)
11938       return false;
11939     V >>= 2;
11940     return V == (V & ((1LL << 8) - 1));
11941   }
11942 }
11943 
11944 /// isLegalAddressImmediate - Return true if the integer value can be used
11945 /// as the offset of the target addressing mode for load / store of the
11946 /// given type.
11947 static bool isLegalAddressImmediate(int64_t V, EVT VT,
11948                                     const ARMSubtarget *Subtarget) {
11949   if (V == 0)
11950     return true;
11951 
11952   if (!VT.isSimple())
11953     return false;
11954 
11955   if (Subtarget->isThumb1Only())
11956     return isLegalT1AddressImmediate(V, VT);
11957   else if (Subtarget->isThumb2())
11958     return isLegalT2AddressImmediate(V, VT, Subtarget);
11959 
11960   // ARM mode.
11961   if (V < 0)
11962     V = - V;
11963   switch (VT.getSimpleVT().SimpleTy) {
11964   default: return false;
11965   case MVT::i1:
11966   case MVT::i8:
11967   case MVT::i32:
11968     // +- imm12
11969     return V == (V & ((1LL << 12) - 1));
11970   case MVT::i16:
11971     // +- imm8
11972     return V == (V & ((1LL << 8) - 1));
11973   case MVT::f32:
11974   case MVT::f64:
11975     if (!Subtarget->hasVFP2()) // FIXME: NEON?
11976       return false;
11977     if ((V & 3) != 0)
11978       return false;
11979     V >>= 2;
11980     return V == (V & ((1LL << 8) - 1));
11981   }
11982 }
11983 
11984 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
11985                                                       EVT VT) const {
11986   int Scale = AM.Scale;
11987   if (Scale < 0)
11988     return false;
11989 
11990   switch (VT.getSimpleVT().SimpleTy) {
11991   default: return false;
11992   case MVT::i1:
11993   case MVT::i8:
11994   case MVT::i16:
11995   case MVT::i32:
11996     if (Scale == 1)
11997       return true;
11998     // r + r << imm
11999     Scale = Scale & ~1;
12000     return Scale == 2 || Scale == 4 || Scale == 8;
12001   case MVT::i64:
12002     // r + r
12003     if (((unsigned)AM.HasBaseReg + Scale) <= 2)
12004       return true;
12005     return false;
12006   case MVT::isVoid:
12007     // Note, we allow "void" uses (basically, uses that aren't loads or
12008     // stores), because arm allows folding a scale into many arithmetic
12009     // operations.  This should be made more precise and revisited later.
12010 
12011     // Allow r << imm, but the imm has to be a multiple of two.
12012     if (Scale & 1) return false;
12013     return isPowerOf2_32(Scale);
12014   }
12015 }
12016 
12017 /// isLegalAddressingMode - Return true if the addressing mode represented
12018 /// by AM is legal for this target, for a load/store of the specified type.
12019 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
12020                                               const AddrMode &AM, Type *Ty,
12021                                               unsigned AS) const {
12022   EVT VT = getValueType(DL, Ty, true);
12023   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
12024     return false;
12025 
12026   // Can never fold addr of global into load/store.
12027   if (AM.BaseGV)
12028     return false;
12029 
12030   switch (AM.Scale) {
12031   case 0:  // no scale reg, must be "r+i" or "r", or "i".
12032     break;
12033   case 1:
12034     if (Subtarget->isThumb1Only())
12035       return false;
12036     LLVM_FALLTHROUGH;
12037   default:
12038     // ARM doesn't support any R+R*scale+imm addr modes.
12039     if (AM.BaseOffs)
12040       return false;
12041 
12042     if (!VT.isSimple())
12043       return false;
12044 
12045     if (Subtarget->isThumb2())
12046       return isLegalT2ScaledAddressingMode(AM, VT);
12047 
12048     int Scale = AM.Scale;
12049     switch (VT.getSimpleVT().SimpleTy) {
12050     default: return false;
12051     case MVT::i1:
12052     case MVT::i8:
12053     case MVT::i32:
12054       if (Scale < 0) Scale = -Scale;
12055       if (Scale == 1)
12056         return true;
12057       // r + r << imm
12058       return isPowerOf2_32(Scale & ~1);
12059     case MVT::i16:
12060     case MVT::i64:
12061       // r + r
12062       if (((unsigned)AM.HasBaseReg + Scale) <= 2)
12063         return true;
12064       return false;
12065 
12066     case MVT::isVoid:
12067       // Note, we allow "void" uses (basically, uses that aren't loads or
12068       // stores), because arm allows folding a scale into many arithmetic
12069       // operations.  This should be made more precise and revisited later.
12070 
12071       // Allow r << imm, but the imm has to be a multiple of two.
12072       if (Scale & 1) return false;
12073       return isPowerOf2_32(Scale);
12074     }
12075   }
12076   return true;
12077 }
12078 
12079 /// isLegalICmpImmediate - Return true if the specified immediate is legal
12080 /// icmp immediate, that is the target has icmp instructions which can compare
12081 /// a register against the immediate without having to materialize the
12082 /// immediate into a register.
12083 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
12084   // Thumb2 and ARM modes can use cmn for negative immediates.
12085   if (!Subtarget->isThumb())
12086     return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
12087   if (Subtarget->isThumb2())
12088     return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
12089   // Thumb1 doesn't have cmn, and only 8-bit immediates.
12090   return Imm >= 0 && Imm <= 255;
12091 }
12092 
12093 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
12094 /// *or sub* immediate, that is the target has add or sub instructions which can
12095 /// add a register with the immediate without having to materialize the
12096 /// immediate into a register.
12097 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
12098   // Same encoding for add/sub, just flip the sign.
12099   int64_t AbsImm = std::abs(Imm);
12100   if (!Subtarget->isThumb())
12101     return ARM_AM::getSOImmVal(AbsImm) != -1;
12102   if (Subtarget->isThumb2())
12103     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
12104   // Thumb1 only has 8-bit unsigned immediate.
12105   return AbsImm >= 0 && AbsImm <= 255;
12106 }
12107 
12108 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
12109                                       bool isSEXTLoad, SDValue &Base,
12110                                       SDValue &Offset, bool &isInc,
12111                                       SelectionDAG &DAG) {
12112   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
12113     return false;
12114 
12115   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
12116     // AddressingMode 3
12117     Base = Ptr->getOperand(0);
12118     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
12119       int RHSC = (int)RHS->getZExtValue();
12120       if (RHSC < 0 && RHSC > -256) {
12121         assert(Ptr->getOpcode() == ISD::ADD);
12122         isInc = false;
12123         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
12124         return true;
12125       }
12126     }
12127     isInc = (Ptr->getOpcode() == ISD::ADD);
12128     Offset = Ptr->getOperand(1);
12129     return true;
12130   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
12131     // AddressingMode 2
12132     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
12133       int RHSC = (int)RHS->getZExtValue();
12134       if (RHSC < 0 && RHSC > -0x1000) {
12135         assert(Ptr->getOpcode() == ISD::ADD);
12136         isInc = false;
12137         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
12138         Base = Ptr->getOperand(0);
12139         return true;
12140       }
12141     }
12142 
12143     if (Ptr->getOpcode() == ISD::ADD) {
12144       isInc = true;
12145       ARM_AM::ShiftOpc ShOpcVal=
12146         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
12147       if (ShOpcVal != ARM_AM::no_shift) {
12148         Base = Ptr->getOperand(1);
12149         Offset = Ptr->getOperand(0);
12150       } else {
12151         Base = Ptr->getOperand(0);
12152         Offset = Ptr->getOperand(1);
12153       }
12154       return true;
12155     }
12156 
12157     isInc = (Ptr->getOpcode() == ISD::ADD);
12158     Base = Ptr->getOperand(0);
12159     Offset = Ptr->getOperand(1);
12160     return true;
12161   }
12162 
12163   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
12164   return false;
12165 }
12166 
12167 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
12168                                      bool isSEXTLoad, SDValue &Base,
12169                                      SDValue &Offset, bool &isInc,
12170                                      SelectionDAG &DAG) {
12171   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
12172     return false;
12173 
12174   Base = Ptr->getOperand(0);
12175   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
12176     int RHSC = (int)RHS->getZExtValue();
12177     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
12178       assert(Ptr->getOpcode() == ISD::ADD);
12179       isInc = false;
12180       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
12181       return true;
12182     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
12183       isInc = Ptr->getOpcode() == ISD::ADD;
12184       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
12185       return true;
12186     }
12187   }
12188 
12189   return false;
12190 }
12191 
12192 /// getPreIndexedAddressParts - returns true by value, base pointer and
12193 /// offset pointer and addressing mode by reference if the node's address
12194 /// can be legally represented as pre-indexed load / store address.
12195 bool
12196 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
12197                                              SDValue &Offset,
12198                                              ISD::MemIndexedMode &AM,
12199                                              SelectionDAG &DAG) const {
12200   if (Subtarget->isThumb1Only())
12201     return false;
12202 
12203   EVT VT;
12204   SDValue Ptr;
12205   bool isSEXTLoad = false;
12206   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
12207     Ptr = LD->getBasePtr();
12208     VT  = LD->getMemoryVT();
12209     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
12210   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
12211     Ptr = ST->getBasePtr();
12212     VT  = ST->getMemoryVT();
12213   } else
12214     return false;
12215 
12216   bool isInc;
12217   bool isLegal = false;
12218   if (Subtarget->isThumb2())
12219     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
12220                                        Offset, isInc, DAG);
12221   else
12222     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
12223                                         Offset, isInc, DAG);
12224   if (!isLegal)
12225     return false;
12226 
12227   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
12228   return true;
12229 }
12230 
12231 /// getPostIndexedAddressParts - returns true by value, base pointer and
12232 /// offset pointer and addressing mode by reference if this node can be
12233 /// combined with a load / store to form a post-indexed load / store.
12234 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
12235                                                    SDValue &Base,
12236                                                    SDValue &Offset,
12237                                                    ISD::MemIndexedMode &AM,
12238                                                    SelectionDAG &DAG) const {
12239   EVT VT;
12240   SDValue Ptr;
12241   bool isSEXTLoad = false, isNonExt;
12242   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
12243     VT  = LD->getMemoryVT();
12244     Ptr = LD->getBasePtr();
12245     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
12246     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
12247   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
12248     VT  = ST->getMemoryVT();
12249     Ptr = ST->getBasePtr();
12250     isNonExt = !ST->isTruncatingStore();
12251   } else
12252     return false;
12253 
12254   if (Subtarget->isThumb1Only()) {
12255     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
12256     // must be non-extending/truncating, i32, with an offset of 4.
12257     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
12258     if (Op->getOpcode() != ISD::ADD || !isNonExt)
12259       return false;
12260     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12261     if (!RHS || RHS->getZExtValue() != 4)
12262       return false;
12263 
12264     Offset = Op->getOperand(1);
12265     Base = Op->getOperand(0);
12266     AM = ISD::POST_INC;
12267     return true;
12268   }
12269 
12270   bool isInc;
12271   bool isLegal = false;
12272   if (Subtarget->isThumb2())
12273     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
12274                                        isInc, DAG);
12275   else
12276     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
12277                                         isInc, DAG);
12278   if (!isLegal)
12279     return false;
12280 
12281   if (Ptr != Base) {
12282     // Swap base ptr and offset to catch more post-index load / store when
12283     // it's legal. In Thumb2 mode, offset must be an immediate.
12284     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
12285         !Subtarget->isThumb2())
12286       std::swap(Base, Offset);
12287 
12288     // Post-indexed load / store update the base pointer.
12289     if (Ptr != Base)
12290       return false;
12291   }
12292 
12293   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
12294   return true;
12295 }
12296 
12297 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
12298                                                       APInt &KnownZero,
12299                                                       APInt &KnownOne,
12300                                                       const SelectionDAG &DAG,
12301                                                       unsigned Depth) const {
12302   unsigned BitWidth = KnownOne.getBitWidth();
12303   KnownZero = KnownOne = APInt(BitWidth, 0);
12304   switch (Op.getOpcode()) {
12305   default: break;
12306   case ARMISD::ADDC:
12307   case ARMISD::ADDE:
12308   case ARMISD::SUBC:
12309   case ARMISD::SUBE:
12310     // These nodes' second result is a boolean
12311     if (Op.getResNo() == 0)
12312       break;
12313     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
12314     break;
12315   case ARMISD::CMOV: {
12316     // Bits are known zero/one if known on the LHS and RHS.
12317     DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
12318     if (KnownZero == 0 && KnownOne == 0) return;
12319 
12320     APInt KnownZeroRHS, KnownOneRHS;
12321     DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
12322     KnownZero &= KnownZeroRHS;
12323     KnownOne  &= KnownOneRHS;
12324     return;
12325   }
12326   case ISD::INTRINSIC_W_CHAIN: {
12327     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
12328     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
12329     switch (IntID) {
12330     default: return;
12331     case Intrinsic::arm_ldaex:
12332     case Intrinsic::arm_ldrex: {
12333       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
12334       unsigned MemBits = VT.getScalarSizeInBits();
12335       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
12336       return;
12337     }
12338     }
12339   }
12340   }
12341 }
12342 
12343 //===----------------------------------------------------------------------===//
12344 //                           ARM Inline Assembly Support
12345 //===----------------------------------------------------------------------===//
12346 
12347 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
12348   // Looking for "rev" which is V6+.
12349   if (!Subtarget->hasV6Ops())
12350     return false;
12351 
12352   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
12353   std::string AsmStr = IA->getAsmString();
12354   SmallVector<StringRef, 4> AsmPieces;
12355   SplitString(AsmStr, AsmPieces, ";\n");
12356 
12357   switch (AsmPieces.size()) {
12358   default: return false;
12359   case 1:
12360     AsmStr = AsmPieces[0];
12361     AsmPieces.clear();
12362     SplitString(AsmStr, AsmPieces, " \t,");
12363 
12364     // rev $0, $1
12365     if (AsmPieces.size() == 3 &&
12366         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
12367         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
12368       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
12369       if (Ty && Ty->getBitWidth() == 32)
12370         return IntrinsicLowering::LowerToByteSwap(CI);
12371     }
12372     break;
12373   }
12374 
12375   return false;
12376 }
12377 
12378 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12379   // At this point, we have to lower this constraint to something else, so we
12380   // lower it to an "r" or "w". However, by doing this we will force the result
12381   // to be in register, while the X constraint is much more permissive.
12382   //
12383   // Although we are correct (we are free to emit anything, without
12384   // constraints), we might break use cases that would expect us to be more
12385   // efficient and emit something else.
12386   if (!Subtarget->hasVFP2())
12387     return "r";
12388   if (ConstraintVT.isFloatingPoint())
12389     return "w";
12390   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
12391      (ConstraintVT.getSizeInBits() == 64 ||
12392       ConstraintVT.getSizeInBits() == 128))
12393     return "w";
12394 
12395   return "r";
12396 }
12397 
12398 /// getConstraintType - Given a constraint letter, return the type of
12399 /// constraint it is for this target.
12400 ARMTargetLowering::ConstraintType
12401 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
12402   if (Constraint.size() == 1) {
12403     switch (Constraint[0]) {
12404     default:  break;
12405     case 'l': return C_RegisterClass;
12406     case 'w': return C_RegisterClass;
12407     case 'h': return C_RegisterClass;
12408     case 'x': return C_RegisterClass;
12409     case 't': return C_RegisterClass;
12410     case 'j': return C_Other; // Constant for movw.
12411       // An address with a single base register. Due to the way we
12412       // currently handle addresses it is the same as an 'r' memory constraint.
12413     case 'Q': return C_Memory;
12414     }
12415   } else if (Constraint.size() == 2) {
12416     switch (Constraint[0]) {
12417     default: break;
12418     // All 'U+' constraints are addresses.
12419     case 'U': return C_Memory;
12420     }
12421   }
12422   return TargetLowering::getConstraintType(Constraint);
12423 }
12424 
12425 /// Examine constraint type and operand type and determine a weight value.
12426 /// This object must already have been set up with the operand type
12427 /// and the current alternative constraint selected.
12428 TargetLowering::ConstraintWeight
12429 ARMTargetLowering::getSingleConstraintMatchWeight(
12430     AsmOperandInfo &info, const char *constraint) const {
12431   ConstraintWeight weight = CW_Invalid;
12432   Value *CallOperandVal = info.CallOperandVal;
12433     // If we don't have a value, we can't do a match,
12434     // but allow it at the lowest weight.
12435   if (!CallOperandVal)
12436     return CW_Default;
12437   Type *type = CallOperandVal->getType();
12438   // Look at the constraint type.
12439   switch (*constraint) {
12440   default:
12441     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
12442     break;
12443   case 'l':
12444     if (type->isIntegerTy()) {
12445       if (Subtarget->isThumb())
12446         weight = CW_SpecificReg;
12447       else
12448         weight = CW_Register;
12449     }
12450     break;
12451   case 'w':
12452     if (type->isFloatingPointTy())
12453       weight = CW_Register;
12454     break;
12455   }
12456   return weight;
12457 }
12458 
12459 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
12460 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
12461     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12462   if (Constraint.size() == 1) {
12463     // GCC ARM Constraint Letters
12464     switch (Constraint[0]) {
12465     case 'l': // Low regs or general regs.
12466       if (Subtarget->isThumb())
12467         return RCPair(0U, &ARM::tGPRRegClass);
12468       return RCPair(0U, &ARM::GPRRegClass);
12469     case 'h': // High regs or no regs.
12470       if (Subtarget->isThumb())
12471         return RCPair(0U, &ARM::hGPRRegClass);
12472       break;
12473     case 'r':
12474       if (Subtarget->isThumb1Only())
12475         return RCPair(0U, &ARM::tGPRRegClass);
12476       return RCPair(0U, &ARM::GPRRegClass);
12477     case 'w':
12478       if (VT == MVT::Other)
12479         break;
12480       if (VT == MVT::f32)
12481         return RCPair(0U, &ARM::SPRRegClass);
12482       if (VT.getSizeInBits() == 64)
12483         return RCPair(0U, &ARM::DPRRegClass);
12484       if (VT.getSizeInBits() == 128)
12485         return RCPair(0U, &ARM::QPRRegClass);
12486       break;
12487     case 'x':
12488       if (VT == MVT::Other)
12489         break;
12490       if (VT == MVT::f32)
12491         return RCPair(0U, &ARM::SPR_8RegClass);
12492       if (VT.getSizeInBits() == 64)
12493         return RCPair(0U, &ARM::DPR_8RegClass);
12494       if (VT.getSizeInBits() == 128)
12495         return RCPair(0U, &ARM::QPR_8RegClass);
12496       break;
12497     case 't':
12498       if (VT == MVT::f32)
12499         return RCPair(0U, &ARM::SPRRegClass);
12500       break;
12501     }
12502   }
12503   if (StringRef("{cc}").equals_lower(Constraint))
12504     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
12505 
12506   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12507 }
12508 
12509 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12510 /// vector.  If it is invalid, don't add anything to Ops.
12511 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
12512                                                      std::string &Constraint,
12513                                                      std::vector<SDValue>&Ops,
12514                                                      SelectionDAG &DAG) const {
12515   SDValue Result;
12516 
12517   // Currently only support length 1 constraints.
12518   if (Constraint.length() != 1) return;
12519 
12520   char ConstraintLetter = Constraint[0];
12521   switch (ConstraintLetter) {
12522   default: break;
12523   case 'j':
12524   case 'I': case 'J': case 'K': case 'L':
12525   case 'M': case 'N': case 'O':
12526     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12527     if (!C)
12528       return;
12529 
12530     int64_t CVal64 = C->getSExtValue();
12531     int CVal = (int) CVal64;
12532     // None of these constraints allow values larger than 32 bits.  Check
12533     // that the value fits in an int.
12534     if (CVal != CVal64)
12535       return;
12536 
12537     switch (ConstraintLetter) {
12538       case 'j':
12539         // Constant suitable for movw, must be between 0 and
12540         // 65535.
12541         if (Subtarget->hasV6T2Ops())
12542           if (CVal >= 0 && CVal <= 65535)
12543             break;
12544         return;
12545       case 'I':
12546         if (Subtarget->isThumb1Only()) {
12547           // This must be a constant between 0 and 255, for ADD
12548           // immediates.
12549           if (CVal >= 0 && CVal <= 255)
12550             break;
12551         } else if (Subtarget->isThumb2()) {
12552           // A constant that can be used as an immediate value in a
12553           // data-processing instruction.
12554           if (ARM_AM::getT2SOImmVal(CVal) != -1)
12555             break;
12556         } else {
12557           // A constant that can be used as an immediate value in a
12558           // data-processing instruction.
12559           if (ARM_AM::getSOImmVal(CVal) != -1)
12560             break;
12561         }
12562         return;
12563 
12564       case 'J':
12565         if (Subtarget->isThumb1Only()) {
12566           // This must be a constant between -255 and -1, for negated ADD
12567           // immediates. This can be used in GCC with an "n" modifier that
12568           // prints the negated value, for use with SUB instructions. It is
12569           // not useful otherwise but is implemented for compatibility.
12570           if (CVal >= -255 && CVal <= -1)
12571             break;
12572         } else {
12573           // This must be a constant between -4095 and 4095. It is not clear
12574           // what this constraint is intended for. Implemented for
12575           // compatibility with GCC.
12576           if (CVal >= -4095 && CVal <= 4095)
12577             break;
12578         }
12579         return;
12580 
12581       case 'K':
12582         if (Subtarget->isThumb1Only()) {
12583           // A 32-bit value where only one byte has a nonzero value. Exclude
12584           // zero to match GCC. This constraint is used by GCC internally for
12585           // constants that can be loaded with a move/shift combination.
12586           // It is not useful otherwise but is implemented for compatibility.
12587           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
12588             break;
12589         } else if (Subtarget->isThumb2()) {
12590           // A constant whose bitwise inverse can be used as an immediate
12591           // value in a data-processing instruction. This can be used in GCC
12592           // with a "B" modifier that prints the inverted value, for use with
12593           // BIC and MVN instructions. It is not useful otherwise but is
12594           // implemented for compatibility.
12595           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
12596             break;
12597         } else {
12598           // A constant whose bitwise inverse can be used as an immediate
12599           // value in a data-processing instruction. This can be used in GCC
12600           // with a "B" modifier that prints the inverted value, for use with
12601           // BIC and MVN instructions. It is not useful otherwise but is
12602           // implemented for compatibility.
12603           if (ARM_AM::getSOImmVal(~CVal) != -1)
12604             break;
12605         }
12606         return;
12607 
12608       case 'L':
12609         if (Subtarget->isThumb1Only()) {
12610           // This must be a constant between -7 and 7,
12611           // for 3-operand ADD/SUB immediate instructions.
12612           if (CVal >= -7 && CVal < 7)
12613             break;
12614         } else if (Subtarget->isThumb2()) {
12615           // A constant whose negation can be used as an immediate value in a
12616           // data-processing instruction. This can be used in GCC with an "n"
12617           // modifier that prints the negated value, for use with SUB
12618           // instructions. It is not useful otherwise but is implemented for
12619           // compatibility.
12620           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
12621             break;
12622         } else {
12623           // A constant whose negation can be used as an immediate value in a
12624           // data-processing instruction. This can be used in GCC with an "n"
12625           // modifier that prints the negated value, for use with SUB
12626           // instructions. It is not useful otherwise but is implemented for
12627           // compatibility.
12628           if (ARM_AM::getSOImmVal(-CVal) != -1)
12629             break;
12630         }
12631         return;
12632 
12633       case 'M':
12634         if (Subtarget->isThumb1Only()) {
12635           // This must be a multiple of 4 between 0 and 1020, for
12636           // ADD sp + immediate.
12637           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
12638             break;
12639         } else {
12640           // A power of two or a constant between 0 and 32.  This is used in
12641           // GCC for the shift amount on shifted register operands, but it is
12642           // useful in general for any shift amounts.
12643           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
12644             break;
12645         }
12646         return;
12647 
12648       case 'N':
12649         if (Subtarget->isThumb()) {  // FIXME thumb2
12650           // This must be a constant between 0 and 31, for shift amounts.
12651           if (CVal >= 0 && CVal <= 31)
12652             break;
12653         }
12654         return;
12655 
12656       case 'O':
12657         if (Subtarget->isThumb()) {  // FIXME thumb2
12658           // This must be a multiple of 4 between -508 and 508, for
12659           // ADD/SUB sp = sp + immediate.
12660           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
12661             break;
12662         }
12663         return;
12664     }
12665     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
12666     break;
12667   }
12668 
12669   if (Result.getNode()) {
12670     Ops.push_back(Result);
12671     return;
12672   }
12673   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12674 }
12675 
12676 static RTLIB::Libcall getDivRemLibcall(
12677     const SDNode *N, MVT::SimpleValueType SVT) {
12678   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
12679           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
12680          "Unhandled Opcode in getDivRemLibcall");
12681   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
12682                   N->getOpcode() == ISD::SREM;
12683   RTLIB::Libcall LC;
12684   switch (SVT) {
12685   default: llvm_unreachable("Unexpected request for libcall!");
12686   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
12687   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
12688   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
12689   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
12690   }
12691   return LC;
12692 }
12693 
12694 static TargetLowering::ArgListTy getDivRemArgList(
12695     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
12696   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
12697           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
12698          "Unhandled Opcode in getDivRemArgList");
12699   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
12700                   N->getOpcode() == ISD::SREM;
12701   TargetLowering::ArgListTy Args;
12702   TargetLowering::ArgListEntry Entry;
12703   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
12704     EVT ArgVT = N->getOperand(i).getValueType();
12705     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
12706     Entry.Node = N->getOperand(i);
12707     Entry.Ty = ArgTy;
12708     Entry.isSExt = isSigned;
12709     Entry.isZExt = !isSigned;
12710     Args.push_back(Entry);
12711   }
12712   if (Subtarget->isTargetWindows() && Args.size() >= 2)
12713     std::swap(Args[0], Args[1]);
12714   return Args;
12715 }
12716 
12717 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
12718   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
12719           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
12720           Subtarget->isTargetWindows()) &&
12721          "Register-based DivRem lowering only");
12722   unsigned Opcode = Op->getOpcode();
12723   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
12724          "Invalid opcode for Div/Rem lowering");
12725   bool isSigned = (Opcode == ISD::SDIVREM);
12726   EVT VT = Op->getValueType(0);
12727   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
12728   SDLoc dl(Op);
12729 
12730   // If the target has hardware divide, use divide + multiply + subtract:
12731   //     div = a / b
12732   //     rem = a - b * div
12733   //     return {div, rem}
12734   // This should be lowered into UDIV/SDIV + MLS later on.
12735   if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() &&
12736       Op->getSimpleValueType(0) == MVT::i32) {
12737     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
12738     const SDValue Dividend = Op->getOperand(0);
12739     const SDValue Divisor = Op->getOperand(1);
12740     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
12741     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
12742     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
12743 
12744     SDValue Values[2] = {Div, Rem};
12745     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
12746   }
12747 
12748   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
12749                                        VT.getSimpleVT().SimpleTy);
12750   SDValue InChain = DAG.getEntryNode();
12751 
12752   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
12753                                                     DAG.getContext(),
12754                                                     Subtarget);
12755 
12756   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
12757                                          getPointerTy(DAG.getDataLayout()));
12758 
12759   Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
12760 
12761   if (Subtarget->isTargetWindows())
12762     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
12763 
12764   TargetLowering::CallLoweringInfo CLI(DAG);
12765   CLI.setDebugLoc(dl).setChain(InChain)
12766     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
12767     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
12768 
12769   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
12770   return CallInfo.first;
12771 }
12772 
12773 // Lowers REM using divmod helpers
12774 // see RTABI section 4.2/4.3
12775 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
12776   // Build return types (div and rem)
12777   std::vector<Type*> RetTyParams;
12778   Type *RetTyElement;
12779 
12780   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
12781   default: llvm_unreachable("Unexpected request for libcall!");
12782   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
12783   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
12784   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
12785   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
12786   }
12787 
12788   RetTyParams.push_back(RetTyElement);
12789   RetTyParams.push_back(RetTyElement);
12790   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
12791   Type *RetTy = StructType::get(*DAG.getContext(), ret);
12792 
12793   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
12794                                                              SimpleTy);
12795   SDValue InChain = DAG.getEntryNode();
12796   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
12797                                                     Subtarget);
12798   bool isSigned = N->getOpcode() == ISD::SREM;
12799   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
12800                                          getPointerTy(DAG.getDataLayout()));
12801 
12802   if (Subtarget->isTargetWindows())
12803     InChain = WinDBZCheckDenominator(DAG, N, InChain);
12804 
12805   // Lower call
12806   CallLoweringInfo CLI(DAG);
12807   CLI.setChain(InChain)
12808      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
12809      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
12810   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
12811 
12812   // Return second (rem) result operand (first contains div)
12813   SDNode *ResNode = CallResult.first.getNode();
12814   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
12815   return ResNode->getOperand(1);
12816 }
12817 
12818 SDValue
12819 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
12820   assert(Subtarget->isTargetWindows() && "unsupported target platform");
12821   SDLoc DL(Op);
12822 
12823   // Get the inputs.
12824   SDValue Chain = Op.getOperand(0);
12825   SDValue Size  = Op.getOperand(1);
12826 
12827   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
12828                               DAG.getConstant(2, DL, MVT::i32));
12829 
12830   SDValue Flag;
12831   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
12832   Flag = Chain.getValue(1);
12833 
12834   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
12835   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
12836 
12837   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
12838   Chain = NewSP.getValue(1);
12839 
12840   SDValue Ops[2] = { NewSP, Chain };
12841   return DAG.getMergeValues(Ops, DL);
12842 }
12843 
12844 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
12845   assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
12846          "Unexpected type for custom-lowering FP_EXTEND");
12847 
12848   RTLIB::Libcall LC;
12849   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
12850 
12851   SDValue SrcVal = Op.getOperand(0);
12852   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
12853                      SDLoc(Op)).first;
12854 }
12855 
12856 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
12857   assert(Op.getOperand(0).getValueType() == MVT::f64 &&
12858          Subtarget->isFPOnlySP() &&
12859          "Unexpected type for custom-lowering FP_ROUND");
12860 
12861   RTLIB::Libcall LC;
12862   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
12863 
12864   SDValue SrcVal = Op.getOperand(0);
12865   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
12866                      SDLoc(Op)).first;
12867 }
12868 
12869 bool
12870 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
12871   // The ARM target isn't yet aware of offsets.
12872   return false;
12873 }
12874 
12875 bool ARM::isBitFieldInvertedMask(unsigned v) {
12876   if (v == 0xffffffff)
12877     return false;
12878 
12879   // there can be 1's on either or both "outsides", all the "inside"
12880   // bits must be 0's
12881   return isShiftedMask_32(~v);
12882 }
12883 
12884 /// isFPImmLegal - Returns true if the target can instruction select the
12885 /// specified FP immediate natively. If false, the legalizer will
12886 /// materialize the FP immediate as a load from a constant pool.
12887 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
12888   if (!Subtarget->hasVFP3())
12889     return false;
12890   if (VT == MVT::f32)
12891     return ARM_AM::getFP32Imm(Imm) != -1;
12892   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
12893     return ARM_AM::getFP64Imm(Imm) != -1;
12894   return false;
12895 }
12896 
12897 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
12898 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
12899 /// specified in the intrinsic calls.
12900 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12901                                            const CallInst &I,
12902                                            unsigned Intrinsic) const {
12903   switch (Intrinsic) {
12904   case Intrinsic::arm_neon_vld1:
12905   case Intrinsic::arm_neon_vld2:
12906   case Intrinsic::arm_neon_vld3:
12907   case Intrinsic::arm_neon_vld4:
12908   case Intrinsic::arm_neon_vld2lane:
12909   case Intrinsic::arm_neon_vld3lane:
12910   case Intrinsic::arm_neon_vld4lane: {
12911     Info.opc = ISD::INTRINSIC_W_CHAIN;
12912     // Conservatively set memVT to the entire set of vectors loaded.
12913     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12914     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
12915     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12916     Info.ptrVal = I.getArgOperand(0);
12917     Info.offset = 0;
12918     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12919     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12920     Info.vol = false; // volatile loads with NEON intrinsics not supported
12921     Info.readMem = true;
12922     Info.writeMem = false;
12923     return true;
12924   }
12925   case Intrinsic::arm_neon_vst1:
12926   case Intrinsic::arm_neon_vst2:
12927   case Intrinsic::arm_neon_vst3:
12928   case Intrinsic::arm_neon_vst4:
12929   case Intrinsic::arm_neon_vst2lane:
12930   case Intrinsic::arm_neon_vst3lane:
12931   case Intrinsic::arm_neon_vst4lane: {
12932     Info.opc = ISD::INTRINSIC_VOID;
12933     // Conservatively set memVT to the entire set of vectors stored.
12934     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12935     unsigned NumElts = 0;
12936     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
12937       Type *ArgTy = I.getArgOperand(ArgI)->getType();
12938       if (!ArgTy->isVectorTy())
12939         break;
12940       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
12941     }
12942     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12943     Info.ptrVal = I.getArgOperand(0);
12944     Info.offset = 0;
12945     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12946     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12947     Info.vol = false; // volatile stores with NEON intrinsics not supported
12948     Info.readMem = false;
12949     Info.writeMem = true;
12950     return true;
12951   }
12952   case Intrinsic::arm_ldaex:
12953   case Intrinsic::arm_ldrex: {
12954     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12955     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
12956     Info.opc = ISD::INTRINSIC_W_CHAIN;
12957     Info.memVT = MVT::getVT(PtrTy->getElementType());
12958     Info.ptrVal = I.getArgOperand(0);
12959     Info.offset = 0;
12960     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12961     Info.vol = true;
12962     Info.readMem = true;
12963     Info.writeMem = false;
12964     return true;
12965   }
12966   case Intrinsic::arm_stlex:
12967   case Intrinsic::arm_strex: {
12968     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12969     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
12970     Info.opc = ISD::INTRINSIC_W_CHAIN;
12971     Info.memVT = MVT::getVT(PtrTy->getElementType());
12972     Info.ptrVal = I.getArgOperand(1);
12973     Info.offset = 0;
12974     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12975     Info.vol = true;
12976     Info.readMem = false;
12977     Info.writeMem = true;
12978     return true;
12979   }
12980   case Intrinsic::arm_stlexd:
12981   case Intrinsic::arm_strexd:
12982     Info.opc = ISD::INTRINSIC_W_CHAIN;
12983     Info.memVT = MVT::i64;
12984     Info.ptrVal = I.getArgOperand(2);
12985     Info.offset = 0;
12986     Info.align = 8;
12987     Info.vol = true;
12988     Info.readMem = false;
12989     Info.writeMem = true;
12990     return true;
12991 
12992   case Intrinsic::arm_ldaexd:
12993   case Intrinsic::arm_ldrexd:
12994     Info.opc = ISD::INTRINSIC_W_CHAIN;
12995     Info.memVT = MVT::i64;
12996     Info.ptrVal = I.getArgOperand(0);
12997     Info.offset = 0;
12998     Info.align = 8;
12999     Info.vol = true;
13000     Info.readMem = true;
13001     Info.writeMem = false;
13002     return true;
13003 
13004   default:
13005     break;
13006   }
13007 
13008   return false;
13009 }
13010 
13011 /// \brief Returns true if it is beneficial to convert a load of a constant
13012 /// to just the constant itself.
13013 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
13014                                                           Type *Ty) const {
13015   assert(Ty->isIntegerTy());
13016 
13017   unsigned Bits = Ty->getPrimitiveSizeInBits();
13018   if (Bits == 0 || Bits > 32)
13019     return false;
13020   return true;
13021 }
13022 
13023 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
13024                                                 unsigned Index) const {
13025   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
13026     return false;
13027 
13028   return (Index == 0 || Index == ResVT.getVectorNumElements());
13029 }
13030 
13031 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
13032                                         ARM_MB::MemBOpt Domain) const {
13033   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
13034 
13035   // First, if the target has no DMB, see what fallback we can use.
13036   if (!Subtarget->hasDataBarrier()) {
13037     // Some ARMv6 cpus can support data barriers with an mcr instruction.
13038     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
13039     // here.
13040     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
13041       Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
13042       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
13043                         Builder.getInt32(0), Builder.getInt32(7),
13044                         Builder.getInt32(10), Builder.getInt32(5)};
13045       return Builder.CreateCall(MCR, args);
13046     } else {
13047       // Instead of using barriers, atomic accesses on these subtargets use
13048       // libcalls.
13049       llvm_unreachable("makeDMB on a target so old that it has no barriers");
13050     }
13051   } else {
13052     Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
13053     // Only a full system barrier exists in the M-class architectures.
13054     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
13055     Constant *CDomain = Builder.getInt32(Domain);
13056     return Builder.CreateCall(DMB, CDomain);
13057   }
13058 }
13059 
13060 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
13061 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
13062                                          AtomicOrdering Ord, bool IsStore,
13063                                          bool IsLoad) const {
13064   switch (Ord) {
13065   case AtomicOrdering::NotAtomic:
13066   case AtomicOrdering::Unordered:
13067     llvm_unreachable("Invalid fence: unordered/non-atomic");
13068   case AtomicOrdering::Monotonic:
13069   case AtomicOrdering::Acquire:
13070     return nullptr; // Nothing to do
13071   case AtomicOrdering::SequentiallyConsistent:
13072     if (!IsStore)
13073       return nullptr; // Nothing to do
13074     /*FALLTHROUGH*/
13075   case AtomicOrdering::Release:
13076   case AtomicOrdering::AcquireRelease:
13077     if (Subtarget->preferISHSTBarriers())
13078       return makeDMB(Builder, ARM_MB::ISHST);
13079     // FIXME: add a comment with a link to documentation justifying this.
13080     else
13081       return makeDMB(Builder, ARM_MB::ISH);
13082   }
13083   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
13084 }
13085 
13086 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
13087                                           AtomicOrdering Ord, bool IsStore,
13088                                           bool IsLoad) const {
13089   switch (Ord) {
13090   case AtomicOrdering::NotAtomic:
13091   case AtomicOrdering::Unordered:
13092     llvm_unreachable("Invalid fence: unordered/not-atomic");
13093   case AtomicOrdering::Monotonic:
13094   case AtomicOrdering::Release:
13095     return nullptr; // Nothing to do
13096   case AtomicOrdering::Acquire:
13097   case AtomicOrdering::AcquireRelease:
13098   case AtomicOrdering::SequentiallyConsistent:
13099     return makeDMB(Builder, ARM_MB::ISH);
13100   }
13101   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
13102 }
13103 
13104 // Loads and stores less than 64-bits are already atomic; ones above that
13105 // are doomed anyway, so defer to the default libcall and blame the OS when
13106 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
13107 // anything for those.
13108 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
13109   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
13110   return (Size == 64) && !Subtarget->isMClass();
13111 }
13112 
13113 // Loads and stores less than 64-bits are already atomic; ones above that
13114 // are doomed anyway, so defer to the default libcall and blame the OS when
13115 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
13116 // anything for those.
13117 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
13118 // guarantee, see DDI0406C ARM architecture reference manual,
13119 // sections A8.8.72-74 LDRD)
13120 TargetLowering::AtomicExpansionKind
13121 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
13122   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
13123   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
13124                                                   : AtomicExpansionKind::None;
13125 }
13126 
13127 // For the real atomic operations, we have ldrex/strex up to 32 bits,
13128 // and up to 64 bits on the non-M profiles
13129 TargetLowering::AtomicExpansionKind
13130 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
13131   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
13132   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
13133   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
13134              ? AtomicExpansionKind::LLSC
13135              : AtomicExpansionKind::None;
13136 }
13137 
13138 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
13139     AtomicCmpXchgInst *AI) const {
13140   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
13141   // implement cmpxchg without spilling. If the address being exchanged is also
13142   // on the stack and close enough to the spill slot, this can lead to a
13143   // situation where the monitor always gets cleared and the atomic operation
13144   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
13145   bool hasAtomicCmpXchg =
13146       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
13147   return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg;
13148 }
13149 
13150 bool ARMTargetLowering::shouldInsertFencesForAtomic(
13151     const Instruction *I) const {
13152   return InsertFencesForAtomic;
13153 }
13154 
13155 // This has so far only been implemented for MachO.
13156 bool ARMTargetLowering::useLoadStackGuardNode() const {
13157   return Subtarget->isTargetMachO();
13158 }
13159 
13160 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
13161                                                   unsigned &Cost) const {
13162   // If we do not have NEON, vector types are not natively supported.
13163   if (!Subtarget->hasNEON())
13164     return false;
13165 
13166   // Floating point values and vector values map to the same register file.
13167   // Therefore, although we could do a store extract of a vector type, this is
13168   // better to leave at float as we have more freedom in the addressing mode for
13169   // those.
13170   if (VectorTy->isFPOrFPVectorTy())
13171     return false;
13172 
13173   // If the index is unknown at compile time, this is very expensive to lower
13174   // and it is not possible to combine the store with the extract.
13175   if (!isa<ConstantInt>(Idx))
13176     return false;
13177 
13178   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
13179   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
13180   // We can do a store + vector extract on any vector that fits perfectly in a D
13181   // or Q register.
13182   if (BitWidth == 64 || BitWidth == 128) {
13183     Cost = 0;
13184     return true;
13185   }
13186   return false;
13187 }
13188 
13189 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
13190   return Subtarget->hasV6T2Ops();
13191 }
13192 
13193 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
13194   return Subtarget->hasV6T2Ops();
13195 }
13196 
13197 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
13198                                          AtomicOrdering Ord) const {
13199   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
13200   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
13201   bool IsAcquire = isAcquireOrStronger(Ord);
13202 
13203   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
13204   // intrinsic must return {i32, i32} and we have to recombine them into a
13205   // single i64 here.
13206   if (ValTy->getPrimitiveSizeInBits() == 64) {
13207     Intrinsic::ID Int =
13208         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
13209     Function *Ldrex = Intrinsic::getDeclaration(M, Int);
13210 
13211     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
13212     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
13213 
13214     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
13215     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
13216     if (!Subtarget->isLittle())
13217       std::swap (Lo, Hi);
13218     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
13219     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
13220     return Builder.CreateOr(
13221         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
13222   }
13223 
13224   Type *Tys[] = { Addr->getType() };
13225   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
13226   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
13227 
13228   return Builder.CreateTruncOrBitCast(
13229       Builder.CreateCall(Ldrex, Addr),
13230       cast<PointerType>(Addr->getType())->getElementType());
13231 }
13232 
13233 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
13234     IRBuilder<> &Builder) const {
13235   if (!Subtarget->hasV7Ops())
13236     return;
13237   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
13238   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
13239 }
13240 
13241 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
13242                                                Value *Addr,
13243                                                AtomicOrdering Ord) const {
13244   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
13245   bool IsRelease = isReleaseOrStronger(Ord);
13246 
13247   // Since the intrinsics must have legal type, the i64 intrinsics take two
13248   // parameters: "i32, i32". We must marshal Val into the appropriate form
13249   // before the call.
13250   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
13251     Intrinsic::ID Int =
13252         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
13253     Function *Strex = Intrinsic::getDeclaration(M, Int);
13254     Type *Int32Ty = Type::getInt32Ty(M->getContext());
13255 
13256     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
13257     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
13258     if (!Subtarget->isLittle())
13259       std::swap (Lo, Hi);
13260     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
13261     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
13262   }
13263 
13264   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
13265   Type *Tys[] = { Addr->getType() };
13266   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
13267 
13268   return Builder.CreateCall(
13269       Strex, {Builder.CreateZExtOrBitCast(
13270                   Val, Strex->getFunctionType()->getParamType(0)),
13271               Addr});
13272 }
13273 
13274 /// \brief Lower an interleaved load into a vldN intrinsic.
13275 ///
13276 /// E.g. Lower an interleaved load (Factor = 2):
13277 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
13278 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
13279 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
13280 ///
13281 ///      Into:
13282 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
13283 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
13284 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
13285 bool ARMTargetLowering::lowerInterleavedLoad(
13286     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
13287     ArrayRef<unsigned> Indices, unsigned Factor) const {
13288   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13289          "Invalid interleave factor");
13290   assert(!Shuffles.empty() && "Empty shufflevector input");
13291   assert(Shuffles.size() == Indices.size() &&
13292          "Unmatched number of shufflevectors and indices");
13293 
13294   VectorType *VecTy = Shuffles[0]->getType();
13295   Type *EltTy = VecTy->getVectorElementType();
13296 
13297   const DataLayout &DL = LI->getModule()->getDataLayout();
13298   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
13299   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
13300 
13301   // Skip if we do not have NEON and skip illegal vector types and vector types
13302   // with i64/f64 elements (vldN doesn't support i64/f64 elements).
13303   if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
13304     return false;
13305 
13306   // A pointer vector can not be the return type of the ldN intrinsics. Need to
13307   // load integer vectors first and then convert to pointer vectors.
13308   if (EltTy->isPointerTy())
13309     VecTy =
13310         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
13311 
13312   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
13313                                             Intrinsic::arm_neon_vld3,
13314                                             Intrinsic::arm_neon_vld4};
13315 
13316   IRBuilder<> Builder(LI);
13317   SmallVector<Value *, 2> Ops;
13318 
13319   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
13320   Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
13321   Ops.push_back(Builder.getInt32(LI->getAlignment()));
13322 
13323   Type *Tys[] = { VecTy, Int8Ptr };
13324   Function *VldnFunc =
13325       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
13326   CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
13327 
13328   // Replace uses of each shufflevector with the corresponding vector loaded
13329   // by ldN.
13330   for (unsigned i = 0; i < Shuffles.size(); i++) {
13331     ShuffleVectorInst *SV = Shuffles[i];
13332     unsigned Index = Indices[i];
13333 
13334     Value *SubVec = Builder.CreateExtractValue(VldN, Index);
13335 
13336     // Convert the integer vector to pointer vector if the element is pointer.
13337     if (EltTy->isPointerTy())
13338       SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
13339 
13340     SV->replaceAllUsesWith(SubVec);
13341   }
13342 
13343   return true;
13344 }
13345 
13346 /// \brief Lower an interleaved store into a vstN intrinsic.
13347 ///
13348 /// E.g. Lower an interleaved store (Factor = 3):
13349 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
13350 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
13351 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
13352 ///
13353 ///      Into:
13354 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
13355 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
13356 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
13357 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
13358 ///
13359 /// Note that the new shufflevectors will be removed and we'll only generate one
13360 /// vst3 instruction in CodeGen.
13361 ///
13362 /// Example for a more general valid mask (Factor 3). Lower:
13363 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
13364 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
13365 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
13366 ///
13367 ///      Into:
13368 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
13369 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
13370 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
13371 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
13372 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
13373                                               ShuffleVectorInst *SVI,
13374                                               unsigned Factor) const {
13375   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13376          "Invalid interleave factor");
13377 
13378   VectorType *VecTy = SVI->getType();
13379   assert(VecTy->getVectorNumElements() % Factor == 0 &&
13380          "Invalid interleaved store");
13381 
13382   unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
13383   Type *EltTy = VecTy->getVectorElementType();
13384   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
13385 
13386   const DataLayout &DL = SI->getModule()->getDataLayout();
13387   unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
13388   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
13389 
13390   // Skip if we do not have NEON and skip illegal vector types and vector types
13391   // with i64/f64 elements (vstN doesn't support i64/f64 elements).
13392   if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
13393       EltIs64Bits)
13394     return false;
13395 
13396   Value *Op0 = SVI->getOperand(0);
13397   Value *Op1 = SVI->getOperand(1);
13398   IRBuilder<> Builder(SI);
13399 
13400   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
13401   // vectors to integer vectors.
13402   if (EltTy->isPointerTy()) {
13403     Type *IntTy = DL.getIntPtrType(EltTy);
13404 
13405     // Convert to the corresponding integer vector.
13406     Type *IntVecTy =
13407         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
13408     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
13409     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
13410 
13411     SubVecTy = VectorType::get(IntTy, LaneLen);
13412   }
13413 
13414   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
13415                                              Intrinsic::arm_neon_vst3,
13416                                              Intrinsic::arm_neon_vst4};
13417   SmallVector<Value *, 6> Ops;
13418 
13419   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
13420   Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
13421 
13422   Type *Tys[] = { Int8Ptr, SubVecTy };
13423   Function *VstNFunc = Intrinsic::getDeclaration(
13424       SI->getModule(), StoreInts[Factor - 2], Tys);
13425 
13426   // Split the shufflevector operands into sub vectors for the new vstN call.
13427   auto Mask = SVI->getShuffleMask();
13428   for (unsigned i = 0; i < Factor; i++) {
13429     if (Mask[i] >= 0) {
13430       Ops.push_back(Builder.CreateShuffleVector(
13431           Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0)));
13432     } else {
13433       unsigned StartMask = 0;
13434       for (unsigned j = 1; j < LaneLen; j++) {
13435         if (Mask[j*Factor + i] >= 0) {
13436           StartMask = Mask[j*Factor + i] - j;
13437           break;
13438         }
13439       }
13440       // Note: If all elements in a chunk are undefs, StartMask=0!
13441       // Note: Filling undef gaps with random elements is ok, since
13442       // those elements were being written anyway (with undefs).
13443       // In the case of all undefs we're defaulting to using elems from 0
13444       // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
13445       Ops.push_back(Builder.CreateShuffleVector(
13446           Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
13447     }
13448   }
13449 
13450   Ops.push_back(Builder.getInt32(SI->getAlignment()));
13451   Builder.CreateCall(VstNFunc, Ops);
13452   return true;
13453 }
13454 
13455 enum HABaseType {
13456   HA_UNKNOWN = 0,
13457   HA_FLOAT,
13458   HA_DOUBLE,
13459   HA_VECT64,
13460   HA_VECT128
13461 };
13462 
13463 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
13464                                    uint64_t &Members) {
13465   if (auto *ST = dyn_cast<StructType>(Ty)) {
13466     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
13467       uint64_t SubMembers = 0;
13468       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
13469         return false;
13470       Members += SubMembers;
13471     }
13472   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
13473     uint64_t SubMembers = 0;
13474     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
13475       return false;
13476     Members += SubMembers * AT->getNumElements();
13477   } else if (Ty->isFloatTy()) {
13478     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
13479       return false;
13480     Members = 1;
13481     Base = HA_FLOAT;
13482   } else if (Ty->isDoubleTy()) {
13483     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
13484       return false;
13485     Members = 1;
13486     Base = HA_DOUBLE;
13487   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
13488     Members = 1;
13489     switch (Base) {
13490     case HA_FLOAT:
13491     case HA_DOUBLE:
13492       return false;
13493     case HA_VECT64:
13494       return VT->getBitWidth() == 64;
13495     case HA_VECT128:
13496       return VT->getBitWidth() == 128;
13497     case HA_UNKNOWN:
13498       switch (VT->getBitWidth()) {
13499       case 64:
13500         Base = HA_VECT64;
13501         return true;
13502       case 128:
13503         Base = HA_VECT128;
13504         return true;
13505       default:
13506         return false;
13507       }
13508     }
13509   }
13510 
13511   return (Members > 0 && Members <= 4);
13512 }
13513 
13514 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
13515 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
13516 /// passing according to AAPCS rules.
13517 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
13518     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
13519   if (getEffectiveCallingConv(CallConv, isVarArg) !=
13520       CallingConv::ARM_AAPCS_VFP)
13521     return false;
13522 
13523   HABaseType Base = HA_UNKNOWN;
13524   uint64_t Members = 0;
13525   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
13526   DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
13527 
13528   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
13529   return IsHA || IsIntArray;
13530 }
13531 
13532 unsigned ARMTargetLowering::getExceptionPointerRegister(
13533     const Constant *PersonalityFn) const {
13534   // Platforms which do not use SjLj EH may return values in these registers
13535   // via the personality function.
13536   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
13537 }
13538 
13539 unsigned ARMTargetLowering::getExceptionSelectorRegister(
13540     const Constant *PersonalityFn) const {
13541   // Platforms which do not use SjLj EH may return values in these registers
13542   // via the personality function.
13543   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
13544 }
13545 
13546 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
13547   // Update IsSplitCSR in ARMFunctionInfo.
13548   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
13549   AFI->setIsSplitCSR(true);
13550 }
13551 
13552 void ARMTargetLowering::insertCopiesSplitCSR(
13553     MachineBasicBlock *Entry,
13554     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
13555   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
13556   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
13557   if (!IStart)
13558     return;
13559 
13560   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
13561   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
13562   MachineBasicBlock::iterator MBBI = Entry->begin();
13563   for (const MCPhysReg *I = IStart; *I; ++I) {
13564     const TargetRegisterClass *RC = nullptr;
13565     if (ARM::GPRRegClass.contains(*I))
13566       RC = &ARM::GPRRegClass;
13567     else if (ARM::DPRRegClass.contains(*I))
13568       RC = &ARM::DPRRegClass;
13569     else
13570       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
13571 
13572     unsigned NewVR = MRI->createVirtualRegister(RC);
13573     // Create copy from CSR to a virtual register.
13574     // FIXME: this currently does not emit CFI pseudo-instructions, it works
13575     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
13576     // nounwind. If we want to generalize this later, we may need to emit
13577     // CFI pseudo-instructions.
13578     assert(Entry->getParent()->getFunction()->hasFnAttribute(
13579                Attribute::NoUnwind) &&
13580            "Function should be nounwind in insertCopiesSplitCSR!");
13581     Entry->addLiveIn(*I);
13582     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
13583         .addReg(*I);
13584 
13585     // Insert the copy-back instructions right before the terminator.
13586     for (auto *Exit : Exits)
13587       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
13588               TII->get(TargetOpcode::COPY), *I)
13589           .addReg(NewVR);
13590   }
13591 }
13592