1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that ARM uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARMISelLowering.h"
16 #include "ARMCallingConv.h"
17 #include "ARMConstantPoolValue.h"
18 #include "ARMMachineFunctionInfo.h"
19 #include "ARMPerfectShuffle.h"
20 #include "ARMSubtarget.h"
21 #include "ARMTargetMachine.h"
22 #include "ARMTargetObjectFile.h"
23 #include "MCTargetDesc/ARMAddressingModes.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/ADT/StringExtras.h"
26 #include "llvm/ADT/StringSwitch.h"
27 #include "llvm/CodeGen/Analysis.h"
28 #include "llvm/CodeGen/CallingConvLower.h"
29 #include "llvm/CodeGen/IntrinsicLowering.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineFrameInfo.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
35 #include "llvm/CodeGen/MachineModuleInfo.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/SelectionDAG.h"
38 #include "llvm/IR/CallingConv.h"
39 #include "llvm/IR/Constants.h"
40 #include "llvm/IR/Function.h"
41 #include "llvm/IR/GlobalValue.h"
42 #include "llvm/IR/IRBuilder.h"
43 #include "llvm/IR/Instruction.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/IntrinsicInst.h"
46 #include "llvm/IR/Intrinsics.h"
47 #include "llvm/IR/Type.h"
48 #include "llvm/MC/MCSectionMachO.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/Debug.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Support/raw_ostream.h"
54 #include "llvm/Target/TargetOptions.h"
55 #include <utility>
56 using namespace llvm;
57 
58 #define DEBUG_TYPE "arm-isel"
59 
60 STATISTIC(NumTailCalls, "Number of tail calls");
61 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
62 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
63 
64 static cl::opt<bool>
65 ARMInterworking("arm-interworking", cl::Hidden,
66   cl::desc("Enable / disable ARM interworking (for debugging only)"),
67   cl::init(true));
68 
69 namespace {
70   class ARMCCState : public CCState {
71   public:
72     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
73                SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
74                ParmContext PC)
75         : CCState(CC, isVarArg, MF, locs, C) {
76       assert(((PC == Call) || (PC == Prologue)) &&
77              "ARMCCState users must specify whether their context is call"
78              "or prologue generation.");
79       CallOrPrologue = PC;
80     }
81   };
82 }
83 
84 // The APCS parameter registers.
85 static const MCPhysReg GPRArgRegs[] = {
86   ARM::R0, ARM::R1, ARM::R2, ARM::R3
87 };
88 
89 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
90                                        MVT PromotedBitwiseVT) {
91   if (VT != PromotedLdStVT) {
92     setOperationAction(ISD::LOAD, VT, Promote);
93     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
94 
95     setOperationAction(ISD::STORE, VT, Promote);
96     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
97   }
98 
99   MVT ElemTy = VT.getVectorElementType();
100   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
101     setOperationAction(ISD::SETCC, VT, Custom);
102   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
103   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
104   if (ElemTy == MVT::i32) {
105     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
106     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
107     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
108     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
109   } else {
110     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
111     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
112     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
113     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
114   }
115   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
116   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
117   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
118   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
119   setOperationAction(ISD::SELECT,            VT, Expand);
120   setOperationAction(ISD::SELECT_CC,         VT, Expand);
121   setOperationAction(ISD::VSELECT,           VT, Expand);
122   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
123   if (VT.isInteger()) {
124     setOperationAction(ISD::SHL, VT, Custom);
125     setOperationAction(ISD::SRA, VT, Custom);
126     setOperationAction(ISD::SRL, VT, Custom);
127   }
128 
129   // Promote all bit-wise operations.
130   if (VT.isInteger() && VT != PromotedBitwiseVT) {
131     setOperationAction(ISD::AND, VT, Promote);
132     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
133     setOperationAction(ISD::OR,  VT, Promote);
134     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
135     setOperationAction(ISD::XOR, VT, Promote);
136     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
137   }
138 
139   // Neon does not support vector divide/remainder operations.
140   setOperationAction(ISD::SDIV, VT, Expand);
141   setOperationAction(ISD::UDIV, VT, Expand);
142   setOperationAction(ISD::FDIV, VT, Expand);
143   setOperationAction(ISD::SREM, VT, Expand);
144   setOperationAction(ISD::UREM, VT, Expand);
145   setOperationAction(ISD::FREM, VT, Expand);
146 
147   if (!VT.isFloatingPoint() &&
148       VT != MVT::v2i64 && VT != MVT::v1i64)
149     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
150       setOperationAction(Opcode, VT, Legal);
151 }
152 
153 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
154   addRegisterClass(VT, &ARM::DPRRegClass);
155   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
156 }
157 
158 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
159   addRegisterClass(VT, &ARM::DPairRegClass);
160   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
161 }
162 
163 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
164                                      const ARMSubtarget &STI)
165     : TargetLowering(TM), Subtarget(&STI) {
166   RegInfo = Subtarget->getRegisterInfo();
167   Itins = Subtarget->getInstrItineraryData();
168 
169   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
170 
171   if (Subtarget->isTargetMachO()) {
172     // Uses VFP for Thumb libfuncs if available.
173     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
174         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
175       static const struct {
176         const RTLIB::Libcall Op;
177         const char * const Name;
178         const ISD::CondCode Cond;
179       } LibraryCalls[] = {
180         // Single-precision floating-point arithmetic.
181         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
182         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
183         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
184         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
185 
186         // Double-precision floating-point arithmetic.
187         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
188         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
189         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
190         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
191 
192         // Single-precision comparisons.
193         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
194         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
195         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
196         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
197         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
198         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
199         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
200         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
201 
202         // Double-precision comparisons.
203         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
204         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
205         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
206         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
207         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
208         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
209         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
210         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
211 
212         // Floating-point to integer conversions.
213         // i64 conversions are done via library routines even when generating VFP
214         // instructions, so use the same ones.
215         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
216         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
217         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
218         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
219 
220         // Conversions between floating types.
221         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
222         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
223 
224         // Integer to floating-point conversions.
225         // i64 conversions are done via library routines even when generating VFP
226         // instructions, so use the same ones.
227         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
228         // e.g., __floatunsidf vs. __floatunssidfvfp.
229         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
230         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
231         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
232         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
233       };
234 
235       for (const auto &LC : LibraryCalls) {
236         setLibcallName(LC.Op, LC.Name);
237         if (LC.Cond != ISD::SETCC_INVALID)
238           setCmpLibcallCC(LC.Op, LC.Cond);
239       }
240     }
241 
242     // Set the correct calling convention for ARMv7k WatchOS. It's just
243     // AAPCS_VFP for functions as simple as libcalls.
244     if (Subtarget->isTargetWatchABI()) {
245       for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
246         setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
247     }
248   }
249 
250   // These libcalls are not available in 32-bit.
251   setLibcallName(RTLIB::SHL_I128, nullptr);
252   setLibcallName(RTLIB::SRL_I128, nullptr);
253   setLibcallName(RTLIB::SRA_I128, nullptr);
254 
255   // RTLIB
256   if (Subtarget->isAAPCS_ABI() &&
257       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
258        Subtarget->isTargetAndroid())) {
259     static const struct {
260       const RTLIB::Libcall Op;
261       const char * const Name;
262       const CallingConv::ID CC;
263       const ISD::CondCode Cond;
264     } LibraryCalls[] = {
265       // Double-precision floating-point arithmetic helper functions
266       // RTABI chapter 4.1.2, Table 2
267       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
268       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
269       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
270       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
271 
272       // Double-precision floating-point comparison helper functions
273       // RTABI chapter 4.1.2, Table 3
274       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
275       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
276       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
277       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
278       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
279       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
280       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
281       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
282 
283       // Single-precision floating-point arithmetic helper functions
284       // RTABI chapter 4.1.2, Table 4
285       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
286       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
287       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
288       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
289 
290       // Single-precision floating-point comparison helper functions
291       // RTABI chapter 4.1.2, Table 5
292       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
293       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
294       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
295       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
296       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
297       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
298       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
299       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
300 
301       // Floating-point to integer conversions.
302       // RTABI chapter 4.1.2, Table 6
303       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
304       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
305       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
306       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
307       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
308       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
309       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
310       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
311 
312       // Conversions between floating types.
313       // RTABI chapter 4.1.2, Table 7
314       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
315       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
316       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
317 
318       // Integer to floating-point conversions.
319       // RTABI chapter 4.1.2, Table 8
320       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
321       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
322       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
323       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
324       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
325       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
326       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
327       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
328 
329       // Long long helper functions
330       // RTABI chapter 4.2, Table 9
331       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
332       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
333       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
334       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
335 
336       // Integer division functions
337       // RTABI chapter 4.3.1
338       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
339       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
340       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
341       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
342       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
343       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
344       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
345       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
346     };
347 
348     for (const auto &LC : LibraryCalls) {
349       setLibcallName(LC.Op, LC.Name);
350       setLibcallCallingConv(LC.Op, LC.CC);
351       if (LC.Cond != ISD::SETCC_INVALID)
352         setCmpLibcallCC(LC.Op, LC.Cond);
353     }
354 
355     // EABI dependent RTLIB
356     if (TM.Options.EABIVersion == EABI::EABI4 ||
357         TM.Options.EABIVersion == EABI::EABI5) {
358       static const struct {
359         const RTLIB::Libcall Op;
360         const char *const Name;
361         const CallingConv::ID CC;
362         const ISD::CondCode Cond;
363       } MemOpsLibraryCalls[] = {
364         // Memory operations
365         // RTABI chapter 4.3.4
366         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
367         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
368         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
369       };
370 
371       for (const auto &LC : MemOpsLibraryCalls) {
372         setLibcallName(LC.Op, LC.Name);
373         setLibcallCallingConv(LC.Op, LC.CC);
374         if (LC.Cond != ISD::SETCC_INVALID)
375           setCmpLibcallCC(LC.Op, LC.Cond);
376       }
377     }
378   }
379 
380   if (Subtarget->isTargetWindows()) {
381     static const struct {
382       const RTLIB::Libcall Op;
383       const char * const Name;
384       const CallingConv::ID CC;
385     } LibraryCalls[] = {
386       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
387       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
388       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
389       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
390       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
391       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
392       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
393       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
394     };
395 
396     for (const auto &LC : LibraryCalls) {
397       setLibcallName(LC.Op, LC.Name);
398       setLibcallCallingConv(LC.Op, LC.CC);
399     }
400   }
401 
402   // Use divmod compiler-rt calls for iOS 5.0 and later.
403   if (Subtarget->isTargetWatchOS() ||
404       (Subtarget->isTargetIOS() &&
405        !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
406     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
407     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
408   }
409 
410   // The half <-> float conversion functions are always soft-float, but are
411   // needed for some targets which use a hard-float calling convention by
412   // default.
413   if (Subtarget->isAAPCS_ABI()) {
414     setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
415     setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
416     setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
417   } else {
418     setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
419     setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
420     setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
421   }
422 
423   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
424   // a __gnu_ prefix (which is the default).
425   if (Subtarget->isTargetAEABI()) {
426     setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
427     setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
428     setLibcallName(RTLIB::FPEXT_F16_F32,   "__aeabi_h2f");
429   }
430 
431   if (Subtarget->isThumb1Only())
432     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
433   else
434     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
435   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
436       !Subtarget->isThumb1Only()) {
437     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
438     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
439   }
440 
441   for (MVT VT : MVT::vector_valuetypes()) {
442     for (MVT InnerVT : MVT::vector_valuetypes()) {
443       setTruncStoreAction(VT, InnerVT, Expand);
444       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
445       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
446       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
447     }
448 
449     setOperationAction(ISD::MULHS, VT, Expand);
450     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
451     setOperationAction(ISD::MULHU, VT, Expand);
452     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
453 
454     setOperationAction(ISD::BSWAP, VT, Expand);
455   }
456 
457   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
458   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
459 
460   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
461   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
462 
463   if (Subtarget->hasNEON()) {
464     addDRTypeForNEON(MVT::v2f32);
465     addDRTypeForNEON(MVT::v8i8);
466     addDRTypeForNEON(MVT::v4i16);
467     addDRTypeForNEON(MVT::v2i32);
468     addDRTypeForNEON(MVT::v1i64);
469 
470     addQRTypeForNEON(MVT::v4f32);
471     addQRTypeForNEON(MVT::v2f64);
472     addQRTypeForNEON(MVT::v16i8);
473     addQRTypeForNEON(MVT::v8i16);
474     addQRTypeForNEON(MVT::v4i32);
475     addQRTypeForNEON(MVT::v2i64);
476 
477     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
478     // neither Neon nor VFP support any arithmetic operations on it.
479     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
480     // supported for v4f32.
481     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
482     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
483     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
484     // FIXME: Code duplication: FDIV and FREM are expanded always, see
485     // ARMTargetLowering::addTypeForNEON method for details.
486     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
487     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
488     // FIXME: Create unittest.
489     // In another words, find a way when "copysign" appears in DAG with vector
490     // operands.
491     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
492     // FIXME: Code duplication: SETCC has custom operation action, see
493     // ARMTargetLowering::addTypeForNEON method for details.
494     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
495     // FIXME: Create unittest for FNEG and for FABS.
496     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
497     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
498     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
499     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
500     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
501     setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
502     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
503     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
504     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
505     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
506     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
507     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
508     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
509     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
510     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
511     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
512     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
513     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
514     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
515 
516     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
517     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
518     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
519     setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
520     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
521     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
522     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
523     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
524     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
525     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
526     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
527     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
528     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
529     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
530     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
531 
532     // Mark v2f32 intrinsics.
533     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
534     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
535     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
536     setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
537     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
538     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
539     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
540     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
541     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
542     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
543     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
544     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
545     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
546     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
547     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
548 
549     // Neon does not support some operations on v1i64 and v2i64 types.
550     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
551     // Custom handling for some quad-vector types to detect VMULL.
552     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
553     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
554     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
555     // Custom handling for some vector types to avoid expensive expansions
556     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
557     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
558     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
559     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
560     setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
561     setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
562     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
563     // a destination type that is wider than the source, and nor does
564     // it have a FP_TO_[SU]INT instruction with a narrower destination than
565     // source.
566     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
567     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
568     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
569     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
570 
571     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
572     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
573 
574     // NEON does not have single instruction CTPOP for vectors with element
575     // types wider than 8-bits.  However, custom lowering can leverage the
576     // v8i8/v16i8 vcnt instruction.
577     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
578     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
579     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
580     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
581     setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
582     setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
583 
584     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
585     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
586 
587     // NEON does not have single instruction CTTZ for vectors.
588     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
589     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
590     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
591     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
592 
593     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
594     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
595     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
596     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
597 
598     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
599     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
600     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
601     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
602 
603     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
604     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
605     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
606     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
607 
608     // NEON only has FMA instructions as of VFP4.
609     if (!Subtarget->hasVFP4()) {
610       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
611       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
612     }
613 
614     setTargetDAGCombine(ISD::INTRINSIC_VOID);
615     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
616     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
617     setTargetDAGCombine(ISD::SHL);
618     setTargetDAGCombine(ISD::SRL);
619     setTargetDAGCombine(ISD::SRA);
620     setTargetDAGCombine(ISD::SIGN_EXTEND);
621     setTargetDAGCombine(ISD::ZERO_EXTEND);
622     setTargetDAGCombine(ISD::ANY_EXTEND);
623     setTargetDAGCombine(ISD::BUILD_VECTOR);
624     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
625     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
626     setTargetDAGCombine(ISD::STORE);
627     setTargetDAGCombine(ISD::FP_TO_SINT);
628     setTargetDAGCombine(ISD::FP_TO_UINT);
629     setTargetDAGCombine(ISD::FDIV);
630     setTargetDAGCombine(ISD::LOAD);
631 
632     // It is legal to extload from v4i8 to v4i16 or v4i32.
633     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
634                    MVT::v2i32}) {
635       for (MVT VT : MVT::integer_vector_valuetypes()) {
636         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
637         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
638         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
639       }
640     }
641   }
642 
643   // ARM and Thumb2 support UMLAL/SMLAL.
644   if (!Subtarget->isThumb1Only())
645     setTargetDAGCombine(ISD::ADDC);
646 
647   if (Subtarget->isFPOnlySP()) {
648     // When targeting a floating-point unit with only single-precision
649     // operations, f64 is legal for the few double-precision instructions which
650     // are present However, no double-precision operations other than moves,
651     // loads and stores are provided by the hardware.
652     setOperationAction(ISD::FADD,       MVT::f64, Expand);
653     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
654     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
655     setOperationAction(ISD::FMA,        MVT::f64, Expand);
656     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
657     setOperationAction(ISD::FREM,       MVT::f64, Expand);
658     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
659     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
660     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
661     setOperationAction(ISD::FABS,       MVT::f64, Expand);
662     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
663     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
664     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
665     setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
666     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
667     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
668     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
669     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
670     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
671     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
672     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
673     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
674     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
675     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
676     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
677     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
678     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
679     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
680     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
681     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
682     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
683     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
684     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
685   }
686 
687   computeRegisterProperties(Subtarget->getRegisterInfo());
688 
689   // ARM does not have floating-point extending loads.
690   for (MVT VT : MVT::fp_valuetypes()) {
691     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
692     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
693   }
694 
695   // ... or truncating stores
696   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
697   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
698   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
699 
700   // ARM does not have i1 sign extending load.
701   for (MVT VT : MVT::integer_valuetypes())
702     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
703 
704   // ARM supports all 4 flavors of integer indexed load / store.
705   if (!Subtarget->isThumb1Only()) {
706     for (unsigned im = (unsigned)ISD::PRE_INC;
707          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
708       setIndexedLoadAction(im,  MVT::i1,  Legal);
709       setIndexedLoadAction(im,  MVT::i8,  Legal);
710       setIndexedLoadAction(im,  MVT::i16, Legal);
711       setIndexedLoadAction(im,  MVT::i32, Legal);
712       setIndexedStoreAction(im, MVT::i1,  Legal);
713       setIndexedStoreAction(im, MVT::i8,  Legal);
714       setIndexedStoreAction(im, MVT::i16, Legal);
715       setIndexedStoreAction(im, MVT::i32, Legal);
716     }
717   }
718 
719   setOperationAction(ISD::SADDO, MVT::i32, Custom);
720   setOperationAction(ISD::UADDO, MVT::i32, Custom);
721   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
722   setOperationAction(ISD::USUBO, MVT::i32, Custom);
723 
724   // i64 operation support.
725   setOperationAction(ISD::MUL,     MVT::i64, Expand);
726   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
727   if (Subtarget->isThumb1Only()) {
728     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
729     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
730   }
731   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
732       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
733     setOperationAction(ISD::MULHS, MVT::i32, Expand);
734 
735   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
736   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
737   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
738   setOperationAction(ISD::SRL,       MVT::i64, Custom);
739   setOperationAction(ISD::SRA,       MVT::i64, Custom);
740 
741   if (!Subtarget->isThumb1Only()) {
742     // FIXME: We should do this for Thumb1 as well.
743     setOperationAction(ISD::ADDC,    MVT::i32, Custom);
744     setOperationAction(ISD::ADDE,    MVT::i32, Custom);
745     setOperationAction(ISD::SUBC,    MVT::i32, Custom);
746     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
747   }
748 
749   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
750     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
751 
752   // ARM does not have ROTL.
753   setOperationAction(ISD::ROTL, MVT::i32, Expand);
754   for (MVT VT : MVT::vector_valuetypes()) {
755     setOperationAction(ISD::ROTL, VT, Expand);
756     setOperationAction(ISD::ROTR, VT, Expand);
757   }
758   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
759   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
760   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
761     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
762 
763   // @llvm.readcyclecounter requires the Performance Monitors extension.
764   // Default to the 0 expansion on unsupported platforms.
765   // FIXME: Technically there are older ARM CPUs that have
766   // implementation-specific ways of obtaining this information.
767   if (Subtarget->hasPerfMon())
768     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
769 
770   // Only ARMv6 has BSWAP.
771   if (!Subtarget->hasV6Ops())
772     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
773 
774   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
775                                         : Subtarget->hasDivideInARMMode();
776   if (!hasDivide) {
777     // These are expanded into libcalls if the cpu doesn't have HW divider.
778     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
779     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
780   }
781 
782   if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
783     setOperationAction(ISD::SDIV, MVT::i32, Custom);
784     setOperationAction(ISD::UDIV, MVT::i32, Custom);
785 
786     setOperationAction(ISD::SDIV, MVT::i64, Custom);
787     setOperationAction(ISD::UDIV, MVT::i64, Custom);
788   }
789 
790   setOperationAction(ISD::SREM,  MVT::i32, Expand);
791   setOperationAction(ISD::UREM,  MVT::i32, Expand);
792   // Register based DivRem for AEABI (RTABI 4.2)
793   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
794       Subtarget->isTargetGNUAEABI()) {
795     setOperationAction(ISD::SREM, MVT::i64, Custom);
796     setOperationAction(ISD::UREM, MVT::i64, Custom);
797 
798     setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
799     setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
800     setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
801     setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
802     setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
803     setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
804     setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
805     setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
806 
807     setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
808     setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
809     setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
810     setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
811     setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
812     setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
813     setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
814     setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
815 
816     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
817     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
818     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
819     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
820   } else {
821     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
822     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
823   }
824 
825   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
826   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
827   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
828   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
829 
830   setOperationAction(ISD::TRAP, MVT::Other, Legal);
831 
832   // Use the default implementation.
833   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
834   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
835   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
836   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
837   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
838   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
839 
840   if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
841     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
842   else
843     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
844 
845   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
846   // the default expansion.
847   InsertFencesForAtomic = false;
848   if (Subtarget->hasAnyDataBarrier() &&
849       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
850     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
851     // to ldrex/strex loops already.
852     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
853     if (!Subtarget->isThumb() || !Subtarget->isMClass())
854       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
855 
856     // On v8, we have particularly efficient implementations of atomic fences
857     // if they can be combined with nearby atomic loads and stores.
858     if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
859       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
860       InsertFencesForAtomic = true;
861     }
862   } else {
863     // If there's anything we can use as a barrier, go through custom lowering
864     // for ATOMIC_FENCE.
865     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
866                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
867 
868     // Set them all for expansion, which will force libcalls.
869     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
870     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
871     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
872     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
873     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
874     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
875     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
876     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
877     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
878     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
879     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
880     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
881     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
882     // Unordered/Monotonic case.
883     setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
884     setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
885   }
886 
887   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
888 
889   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
890   if (!Subtarget->hasV6Ops()) {
891     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
892     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
893   }
894   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
895 
896   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
897       !Subtarget->isThumb1Only()) {
898     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
899     // iff target supports vfp2.
900     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
901     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
902   }
903 
904   // We want to custom lower some of our intrinsics.
905   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
906   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
907   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
908   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
909   if (Subtarget->useSjLjEH())
910     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
911 
912   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
913   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
914   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
915   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
916   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
917   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
918   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
919   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
920   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
921 
922   // Thumb-1 cannot currently select ARMISD::SUBE.
923   if (!Subtarget->isThumb1Only())
924     setOperationAction(ISD::SETCCE, MVT::i32, Custom);
925 
926   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
927   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
928   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
929   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
930   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
931 
932   // We don't support sin/cos/fmod/copysign/pow
933   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
934   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
935   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
936   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
937   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
938   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
939   setOperationAction(ISD::FREM,      MVT::f64, Expand);
940   setOperationAction(ISD::FREM,      MVT::f32, Expand);
941   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
942       !Subtarget->isThumb1Only()) {
943     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
944     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
945   }
946   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
947   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
948 
949   if (!Subtarget->hasVFP4()) {
950     setOperationAction(ISD::FMA, MVT::f64, Expand);
951     setOperationAction(ISD::FMA, MVT::f32, Expand);
952   }
953 
954   // Various VFP goodness
955   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
956     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
957     if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
958       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
959       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
960     }
961 
962     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
963     if (!Subtarget->hasFP16()) {
964       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
965       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
966     }
967   }
968 
969   // Combine sin / cos into one node or libcall if possible.
970   if (Subtarget->hasSinCos()) {
971     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
972     setLibcallName(RTLIB::SINCOS_F64, "sincos");
973     if (Subtarget->isTargetWatchABI()) {
974       setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
975       setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
976     }
977     if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
978       // For iOS, we don't want to the normal expansion of a libcall to
979       // sincos. We want to issue a libcall to __sincos_stret.
980       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
981       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
982     }
983   }
984 
985   // FP-ARMv8 implements a lot of rounding-like FP operations.
986   if (Subtarget->hasFPARMv8()) {
987     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
988     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
989     setOperationAction(ISD::FROUND, MVT::f32, Legal);
990     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
991     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
992     setOperationAction(ISD::FRINT, MVT::f32, Legal);
993     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
994     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
995     setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
996     setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
997     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
998     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
999 
1000     if (!Subtarget->isFPOnlySP()) {
1001       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1002       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1003       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1004       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1005       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1006       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1007       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1008       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1009     }
1010   }
1011 
1012   if (Subtarget->hasNEON()) {
1013     // vmin and vmax aren't available in a scalar form, so we use
1014     // a NEON instruction with an undef lane instead.
1015     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
1016     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
1017     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
1018     setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
1019     setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
1020     setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
1021   }
1022 
1023   // We have target-specific dag combine patterns for the following nodes:
1024   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1025   setTargetDAGCombine(ISD::ADD);
1026   setTargetDAGCombine(ISD::SUB);
1027   setTargetDAGCombine(ISD::MUL);
1028   setTargetDAGCombine(ISD::AND);
1029   setTargetDAGCombine(ISD::OR);
1030   setTargetDAGCombine(ISD::XOR);
1031 
1032   if (Subtarget->hasV6Ops())
1033     setTargetDAGCombine(ISD::SRL);
1034 
1035   setStackPointerRegisterToSaveRestore(ARM::SP);
1036 
1037   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1038       !Subtarget->hasVFP2())
1039     setSchedulingPreference(Sched::RegPressure);
1040   else
1041     setSchedulingPreference(Sched::Hybrid);
1042 
1043   //// temporary - rewrite interface to use type
1044   MaxStoresPerMemset = 8;
1045   MaxStoresPerMemsetOptSize = 4;
1046   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1047   MaxStoresPerMemcpyOptSize = 2;
1048   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1049   MaxStoresPerMemmoveOptSize = 2;
1050 
1051   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1052   // are at least 4 bytes aligned.
1053   setMinStackArgumentAlignment(4);
1054 
1055   // Prefer likely predicted branches to selects on out-of-order cores.
1056   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1057 
1058   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
1059 }
1060 
1061 bool ARMTargetLowering::useSoftFloat() const {
1062   return Subtarget->useSoftFloat();
1063 }
1064 
1065 // FIXME: It might make sense to define the representative register class as the
1066 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1067 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1068 // SPR's representative would be DPR_VFP2. This should work well if register
1069 // pressure tracking were modified such that a register use would increment the
1070 // pressure of the register class's representative and all of it's super
1071 // classes' representatives transitively. We have not implemented this because
1072 // of the difficulty prior to coalescing of modeling operand register classes
1073 // due to the common occurrence of cross class copies and subregister insertions
1074 // and extractions.
1075 std::pair<const TargetRegisterClass *, uint8_t>
1076 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1077                                            MVT VT) const {
1078   const TargetRegisterClass *RRC = nullptr;
1079   uint8_t Cost = 1;
1080   switch (VT.SimpleTy) {
1081   default:
1082     return TargetLowering::findRepresentativeClass(TRI, VT);
1083   // Use DPR as representative register class for all floating point
1084   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1085   // the cost is 1 for both f32 and f64.
1086   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1087   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1088     RRC = &ARM::DPRRegClass;
1089     // When NEON is used for SP, only half of the register file is available
1090     // because operations that define both SP and DP results will be constrained
1091     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1092     // coalescing by double-counting the SP regs. See the FIXME above.
1093     if (Subtarget->useNEONForSinglePrecisionFP())
1094       Cost = 2;
1095     break;
1096   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1097   case MVT::v4f32: case MVT::v2f64:
1098     RRC = &ARM::DPRRegClass;
1099     Cost = 2;
1100     break;
1101   case MVT::v4i64:
1102     RRC = &ARM::DPRRegClass;
1103     Cost = 4;
1104     break;
1105   case MVT::v8i64:
1106     RRC = &ARM::DPRRegClass;
1107     Cost = 8;
1108     break;
1109   }
1110   return std::make_pair(RRC, Cost);
1111 }
1112 
1113 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1114   switch ((ARMISD::NodeType)Opcode) {
1115   case ARMISD::FIRST_NUMBER:  break;
1116   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1117   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1118   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1119   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1120   case ARMISD::CALL:          return "ARMISD::CALL";
1121   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1122   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1123   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1124   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1125   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1126   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1127   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1128   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1129   case ARMISD::CMP:           return "ARMISD::CMP";
1130   case ARMISD::CMN:           return "ARMISD::CMN";
1131   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1132   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1133   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1134   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1135   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1136 
1137   case ARMISD::CMOV:          return "ARMISD::CMOV";
1138 
1139   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1140   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1141   case ARMISD::RRX:           return "ARMISD::RRX";
1142 
1143   case ARMISD::ADDC:          return "ARMISD::ADDC";
1144   case ARMISD::ADDE:          return "ARMISD::ADDE";
1145   case ARMISD::SUBC:          return "ARMISD::SUBC";
1146   case ARMISD::SUBE:          return "ARMISD::SUBE";
1147 
1148   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1149   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1150 
1151   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1152   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1153   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1154 
1155   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1156 
1157   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1158 
1159   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1160 
1161   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1162 
1163   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1164 
1165   case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
1166   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1167 
1168   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1169   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1170   case ARMISD::VCGE:          return "ARMISD::VCGE";
1171   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1172   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1173   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1174   case ARMISD::VCGT:          return "ARMISD::VCGT";
1175   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1176   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1177   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1178   case ARMISD::VTST:          return "ARMISD::VTST";
1179 
1180   case ARMISD::VSHL:          return "ARMISD::VSHL";
1181   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1182   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1183   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1184   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1185   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1186   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1187   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1188   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1189   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1190   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1191   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1192   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1193   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1194   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1195   case ARMISD::VSLI:          return "ARMISD::VSLI";
1196   case ARMISD::VSRI:          return "ARMISD::VSRI";
1197   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1198   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1199   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1200   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1201   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1202   case ARMISD::VDUP:          return "ARMISD::VDUP";
1203   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1204   case ARMISD::VEXT:          return "ARMISD::VEXT";
1205   case ARMISD::VREV64:        return "ARMISD::VREV64";
1206   case ARMISD::VREV32:        return "ARMISD::VREV32";
1207   case ARMISD::VREV16:        return "ARMISD::VREV16";
1208   case ARMISD::VZIP:          return "ARMISD::VZIP";
1209   case ARMISD::VUZP:          return "ARMISD::VUZP";
1210   case ARMISD::VTRN:          return "ARMISD::VTRN";
1211   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1212   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1213   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1214   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1215   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1216   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1217   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1218   case ARMISD::BFI:           return "ARMISD::BFI";
1219   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1220   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1221   case ARMISD::VBSL:          return "ARMISD::VBSL";
1222   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1223   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1224   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1225   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1226   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1227   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1228   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1229   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1230   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1231   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1232   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1233   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1234   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1235   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1236   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1237   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1238   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1239   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1240   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1241   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1242   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1243   }
1244   return nullptr;
1245 }
1246 
1247 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1248                                           EVT VT) const {
1249   if (!VT.isVector())
1250     return getPointerTy(DL);
1251   return VT.changeVectorElementTypeToInteger();
1252 }
1253 
1254 /// getRegClassFor - Return the register class that should be used for the
1255 /// specified value type.
1256 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1257   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1258   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1259   // load / store 4 to 8 consecutive D registers.
1260   if (Subtarget->hasNEON()) {
1261     if (VT == MVT::v4i64)
1262       return &ARM::QQPRRegClass;
1263     if (VT == MVT::v8i64)
1264       return &ARM::QQQQPRRegClass;
1265   }
1266   return TargetLowering::getRegClassFor(VT);
1267 }
1268 
1269 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1270 // source/dest is aligned and the copy size is large enough. We therefore want
1271 // to align such objects passed to memory intrinsics.
1272 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1273                                                unsigned &PrefAlign) const {
1274   if (!isa<MemIntrinsic>(CI))
1275     return false;
1276   MinSize = 8;
1277   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1278   // cycle faster than 4-byte aligned LDM.
1279   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1280   return true;
1281 }
1282 
1283 // Create a fast isel object.
1284 FastISel *
1285 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1286                                   const TargetLibraryInfo *libInfo) const {
1287   return ARM::createFastISel(funcInfo, libInfo);
1288 }
1289 
1290 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1291   unsigned NumVals = N->getNumValues();
1292   if (!NumVals)
1293     return Sched::RegPressure;
1294 
1295   for (unsigned i = 0; i != NumVals; ++i) {
1296     EVT VT = N->getValueType(i);
1297     if (VT == MVT::Glue || VT == MVT::Other)
1298       continue;
1299     if (VT.isFloatingPoint() || VT.isVector())
1300       return Sched::ILP;
1301   }
1302 
1303   if (!N->isMachineOpcode())
1304     return Sched::RegPressure;
1305 
1306   // Load are scheduled for latency even if there instruction itinerary
1307   // is not available.
1308   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1309   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1310 
1311   if (MCID.getNumDefs() == 0)
1312     return Sched::RegPressure;
1313   if (!Itins->isEmpty() &&
1314       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1315     return Sched::ILP;
1316 
1317   return Sched::RegPressure;
1318 }
1319 
1320 //===----------------------------------------------------------------------===//
1321 // Lowering Code
1322 //===----------------------------------------------------------------------===//
1323 
1324 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1325 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1326   switch (CC) {
1327   default: llvm_unreachable("Unknown condition code!");
1328   case ISD::SETNE:  return ARMCC::NE;
1329   case ISD::SETEQ:  return ARMCC::EQ;
1330   case ISD::SETGT:  return ARMCC::GT;
1331   case ISD::SETGE:  return ARMCC::GE;
1332   case ISD::SETLT:  return ARMCC::LT;
1333   case ISD::SETLE:  return ARMCC::LE;
1334   case ISD::SETUGT: return ARMCC::HI;
1335   case ISD::SETUGE: return ARMCC::HS;
1336   case ISD::SETULT: return ARMCC::LO;
1337   case ISD::SETULE: return ARMCC::LS;
1338   }
1339 }
1340 
1341 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1342 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1343                         ARMCC::CondCodes &CondCode2) {
1344   CondCode2 = ARMCC::AL;
1345   switch (CC) {
1346   default: llvm_unreachable("Unknown FP condition!");
1347   case ISD::SETEQ:
1348   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1349   case ISD::SETGT:
1350   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1351   case ISD::SETGE:
1352   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1353   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1354   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1355   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1356   case ISD::SETO:   CondCode = ARMCC::VC; break;
1357   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1358   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1359   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1360   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1361   case ISD::SETLT:
1362   case ISD::SETULT: CondCode = ARMCC::LT; break;
1363   case ISD::SETLE:
1364   case ISD::SETULE: CondCode = ARMCC::LE; break;
1365   case ISD::SETNE:
1366   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1367   }
1368 }
1369 
1370 //===----------------------------------------------------------------------===//
1371 //                      Calling Convention Implementation
1372 //===----------------------------------------------------------------------===//
1373 
1374 #include "ARMGenCallingConv.inc"
1375 
1376 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1377 /// account presence of floating point hardware and calling convention
1378 /// limitations, such as support for variadic functions.
1379 CallingConv::ID
1380 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1381                                            bool isVarArg) const {
1382   switch (CC) {
1383   default:
1384     llvm_unreachable("Unsupported calling convention");
1385   case CallingConv::ARM_AAPCS:
1386   case CallingConv::ARM_APCS:
1387   case CallingConv::GHC:
1388     return CC;
1389   case CallingConv::PreserveMost:
1390     return CallingConv::PreserveMost;
1391   case CallingConv::ARM_AAPCS_VFP:
1392   case CallingConv::Swift:
1393     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1394   case CallingConv::C:
1395     if (!Subtarget->isAAPCS_ABI())
1396       return CallingConv::ARM_APCS;
1397     else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
1398              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1399              !isVarArg)
1400       return CallingConv::ARM_AAPCS_VFP;
1401     else
1402       return CallingConv::ARM_AAPCS;
1403   case CallingConv::Fast:
1404   case CallingConv::CXX_FAST_TLS:
1405     if (!Subtarget->isAAPCS_ABI()) {
1406       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1407         return CallingConv::Fast;
1408       return CallingConv::ARM_APCS;
1409     } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1410       return CallingConv::ARM_AAPCS_VFP;
1411     else
1412       return CallingConv::ARM_AAPCS;
1413   }
1414 }
1415 
1416 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1417 /// CallingConvention.
1418 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1419                                                  bool Return,
1420                                                  bool isVarArg) const {
1421   switch (getEffectiveCallingConv(CC, isVarArg)) {
1422   default:
1423     llvm_unreachable("Unsupported calling convention");
1424   case CallingConv::ARM_APCS:
1425     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1426   case CallingConv::ARM_AAPCS:
1427     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1428   case CallingConv::ARM_AAPCS_VFP:
1429     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1430   case CallingConv::Fast:
1431     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1432   case CallingConv::GHC:
1433     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1434   case CallingConv::PreserveMost:
1435     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1436   }
1437 }
1438 
1439 /// LowerCallResult - Lower the result values of a call into the
1440 /// appropriate copies out of appropriate physical registers.
1441 SDValue
1442 ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1443                                    CallingConv::ID CallConv, bool isVarArg,
1444                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1445                                    SDLoc dl, SelectionDAG &DAG,
1446                                    SmallVectorImpl<SDValue> &InVals,
1447                                    bool isThisReturn, SDValue ThisVal) const {
1448 
1449   // Assign locations to each value returned by this call.
1450   SmallVector<CCValAssign, 16> RVLocs;
1451   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1452                     *DAG.getContext(), Call);
1453   CCInfo.AnalyzeCallResult(Ins,
1454                            CCAssignFnForNode(CallConv, /* Return*/ true,
1455                                              isVarArg));
1456 
1457   // Copy all of the result registers out of their specified physreg.
1458   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1459     CCValAssign VA = RVLocs[i];
1460 
1461     // Pass 'this' value directly from the argument to return value, to avoid
1462     // reg unit interference
1463     if (i == 0 && isThisReturn) {
1464       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1465              "unexpected return calling convention register assignment");
1466       InVals.push_back(ThisVal);
1467       continue;
1468     }
1469 
1470     SDValue Val;
1471     if (VA.needsCustom()) {
1472       // Handle f64 or half of a v2f64.
1473       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1474                                       InFlag);
1475       Chain = Lo.getValue(1);
1476       InFlag = Lo.getValue(2);
1477       VA = RVLocs[++i]; // skip ahead to next loc
1478       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1479                                       InFlag);
1480       Chain = Hi.getValue(1);
1481       InFlag = Hi.getValue(2);
1482       if (!Subtarget->isLittle())
1483         std::swap (Lo, Hi);
1484       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1485 
1486       if (VA.getLocVT() == MVT::v2f64) {
1487         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1488         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1489                           DAG.getConstant(0, dl, MVT::i32));
1490 
1491         VA = RVLocs[++i]; // skip ahead to next loc
1492         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1493         Chain = Lo.getValue(1);
1494         InFlag = Lo.getValue(2);
1495         VA = RVLocs[++i]; // skip ahead to next loc
1496         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1497         Chain = Hi.getValue(1);
1498         InFlag = Hi.getValue(2);
1499         if (!Subtarget->isLittle())
1500           std::swap (Lo, Hi);
1501         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1502         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1503                           DAG.getConstant(1, dl, MVT::i32));
1504       }
1505     } else {
1506       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1507                                InFlag);
1508       Chain = Val.getValue(1);
1509       InFlag = Val.getValue(2);
1510     }
1511 
1512     switch (VA.getLocInfo()) {
1513     default: llvm_unreachable("Unknown loc info!");
1514     case CCValAssign::Full: break;
1515     case CCValAssign::BCvt:
1516       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1517       break;
1518     }
1519 
1520     InVals.push_back(Val);
1521   }
1522 
1523   return Chain;
1524 }
1525 
1526 /// LowerMemOpCallTo - Store the argument to the stack.
1527 SDValue
1528 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
1529                                     SDValue StackPtr, SDValue Arg,
1530                                     SDLoc dl, SelectionDAG &DAG,
1531                                     const CCValAssign &VA,
1532                                     ISD::ArgFlagsTy Flags) const {
1533   unsigned LocMemOffset = VA.getLocMemOffset();
1534   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1535   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1536                        StackPtr, PtrOff);
1537   return DAG.getStore(
1538       Chain, dl, Arg, PtrOff,
1539       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
1540       false, false, 0);
1541 }
1542 
1543 void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
1544                                          SDValue Chain, SDValue &Arg,
1545                                          RegsToPassVector &RegsToPass,
1546                                          CCValAssign &VA, CCValAssign &NextVA,
1547                                          SDValue &StackPtr,
1548                                          SmallVectorImpl<SDValue> &MemOpChains,
1549                                          ISD::ArgFlagsTy Flags) const {
1550 
1551   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1552                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1553   unsigned id = Subtarget->isLittle() ? 0 : 1;
1554   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1555 
1556   if (NextVA.isRegLoc())
1557     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1558   else {
1559     assert(NextVA.isMemLoc());
1560     if (!StackPtr.getNode())
1561       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1562                                     getPointerTy(DAG.getDataLayout()));
1563 
1564     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
1565                                            dl, DAG, NextVA,
1566                                            Flags));
1567   }
1568 }
1569 
1570 /// LowerCall - Lowering a call into a callseq_start <-
1571 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1572 /// nodes.
1573 SDValue
1574 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1575                              SmallVectorImpl<SDValue> &InVals) const {
1576   SelectionDAG &DAG                     = CLI.DAG;
1577   SDLoc &dl                             = CLI.DL;
1578   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1579   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1580   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1581   SDValue Chain                         = CLI.Chain;
1582   SDValue Callee                        = CLI.Callee;
1583   bool &isTailCall                      = CLI.IsTailCall;
1584   CallingConv::ID CallConv              = CLI.CallConv;
1585   bool doesNotRet                       = CLI.DoesNotReturn;
1586   bool isVarArg                         = CLI.IsVarArg;
1587 
1588   MachineFunction &MF = DAG.getMachineFunction();
1589   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1590   bool isThisReturn   = false;
1591   bool isSibCall      = false;
1592   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
1593 
1594   // Disable tail calls if they're not supported.
1595   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
1596     isTailCall = false;
1597 
1598   if (isTailCall) {
1599     // Check if it's really possible to do a tail call.
1600     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1601                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
1602                                                    Outs, OutVals, Ins, DAG);
1603     if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
1604       report_fatal_error("failed to perform tail call elimination on a call "
1605                          "site marked musttail");
1606     // We don't support GuaranteedTailCallOpt for ARM, only automatically
1607     // detected sibcalls.
1608     if (isTailCall) {
1609       ++NumTailCalls;
1610       isSibCall = true;
1611     }
1612   }
1613 
1614   // Analyze operands of the call, assigning locations to each operand.
1615   SmallVector<CCValAssign, 16> ArgLocs;
1616   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1617                     *DAG.getContext(), Call);
1618   CCInfo.AnalyzeCallOperands(Outs,
1619                              CCAssignFnForNode(CallConv, /* Return*/ false,
1620                                                isVarArg));
1621 
1622   // Get a count of how many bytes are to be pushed on the stack.
1623   unsigned NumBytes = CCInfo.getNextStackOffset();
1624 
1625   // For tail calls, memory operands are available in our caller's stack.
1626   if (isSibCall)
1627     NumBytes = 0;
1628 
1629   // Adjust the stack pointer for the new arguments...
1630   // These operations are automatically eliminated by the prolog/epilog pass
1631   if (!isSibCall)
1632     Chain = DAG.getCALLSEQ_START(Chain,
1633                                  DAG.getIntPtrConstant(NumBytes, dl, true), dl);
1634 
1635   SDValue StackPtr =
1636       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
1637 
1638   RegsToPassVector RegsToPass;
1639   SmallVector<SDValue, 8> MemOpChains;
1640 
1641   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1642   // of tail call optimization, arguments are handled later.
1643   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1644        i != e;
1645        ++i, ++realArgIdx) {
1646     CCValAssign &VA = ArgLocs[i];
1647     SDValue Arg = OutVals[realArgIdx];
1648     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1649     bool isByVal = Flags.isByVal();
1650 
1651     // Promote the value if needed.
1652     switch (VA.getLocInfo()) {
1653     default: llvm_unreachable("Unknown loc info!");
1654     case CCValAssign::Full: break;
1655     case CCValAssign::SExt:
1656       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1657       break;
1658     case CCValAssign::ZExt:
1659       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1660       break;
1661     case CCValAssign::AExt:
1662       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1663       break;
1664     case CCValAssign::BCvt:
1665       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1666       break;
1667     }
1668 
1669     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1670     if (VA.needsCustom()) {
1671       if (VA.getLocVT() == MVT::v2f64) {
1672         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1673                                   DAG.getConstant(0, dl, MVT::i32));
1674         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1675                                   DAG.getConstant(1, dl, MVT::i32));
1676 
1677         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1678                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1679 
1680         VA = ArgLocs[++i]; // skip ahead to next loc
1681         if (VA.isRegLoc()) {
1682           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1683                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1684         } else {
1685           assert(VA.isMemLoc());
1686 
1687           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1688                                                  dl, DAG, VA, Flags));
1689         }
1690       } else {
1691         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1692                          StackPtr, MemOpChains, Flags);
1693       }
1694     } else if (VA.isRegLoc()) {
1695       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
1696         assert(VA.getLocVT() == MVT::i32 &&
1697                "unexpected calling convention register assignment");
1698         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1699                "unexpected use of 'returned'");
1700         isThisReturn = true;
1701       }
1702       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1703     } else if (isByVal) {
1704       assert(VA.isMemLoc());
1705       unsigned offset = 0;
1706 
1707       // True if this byval aggregate will be split between registers
1708       // and memory.
1709       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1710       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
1711 
1712       if (CurByValIdx < ByValArgsCount) {
1713 
1714         unsigned RegBegin, RegEnd;
1715         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1716 
1717         EVT PtrVT =
1718             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1719         unsigned int i, j;
1720         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1721           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
1722           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1723           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1724                                      MachinePointerInfo(),
1725                                      false, false, false,
1726                                      DAG.InferPtrAlignment(AddArg));
1727           MemOpChains.push_back(Load.getValue(1));
1728           RegsToPass.push_back(std::make_pair(j, Load));
1729         }
1730 
1731         // If parameter size outsides register area, "offset" value
1732         // helps us to calculate stack slot for remained part properly.
1733         offset = RegEnd - RegBegin;
1734 
1735         CCInfo.nextInRegsParam();
1736       }
1737 
1738       if (Flags.getByValSize() > 4*offset) {
1739         auto PtrVT = getPointerTy(DAG.getDataLayout());
1740         unsigned LocMemOffset = VA.getLocMemOffset();
1741         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1742         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
1743         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
1744         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
1745         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
1746                                            MVT::i32);
1747         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
1748                                             MVT::i32);
1749 
1750         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1751         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1752         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1753                                           Ops));
1754       }
1755     } else if (!isSibCall) {
1756       assert(VA.isMemLoc());
1757 
1758       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1759                                              dl, DAG, VA, Flags));
1760     }
1761   }
1762 
1763   if (!MemOpChains.empty())
1764     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
1765 
1766   // Build a sequence of copy-to-reg nodes chained together with token chain
1767   // and flag operands which copy the outgoing args into the appropriate regs.
1768   SDValue InFlag;
1769   // Tail call byval lowering might overwrite argument registers so in case of
1770   // tail call optimization the copies to registers are lowered later.
1771   if (!isTailCall)
1772     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1773       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1774                                RegsToPass[i].second, InFlag);
1775       InFlag = Chain.getValue(1);
1776     }
1777 
1778   // For tail calls lower the arguments to the 'real' stack slot.
1779   if (isTailCall) {
1780     // Force all the incoming stack arguments to be loaded from the stack
1781     // before any new outgoing arguments are stored to the stack, because the
1782     // outgoing stack slots may alias the incoming argument stack slots, and
1783     // the alias isn't otherwise explicit. This is slightly more conservative
1784     // than necessary, because it means that each store effectively depends
1785     // on every argument instead of just those arguments it would clobber.
1786 
1787     // Do not flag preceding copytoreg stuff together with the following stuff.
1788     InFlag = SDValue();
1789     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1790       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1791                                RegsToPass[i].second, InFlag);
1792       InFlag = Chain.getValue(1);
1793     }
1794     InFlag = SDValue();
1795   }
1796 
1797   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1798   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1799   // node so that legalize doesn't hack it.
1800   bool isDirect = false;
1801   bool isARMFunc = false;
1802   bool isLocalARMFunc = false;
1803   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1804   auto PtrVt = getPointerTy(DAG.getDataLayout());
1805 
1806   if (Subtarget->genLongCalls()) {
1807     assert((Subtarget->isTargetWindows() ||
1808             getTargetMachine().getRelocationModel() == Reloc::Static) &&
1809            "long-calls with non-static relocation model!");
1810     // Handle a global address or an external symbol. If it's not one of
1811     // those, the target's already in a register, so we don't need to do
1812     // anything extra.
1813     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1814       const GlobalValue *GV = G->getGlobal();
1815       // Create a constant pool entry for the callee address
1816       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1817       ARMConstantPoolValue *CPV =
1818         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
1819 
1820       // Get the address of the callee into a register
1821       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1822       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1823       Callee = DAG.getLoad(
1824           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1825           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
1826           false, false, 0);
1827     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
1828       const char *Sym = S->getSymbol();
1829 
1830       // Create a constant pool entry for the callee address
1831       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1832       ARMConstantPoolValue *CPV =
1833         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1834                                       ARMPCLabelIndex, 0);
1835       // Get the address of the callee into a register
1836       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1837       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1838       Callee = DAG.getLoad(
1839           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1840           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
1841           false, false, 0);
1842     }
1843   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1844     const GlobalValue *GV = G->getGlobal();
1845     isDirect = true;
1846     bool isDef = GV->isStrongDefinitionForLinker();
1847     bool isStub = (!isDef && Subtarget->isTargetMachO()) &&
1848                    getTargetMachine().getRelocationModel() != Reloc::Static;
1849     isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
1850     // ARM call to a local ARM function is predicable.
1851     isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
1852     // tBX takes a register source operand.
1853     if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1854       assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
1855       Callee = DAG.getNode(
1856           ARMISD::WrapperPIC, dl, PtrVt,
1857           DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
1858       Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
1859                            MachinePointerInfo::getGOT(DAG.getMachineFunction()),
1860                            false, false, true, 0);
1861     } else if (Subtarget->isTargetCOFF()) {
1862       assert(Subtarget->isTargetWindows() &&
1863              "Windows is the only supported COFF target");
1864       unsigned TargetFlags = GV->hasDLLImportStorageClass()
1865                                  ? ARMII::MO_DLLIMPORT
1866                                  : ARMII::MO_NO_FLAG;
1867       Callee =
1868           DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags);
1869       if (GV->hasDLLImportStorageClass())
1870         Callee =
1871             DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
1872                         DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
1873                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
1874                         false, false, false, 0);
1875     } else {
1876       // On ELF targets for PIC code, direct calls should go through the PLT
1877       unsigned OpFlags = 0;
1878       if (Subtarget->isTargetELF() &&
1879           getTargetMachine().getRelocationModel() == Reloc::PIC_)
1880         OpFlags = ARMII::MO_PLT;
1881       Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags);
1882     }
1883   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1884     isDirect = true;
1885     bool isStub = Subtarget->isTargetMachO() &&
1886                   getTargetMachine().getRelocationModel() != Reloc::Static;
1887     isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
1888     // tBX takes a register source operand.
1889     const char *Sym = S->getSymbol();
1890     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1891       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1892       ARMConstantPoolValue *CPV =
1893         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1894                                       ARMPCLabelIndex, 4);
1895       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
1896       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1897       Callee = DAG.getLoad(
1898           PtrVt, dl, DAG.getEntryNode(), CPAddr,
1899           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
1900           false, false, 0);
1901       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
1902       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
1903     } else {
1904       unsigned OpFlags = 0;
1905       // On ELF targets for PIC code, direct calls should go through the PLT
1906       if (Subtarget->isTargetELF() &&
1907                   getTargetMachine().getRelocationModel() == Reloc::PIC_)
1908         OpFlags = ARMII::MO_PLT;
1909       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags);
1910     }
1911   }
1912 
1913   // FIXME: handle tail calls differently.
1914   unsigned CallOpc;
1915   if (Subtarget->isThumb()) {
1916     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
1917       CallOpc = ARMISD::CALL_NOLINK;
1918     else
1919       CallOpc = ARMISD::CALL;
1920   } else {
1921     if (!isDirect && !Subtarget->hasV5TOps())
1922       CallOpc = ARMISD::CALL_NOLINK;
1923     else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
1924              // Emit regular call when code size is the priority
1925              !MF.getFunction()->optForMinSize())
1926       // "mov lr, pc; b _foo" to avoid confusing the RSP
1927       CallOpc = ARMISD::CALL_NOLINK;
1928     else
1929       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
1930   }
1931 
1932   std::vector<SDValue> Ops;
1933   Ops.push_back(Chain);
1934   Ops.push_back(Callee);
1935 
1936   // Add argument registers to the end of the list so that they are known live
1937   // into the call.
1938   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1939     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1940                                   RegsToPass[i].second.getValueType()));
1941 
1942   // Add a register mask operand representing the call-preserved registers.
1943   if (!isTailCall) {
1944     const uint32_t *Mask;
1945     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
1946     if (isThisReturn) {
1947       // For 'this' returns, use the R0-preserving mask if applicable
1948       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
1949       if (!Mask) {
1950         // Set isThisReturn to false if the calling convention is not one that
1951         // allows 'returned' to be modeled in this way, so LowerCallResult does
1952         // not try to pass 'this' straight through
1953         isThisReturn = false;
1954         Mask = ARI->getCallPreservedMask(MF, CallConv);
1955       }
1956     } else
1957       Mask = ARI->getCallPreservedMask(MF, CallConv);
1958 
1959     assert(Mask && "Missing call preserved mask for calling convention");
1960     Ops.push_back(DAG.getRegisterMask(Mask));
1961   }
1962 
1963   if (InFlag.getNode())
1964     Ops.push_back(InFlag);
1965 
1966   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1967   if (isTailCall) {
1968     MF.getFrameInfo()->setHasTailCall();
1969     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
1970   }
1971 
1972   // Returns a chain and a flag for retval copy to use.
1973   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
1974   InFlag = Chain.getValue(1);
1975 
1976   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
1977                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
1978   if (!Ins.empty())
1979     InFlag = Chain.getValue(1);
1980 
1981   // Handle result values, copying them out of physregs into vregs that we
1982   // return.
1983   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
1984                          InVals, isThisReturn,
1985                          isThisReturn ? OutVals[0] : SDValue());
1986 }
1987 
1988 /// HandleByVal - Every parameter *after* a byval parameter is passed
1989 /// on the stack.  Remember the next parameter register to allocate,
1990 /// and then confiscate the rest of the parameter registers to insure
1991 /// this.
1992 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
1993                                     unsigned Align) const {
1994   assert((State->getCallOrPrologue() == Prologue ||
1995           State->getCallOrPrologue() == Call) &&
1996          "unhandled ParmContext");
1997 
1998   // Byval (as with any stack) slots are always at least 4 byte aligned.
1999   Align = std::max(Align, 4U);
2000 
2001   unsigned Reg = State->AllocateReg(GPRArgRegs);
2002   if (!Reg)
2003     return;
2004 
2005   unsigned AlignInRegs = Align / 4;
2006   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2007   for (unsigned i = 0; i < Waste; ++i)
2008     Reg = State->AllocateReg(GPRArgRegs);
2009 
2010   if (!Reg)
2011     return;
2012 
2013   unsigned Excess = 4 * (ARM::R4 - Reg);
2014 
2015   // Special case when NSAA != SP and parameter size greater than size of
2016   // all remained GPR regs. In that case we can't split parameter, we must
2017   // send it to stack. We also must set NCRN to R4, so waste all
2018   // remained registers.
2019   const unsigned NSAAOffset = State->getNextStackOffset();
2020   if (NSAAOffset != 0 && Size > Excess) {
2021     while (State->AllocateReg(GPRArgRegs))
2022       ;
2023     return;
2024   }
2025 
2026   // First register for byval parameter is the first register that wasn't
2027   // allocated before this method call, so it would be "reg".
2028   // If parameter is small enough to be saved in range [reg, r4), then
2029   // the end (first after last) register would be reg + param-size-in-regs,
2030   // else parameter would be splitted between registers and stack,
2031   // end register would be r4 in this case.
2032   unsigned ByValRegBegin = Reg;
2033   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2034   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2035   // Note, first register is allocated in the beginning of function already,
2036   // allocate remained amount of registers we need.
2037   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2038     State->AllocateReg(GPRArgRegs);
2039   // A byval parameter that is split between registers and memory needs its
2040   // size truncated here.
2041   // In the case where the entire structure fits in registers, we set the
2042   // size in memory to zero.
2043   Size = std::max<int>(Size - Excess, 0);
2044 }
2045 
2046 /// MatchingStackOffset - Return true if the given stack call argument is
2047 /// already available in the same position (relatively) of the caller's
2048 /// incoming argument stack.
2049 static
2050 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2051                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2052                          const TargetInstrInfo *TII) {
2053   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2054   int FI = INT_MAX;
2055   if (Arg.getOpcode() == ISD::CopyFromReg) {
2056     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2057     if (!TargetRegisterInfo::isVirtualRegister(VR))
2058       return false;
2059     MachineInstr *Def = MRI->getVRegDef(VR);
2060     if (!Def)
2061       return false;
2062     if (!Flags.isByVal()) {
2063       if (!TII->isLoadFromStackSlot(Def, FI))
2064         return false;
2065     } else {
2066       return false;
2067     }
2068   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2069     if (Flags.isByVal())
2070       // ByVal argument is passed in as a pointer but it's now being
2071       // dereferenced. e.g.
2072       // define @foo(%struct.X* %A) {
2073       //   tail call @bar(%struct.X* byval %A)
2074       // }
2075       return false;
2076     SDValue Ptr = Ld->getBasePtr();
2077     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2078     if (!FINode)
2079       return false;
2080     FI = FINode->getIndex();
2081   } else
2082     return false;
2083 
2084   assert(FI != INT_MAX);
2085   if (!MFI->isFixedObjectIndex(FI))
2086     return false;
2087   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2088 }
2089 
2090 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2091 /// for tail call optimization. Targets which want to do tail call
2092 /// optimization should implement this function.
2093 bool
2094 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2095                                                      CallingConv::ID CalleeCC,
2096                                                      bool isVarArg,
2097                                                      bool isCalleeStructRet,
2098                                                      bool isCallerStructRet,
2099                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
2100                                     const SmallVectorImpl<SDValue> &OutVals,
2101                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2102                                                      SelectionDAG& DAG) const {
2103   MachineFunction &MF = DAG.getMachineFunction();
2104   const Function *CallerF = MF.getFunction();
2105   CallingConv::ID CallerCC = CallerF->getCallingConv();
2106 
2107   assert(Subtarget->supportsTailCall());
2108 
2109   // Look for obvious safe cases to perform tail call optimization that do not
2110   // require ABI changes. This is what gcc calls sibcall.
2111 
2112   // Do not sibcall optimize vararg calls unless the call site is not passing
2113   // any arguments.
2114   if (isVarArg && !Outs.empty())
2115     return false;
2116 
2117   // Exception-handling functions need a special set of instructions to indicate
2118   // a return to the hardware. Tail-calling another function would probably
2119   // break this.
2120   if (CallerF->hasFnAttribute("interrupt"))
2121     return false;
2122 
2123   // Also avoid sibcall optimization if either caller or callee uses struct
2124   // return semantics.
2125   if (isCalleeStructRet || isCallerStructRet)
2126     return false;
2127 
2128   // Externally-defined functions with weak linkage should not be
2129   // tail-called on ARM when the OS does not support dynamic
2130   // pre-emption of symbols, as the AAELF spec requires normal calls
2131   // to undefined weak functions to be replaced with a NOP or jump to the
2132   // next instruction. The behaviour of branch instructions in this
2133   // situation (as used for tail calls) is implementation-defined, so we
2134   // cannot rely on the linker replacing the tail call with a return.
2135   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2136     const GlobalValue *GV = G->getGlobal();
2137     const Triple &TT = getTargetMachine().getTargetTriple();
2138     if (GV->hasExternalWeakLinkage() &&
2139         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2140       return false;
2141   }
2142 
2143   // Check that the call results are passed in the same way.
2144   LLVMContext &C = *DAG.getContext();
2145   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2146                                   CCAssignFnForNode(CalleeCC, true, isVarArg),
2147                                   CCAssignFnForNode(CallerCC, true, isVarArg)))
2148     return false;
2149   // The callee has to preserve all registers the caller needs to preserve.
2150   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2151   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2152   if (CalleeCC != CallerCC) {
2153     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2154     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2155       return false;
2156   }
2157 
2158   // If Caller's vararg or byval argument has been split between registers and
2159   // stack, do not perform tail call, since part of the argument is in caller's
2160   // local frame.
2161   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2162   if (AFI_Caller->getArgRegsSaveSize())
2163     return false;
2164 
2165   // If the callee takes no arguments then go on to check the results of the
2166   // call.
2167   if (!Outs.empty()) {
2168     // Check if stack adjustment is needed. For now, do not do this if any
2169     // argument is passed on the stack.
2170     SmallVector<CCValAssign, 16> ArgLocs;
2171     ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
2172     CCInfo.AnalyzeCallOperands(Outs,
2173                                CCAssignFnForNode(CalleeCC, false, isVarArg));
2174     if (CCInfo.getNextStackOffset()) {
2175       // Check if the arguments are already laid out in the right way as
2176       // the caller's fixed stack objects.
2177       MachineFrameInfo *MFI = MF.getFrameInfo();
2178       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2179       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2180       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2181            i != e;
2182            ++i, ++realArgIdx) {
2183         CCValAssign &VA = ArgLocs[i];
2184         EVT RegVT = VA.getLocVT();
2185         SDValue Arg = OutVals[realArgIdx];
2186         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2187         if (VA.getLocInfo() == CCValAssign::Indirect)
2188           return false;
2189         if (VA.needsCustom()) {
2190           // f64 and vector types are split into multiple registers or
2191           // register/stack-slot combinations.  The types will not match
2192           // the registers; give up on memory f64 refs until we figure
2193           // out what to do about this.
2194           if (!VA.isRegLoc())
2195             return false;
2196           if (!ArgLocs[++i].isRegLoc())
2197             return false;
2198           if (RegVT == MVT::v2f64) {
2199             if (!ArgLocs[++i].isRegLoc())
2200               return false;
2201             if (!ArgLocs[++i].isRegLoc())
2202               return false;
2203           }
2204         } else if (!VA.isRegLoc()) {
2205           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2206                                    MFI, MRI, TII))
2207             return false;
2208         }
2209       }
2210     }
2211 
2212     const MachineRegisterInfo &MRI = MF.getRegInfo();
2213     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2214       return false;
2215   }
2216 
2217   return true;
2218 }
2219 
2220 bool
2221 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2222                                   MachineFunction &MF, bool isVarArg,
2223                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2224                                   LLVMContext &Context) const {
2225   SmallVector<CCValAssign, 16> RVLocs;
2226   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2227   return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
2228                                                     isVarArg));
2229 }
2230 
2231 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2232                                     SDLoc DL, SelectionDAG &DAG) {
2233   const MachineFunction &MF = DAG.getMachineFunction();
2234   const Function *F = MF.getFunction();
2235 
2236   StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
2237 
2238   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2239   // version of the "preferred return address". These offsets affect the return
2240   // instruction if this is a return from PL1 without hypervisor extensions.
2241   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2242   //    SWI:     0      "subs pc, lr, #0"
2243   //    ABORT:   +4     "subs pc, lr, #4"
2244   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2245   // UNDEF varies depending on where the exception came from ARM or Thumb
2246   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2247 
2248   int64_t LROffset;
2249   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2250       IntKind == "ABORT")
2251     LROffset = 4;
2252   else if (IntKind == "SWI" || IntKind == "UNDEF")
2253     LROffset = 0;
2254   else
2255     report_fatal_error("Unsupported interrupt attribute. If present, value "
2256                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2257 
2258   RetOps.insert(RetOps.begin() + 1,
2259                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2260 
2261   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2262 }
2263 
2264 SDValue
2265 ARMTargetLowering::LowerReturn(SDValue Chain,
2266                                CallingConv::ID CallConv, bool isVarArg,
2267                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2268                                const SmallVectorImpl<SDValue> &OutVals,
2269                                SDLoc dl, SelectionDAG &DAG) const {
2270 
2271   // CCValAssign - represent the assignment of the return value to a location.
2272   SmallVector<CCValAssign, 16> RVLocs;
2273 
2274   // CCState - Info about the registers and stack slots.
2275   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2276                     *DAG.getContext(), Call);
2277 
2278   // Analyze outgoing return values.
2279   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
2280                                                isVarArg));
2281 
2282   SDValue Flag;
2283   SmallVector<SDValue, 4> RetOps;
2284   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2285   bool isLittleEndian = Subtarget->isLittle();
2286 
2287   MachineFunction &MF = DAG.getMachineFunction();
2288   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2289   AFI->setReturnRegsCount(RVLocs.size());
2290 
2291   // Copy the result values into the output registers.
2292   for (unsigned i = 0, realRVLocIdx = 0;
2293        i != RVLocs.size();
2294        ++i, ++realRVLocIdx) {
2295     CCValAssign &VA = RVLocs[i];
2296     assert(VA.isRegLoc() && "Can only return in registers!");
2297 
2298     SDValue Arg = OutVals[realRVLocIdx];
2299 
2300     switch (VA.getLocInfo()) {
2301     default: llvm_unreachable("Unknown loc info!");
2302     case CCValAssign::Full: break;
2303     case CCValAssign::BCvt:
2304       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2305       break;
2306     }
2307 
2308     if (VA.needsCustom()) {
2309       if (VA.getLocVT() == MVT::v2f64) {
2310         // Extract the first half and return it in two registers.
2311         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2312                                    DAG.getConstant(0, dl, MVT::i32));
2313         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2314                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2315 
2316         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2317                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2318                                  Flag);
2319         Flag = Chain.getValue(1);
2320         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2321         VA = RVLocs[++i]; // skip ahead to next loc
2322         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2323                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2324                                  Flag);
2325         Flag = Chain.getValue(1);
2326         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2327         VA = RVLocs[++i]; // skip ahead to next loc
2328 
2329         // Extract the 2nd half and fall through to handle it as an f64 value.
2330         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2331                           DAG.getConstant(1, dl, MVT::i32));
2332       }
2333       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2334       // available.
2335       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2336                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2337       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2338                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2339                                Flag);
2340       Flag = Chain.getValue(1);
2341       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2342       VA = RVLocs[++i]; // skip ahead to next loc
2343       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2344                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2345                                Flag);
2346     } else
2347       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2348 
2349     // Guarantee that all emitted copies are
2350     // stuck together, avoiding something bad.
2351     Flag = Chain.getValue(1);
2352     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2353   }
2354   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2355   const MCPhysReg *I =
2356       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2357   if (I) {
2358     for (; *I; ++I) {
2359       if (ARM::GPRRegClass.contains(*I))
2360         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2361       else if (ARM::DPRRegClass.contains(*I))
2362         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
2363       else
2364         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2365     }
2366   }
2367 
2368   // Update chain and glue.
2369   RetOps[0] = Chain;
2370   if (Flag.getNode())
2371     RetOps.push_back(Flag);
2372 
2373   // CPUs which aren't M-class use a special sequence to return from
2374   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2375   // though we use "subs pc, lr, #N").
2376   //
2377   // M-class CPUs actually use a normal return sequence with a special
2378   // (hardware-provided) value in LR, so the normal code path works.
2379   if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
2380       !Subtarget->isMClass()) {
2381     if (Subtarget->isThumb1Only())
2382       report_fatal_error("interrupt attribute is not supported in Thumb1");
2383     return LowerInterruptReturn(RetOps, dl, DAG);
2384   }
2385 
2386   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2387 }
2388 
2389 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2390   if (N->getNumValues() != 1)
2391     return false;
2392   if (!N->hasNUsesOfValue(1, 0))
2393     return false;
2394 
2395   SDValue TCChain = Chain;
2396   SDNode *Copy = *N->use_begin();
2397   if (Copy->getOpcode() == ISD::CopyToReg) {
2398     // If the copy has a glue operand, we conservatively assume it isn't safe to
2399     // perform a tail call.
2400     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2401       return false;
2402     TCChain = Copy->getOperand(0);
2403   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2404     SDNode *VMov = Copy;
2405     // f64 returned in a pair of GPRs.
2406     SmallPtrSet<SDNode*, 2> Copies;
2407     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2408          UI != UE; ++UI) {
2409       if (UI->getOpcode() != ISD::CopyToReg)
2410         return false;
2411       Copies.insert(*UI);
2412     }
2413     if (Copies.size() > 2)
2414       return false;
2415 
2416     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2417          UI != UE; ++UI) {
2418       SDValue UseChain = UI->getOperand(0);
2419       if (Copies.count(UseChain.getNode()))
2420         // Second CopyToReg
2421         Copy = *UI;
2422       else {
2423         // We are at the top of this chain.
2424         // If the copy has a glue operand, we conservatively assume it
2425         // isn't safe to perform a tail call.
2426         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2427           return false;
2428         // First CopyToReg
2429         TCChain = UseChain;
2430       }
2431     }
2432   } else if (Copy->getOpcode() == ISD::BITCAST) {
2433     // f32 returned in a single GPR.
2434     if (!Copy->hasOneUse())
2435       return false;
2436     Copy = *Copy->use_begin();
2437     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2438       return false;
2439     // If the copy has a glue operand, we conservatively assume it isn't safe to
2440     // perform a tail call.
2441     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2442       return false;
2443     TCChain = Copy->getOperand(0);
2444   } else {
2445     return false;
2446   }
2447 
2448   bool HasRet = false;
2449   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2450        UI != UE; ++UI) {
2451     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2452         UI->getOpcode() != ARMISD::INTRET_FLAG)
2453       return false;
2454     HasRet = true;
2455   }
2456 
2457   if (!HasRet)
2458     return false;
2459 
2460   Chain = TCChain;
2461   return true;
2462 }
2463 
2464 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2465   if (!Subtarget->supportsTailCall())
2466     return false;
2467 
2468   auto Attr =
2469       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2470   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2471     return false;
2472 
2473   return true;
2474 }
2475 
2476 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2477 // and pass the lower and high parts through.
2478 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2479   SDLoc DL(Op);
2480   SDValue WriteValue = Op->getOperand(2);
2481 
2482   // This function is only supposed to be called for i64 type argument.
2483   assert(WriteValue.getValueType() == MVT::i64
2484           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2485 
2486   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2487                            DAG.getConstant(0, DL, MVT::i32));
2488   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2489                            DAG.getConstant(1, DL, MVT::i32));
2490   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2491   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2492 }
2493 
2494 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2495 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2496 // one of the above mentioned nodes. It has to be wrapped because otherwise
2497 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2498 // be used to form addressing mode. These wrapped nodes will be selected
2499 // into MOVi.
2500 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2501   EVT PtrVT = Op.getValueType();
2502   // FIXME there is no actual debug info here
2503   SDLoc dl(Op);
2504   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2505   SDValue Res;
2506   if (CP->isMachineConstantPoolEntry())
2507     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2508                                     CP->getAlignment());
2509   else
2510     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2511                                     CP->getAlignment());
2512   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2513 }
2514 
2515 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2516   return MachineJumpTableInfo::EK_Inline;
2517 }
2518 
2519 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2520                                              SelectionDAG &DAG) const {
2521   MachineFunction &MF = DAG.getMachineFunction();
2522   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2523   unsigned ARMPCLabelIndex = 0;
2524   SDLoc DL(Op);
2525   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2526   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2527   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2528   SDValue CPAddr;
2529   if (RelocM == Reloc::Static) {
2530     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2531   } else {
2532     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2533     ARMPCLabelIndex = AFI->createPICLabelUId();
2534     ARMConstantPoolValue *CPV =
2535       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2536                                       ARMCP::CPBlockAddress, PCAdj);
2537     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2538   }
2539   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2540   SDValue Result =
2541       DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
2542                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
2543                   false, false, false, 0);
2544   if (RelocM == Reloc::Static)
2545     return Result;
2546   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
2547   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2548 }
2549 
2550 /// \brief Convert a TLS address reference into the correct sequence of loads
2551 /// and calls to compute the variable's address for Darwin, and return an
2552 /// SDValue containing the final node.
2553 
2554 /// Darwin only has one TLS scheme which must be capable of dealing with the
2555 /// fully general situation, in the worst case. This means:
2556 ///     + "extern __thread" declaration.
2557 ///     + Defined in a possibly unknown dynamic library.
2558 ///
2559 /// The general system is that each __thread variable has a [3 x i32] descriptor
2560 /// which contains information used by the runtime to calculate the address. The
2561 /// only part of this the compiler needs to know about is the first word, which
2562 /// contains a function pointer that must be called with the address of the
2563 /// entire descriptor in "r0".
2564 ///
2565 /// Since this descriptor may be in a different unit, in general access must
2566 /// proceed along the usual ARM rules. A common sequence to produce is:
2567 ///
2568 ///     movw rT1, :lower16:_var$non_lazy_ptr
2569 ///     movt rT1, :upper16:_var$non_lazy_ptr
2570 ///     ldr r0, [rT1]
2571 ///     ldr rT2, [r0]
2572 ///     blx rT2
2573 ///     [...address now in r0...]
2574 SDValue
2575 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
2576                                                SelectionDAG &DAG) const {
2577   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
2578   SDLoc DL(Op);
2579 
2580   // First step is to get the address of the actua global symbol. This is where
2581   // the TLS descriptor lives.
2582   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
2583 
2584   // The first entry in the descriptor is a function pointer that we must call
2585   // to obtain the address of the variable.
2586   SDValue Chain = DAG.getEntryNode();
2587   SDValue FuncTLVGet =
2588       DAG.getLoad(MVT::i32, DL, Chain, DescAddr,
2589                   MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2590                   false, true, true, 4);
2591   Chain = FuncTLVGet.getValue(1);
2592 
2593   MachineFunction &F = DAG.getMachineFunction();
2594   MachineFrameInfo *MFI = F.getFrameInfo();
2595   MFI->setAdjustsStack(true);
2596 
2597   // TLS calls preserve all registers except those that absolutely must be
2598   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
2599   // silly).
2600   auto TRI =
2601       getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
2602   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
2603   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
2604 
2605   // Finally, we can make the call. This is just a degenerate version of a
2606   // normal AArch64 call node: r0 takes the address of the descriptor, and
2607   // returns the address of the variable in this thread.
2608   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
2609   Chain =
2610       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
2611                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
2612                   DAG.getRegisterMask(Mask), Chain.getValue(1));
2613   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
2614 }
2615 
2616 SDValue
2617 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
2618                                                 SelectionDAG &DAG) const {
2619   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
2620   SDValue Chain = DAG.getEntryNode();
2621   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2622   SDLoc DL(Op);
2623 
2624   // Load the current TEB (thread environment block)
2625   SDValue Ops[] = {Chain,
2626                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
2627                    DAG.getConstant(15, DL, MVT::i32),
2628                    DAG.getConstant(0, DL, MVT::i32),
2629                    DAG.getConstant(13, DL, MVT::i32),
2630                    DAG.getConstant(0, DL, MVT::i32),
2631                    DAG.getConstant(2, DL, MVT::i32)};
2632   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
2633                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
2634 
2635   SDValue TEB = CurrentTEB.getValue(0);
2636   Chain = CurrentTEB.getValue(1);
2637 
2638   // Load the ThreadLocalStoragePointer from the TEB
2639   // A pointer to the TLS array is located at offset 0x2c from the TEB.
2640   SDValue TLSArray =
2641       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
2642   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo(),
2643                          false, false, false, 0);
2644 
2645   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
2646   // offset into the TLSArray.
2647 
2648   // Load the TLS index from the C runtime
2649   SDValue TLSIndex =
2650       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
2651   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
2652   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo(),
2653                          false, false, false, 0);
2654 
2655   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
2656                               DAG.getConstant(2, DL, MVT::i32));
2657   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
2658                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
2659                             MachinePointerInfo(), false, false, false, 0);
2660 
2661   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS,
2662                      LowerGlobalAddressWindows(Op, DAG));
2663 }
2664 
2665 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
2666 SDValue
2667 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2668                                                  SelectionDAG &DAG) const {
2669   SDLoc dl(GA);
2670   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2671   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2672   MachineFunction &MF = DAG.getMachineFunction();
2673   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2674   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2675   ARMConstantPoolValue *CPV =
2676     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2677                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2678   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2679   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2680   Argument =
2681       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
2682                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
2683                   false, false, false, 0);
2684   SDValue Chain = Argument.getValue(1);
2685 
2686   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2687   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2688 
2689   // call __tls_get_addr.
2690   ArgListTy Args;
2691   ArgListEntry Entry;
2692   Entry.Node = Argument;
2693   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2694   Args.push_back(Entry);
2695 
2696   // FIXME: is there useful debug info available here?
2697   TargetLowering::CallLoweringInfo CLI(DAG);
2698   CLI.setDebugLoc(dl).setChain(Chain)
2699     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
2700                DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args),
2701                0);
2702 
2703   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2704   return CallResult.first;
2705 }
2706 
2707 // Lower ISD::GlobalTLSAddress using the "initial exec" or
2708 // "local exec" model.
2709 SDValue
2710 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2711                                         SelectionDAG &DAG,
2712                                         TLSModel::Model model) const {
2713   const GlobalValue *GV = GA->getGlobal();
2714   SDLoc dl(GA);
2715   SDValue Offset;
2716   SDValue Chain = DAG.getEntryNode();
2717   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2718   // Get the Thread Pointer
2719   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2720 
2721   if (model == TLSModel::InitialExec) {
2722     MachineFunction &MF = DAG.getMachineFunction();
2723     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2724     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2725     // Initial exec model.
2726     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2727     ARMConstantPoolValue *CPV =
2728       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2729                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2730                                       true);
2731     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2732     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2733     Offset = DAG.getLoad(
2734         PtrVT, dl, Chain, Offset,
2735         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2736         false, false, 0);
2737     Chain = Offset.getValue(1);
2738 
2739     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2740     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2741 
2742     Offset = DAG.getLoad(
2743         PtrVT, dl, Chain, Offset,
2744         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2745         false, false, 0);
2746   } else {
2747     // local exec model
2748     assert(model == TLSModel::LocalExec);
2749     ARMConstantPoolValue *CPV =
2750       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2751     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2752     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2753     Offset = DAG.getLoad(
2754         PtrVT, dl, Chain, Offset,
2755         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2756         false, false, 0);
2757   }
2758 
2759   // The address of the thread local variable is the add of the thread
2760   // pointer with the offset of the variable.
2761   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2762 }
2763 
2764 SDValue
2765 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2766   if (Subtarget->isTargetDarwin())
2767     return LowerGlobalTLSAddressDarwin(Op, DAG);
2768 
2769   if (Subtarget->isTargetWindows())
2770     return LowerGlobalTLSAddressWindows(Op, DAG);
2771 
2772   // TODO: implement the "local dynamic" model
2773   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
2774   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2775   if (DAG.getTarget().Options.EmulatedTLS)
2776     return LowerToTLSEmulatedModel(GA, DAG);
2777 
2778   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2779 
2780   switch (model) {
2781     case TLSModel::GeneralDynamic:
2782     case TLSModel::LocalDynamic:
2783       return LowerToTLSGeneralDynamicModel(GA, DAG);
2784     case TLSModel::InitialExec:
2785     case TLSModel::LocalExec:
2786       return LowerToTLSExecModels(GA, DAG, model);
2787   }
2788   llvm_unreachable("bogus TLS model");
2789 }
2790 
2791 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
2792                                                  SelectionDAG &DAG) const {
2793   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2794   SDLoc dl(Op);
2795   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2796   const TargetMachine &TM = getTargetMachine();
2797   Reloc::Model RM = TM.getRelocationModel();
2798   const Triple &TargetTriple = TM.getTargetTriple();
2799   if (RM == Reloc::PIC_) {
2800     bool UseGOT_PREL =
2801         !shouldAssumeDSOLocal(RM, TargetTriple, *GV->getParent(), GV);
2802 
2803     MachineFunction &MF = DAG.getMachineFunction();
2804     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2805     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2806     EVT PtrVT = getPointerTy(DAG.getDataLayout());
2807     SDLoc dl(Op);
2808     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2809     ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2810         GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
2811         UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
2812         /*AddCurrentAddress=*/UseGOT_PREL);
2813     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2814     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2815     SDValue Result = DAG.getLoad(
2816         PtrVT, dl, DAG.getEntryNode(), CPAddr,
2817         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2818         false, false, 0);
2819     SDValue Chain = Result.getValue(1);
2820     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2821     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2822     if (UseGOT_PREL)
2823       Result = DAG.getLoad(PtrVT, dl, Chain, Result,
2824                            MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2825                            false, false, false, 0);
2826     return Result;
2827   }
2828 
2829   // If we have T2 ops, we can materialize the address directly via movt/movw
2830   // pair. This is always cheaper.
2831   if (Subtarget->useMovt(DAG.getMachineFunction())) {
2832     ++NumMovwMovt;
2833     // FIXME: Once remat is capable of dealing with instructions with register
2834     // operands, expand this into two nodes.
2835     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2836                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2837   } else {
2838     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2839     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2840     return DAG.getLoad(
2841         PtrVT, dl, DAG.getEntryNode(), CPAddr,
2842         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2843         false, false, 0);
2844   }
2845 }
2846 
2847 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
2848                                                     SelectionDAG &DAG) const {
2849   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2850   SDLoc dl(Op);
2851   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2852   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2853 
2854   if (Subtarget->useMovt(DAG.getMachineFunction()))
2855     ++NumMovwMovt;
2856 
2857   // FIXME: Once remat is capable of dealing with instructions with register
2858   // operands, expand this into multiple nodes
2859   unsigned Wrapper =
2860       RelocM == Reloc::PIC_ ? ARMISD::WrapperPIC : ARMISD::Wrapper;
2861 
2862   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
2863   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
2864 
2865   if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2866     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
2867                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2868                          false, false, false, 0);
2869   return Result;
2870 }
2871 
2872 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
2873                                                      SelectionDAG &DAG) const {
2874   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
2875   assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
2876          "Windows on ARM expects to use movw/movt");
2877 
2878   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2879   const ARMII::TOF TargetFlags =
2880     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
2881   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2882   SDValue Result;
2883   SDLoc DL(Op);
2884 
2885   ++NumMovwMovt;
2886 
2887   // FIXME: Once remat is capable of dealing with instructions with register
2888   // operands, expand this into two nodes.
2889   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
2890                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
2891                                                   TargetFlags));
2892   if (GV->hasDLLImportStorageClass())
2893     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
2894                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2895                          false, false, false, 0);
2896   return Result;
2897 }
2898 
2899 SDValue
2900 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
2901   SDLoc dl(Op);
2902   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
2903   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
2904                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
2905                      Op.getOperand(1), Val);
2906 }
2907 
2908 SDValue
2909 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
2910   SDLoc dl(Op);
2911   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
2912                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
2913 }
2914 
2915 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
2916                                                       SelectionDAG &DAG) const {
2917   SDLoc dl(Op);
2918   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
2919                      Op.getOperand(0));
2920 }
2921 
2922 SDValue
2923 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
2924                                           const ARMSubtarget *Subtarget) const {
2925   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2926   SDLoc dl(Op);
2927   switch (IntNo) {
2928   default: return SDValue();    // Don't custom lower most intrinsics.
2929   case Intrinsic::arm_rbit: {
2930     assert(Op.getOperand(1).getValueType() == MVT::i32 &&
2931            "RBIT intrinsic must have i32 type!");
2932     return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
2933   }
2934   case Intrinsic::thread_pointer: {
2935     EVT PtrVT = getPointerTy(DAG.getDataLayout());
2936     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2937   }
2938   case Intrinsic::eh_sjlj_lsda: {
2939     MachineFunction &MF = DAG.getMachineFunction();
2940     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2941     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2942     EVT PtrVT = getPointerTy(DAG.getDataLayout());
2943     Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2944     SDValue CPAddr;
2945     unsigned PCAdj = (RelocM != Reloc::PIC_)
2946       ? 0 : (Subtarget->isThumb() ? 4 : 8);
2947     ARMConstantPoolValue *CPV =
2948       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
2949                                       ARMCP::CPLSDA, PCAdj);
2950     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2951     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2952     SDValue Result = DAG.getLoad(
2953         PtrVT, dl, DAG.getEntryNode(), CPAddr,
2954         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
2955         false, false, 0);
2956 
2957     if (RelocM == Reloc::PIC_) {
2958       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2959       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2960     }
2961     return Result;
2962   }
2963   case Intrinsic::arm_neon_vmulls:
2964   case Intrinsic::arm_neon_vmullu: {
2965     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
2966       ? ARMISD::VMULLs : ARMISD::VMULLu;
2967     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2968                        Op.getOperand(1), Op.getOperand(2));
2969   }
2970   case Intrinsic::arm_neon_vminnm:
2971   case Intrinsic::arm_neon_vmaxnm: {
2972     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
2973       ? ISD::FMINNUM : ISD::FMAXNUM;
2974     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2975                        Op.getOperand(1), Op.getOperand(2));
2976   }
2977   case Intrinsic::arm_neon_vminu:
2978   case Intrinsic::arm_neon_vmaxu: {
2979     if (Op.getValueType().isFloatingPoint())
2980       return SDValue();
2981     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
2982       ? ISD::UMIN : ISD::UMAX;
2983     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2984                          Op.getOperand(1), Op.getOperand(2));
2985   }
2986   case Intrinsic::arm_neon_vmins:
2987   case Intrinsic::arm_neon_vmaxs: {
2988     // v{min,max}s is overloaded between signed integers and floats.
2989     if (!Op.getValueType().isFloatingPoint()) {
2990       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
2991         ? ISD::SMIN : ISD::SMAX;
2992       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2993                          Op.getOperand(1), Op.getOperand(2));
2994     }
2995     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
2996       ? ISD::FMINNAN : ISD::FMAXNAN;
2997     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2998                        Op.getOperand(1), Op.getOperand(2));
2999   }
3000   }
3001 }
3002 
3003 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
3004                                  const ARMSubtarget *Subtarget) {
3005   // FIXME: handle "fence singlethread" more efficiently.
3006   SDLoc dl(Op);
3007   if (!Subtarget->hasDataBarrier()) {
3008     // Some ARMv6 cpus can support data barriers with an mcr instruction.
3009     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3010     // here.
3011     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3012            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3013     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3014                        DAG.getConstant(0, dl, MVT::i32));
3015   }
3016 
3017   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
3018   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
3019   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
3020   if (Subtarget->isMClass()) {
3021     // Only a full system barrier exists in the M-class architectures.
3022     Domain = ARM_MB::SY;
3023   } else if (Subtarget->isSwift() && Ord == AtomicOrdering::Release) {
3024     // Swift happens to implement ISHST barriers in a way that's compatible with
3025     // Release semantics but weaker than ISH so we'd be fools not to use
3026     // it. Beware: other processors probably don't!
3027     Domain = ARM_MB::ISHST;
3028   }
3029 
3030   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3031                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3032                      DAG.getConstant(Domain, dl, MVT::i32));
3033 }
3034 
3035 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
3036                              const ARMSubtarget *Subtarget) {
3037   // ARM pre v5TE and Thumb1 does not have preload instructions.
3038   if (!(Subtarget->isThumb2() ||
3039         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
3040     // Just preserve the chain.
3041     return Op.getOperand(0);
3042 
3043   SDLoc dl(Op);
3044   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
3045   if (!isRead &&
3046       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
3047     // ARMv7 with MP extension has PLDW.
3048     return Op.getOperand(0);
3049 
3050   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3051   if (Subtarget->isThumb()) {
3052     // Invert the bits.
3053     isRead = ~isRead & 1;
3054     isData = ~isData & 1;
3055   }
3056 
3057   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
3058                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
3059                      DAG.getConstant(isData, dl, MVT::i32));
3060 }
3061 
3062 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
3063   MachineFunction &MF = DAG.getMachineFunction();
3064   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
3065 
3066   // vastart just stores the address of the VarArgsFrameIndex slot into the
3067   // memory location argument.
3068   SDLoc dl(Op);
3069   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
3070   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3071   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3072   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3073                       MachinePointerInfo(SV), false, false, 0);
3074 }
3075 
3076 SDValue
3077 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
3078                                         SDValue &Root, SelectionDAG &DAG,
3079                                         SDLoc dl) const {
3080   MachineFunction &MF = DAG.getMachineFunction();
3081   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3082 
3083   const TargetRegisterClass *RC;
3084   if (AFI->isThumb1OnlyFunction())
3085     RC = &ARM::tGPRRegClass;
3086   else
3087     RC = &ARM::GPRRegClass;
3088 
3089   // Transform the arguments stored in physical registers into virtual ones.
3090   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3091   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3092 
3093   SDValue ArgValue2;
3094   if (NextVA.isMemLoc()) {
3095     MachineFrameInfo *MFI = MF.getFrameInfo();
3096     int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
3097 
3098     // Create load node to retrieve arguments from the stack.
3099     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3100     ArgValue2 = DAG.getLoad(
3101         MVT::i32, dl, Root, FIN,
3102         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
3103         false, false, 0);
3104   } else {
3105     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3106     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3107   }
3108   if (!Subtarget->isLittle())
3109     std::swap (ArgValue, ArgValue2);
3110   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
3111 }
3112 
3113 // The remaining GPRs hold either the beginning of variable-argument
3114 // data, or the beginning of an aggregate passed by value (usually
3115 // byval).  Either way, we allocate stack slots adjacent to the data
3116 // provided by our caller, and store the unallocated registers there.
3117 // If this is a variadic function, the va_list pointer will begin with
3118 // these values; otherwise, this reassembles a (byval) structure that
3119 // was split between registers and memory.
3120 // Return: The frame index registers were stored into.
3121 int
3122 ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
3123                                   SDLoc dl, SDValue &Chain,
3124                                   const Value *OrigArg,
3125                                   unsigned InRegsParamRecordIdx,
3126                                   int ArgOffset,
3127                                   unsigned ArgSize) const {
3128   // Currently, two use-cases possible:
3129   // Case #1. Non-var-args function, and we meet first byval parameter.
3130   //          Setup first unallocated register as first byval register;
3131   //          eat all remained registers
3132   //          (these two actions are performed by HandleByVal method).
3133   //          Then, here, we initialize stack frame with
3134   //          "store-reg" instructions.
3135   // Case #2. Var-args function, that doesn't contain byval parameters.
3136   //          The same: eat all remained unallocated registers,
3137   //          initialize stack frame.
3138 
3139   MachineFunction &MF = DAG.getMachineFunction();
3140   MachineFrameInfo *MFI = MF.getFrameInfo();
3141   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3142   unsigned RBegin, REnd;
3143   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
3144     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
3145   } else {
3146     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3147     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
3148     REnd = ARM::R4;
3149   }
3150 
3151   if (REnd != RBegin)
3152     ArgOffset = -4 * (ARM::R4 - RBegin);
3153 
3154   auto PtrVT = getPointerTy(DAG.getDataLayout());
3155   int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false);
3156   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
3157 
3158   SmallVector<SDValue, 4> MemOps;
3159   const TargetRegisterClass *RC =
3160       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
3161 
3162   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3163     unsigned VReg = MF.addLiveIn(Reg, RC);
3164     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3165     SDValue Store =
3166         DAG.getStore(Val.getValue(1), dl, Val, FIN,
3167                      MachinePointerInfo(OrigArg, 4 * i), false, false, 0);
3168     MemOps.push_back(Store);
3169     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3170   }
3171 
3172   if (!MemOps.empty())
3173     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3174   return FrameIndex;
3175 }
3176 
3177 // Setup stack frame, the va_list pointer will start from.
3178 void
3179 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3180                                         SDLoc dl, SDValue &Chain,
3181                                         unsigned ArgOffset,
3182                                         unsigned TotalArgRegsSaveSize,
3183                                         bool ForceMutable) const {
3184   MachineFunction &MF = DAG.getMachineFunction();
3185   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3186 
3187   // Try to store any remaining integer argument regs
3188   // to their spots on the stack so that they may be loaded by deferencing
3189   // the result of va_next.
3190   // If there is no regs to be stored, just point address after last
3191   // argument passed via stack.
3192   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3193                                   CCInfo.getInRegsParamsCount(),
3194                                   CCInfo.getNextStackOffset(), 4);
3195   AFI->setVarArgsFrameIndex(FrameIndex);
3196 }
3197 
3198 SDValue
3199 ARMTargetLowering::LowerFormalArguments(SDValue Chain,
3200                                         CallingConv::ID CallConv, bool isVarArg,
3201                                         const SmallVectorImpl<ISD::InputArg>
3202                                           &Ins,
3203                                         SDLoc dl, SelectionDAG &DAG,
3204                                         SmallVectorImpl<SDValue> &InVals)
3205                                           const {
3206   MachineFunction &MF = DAG.getMachineFunction();
3207   MachineFrameInfo *MFI = MF.getFrameInfo();
3208 
3209   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3210 
3211   // Assign locations to all of the incoming arguments.
3212   SmallVector<CCValAssign, 16> ArgLocs;
3213   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3214                     *DAG.getContext(), Prologue);
3215   CCInfo.AnalyzeFormalArguments(Ins,
3216                                 CCAssignFnForNode(CallConv, /* Return*/ false,
3217                                                   isVarArg));
3218 
3219   SmallVector<SDValue, 16> ArgValues;
3220   SDValue ArgValue;
3221   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
3222   unsigned CurArgIdx = 0;
3223 
3224   // Initially ArgRegsSaveSize is zero.
3225   // Then we increase this value each time we meet byval parameter.
3226   // We also increase this value in case of varargs function.
3227   AFI->setArgRegsSaveSize(0);
3228 
3229   // Calculate the amount of stack space that we need to allocate to store
3230   // byval and variadic arguments that are passed in registers.
3231   // We need to know this before we allocate the first byval or variadic
3232   // argument, as they will be allocated a stack slot below the CFA (Canonical
3233   // Frame Address, the stack pointer at entry to the function).
3234   unsigned ArgRegBegin = ARM::R4;
3235   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3236     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3237       break;
3238 
3239     CCValAssign &VA = ArgLocs[i];
3240     unsigned Index = VA.getValNo();
3241     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3242     if (!Flags.isByVal())
3243       continue;
3244 
3245     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3246     unsigned RBegin, REnd;
3247     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3248     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3249 
3250     CCInfo.nextInRegsParam();
3251   }
3252   CCInfo.rewindByValRegsInfo();
3253 
3254   int lastInsIndex = -1;
3255   if (isVarArg && MFI->hasVAStart()) {
3256     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3257     if (RegIdx != array_lengthof(GPRArgRegs))
3258       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3259   }
3260 
3261   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3262   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3263   auto PtrVT = getPointerTy(DAG.getDataLayout());
3264 
3265   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3266     CCValAssign &VA = ArgLocs[i];
3267     if (Ins[VA.getValNo()].isOrigArg()) {
3268       std::advance(CurOrigArg,
3269                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3270       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3271     }
3272     // Arguments stored in registers.
3273     if (VA.isRegLoc()) {
3274       EVT RegVT = VA.getLocVT();
3275 
3276       if (VA.needsCustom()) {
3277         // f64 and vector types are split up into multiple registers or
3278         // combinations of registers and stack slots.
3279         if (VA.getLocVT() == MVT::v2f64) {
3280           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3281                                                    Chain, DAG, dl);
3282           VA = ArgLocs[++i]; // skip ahead to next loc
3283           SDValue ArgValue2;
3284           if (VA.isMemLoc()) {
3285             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
3286             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3287             ArgValue2 = DAG.getLoad(
3288                 MVT::f64, dl, Chain, FIN,
3289                 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3290                 false, false, false, 0);
3291           } else {
3292             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3293                                              Chain, DAG, dl);
3294           }
3295           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3296           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3297                                  ArgValue, ArgValue1,
3298                                  DAG.getIntPtrConstant(0, dl));
3299           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3300                                  ArgValue, ArgValue2,
3301                                  DAG.getIntPtrConstant(1, dl));
3302         } else
3303           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3304 
3305       } else {
3306         const TargetRegisterClass *RC;
3307 
3308         if (RegVT == MVT::f32)
3309           RC = &ARM::SPRRegClass;
3310         else if (RegVT == MVT::f64)
3311           RC = &ARM::DPRRegClass;
3312         else if (RegVT == MVT::v2f64)
3313           RC = &ARM::QPRRegClass;
3314         else if (RegVT == MVT::i32)
3315           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3316                                            : &ARM::GPRRegClass;
3317         else
3318           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3319 
3320         // Transform the arguments in physical registers into virtual ones.
3321         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3322         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3323       }
3324 
3325       // If this is an 8 or 16-bit value, it is really passed promoted
3326       // to 32 bits.  Insert an assert[sz]ext to capture this, then
3327       // truncate to the right size.
3328       switch (VA.getLocInfo()) {
3329       default: llvm_unreachable("Unknown loc info!");
3330       case CCValAssign::Full: break;
3331       case CCValAssign::BCvt:
3332         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
3333         break;
3334       case CCValAssign::SExt:
3335         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3336                                DAG.getValueType(VA.getValVT()));
3337         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3338         break;
3339       case CCValAssign::ZExt:
3340         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3341                                DAG.getValueType(VA.getValVT()));
3342         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3343         break;
3344       }
3345 
3346       InVals.push_back(ArgValue);
3347 
3348     } else { // VA.isRegLoc()
3349 
3350       // sanity check
3351       assert(VA.isMemLoc());
3352       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
3353 
3354       int index = VA.getValNo();
3355 
3356       // Some Ins[] entries become multiple ArgLoc[] entries.
3357       // Process them only once.
3358       if (index != lastInsIndex)
3359         {
3360           ISD::ArgFlagsTy Flags = Ins[index].Flags;
3361           // FIXME: For now, all byval parameter objects are marked mutable.
3362           // This can be changed with more analysis.
3363           // In case of tail call optimization mark all arguments mutable.
3364           // Since they could be overwritten by lowering of arguments in case of
3365           // a tail call.
3366           if (Flags.isByVal()) {
3367             assert(Ins[index].isOrigArg() &&
3368                    "Byval arguments cannot be implicit");
3369             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
3370 
3371             int FrameIndex = StoreByValRegs(
3372                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
3373                 VA.getLocMemOffset(), Flags.getByValSize());
3374             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
3375             CCInfo.nextInRegsParam();
3376           } else {
3377             unsigned FIOffset = VA.getLocMemOffset();
3378             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
3379                                             FIOffset, true);
3380 
3381             // Create load nodes to retrieve arguments from the stack.
3382             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3383             InVals.push_back(DAG.getLoad(
3384                 VA.getValVT(), dl, Chain, FIN,
3385                 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3386                 false, false, false, 0));
3387           }
3388           lastInsIndex = index;
3389         }
3390     }
3391   }
3392 
3393   // varargs
3394   if (isVarArg && MFI->hasVAStart())
3395     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3396                          CCInfo.getNextStackOffset(),
3397                          TotalArgRegsSaveSize);
3398 
3399   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
3400 
3401   return Chain;
3402 }
3403 
3404 /// isFloatingPointZero - Return true if this is +0.0.
3405 static bool isFloatingPointZero(SDValue Op) {
3406   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3407     return CFP->getValueAPF().isPosZero();
3408   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3409     // Maybe this has already been legalized into the constant pool?
3410     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3411       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3412       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3413         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3414           return CFP->getValueAPF().isPosZero();
3415     }
3416   } else if (Op->getOpcode() == ISD::BITCAST &&
3417              Op->getValueType(0) == MVT::f64) {
3418     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
3419     // created by LowerConstantFP().
3420     SDValue BitcastOp = Op->getOperand(0);
3421     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
3422         isNullConstant(BitcastOp->getOperand(0)))
3423       return true;
3424   }
3425   return false;
3426 }
3427 
3428 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3429 /// the given operands.
3430 SDValue
3431 ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3432                              SDValue &ARMcc, SelectionDAG &DAG,
3433                              SDLoc dl) const {
3434   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3435     unsigned C = RHSC->getZExtValue();
3436     if (!isLegalICmpImmediate(C)) {
3437       // Constant does not fit, try adjusting it by one?
3438       switch (CC) {
3439       default: break;
3440       case ISD::SETLT:
3441       case ISD::SETGE:
3442         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3443           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3444           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3445         }
3446         break;
3447       case ISD::SETULT:
3448       case ISD::SETUGE:
3449         if (C != 0 && isLegalICmpImmediate(C-1)) {
3450           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3451           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3452         }
3453         break;
3454       case ISD::SETLE:
3455       case ISD::SETGT:
3456         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3457           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3458           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3459         }
3460         break;
3461       case ISD::SETULE:
3462       case ISD::SETUGT:
3463         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3464           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3465           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3466         }
3467         break;
3468       }
3469     }
3470   }
3471 
3472   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3473   ARMISD::NodeType CompareType;
3474   switch (CondCode) {
3475   default:
3476     CompareType = ARMISD::CMP;
3477     break;
3478   case ARMCC::EQ:
3479   case ARMCC::NE:
3480     // Uses only Z Flag
3481     CompareType = ARMISD::CMPZ;
3482     break;
3483   }
3484   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3485   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3486 }
3487 
3488 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3489 SDValue
3490 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
3491                              SDLoc dl) const {
3492   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
3493   SDValue Cmp;
3494   if (!isFloatingPointZero(RHS))
3495     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
3496   else
3497     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
3498   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3499 }
3500 
3501 /// duplicateCmp - Glue values can have only one use, so this function
3502 /// duplicates a comparison node.
3503 SDValue
3504 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3505   unsigned Opc = Cmp.getOpcode();
3506   SDLoc DL(Cmp);
3507   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3508     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3509 
3510   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3511   Cmp = Cmp.getOperand(0);
3512   Opc = Cmp.getOpcode();
3513   if (Opc == ARMISD::CMPFP)
3514     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3515   else {
3516     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3517     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
3518   }
3519   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3520 }
3521 
3522 std::pair<SDValue, SDValue>
3523 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
3524                                  SDValue &ARMcc) const {
3525   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
3526 
3527   SDValue Value, OverflowCmp;
3528   SDValue LHS = Op.getOperand(0);
3529   SDValue RHS = Op.getOperand(1);
3530   SDLoc dl(Op);
3531 
3532   // FIXME: We are currently always generating CMPs because we don't support
3533   // generating CMN through the backend. This is not as good as the natural
3534   // CMP case because it causes a register dependency and cannot be folded
3535   // later.
3536 
3537   switch (Op.getOpcode()) {
3538   default:
3539     llvm_unreachable("Unknown overflow instruction!");
3540   case ISD::SADDO:
3541     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3542     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3543     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3544     break;
3545   case ISD::UADDO:
3546     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3547     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3548     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3549     break;
3550   case ISD::SSUBO:
3551     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3552     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3553     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3554     break;
3555   case ISD::USUBO:
3556     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3557     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3558     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3559     break;
3560   } // switch (...)
3561 
3562   return std::make_pair(Value, OverflowCmp);
3563 }
3564 
3565 
3566 SDValue
3567 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
3568   // Let legalize expand this if it isn't a legal type yet.
3569   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3570     return SDValue();
3571 
3572   SDValue Value, OverflowCmp;
3573   SDValue ARMcc;
3574   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
3575   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3576   SDLoc dl(Op);
3577   // We use 0 and 1 as false and true values.
3578   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3579   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3580   EVT VT = Op.getValueType();
3581 
3582   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
3583                                  ARMcc, CCR, OverflowCmp);
3584 
3585   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3586   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3587 }
3588 
3589 
3590 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
3591   SDValue Cond = Op.getOperand(0);
3592   SDValue SelectTrue = Op.getOperand(1);
3593   SDValue SelectFalse = Op.getOperand(2);
3594   SDLoc dl(Op);
3595   unsigned Opc = Cond.getOpcode();
3596 
3597   if (Cond.getResNo() == 1 &&
3598       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3599        Opc == ISD::USUBO)) {
3600     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
3601       return SDValue();
3602 
3603     SDValue Value, OverflowCmp;
3604     SDValue ARMcc;
3605     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
3606     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3607     EVT VT = Op.getValueType();
3608 
3609     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
3610                    OverflowCmp, DAG);
3611   }
3612 
3613   // Convert:
3614   //
3615   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
3616   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
3617   //
3618   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
3619     const ConstantSDNode *CMOVTrue =
3620       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
3621     const ConstantSDNode *CMOVFalse =
3622       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3623 
3624     if (CMOVTrue && CMOVFalse) {
3625       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
3626       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
3627 
3628       SDValue True;
3629       SDValue False;
3630       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
3631         True = SelectTrue;
3632         False = SelectFalse;
3633       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
3634         True = SelectFalse;
3635         False = SelectTrue;
3636       }
3637 
3638       if (True.getNode() && False.getNode()) {
3639         EVT VT = Op.getValueType();
3640         SDValue ARMcc = Cond.getOperand(2);
3641         SDValue CCR = Cond.getOperand(3);
3642         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
3643         assert(True.getValueType() == VT);
3644         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
3645       }
3646     }
3647   }
3648 
3649   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
3650   // undefined bits before doing a full-word comparison with zero.
3651   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
3652                      DAG.getConstant(1, dl, Cond.getValueType()));
3653 
3654   return DAG.getSelectCC(dl, Cond,
3655                          DAG.getConstant(0, dl, Cond.getValueType()),
3656                          SelectTrue, SelectFalse, ISD::SETNE);
3657 }
3658 
3659 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
3660                                  bool &swpCmpOps, bool &swpVselOps) {
3661   // Start by selecting the GE condition code for opcodes that return true for
3662   // 'equality'
3663   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
3664       CC == ISD::SETULE)
3665     CondCode = ARMCC::GE;
3666 
3667   // and GT for opcodes that return false for 'equality'.
3668   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
3669            CC == ISD::SETULT)
3670     CondCode = ARMCC::GT;
3671 
3672   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
3673   // to swap the compare operands.
3674   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
3675       CC == ISD::SETULT)
3676     swpCmpOps = true;
3677 
3678   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
3679   // If we have an unordered opcode, we need to swap the operands to the VSEL
3680   // instruction (effectively negating the condition).
3681   //
3682   // This also has the effect of swapping which one of 'less' or 'greater'
3683   // returns true, so we also swap the compare operands. It also switches
3684   // whether we return true for 'equality', so we compensate by picking the
3685   // opposite condition code to our original choice.
3686   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
3687       CC == ISD::SETUGT) {
3688     swpCmpOps = !swpCmpOps;
3689     swpVselOps = !swpVselOps;
3690     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
3691   }
3692 
3693   // 'ordered' is 'anything but unordered', so use the VS condition code and
3694   // swap the VSEL operands.
3695   if (CC == ISD::SETO) {
3696     CondCode = ARMCC::VS;
3697     swpVselOps = true;
3698   }
3699 
3700   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
3701   // code and swap the VSEL operands.
3702   if (CC == ISD::SETUNE) {
3703     CondCode = ARMCC::EQ;
3704     swpVselOps = true;
3705   }
3706 }
3707 
3708 SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal,
3709                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
3710                                    SDValue Cmp, SelectionDAG &DAG) const {
3711   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
3712     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
3713                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
3714     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
3715                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
3716 
3717     SDValue TrueLow = TrueVal.getValue(0);
3718     SDValue TrueHigh = TrueVal.getValue(1);
3719     SDValue FalseLow = FalseVal.getValue(0);
3720     SDValue FalseHigh = FalseVal.getValue(1);
3721 
3722     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
3723                               ARMcc, CCR, Cmp);
3724     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
3725                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
3726 
3727     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
3728   } else {
3729     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
3730                        Cmp);
3731   }
3732 }
3733 
3734 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
3735   EVT VT = Op.getValueType();
3736   SDValue LHS = Op.getOperand(0);
3737   SDValue RHS = Op.getOperand(1);
3738   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3739   SDValue TrueVal = Op.getOperand(2);
3740   SDValue FalseVal = Op.getOperand(3);
3741   SDLoc dl(Op);
3742 
3743   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
3744     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
3745                                                     dl);
3746 
3747     // If softenSetCCOperands only returned one value, we should compare it to
3748     // zero.
3749     if (!RHS.getNode()) {
3750       RHS = DAG.getConstant(0, dl, LHS.getValueType());
3751       CC = ISD::SETNE;
3752     }
3753   }
3754 
3755   if (LHS.getValueType() == MVT::i32) {
3756     // Try to generate VSEL on ARMv8.
3757     // The VSEL instruction can't use all the usual ARM condition
3758     // codes: it only has two bits to select the condition code, so it's
3759     // constrained to use only GE, GT, VS and EQ.
3760     //
3761     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
3762     // swap the operands of the previous compare instruction (effectively
3763     // inverting the compare condition, swapping 'less' and 'greater') and
3764     // sometimes need to swap the operands to the VSEL (which inverts the
3765     // condition in the sense of firing whenever the previous condition didn't)
3766     if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
3767                                     TrueVal.getValueType() == MVT::f64)) {
3768       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3769       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
3770           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
3771         CC = ISD::getSetCCInverse(CC, true);
3772         std::swap(TrueVal, FalseVal);
3773       }
3774     }
3775 
3776     SDValue ARMcc;
3777     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3778     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3779     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
3780   }
3781 
3782   ARMCC::CondCodes CondCode, CondCode2;
3783   FPCCToARMCC(CC, CondCode, CondCode2);
3784 
3785   // Try to generate VMAXNM/VMINNM on ARMv8.
3786   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
3787                                   TrueVal.getValueType() == MVT::f64)) {
3788     bool swpCmpOps = false;
3789     bool swpVselOps = false;
3790     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
3791 
3792     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
3793         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
3794       if (swpCmpOps)
3795         std::swap(LHS, RHS);
3796       if (swpVselOps)
3797         std::swap(TrueVal, FalseVal);
3798     }
3799   }
3800 
3801   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3802   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3803   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3804   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
3805   if (CondCode2 != ARMCC::AL) {
3806     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
3807     // FIXME: Needs another CMP because flag can have but one use.
3808     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
3809     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
3810   }
3811   return Result;
3812 }
3813 
3814 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
3815 /// to morph to an integer compare sequence.
3816 static bool canChangeToInt(SDValue Op, bool &SeenZero,
3817                            const ARMSubtarget *Subtarget) {
3818   SDNode *N = Op.getNode();
3819   if (!N->hasOneUse())
3820     // Otherwise it requires moving the value from fp to integer registers.
3821     return false;
3822   if (!N->getNumValues())
3823     return false;
3824   EVT VT = Op.getValueType();
3825   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
3826     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
3827     // vmrs are very slow, e.g. cortex-a8.
3828     return false;
3829 
3830   if (isFloatingPointZero(Op)) {
3831     SeenZero = true;
3832     return true;
3833   }
3834   return ISD::isNormalLoad(N);
3835 }
3836 
3837 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
3838   if (isFloatingPointZero(Op))
3839     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
3840 
3841   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
3842     return DAG.getLoad(MVT::i32, SDLoc(Op),
3843                        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
3844                        Ld->isVolatile(), Ld->isNonTemporal(),
3845                        Ld->isInvariant(), Ld->getAlignment());
3846 
3847   llvm_unreachable("Unknown VFP cmp argument!");
3848 }
3849 
3850 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
3851                            SDValue &RetVal1, SDValue &RetVal2) {
3852   SDLoc dl(Op);
3853 
3854   if (isFloatingPointZero(Op)) {
3855     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
3856     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
3857     return;
3858   }
3859 
3860   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
3861     SDValue Ptr = Ld->getBasePtr();
3862     RetVal1 = DAG.getLoad(MVT::i32, dl,
3863                           Ld->getChain(), Ptr,
3864                           Ld->getPointerInfo(),
3865                           Ld->isVolatile(), Ld->isNonTemporal(),
3866                           Ld->isInvariant(), Ld->getAlignment());
3867 
3868     EVT PtrType = Ptr.getValueType();
3869     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
3870     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
3871                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
3872     RetVal2 = DAG.getLoad(MVT::i32, dl,
3873                           Ld->getChain(), NewPtr,
3874                           Ld->getPointerInfo().getWithOffset(4),
3875                           Ld->isVolatile(), Ld->isNonTemporal(),
3876                           Ld->isInvariant(), NewAlign);
3877     return;
3878   }
3879 
3880   llvm_unreachable("Unknown VFP cmp argument!");
3881 }
3882 
3883 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
3884 /// f32 and even f64 comparisons to integer ones.
3885 SDValue
3886 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
3887   SDValue Chain = Op.getOperand(0);
3888   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3889   SDValue LHS = Op.getOperand(2);
3890   SDValue RHS = Op.getOperand(3);
3891   SDValue Dest = Op.getOperand(4);
3892   SDLoc dl(Op);
3893 
3894   bool LHSSeenZero = false;
3895   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
3896   bool RHSSeenZero = false;
3897   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
3898   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
3899     // If unsafe fp math optimization is enabled and there are no other uses of
3900     // the CMP operands, and the condition code is EQ or NE, we can optimize it
3901     // to an integer comparison.
3902     if (CC == ISD::SETOEQ)
3903       CC = ISD::SETEQ;
3904     else if (CC == ISD::SETUNE)
3905       CC = ISD::SETNE;
3906 
3907     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
3908     SDValue ARMcc;
3909     if (LHS.getValueType() == MVT::f32) {
3910       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3911                         bitcastf32Toi32(LHS, DAG), Mask);
3912       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3913                         bitcastf32Toi32(RHS, DAG), Mask);
3914       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3915       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3916       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3917                          Chain, Dest, ARMcc, CCR, Cmp);
3918     }
3919 
3920     SDValue LHS1, LHS2;
3921     SDValue RHS1, RHS2;
3922     expandf64Toi32(LHS, DAG, LHS1, LHS2);
3923     expandf64Toi32(RHS, DAG, RHS1, RHS2);
3924     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
3925     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
3926     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3927     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3928     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3929     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
3930     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
3931   }
3932 
3933   return SDValue();
3934 }
3935 
3936 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3937   SDValue Chain = Op.getOperand(0);
3938   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3939   SDValue LHS = Op.getOperand(2);
3940   SDValue RHS = Op.getOperand(3);
3941   SDValue Dest = Op.getOperand(4);
3942   SDLoc dl(Op);
3943 
3944   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
3945     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
3946                                                     dl);
3947 
3948     // If softenSetCCOperands only returned one value, we should compare it to
3949     // zero.
3950     if (!RHS.getNode()) {
3951       RHS = DAG.getConstant(0, dl, LHS.getValueType());
3952       CC = ISD::SETNE;
3953     }
3954   }
3955 
3956   if (LHS.getValueType() == MVT::i32) {
3957     SDValue ARMcc;
3958     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3959     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3960     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3961                        Chain, Dest, ARMcc, CCR, Cmp);
3962   }
3963 
3964   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3965 
3966   if (getTargetMachine().Options.UnsafeFPMath &&
3967       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
3968        CC == ISD::SETNE || CC == ISD::SETUNE)) {
3969     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
3970       return Result;
3971   }
3972 
3973   ARMCC::CondCodes CondCode, CondCode2;
3974   FPCCToARMCC(CC, CondCode, CondCode2);
3975 
3976   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3977   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3978   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3979   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3980   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
3981   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
3982   if (CondCode2 != ARMCC::AL) {
3983     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
3984     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
3985     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
3986   }
3987   return Res;
3988 }
3989 
3990 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3991   SDValue Chain = Op.getOperand(0);
3992   SDValue Table = Op.getOperand(1);
3993   SDValue Index = Op.getOperand(2);
3994   SDLoc dl(Op);
3995 
3996   EVT PTy = getPointerTy(DAG.getDataLayout());
3997   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
3998   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
3999   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
4000   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
4001   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
4002   if (Subtarget->isThumb2()) {
4003     // Thumb2 uses a two-level jump. That is, it jumps into the jump table
4004     // which does another jump to the destination. This also makes it easier
4005     // to translate it to TBB / TBH later.
4006     // FIXME: This might not work if the function is extremely large.
4007     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
4008                        Addr, Op.getOperand(2), JTI);
4009   }
4010   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
4011     Addr =
4012         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
4013                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
4014                     false, false, false, 0);
4015     Chain = Addr.getValue(1);
4016     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
4017     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4018   } else {
4019     Addr =
4020         DAG.getLoad(PTy, dl, Chain, Addr,
4021                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()),
4022                     false, false, false, 0);
4023     Chain = Addr.getValue(1);
4024     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4025   }
4026 }
4027 
4028 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
4029   EVT VT = Op.getValueType();
4030   SDLoc dl(Op);
4031 
4032   if (Op.getValueType().getVectorElementType() == MVT::i32) {
4033     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
4034       return Op;
4035     return DAG.UnrollVectorOp(Op.getNode());
4036   }
4037 
4038   assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
4039          "Invalid type for custom lowering!");
4040   if (VT != MVT::v4i16)
4041     return DAG.UnrollVectorOp(Op.getNode());
4042 
4043   Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
4044   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
4045 }
4046 
4047 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
4048   EVT VT = Op.getValueType();
4049   if (VT.isVector())
4050     return LowerVectorFP_TO_INT(Op, DAG);
4051   if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
4052     RTLIB::Libcall LC;
4053     if (Op.getOpcode() == ISD::FP_TO_SINT)
4054       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
4055                               Op.getValueType());
4056     else
4057       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
4058                               Op.getValueType());
4059     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4060                        /*isSigned*/ false, SDLoc(Op)).first;
4061   }
4062 
4063   return Op;
4064 }
4065 
4066 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4067   EVT VT = Op.getValueType();
4068   SDLoc dl(Op);
4069 
4070   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
4071     if (VT.getVectorElementType() == MVT::f32)
4072       return Op;
4073     return DAG.UnrollVectorOp(Op.getNode());
4074   }
4075 
4076   assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
4077          "Invalid type for custom lowering!");
4078   if (VT != MVT::v4f32)
4079     return DAG.UnrollVectorOp(Op.getNode());
4080 
4081   unsigned CastOpc;
4082   unsigned Opc;
4083   switch (Op.getOpcode()) {
4084   default: llvm_unreachable("Invalid opcode!");
4085   case ISD::SINT_TO_FP:
4086     CastOpc = ISD::SIGN_EXTEND;
4087     Opc = ISD::SINT_TO_FP;
4088     break;
4089   case ISD::UINT_TO_FP:
4090     CastOpc = ISD::ZERO_EXTEND;
4091     Opc = ISD::UINT_TO_FP;
4092     break;
4093   }
4094 
4095   Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
4096   return DAG.getNode(Opc, dl, VT, Op);
4097 }
4098 
4099 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
4100   EVT VT = Op.getValueType();
4101   if (VT.isVector())
4102     return LowerVectorINT_TO_FP(Op, DAG);
4103   if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
4104     RTLIB::Libcall LC;
4105     if (Op.getOpcode() == ISD::SINT_TO_FP)
4106       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
4107                               Op.getValueType());
4108     else
4109       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
4110                               Op.getValueType());
4111     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4112                        /*isSigned*/ false, SDLoc(Op)).first;
4113   }
4114 
4115   return Op;
4116 }
4117 
4118 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
4119   // Implement fcopysign with a fabs and a conditional fneg.
4120   SDValue Tmp0 = Op.getOperand(0);
4121   SDValue Tmp1 = Op.getOperand(1);
4122   SDLoc dl(Op);
4123   EVT VT = Op.getValueType();
4124   EVT SrcVT = Tmp1.getValueType();
4125   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
4126     Tmp0.getOpcode() == ARMISD::VMOVDRR;
4127   bool UseNEON = !InGPR && Subtarget->hasNEON();
4128 
4129   if (UseNEON) {
4130     // Use VBSL to copy the sign bit.
4131     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
4132     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
4133                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
4134     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
4135     if (VT == MVT::f64)
4136       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4137                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
4138                          DAG.getConstant(32, dl, MVT::i32));
4139     else /*if (VT == MVT::f32)*/
4140       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
4141     if (SrcVT == MVT::f32) {
4142       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
4143       if (VT == MVT::f64)
4144         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4145                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
4146                            DAG.getConstant(32, dl, MVT::i32));
4147     } else if (VT == MVT::f32)
4148       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
4149                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
4150                          DAG.getConstant(32, dl, MVT::i32));
4151     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
4152     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
4153 
4154     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
4155                                             dl, MVT::i32);
4156     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
4157     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
4158                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
4159 
4160     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
4161                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
4162                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
4163     if (VT == MVT::f32) {
4164       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
4165       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
4166                         DAG.getConstant(0, dl, MVT::i32));
4167     } else {
4168       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
4169     }
4170 
4171     return Res;
4172   }
4173 
4174   // Bitcast operand 1 to i32.
4175   if (SrcVT == MVT::f64)
4176     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4177                        Tmp1).getValue(1);
4178   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
4179 
4180   // Or in the signbit with integer operations.
4181   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
4182   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4183   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
4184   if (VT == MVT::f32) {
4185     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
4186                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
4187     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
4188                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
4189   }
4190 
4191   // f64: Or the high part with signbit and then combine two parts.
4192   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4193                      Tmp0);
4194   SDValue Lo = Tmp0.getValue(0);
4195   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
4196   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
4197   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
4198 }
4199 
4200 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
4201   MachineFunction &MF = DAG.getMachineFunction();
4202   MachineFrameInfo *MFI = MF.getFrameInfo();
4203   MFI->setReturnAddressIsTaken(true);
4204 
4205   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
4206     return SDValue();
4207 
4208   EVT VT = Op.getValueType();
4209   SDLoc dl(Op);
4210   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4211   if (Depth) {
4212     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
4213     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
4214     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
4215                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
4216                        MachinePointerInfo(), false, false, false, 0);
4217   }
4218 
4219   // Return LR, which contains the return address. Mark it an implicit live-in.
4220   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4221   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
4222 }
4223 
4224 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
4225   const ARMBaseRegisterInfo &ARI =
4226     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
4227   MachineFunction &MF = DAG.getMachineFunction();
4228   MachineFrameInfo *MFI = MF.getFrameInfo();
4229   MFI->setFrameAddressIsTaken(true);
4230 
4231   EVT VT = Op.getValueType();
4232   SDLoc dl(Op);  // FIXME probably not meaningful
4233   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4234   unsigned FrameReg = ARI.getFrameRegister(MF);
4235   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
4236   while (Depth--)
4237     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
4238                             MachinePointerInfo(),
4239                             false, false, false, 0);
4240   return FrameAddr;
4241 }
4242 
4243 // FIXME? Maybe this could be a TableGen attribute on some registers and
4244 // this table could be generated automatically from RegInfo.
4245 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
4246                                               SelectionDAG &DAG) const {
4247   unsigned Reg = StringSwitch<unsigned>(RegName)
4248                        .Case("sp", ARM::SP)
4249                        .Default(0);
4250   if (Reg)
4251     return Reg;
4252   report_fatal_error(Twine("Invalid register name \""
4253                               + StringRef(RegName)  + "\"."));
4254 }
4255 
4256 // Result is 64 bit value so split into two 32 bit values and return as a
4257 // pair of values.
4258 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
4259                                 SelectionDAG &DAG) {
4260   SDLoc DL(N);
4261 
4262   // This function is only supposed to be called for i64 type destination.
4263   assert(N->getValueType(0) == MVT::i64
4264           && "ExpandREAD_REGISTER called for non-i64 type result.");
4265 
4266   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
4267                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
4268                              N->getOperand(0),
4269                              N->getOperand(1));
4270 
4271   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
4272                     Read.getValue(1)));
4273   Results.push_back(Read.getOperand(0));
4274 }
4275 
4276 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
4277 /// When \p DstVT, the destination type of \p BC, is on the vector
4278 /// register bank and the source of bitcast, \p Op, operates on the same bank,
4279 /// it might be possible to combine them, such that everything stays on the
4280 /// vector register bank.
4281 /// \p return The node that would replace \p BT, if the combine
4282 /// is possible.
4283 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
4284                                                 SelectionDAG &DAG) {
4285   SDValue Op = BC->getOperand(0);
4286   EVT DstVT = BC->getValueType(0);
4287 
4288   // The only vector instruction that can produce a scalar (remember,
4289   // since the bitcast was about to be turned into VMOVDRR, the source
4290   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
4291   // Moreover, we can do this combine only if there is one use.
4292   // Finally, if the destination type is not a vector, there is not
4293   // much point on forcing everything on the vector bank.
4294   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
4295       !Op.hasOneUse())
4296     return SDValue();
4297 
4298   // If the index is not constant, we will introduce an additional
4299   // multiply that will stick.
4300   // Give up in that case.
4301   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
4302   if (!Index)
4303     return SDValue();
4304   unsigned DstNumElt = DstVT.getVectorNumElements();
4305 
4306   // Compute the new index.
4307   const APInt &APIntIndex = Index->getAPIntValue();
4308   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
4309   NewIndex *= APIntIndex;
4310   // Check if the new constant index fits into i32.
4311   if (NewIndex.getBitWidth() > 32)
4312     return SDValue();
4313 
4314   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
4315   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
4316   SDLoc dl(Op);
4317   SDValue ExtractSrc = Op.getOperand(0);
4318   EVT VecVT = EVT::getVectorVT(
4319       *DAG.getContext(), DstVT.getScalarType(),
4320       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
4321   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
4322   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
4323                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
4324 }
4325 
4326 /// ExpandBITCAST - If the target supports VFP, this function is called to
4327 /// expand a bit convert where either the source or destination type is i64 to
4328 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
4329 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
4330 /// vectors), since the legalizer won't know what to do with that.
4331 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
4332   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4333   SDLoc dl(N);
4334   SDValue Op = N->getOperand(0);
4335 
4336   // This function is only supposed to be called for i64 types, either as the
4337   // source or destination of the bit convert.
4338   EVT SrcVT = Op.getValueType();
4339   EVT DstVT = N->getValueType(0);
4340   assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
4341          "ExpandBITCAST called for non-i64 type");
4342 
4343   // Turn i64->f64 into VMOVDRR.
4344   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
4345     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
4346     // if we can combine the bitcast with its source.
4347     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
4348       return Val;
4349 
4350     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4351                              DAG.getConstant(0, dl, MVT::i32));
4352     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4353                              DAG.getConstant(1, dl, MVT::i32));
4354     return DAG.getNode(ISD::BITCAST, dl, DstVT,
4355                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
4356   }
4357 
4358   // Turn f64->i64 into VMOVRRD.
4359   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
4360     SDValue Cvt;
4361     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
4362         SrcVT.getVectorNumElements() > 1)
4363       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4364                         DAG.getVTList(MVT::i32, MVT::i32),
4365                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
4366     else
4367       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4368                         DAG.getVTList(MVT::i32, MVT::i32), Op);
4369     // Merge the pieces into a single i64 value.
4370     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
4371   }
4372 
4373   return SDValue();
4374 }
4375 
4376 /// getZeroVector - Returns a vector of specified type with all zero elements.
4377 /// Zero vectors are used to represent vector negation and in those cases
4378 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
4379 /// not support i64 elements, so sometimes the zero vectors will need to be
4380 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
4381 /// zero vector.
4382 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
4383   assert(VT.isVector() && "Expected a vector type");
4384   // The canonical modified immediate encoding of a zero vector is....0!
4385   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
4386   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
4387   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
4388   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4389 }
4390 
4391 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
4392 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4393 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
4394                                                 SelectionDAG &DAG) const {
4395   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4396   EVT VT = Op.getValueType();
4397   unsigned VTBits = VT.getSizeInBits();
4398   SDLoc dl(Op);
4399   SDValue ShOpLo = Op.getOperand(0);
4400   SDValue ShOpHi = Op.getOperand(1);
4401   SDValue ShAmt  = Op.getOperand(2);
4402   SDValue ARMcc;
4403   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
4404 
4405   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
4406 
4407   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4408                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4409   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
4410   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4411                                    DAG.getConstant(VTBits, dl, MVT::i32));
4412   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
4413   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4414   SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
4415 
4416   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4417   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4418                           ISD::SETGE, ARMcc, DAG, dl);
4419   SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
4420   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
4421                            CCR, Cmp);
4422 
4423   SDValue Ops[2] = { Lo, Hi };
4424   return DAG.getMergeValues(Ops, dl);
4425 }
4426 
4427 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
4428 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4429 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
4430                                                SelectionDAG &DAG) const {
4431   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4432   EVT VT = Op.getValueType();
4433   unsigned VTBits = VT.getSizeInBits();
4434   SDLoc dl(Op);
4435   SDValue ShOpLo = Op.getOperand(0);
4436   SDValue ShOpHi = Op.getOperand(1);
4437   SDValue ShAmt  = Op.getOperand(2);
4438   SDValue ARMcc;
4439 
4440   assert(Op.getOpcode() == ISD::SHL_PARTS);
4441   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4442                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4443   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
4444   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4445                                    DAG.getConstant(VTBits, dl, MVT::i32));
4446   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
4447   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
4448 
4449   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4450   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4451   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4452                           ISD::SETGE, ARMcc, DAG, dl);
4453   SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4454   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
4455                            CCR, Cmp);
4456 
4457   SDValue Ops[2] = { Lo, Hi };
4458   return DAG.getMergeValues(Ops, dl);
4459 }
4460 
4461 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4462                                             SelectionDAG &DAG) const {
4463   // The rounding mode is in bits 23:22 of the FPSCR.
4464   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4465   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4466   // so that the shift + and get folded into a bitfield extract.
4467   SDLoc dl(Op);
4468   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
4469                               DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
4470                                               MVT::i32));
4471   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
4472                                   DAG.getConstant(1U << 22, dl, MVT::i32));
4473   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4474                               DAG.getConstant(22, dl, MVT::i32));
4475   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4476                      DAG.getConstant(3, dl, MVT::i32));
4477 }
4478 
4479 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
4480                          const ARMSubtarget *ST) {
4481   SDLoc dl(N);
4482   EVT VT = N->getValueType(0);
4483   if (VT.isVector()) {
4484     assert(ST->hasNEON());
4485 
4486     // Compute the least significant set bit: LSB = X & -X
4487     SDValue X = N->getOperand(0);
4488     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
4489     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
4490 
4491     EVT ElemTy = VT.getVectorElementType();
4492 
4493     if (ElemTy == MVT::i8) {
4494       // Compute with: cttz(x) = ctpop(lsb - 1)
4495       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4496                                 DAG.getTargetConstant(1, dl, ElemTy));
4497       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4498       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
4499     }
4500 
4501     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
4502         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
4503       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
4504       unsigned NumBits = ElemTy.getSizeInBits();
4505       SDValue WidthMinus1 =
4506           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4507                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
4508       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
4509       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
4510     }
4511 
4512     // Compute with: cttz(x) = ctpop(lsb - 1)
4513 
4514     // Since we can only compute the number of bits in a byte with vcnt.8, we
4515     // have to gather the result with pairwise addition (vpaddl) for i16, i32,
4516     // and i64.
4517 
4518     // Compute LSB - 1.
4519     SDValue Bits;
4520     if (ElemTy == MVT::i64) {
4521       // Load constant 0xffff'ffff'ffff'ffff to register.
4522       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4523                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
4524       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
4525     } else {
4526       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4527                                 DAG.getTargetConstant(1, dl, ElemTy));
4528       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4529     }
4530 
4531     // Count #bits with vcnt.8.
4532     EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
4533     SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
4534     SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
4535 
4536     // Gather the #bits with vpaddl (pairwise add.)
4537     EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
4538     SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
4539         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4540         Cnt8);
4541     if (ElemTy == MVT::i16)
4542       return Cnt16;
4543 
4544     EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
4545     SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
4546         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4547         Cnt16);
4548     if (ElemTy == MVT::i32)
4549       return Cnt32;
4550 
4551     assert(ElemTy == MVT::i64);
4552     SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4553         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
4554         Cnt32);
4555     return Cnt64;
4556   }
4557 
4558   if (!ST->hasV6T2Ops())
4559     return SDValue();
4560 
4561   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
4562   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
4563 }
4564 
4565 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
4566 /// for each 16-bit element from operand, repeated.  The basic idea is to
4567 /// leverage vcnt to get the 8-bit counts, gather and add the results.
4568 ///
4569 /// Trace for v4i16:
4570 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
4571 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
4572 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
4573 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
4574 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
4575 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
4576 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
4577 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
4578 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
4579   EVT VT = N->getValueType(0);
4580   SDLoc DL(N);
4581 
4582   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
4583   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
4584   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
4585   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
4586   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
4587   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
4588 }
4589 
4590 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
4591 /// bit-count for each 16-bit element from the operand.  We need slightly
4592 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
4593 /// 64/128-bit registers.
4594 ///
4595 /// Trace for v4i16:
4596 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
4597 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
4598 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
4599 /// v4i16:Extracted = [k0    k1    k2    k3    ]
4600 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
4601   EVT VT = N->getValueType(0);
4602   SDLoc DL(N);
4603 
4604   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
4605   if (VT.is64BitVector()) {
4606     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
4607     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
4608                        DAG.getIntPtrConstant(0, DL));
4609   } else {
4610     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
4611                                     BitCounts, DAG.getIntPtrConstant(0, DL));
4612     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
4613   }
4614 }
4615 
4616 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
4617 /// bit-count for each 32-bit element from the operand.  The idea here is
4618 /// to split the vector into 16-bit elements, leverage the 16-bit count
4619 /// routine, and then combine the results.
4620 ///
4621 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
4622 /// input    = [v0    v1    ] (vi: 32-bit elements)
4623 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
4624 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
4625 /// vrev: N0 = [k1 k0 k3 k2 ]
4626 ///            [k0 k1 k2 k3 ]
4627 ///       N1 =+[k1 k0 k3 k2 ]
4628 ///            [k0 k2 k1 k3 ]
4629 ///       N2 =+[k1 k3 k0 k2 ]
4630 ///            [k0    k2    k1    k3    ]
4631 /// Extended =+[k1    k3    k0    k2    ]
4632 ///            [k0    k2    ]
4633 /// Extracted=+[k1    k3    ]
4634 ///
4635 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
4636   EVT VT = N->getValueType(0);
4637   SDLoc DL(N);
4638 
4639   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
4640 
4641   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
4642   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
4643   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
4644   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
4645   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
4646 
4647   if (VT.is64BitVector()) {
4648     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
4649     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
4650                        DAG.getIntPtrConstant(0, DL));
4651   } else {
4652     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
4653                                     DAG.getIntPtrConstant(0, DL));
4654     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
4655   }
4656 }
4657 
4658 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
4659                           const ARMSubtarget *ST) {
4660   EVT VT = N->getValueType(0);
4661 
4662   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
4663   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
4664           VT == MVT::v4i16 || VT == MVT::v8i16) &&
4665          "Unexpected type for custom ctpop lowering");
4666 
4667   if (VT.getVectorElementType() == MVT::i32)
4668     return lowerCTPOP32BitElements(N, DAG);
4669   else
4670     return lowerCTPOP16BitElements(N, DAG);
4671 }
4672 
4673 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
4674                           const ARMSubtarget *ST) {
4675   EVT VT = N->getValueType(0);
4676   SDLoc dl(N);
4677 
4678   if (!VT.isVector())
4679     return SDValue();
4680 
4681   // Lower vector shifts on NEON to use VSHL.
4682   assert(ST->hasNEON() && "unexpected vector shift");
4683 
4684   // Left shifts translate directly to the vshiftu intrinsic.
4685   if (N->getOpcode() == ISD::SHL)
4686     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4687                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
4688                                        MVT::i32),
4689                        N->getOperand(0), N->getOperand(1));
4690 
4691   assert((N->getOpcode() == ISD::SRA ||
4692           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
4693 
4694   // NEON uses the same intrinsics for both left and right shifts.  For
4695   // right shifts, the shift amounts are negative, so negate the vector of
4696   // shift amounts.
4697   EVT ShiftVT = N->getOperand(1).getValueType();
4698   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
4699                                      getZeroVector(ShiftVT, DAG, dl),
4700                                      N->getOperand(1));
4701   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
4702                              Intrinsic::arm_neon_vshifts :
4703                              Intrinsic::arm_neon_vshiftu);
4704   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
4705                      DAG.getConstant(vshiftInt, dl, MVT::i32),
4706                      N->getOperand(0), NegatedCount);
4707 }
4708 
4709 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
4710                                 const ARMSubtarget *ST) {
4711   EVT VT = N->getValueType(0);
4712   SDLoc dl(N);
4713 
4714   // We can get here for a node like i32 = ISD::SHL i32, i64
4715   if (VT != MVT::i64)
4716     return SDValue();
4717 
4718   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
4719          "Unknown shift to lower!");
4720 
4721   // We only lower SRA, SRL of 1 here, all others use generic lowering.
4722   if (!isOneConstant(N->getOperand(1)))
4723     return SDValue();
4724 
4725   // If we are in thumb mode, we don't have RRX.
4726   if (ST->isThumb1Only()) return SDValue();
4727 
4728   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
4729   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
4730                            DAG.getConstant(0, dl, MVT::i32));
4731   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
4732                            DAG.getConstant(1, dl, MVT::i32));
4733 
4734   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
4735   // captures the result into a carry flag.
4736   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
4737   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
4738 
4739   // The low part is an ARMISD::RRX operand, which shifts the carry in.
4740   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
4741 
4742   // Merge the pieces into a single i64 value.
4743  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
4744 }
4745 
4746 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
4747   SDValue TmpOp0, TmpOp1;
4748   bool Invert = false;
4749   bool Swap = false;
4750   unsigned Opc = 0;
4751 
4752   SDValue Op0 = Op.getOperand(0);
4753   SDValue Op1 = Op.getOperand(1);
4754   SDValue CC = Op.getOperand(2);
4755   EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
4756   EVT VT = Op.getValueType();
4757   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
4758   SDLoc dl(Op);
4759 
4760   if (CmpVT.getVectorElementType() == MVT::i64)
4761     // 64-bit comparisons are not legal. We've marked SETCC as non-Custom,
4762     // but it's possible that our operands are 64-bit but our result is 32-bit.
4763     // Bail in this case.
4764     return SDValue();
4765 
4766   if (Op1.getValueType().isFloatingPoint()) {
4767     switch (SetCCOpcode) {
4768     default: llvm_unreachable("Illegal FP comparison");
4769     case ISD::SETUNE:
4770     case ISD::SETNE:  Invert = true; // Fallthrough
4771     case ISD::SETOEQ:
4772     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4773     case ISD::SETOLT:
4774     case ISD::SETLT: Swap = true; // Fallthrough
4775     case ISD::SETOGT:
4776     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4777     case ISD::SETOLE:
4778     case ISD::SETLE:  Swap = true; // Fallthrough
4779     case ISD::SETOGE:
4780     case ISD::SETGE: Opc = ARMISD::VCGE; break;
4781     case ISD::SETUGE: Swap = true; // Fallthrough
4782     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
4783     case ISD::SETUGT: Swap = true; // Fallthrough
4784     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
4785     case ISD::SETUEQ: Invert = true; // Fallthrough
4786     case ISD::SETONE:
4787       // Expand this to (OLT | OGT).
4788       TmpOp0 = Op0;
4789       TmpOp1 = Op1;
4790       Opc = ISD::OR;
4791       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
4792       Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
4793       break;
4794     case ISD::SETUO: Invert = true; // Fallthrough
4795     case ISD::SETO:
4796       // Expand this to (OLT | OGE).
4797       TmpOp0 = Op0;
4798       TmpOp1 = Op1;
4799       Opc = ISD::OR;
4800       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
4801       Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
4802       break;
4803     }
4804   } else {
4805     // Integer comparisons.
4806     switch (SetCCOpcode) {
4807     default: llvm_unreachable("Illegal integer comparison");
4808     case ISD::SETNE:  Invert = true;
4809     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4810     case ISD::SETLT:  Swap = true;
4811     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4812     case ISD::SETLE:  Swap = true;
4813     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
4814     case ISD::SETULT: Swap = true;
4815     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
4816     case ISD::SETULE: Swap = true;
4817     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
4818     }
4819 
4820     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
4821     if (Opc == ARMISD::VCEQ) {
4822 
4823       SDValue AndOp;
4824       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4825         AndOp = Op0;
4826       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
4827         AndOp = Op1;
4828 
4829       // Ignore bitconvert.
4830       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
4831         AndOp = AndOp.getOperand(0);
4832 
4833       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
4834         Opc = ARMISD::VTST;
4835         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
4836         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
4837         Invert = !Invert;
4838       }
4839     }
4840   }
4841 
4842   if (Swap)
4843     std::swap(Op0, Op1);
4844 
4845   // If one of the operands is a constant vector zero, attempt to fold the
4846   // comparison to a specialized compare-against-zero form.
4847   SDValue SingleOp;
4848   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4849     SingleOp = Op0;
4850   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
4851     if (Opc == ARMISD::VCGE)
4852       Opc = ARMISD::VCLEZ;
4853     else if (Opc == ARMISD::VCGT)
4854       Opc = ARMISD::VCLTZ;
4855     SingleOp = Op1;
4856   }
4857 
4858   SDValue Result;
4859   if (SingleOp.getNode()) {
4860     switch (Opc) {
4861     case ARMISD::VCEQ:
4862       Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
4863     case ARMISD::VCGE:
4864       Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
4865     case ARMISD::VCLEZ:
4866       Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
4867     case ARMISD::VCGT:
4868       Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
4869     case ARMISD::VCLTZ:
4870       Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
4871     default:
4872       Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
4873     }
4874   } else {
4875      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
4876   }
4877 
4878   Result = DAG.getSExtOrTrunc(Result, dl, VT);
4879 
4880   if (Invert)
4881     Result = DAG.getNOT(dl, Result, VT);
4882 
4883   return Result;
4884 }
4885 
4886 static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
4887   SDValue LHS = Op.getOperand(0);
4888   SDValue RHS = Op.getOperand(1);
4889   SDValue Carry = Op.getOperand(2);
4890   SDValue Cond = Op.getOperand(3);
4891   SDLoc DL(Op);
4892 
4893   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
4894 
4895   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
4896   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
4897   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
4898 
4899   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4900   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4901   SDValue ARMcc = DAG.getConstant(
4902       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
4903   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4904   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
4905                                    Cmp.getValue(1), SDValue());
4906   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
4907                      CCR, Chain.getValue(1));
4908 }
4909 
4910 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
4911 /// valid vector constant for a NEON instruction with a "modified immediate"
4912 /// operand (e.g., VMOV).  If so, return the encoded value.
4913 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
4914                                  unsigned SplatBitSize, SelectionDAG &DAG,
4915                                  SDLoc dl, EVT &VT, bool is128Bits,
4916                                  NEONModImmType type) {
4917   unsigned OpCmode, Imm;
4918 
4919   // SplatBitSize is set to the smallest size that splats the vector, so a
4920   // zero vector will always have SplatBitSize == 8.  However, NEON modified
4921   // immediate instructions others than VMOV do not support the 8-bit encoding
4922   // of a zero vector, and the default encoding of zero is supposed to be the
4923   // 32-bit version.
4924   if (SplatBits == 0)
4925     SplatBitSize = 32;
4926 
4927   switch (SplatBitSize) {
4928   case 8:
4929     if (type != VMOVModImm)
4930       return SDValue();
4931     // Any 1-byte value is OK.  Op=0, Cmode=1110.
4932     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
4933     OpCmode = 0xe;
4934     Imm = SplatBits;
4935     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
4936     break;
4937 
4938   case 16:
4939     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
4940     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
4941     if ((SplatBits & ~0xff) == 0) {
4942       // Value = 0x00nn: Op=x, Cmode=100x.
4943       OpCmode = 0x8;
4944       Imm = SplatBits;
4945       break;
4946     }
4947     if ((SplatBits & ~0xff00) == 0) {
4948       // Value = 0xnn00: Op=x, Cmode=101x.
4949       OpCmode = 0xa;
4950       Imm = SplatBits >> 8;
4951       break;
4952     }
4953     return SDValue();
4954 
4955   case 32:
4956     // NEON's 32-bit VMOV supports splat values where:
4957     // * only one byte is nonzero, or
4958     // * the least significant byte is 0xff and the second byte is nonzero, or
4959     // * the least significant 2 bytes are 0xff and the third is nonzero.
4960     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
4961     if ((SplatBits & ~0xff) == 0) {
4962       // Value = 0x000000nn: Op=x, Cmode=000x.
4963       OpCmode = 0;
4964       Imm = SplatBits;
4965       break;
4966     }
4967     if ((SplatBits & ~0xff00) == 0) {
4968       // Value = 0x0000nn00: Op=x, Cmode=001x.
4969       OpCmode = 0x2;
4970       Imm = SplatBits >> 8;
4971       break;
4972     }
4973     if ((SplatBits & ~0xff0000) == 0) {
4974       // Value = 0x00nn0000: Op=x, Cmode=010x.
4975       OpCmode = 0x4;
4976       Imm = SplatBits >> 16;
4977       break;
4978     }
4979     if ((SplatBits & ~0xff000000) == 0) {
4980       // Value = 0xnn000000: Op=x, Cmode=011x.
4981       OpCmode = 0x6;
4982       Imm = SplatBits >> 24;
4983       break;
4984     }
4985 
4986     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
4987     if (type == OtherModImm) return SDValue();
4988 
4989     if ((SplatBits & ~0xffff) == 0 &&
4990         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
4991       // Value = 0x0000nnff: Op=x, Cmode=1100.
4992       OpCmode = 0xc;
4993       Imm = SplatBits >> 8;
4994       break;
4995     }
4996 
4997     if ((SplatBits & ~0xffffff) == 0 &&
4998         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
4999       // Value = 0x00nnffff: Op=x, Cmode=1101.
5000       OpCmode = 0xd;
5001       Imm = SplatBits >> 16;
5002       break;
5003     }
5004 
5005     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
5006     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
5007     // VMOV.I32.  A (very) minor optimization would be to replicate the value
5008     // and fall through here to test for a valid 64-bit splat.  But, then the
5009     // caller would also need to check and handle the change in size.
5010     return SDValue();
5011 
5012   case 64: {
5013     if (type != VMOVModImm)
5014       return SDValue();
5015     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
5016     uint64_t BitMask = 0xff;
5017     uint64_t Val = 0;
5018     unsigned ImmMask = 1;
5019     Imm = 0;
5020     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
5021       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
5022         Val |= BitMask;
5023         Imm |= ImmMask;
5024       } else if ((SplatBits & BitMask) != 0) {
5025         return SDValue();
5026       }
5027       BitMask <<= 8;
5028       ImmMask <<= 1;
5029     }
5030 
5031     if (DAG.getDataLayout().isBigEndian())
5032       // swap higher and lower 32 bit word
5033       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
5034 
5035     // Op=1, Cmode=1110.
5036     OpCmode = 0x1e;
5037     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
5038     break;
5039   }
5040 
5041   default:
5042     llvm_unreachable("unexpected size for isNEONModifiedImm");
5043   }
5044 
5045   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
5046   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
5047 }
5048 
5049 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
5050                                            const ARMSubtarget *ST) const {
5051   if (!ST->hasVFP3())
5052     return SDValue();
5053 
5054   bool IsDouble = Op.getValueType() == MVT::f64;
5055   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
5056 
5057   // Use the default (constant pool) lowering for double constants when we have
5058   // an SP-only FPU
5059   if (IsDouble && Subtarget->isFPOnlySP())
5060     return SDValue();
5061 
5062   // Try splatting with a VMOV.f32...
5063   APFloat FPVal = CFP->getValueAPF();
5064   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
5065 
5066   if (ImmVal != -1) {
5067     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
5068       // We have code in place to select a valid ConstantFP already, no need to
5069       // do any mangling.
5070       return Op;
5071     }
5072 
5073     // It's a float and we are trying to use NEON operations where
5074     // possible. Lower it to a splat followed by an extract.
5075     SDLoc DL(Op);
5076     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
5077     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
5078                                       NewVal);
5079     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
5080                        DAG.getConstant(0, DL, MVT::i32));
5081   }
5082 
5083   // The rest of our options are NEON only, make sure that's allowed before
5084   // proceeding..
5085   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
5086     return SDValue();
5087 
5088   EVT VMovVT;
5089   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
5090 
5091   // It wouldn't really be worth bothering for doubles except for one very
5092   // important value, which does happen to match: 0.0. So make sure we don't do
5093   // anything stupid.
5094   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
5095     return SDValue();
5096 
5097   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
5098   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
5099                                      VMovVT, false, VMOVModImm);
5100   if (NewVal != SDValue()) {
5101     SDLoc DL(Op);
5102     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
5103                                       NewVal);
5104     if (IsDouble)
5105       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5106 
5107     // It's a float: cast and extract a vector element.
5108     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5109                                        VecConstant);
5110     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5111                        DAG.getConstant(0, DL, MVT::i32));
5112   }
5113 
5114   // Finally, try a VMVN.i32
5115   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
5116                              false, VMVNModImm);
5117   if (NewVal != SDValue()) {
5118     SDLoc DL(Op);
5119     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
5120 
5121     if (IsDouble)
5122       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5123 
5124     // It's a float: cast and extract a vector element.
5125     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5126                                        VecConstant);
5127     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5128                        DAG.getConstant(0, DL, MVT::i32));
5129   }
5130 
5131   return SDValue();
5132 }
5133 
5134 // check if an VEXT instruction can handle the shuffle mask when the
5135 // vector sources of the shuffle are the same.
5136 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
5137   unsigned NumElts = VT.getVectorNumElements();
5138 
5139   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5140   if (M[0] < 0)
5141     return false;
5142 
5143   Imm = M[0];
5144 
5145   // If this is a VEXT shuffle, the immediate value is the index of the first
5146   // element.  The other shuffle indices must be the successive elements after
5147   // the first one.
5148   unsigned ExpectedElt = Imm;
5149   for (unsigned i = 1; i < NumElts; ++i) {
5150     // Increment the expected index.  If it wraps around, just follow it
5151     // back to index zero and keep going.
5152     ++ExpectedElt;
5153     if (ExpectedElt == NumElts)
5154       ExpectedElt = 0;
5155 
5156     if (M[i] < 0) continue; // ignore UNDEF indices
5157     if (ExpectedElt != static_cast<unsigned>(M[i]))
5158       return false;
5159   }
5160 
5161   return true;
5162 }
5163 
5164 
5165 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
5166                        bool &ReverseVEXT, unsigned &Imm) {
5167   unsigned NumElts = VT.getVectorNumElements();
5168   ReverseVEXT = false;
5169 
5170   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5171   if (M[0] < 0)
5172     return false;
5173 
5174   Imm = M[0];
5175 
5176   // If this is a VEXT shuffle, the immediate value is the index of the first
5177   // element.  The other shuffle indices must be the successive elements after
5178   // the first one.
5179   unsigned ExpectedElt = Imm;
5180   for (unsigned i = 1; i < NumElts; ++i) {
5181     // Increment the expected index.  If it wraps around, it may still be
5182     // a VEXT but the source vectors must be swapped.
5183     ExpectedElt += 1;
5184     if (ExpectedElt == NumElts * 2) {
5185       ExpectedElt = 0;
5186       ReverseVEXT = true;
5187     }
5188 
5189     if (M[i] < 0) continue; // ignore UNDEF indices
5190     if (ExpectedElt != static_cast<unsigned>(M[i]))
5191       return false;
5192   }
5193 
5194   // Adjust the index value if the source operands will be swapped.
5195   if (ReverseVEXT)
5196     Imm -= NumElts;
5197 
5198   return true;
5199 }
5200 
5201 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
5202 /// instruction with the specified blocksize.  (The order of the elements
5203 /// within each block of the vector is reversed.)
5204 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
5205   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
5206          "Only possible block sizes for VREV are: 16, 32, 64");
5207 
5208   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5209   if (EltSz == 64)
5210     return false;
5211 
5212   unsigned NumElts = VT.getVectorNumElements();
5213   unsigned BlockElts = M[0] + 1;
5214   // If the first shuffle index is UNDEF, be optimistic.
5215   if (M[0] < 0)
5216     BlockElts = BlockSize / EltSz;
5217 
5218   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
5219     return false;
5220 
5221   for (unsigned i = 0; i < NumElts; ++i) {
5222     if (M[i] < 0) continue; // ignore UNDEF indices
5223     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
5224       return false;
5225   }
5226 
5227   return true;
5228 }
5229 
5230 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
5231   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
5232   // range, then 0 is placed into the resulting vector. So pretty much any mask
5233   // of 8 elements can work here.
5234   return VT == MVT::v8i8 && M.size() == 8;
5235 }
5236 
5237 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
5238 // checking that pairs of elements in the shuffle mask represent the same index
5239 // in each vector, incrementing the expected index by 2 at each step.
5240 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
5241 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
5242 //  v2={e,f,g,h}
5243 // WhichResult gives the offset for each element in the mask based on which
5244 // of the two results it belongs to.
5245 //
5246 // The transpose can be represented either as:
5247 // result1 = shufflevector v1, v2, result1_shuffle_mask
5248 // result2 = shufflevector v1, v2, result2_shuffle_mask
5249 // where v1/v2 and the shuffle masks have the same number of elements
5250 // (here WhichResult (see below) indicates which result is being checked)
5251 //
5252 // or as:
5253 // results = shufflevector v1, v2, shuffle_mask
5254 // where both results are returned in one vector and the shuffle mask has twice
5255 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
5256 // want to check the low half and high half of the shuffle mask as if it were
5257 // the other case
5258 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5259   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5260   if (EltSz == 64)
5261     return false;
5262 
5263   unsigned NumElts = VT.getVectorNumElements();
5264   if (M.size() != NumElts && M.size() != NumElts*2)
5265     return false;
5266 
5267   // If the mask is twice as long as the input vector then we need to check the
5268   // upper and lower parts of the mask with a matching value for WhichResult
5269   // FIXME: A mask with only even values will be rejected in case the first
5270   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
5271   // M[0] is used to determine WhichResult
5272   for (unsigned i = 0; i < M.size(); i += NumElts) {
5273     if (M.size() == NumElts * 2)
5274       WhichResult = i / NumElts;
5275     else
5276       WhichResult = M[i] == 0 ? 0 : 1;
5277     for (unsigned j = 0; j < NumElts; j += 2) {
5278       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5279           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
5280         return false;
5281     }
5282   }
5283 
5284   if (M.size() == NumElts*2)
5285     WhichResult = 0;
5286 
5287   return true;
5288 }
5289 
5290 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
5291 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5292 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
5293 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5294   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5295   if (EltSz == 64)
5296     return false;
5297 
5298   unsigned NumElts = VT.getVectorNumElements();
5299   if (M.size() != NumElts && M.size() != NumElts*2)
5300     return false;
5301 
5302   for (unsigned i = 0; i < M.size(); i += NumElts) {
5303     if (M.size() == NumElts * 2)
5304       WhichResult = i / NumElts;
5305     else
5306       WhichResult = M[i] == 0 ? 0 : 1;
5307     for (unsigned j = 0; j < NumElts; j += 2) {
5308       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5309           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
5310         return false;
5311     }
5312   }
5313 
5314   if (M.size() == NumElts*2)
5315     WhichResult = 0;
5316 
5317   return true;
5318 }
5319 
5320 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
5321 // that the mask elements are either all even and in steps of size 2 or all odd
5322 // and in steps of size 2.
5323 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
5324 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
5325 //  v2={e,f,g,h}
5326 // Requires similar checks to that of isVTRNMask with
5327 // respect the how results are returned.
5328 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5329   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5330   if (EltSz == 64)
5331     return false;
5332 
5333   unsigned NumElts = VT.getVectorNumElements();
5334   if (M.size() != NumElts && M.size() != NumElts*2)
5335     return false;
5336 
5337   for (unsigned i = 0; i < M.size(); i += NumElts) {
5338     WhichResult = M[i] == 0 ? 0 : 1;
5339     for (unsigned j = 0; j < NumElts; ++j) {
5340       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
5341         return false;
5342     }
5343   }
5344 
5345   if (M.size() == NumElts*2)
5346     WhichResult = 0;
5347 
5348   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5349   if (VT.is64BitVector() && EltSz == 32)
5350     return false;
5351 
5352   return true;
5353 }
5354 
5355 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
5356 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5357 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
5358 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5359   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5360   if (EltSz == 64)
5361     return false;
5362 
5363   unsigned NumElts = VT.getVectorNumElements();
5364   if (M.size() != NumElts && M.size() != NumElts*2)
5365     return false;
5366 
5367   unsigned Half = NumElts / 2;
5368   for (unsigned i = 0; i < M.size(); i += NumElts) {
5369     WhichResult = M[i] == 0 ? 0 : 1;
5370     for (unsigned j = 0; j < NumElts; j += Half) {
5371       unsigned Idx = WhichResult;
5372       for (unsigned k = 0; k < Half; ++k) {
5373         int MIdx = M[i + j + k];
5374         if (MIdx >= 0 && (unsigned) MIdx != Idx)
5375           return false;
5376         Idx += 2;
5377       }
5378     }
5379   }
5380 
5381   if (M.size() == NumElts*2)
5382     WhichResult = 0;
5383 
5384   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5385   if (VT.is64BitVector() && EltSz == 32)
5386     return false;
5387 
5388   return true;
5389 }
5390 
5391 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
5392 // that pairs of elements of the shufflemask represent the same index in each
5393 // vector incrementing sequentially through the vectors.
5394 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
5395 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
5396 //  v2={e,f,g,h}
5397 // Requires similar checks to that of isVTRNMask with respect the how results
5398 // are returned.
5399 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5400   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5401   if (EltSz == 64)
5402     return false;
5403 
5404   unsigned NumElts = VT.getVectorNumElements();
5405   if (M.size() != NumElts && M.size() != NumElts*2)
5406     return false;
5407 
5408   for (unsigned i = 0; i < M.size(); i += NumElts) {
5409     WhichResult = M[i] == 0 ? 0 : 1;
5410     unsigned Idx = WhichResult * NumElts / 2;
5411     for (unsigned j = 0; j < NumElts; j += 2) {
5412       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5413           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
5414         return false;
5415       Idx += 1;
5416     }
5417   }
5418 
5419   if (M.size() == NumElts*2)
5420     WhichResult = 0;
5421 
5422   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5423   if (VT.is64BitVector() && EltSz == 32)
5424     return false;
5425 
5426   return true;
5427 }
5428 
5429 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
5430 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5431 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
5432 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5433   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
5434   if (EltSz == 64)
5435     return false;
5436 
5437   unsigned NumElts = VT.getVectorNumElements();
5438   if (M.size() != NumElts && M.size() != NumElts*2)
5439     return false;
5440 
5441   for (unsigned i = 0; i < M.size(); i += NumElts) {
5442     WhichResult = M[i] == 0 ? 0 : 1;
5443     unsigned Idx = WhichResult * NumElts / 2;
5444     for (unsigned j = 0; j < NumElts; j += 2) {
5445       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5446           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
5447         return false;
5448       Idx += 1;
5449     }
5450   }
5451 
5452   if (M.size() == NumElts*2)
5453     WhichResult = 0;
5454 
5455   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5456   if (VT.is64BitVector() && EltSz == 32)
5457     return false;
5458 
5459   return true;
5460 }
5461 
5462 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
5463 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
5464 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
5465                                            unsigned &WhichResult,
5466                                            bool &isV_UNDEF) {
5467   isV_UNDEF = false;
5468   if (isVTRNMask(ShuffleMask, VT, WhichResult))
5469     return ARMISD::VTRN;
5470   if (isVUZPMask(ShuffleMask, VT, WhichResult))
5471     return ARMISD::VUZP;
5472   if (isVZIPMask(ShuffleMask, VT, WhichResult))
5473     return ARMISD::VZIP;
5474 
5475   isV_UNDEF = true;
5476   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
5477     return ARMISD::VTRN;
5478   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5479     return ARMISD::VUZP;
5480   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5481     return ARMISD::VZIP;
5482 
5483   return 0;
5484 }
5485 
5486 /// \return true if this is a reverse operation on an vector.
5487 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
5488   unsigned NumElts = VT.getVectorNumElements();
5489   // Make sure the mask has the right size.
5490   if (NumElts != M.size())
5491       return false;
5492 
5493   // Look for <15, ..., 3, -1, 1, 0>.
5494   for (unsigned i = 0; i != NumElts; ++i)
5495     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
5496       return false;
5497 
5498   return true;
5499 }
5500 
5501 // If N is an integer constant that can be moved into a register in one
5502 // instruction, return an SDValue of such a constant (will become a MOV
5503 // instruction).  Otherwise return null.
5504 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
5505                                      const ARMSubtarget *ST, SDLoc dl) {
5506   uint64_t Val;
5507   if (!isa<ConstantSDNode>(N))
5508     return SDValue();
5509   Val = cast<ConstantSDNode>(N)->getZExtValue();
5510 
5511   if (ST->isThumb1Only()) {
5512     if (Val <= 255 || ~Val <= 255)
5513       return DAG.getConstant(Val, dl, MVT::i32);
5514   } else {
5515     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
5516       return DAG.getConstant(Val, dl, MVT::i32);
5517   }
5518   return SDValue();
5519 }
5520 
5521 // If this is a case we can't handle, return null and let the default
5522 // expansion code take care of it.
5523 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
5524                                              const ARMSubtarget *ST) const {
5525   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
5526   SDLoc dl(Op);
5527   EVT VT = Op.getValueType();
5528 
5529   APInt SplatBits, SplatUndef;
5530   unsigned SplatBitSize;
5531   bool HasAnyUndefs;
5532   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
5533     if (SplatBitSize <= 64) {
5534       // Check if an immediate VMOV works.
5535       EVT VmovVT;
5536       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
5537                                       SplatUndef.getZExtValue(), SplatBitSize,
5538                                       DAG, dl, VmovVT, VT.is128BitVector(),
5539                                       VMOVModImm);
5540       if (Val.getNode()) {
5541         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
5542         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
5543       }
5544 
5545       // Try an immediate VMVN.
5546       uint64_t NegatedImm = (~SplatBits).getZExtValue();
5547       Val = isNEONModifiedImm(NegatedImm,
5548                                       SplatUndef.getZExtValue(), SplatBitSize,
5549                                       DAG, dl, VmovVT, VT.is128BitVector(),
5550                                       VMVNModImm);
5551       if (Val.getNode()) {
5552         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
5553         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
5554       }
5555 
5556       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
5557       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
5558         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
5559         if (ImmVal != -1) {
5560           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
5561           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
5562         }
5563       }
5564     }
5565   }
5566 
5567   // Scan through the operands to see if only one value is used.
5568   //
5569   // As an optimisation, even if more than one value is used it may be more
5570   // profitable to splat with one value then change some lanes.
5571   //
5572   // Heuristically we decide to do this if the vector has a "dominant" value,
5573   // defined as splatted to more than half of the lanes.
5574   unsigned NumElts = VT.getVectorNumElements();
5575   bool isOnlyLowElement = true;
5576   bool usesOnlyOneValue = true;
5577   bool hasDominantValue = false;
5578   bool isConstant = true;
5579 
5580   // Map of the number of times a particular SDValue appears in the
5581   // element list.
5582   DenseMap<SDValue, unsigned> ValueCounts;
5583   SDValue Value;
5584   for (unsigned i = 0; i < NumElts; ++i) {
5585     SDValue V = Op.getOperand(i);
5586     if (V.isUndef())
5587       continue;
5588     if (i > 0)
5589       isOnlyLowElement = false;
5590     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
5591       isConstant = false;
5592 
5593     ValueCounts.insert(std::make_pair(V, 0));
5594     unsigned &Count = ValueCounts[V];
5595 
5596     // Is this value dominant? (takes up more than half of the lanes)
5597     if (++Count > (NumElts / 2)) {
5598       hasDominantValue = true;
5599       Value = V;
5600     }
5601   }
5602   if (ValueCounts.size() != 1)
5603     usesOnlyOneValue = false;
5604   if (!Value.getNode() && ValueCounts.size() > 0)
5605     Value = ValueCounts.begin()->first;
5606 
5607   if (ValueCounts.size() == 0)
5608     return DAG.getUNDEF(VT);
5609 
5610   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
5611   // Keep going if we are hitting this case.
5612   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
5613     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
5614 
5615   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5616 
5617   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
5618   // i32 and try again.
5619   if (hasDominantValue && EltSize <= 32) {
5620     if (!isConstant) {
5621       SDValue N;
5622 
5623       // If we are VDUPing a value that comes directly from a vector, that will
5624       // cause an unnecessary move to and from a GPR, where instead we could
5625       // just use VDUPLANE. We can only do this if the lane being extracted
5626       // is at a constant index, as the VDUP from lane instructions only have
5627       // constant-index forms.
5628       ConstantSDNode *constIndex;
5629       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5630           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
5631         // We need to create a new undef vector to use for the VDUPLANE if the
5632         // size of the vector from which we get the value is different than the
5633         // size of the vector that we need to create. We will insert the element
5634         // such that the register coalescer will remove unnecessary copies.
5635         if (VT != Value->getOperand(0).getValueType()) {
5636           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
5637                              VT.getVectorNumElements();
5638           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
5639                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
5640                         Value, DAG.getConstant(index, dl, MVT::i32)),
5641                            DAG.getConstant(index, dl, MVT::i32));
5642         } else
5643           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
5644                         Value->getOperand(0), Value->getOperand(1));
5645       } else
5646         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
5647 
5648       if (!usesOnlyOneValue) {
5649         // The dominant value was splatted as 'N', but we now have to insert
5650         // all differing elements.
5651         for (unsigned I = 0; I < NumElts; ++I) {
5652           if (Op.getOperand(I) == Value)
5653             continue;
5654           SmallVector<SDValue, 3> Ops;
5655           Ops.push_back(N);
5656           Ops.push_back(Op.getOperand(I));
5657           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
5658           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
5659         }
5660       }
5661       return N;
5662     }
5663     if (VT.getVectorElementType().isFloatingPoint()) {
5664       SmallVector<SDValue, 8> Ops;
5665       for (unsigned i = 0; i < NumElts; ++i)
5666         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5667                                   Op.getOperand(i)));
5668       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
5669       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
5670       Val = LowerBUILD_VECTOR(Val, DAG, ST);
5671       if (Val.getNode())
5672         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
5673     }
5674     if (usesOnlyOneValue) {
5675       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
5676       if (isConstant && Val.getNode())
5677         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
5678     }
5679   }
5680 
5681   // If all elements are constants and the case above didn't get hit, fall back
5682   // to the default expansion, which will generate a load from the constant
5683   // pool.
5684   if (isConstant)
5685     return SDValue();
5686 
5687   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
5688   if (NumElts >= 4) {
5689     SDValue shuffle = ReconstructShuffle(Op, DAG);
5690     if (shuffle != SDValue())
5691       return shuffle;
5692   }
5693 
5694   // Vectors with 32- or 64-bit elements can be built by directly assigning
5695   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
5696   // will be legalized.
5697   if (EltSize >= 32) {
5698     // Do the expansion with floating-point types, since that is what the VFP
5699     // registers are defined to use, and since i64 is not legal.
5700     EVT EltVT = EVT::getFloatingPointVT(EltSize);
5701     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
5702     SmallVector<SDValue, 8> Ops;
5703     for (unsigned i = 0; i < NumElts; ++i)
5704       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
5705     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
5706     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
5707   }
5708 
5709   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
5710   // know the default expansion would otherwise fall back on something even
5711   // worse. For a vector with one or two non-undef values, that's
5712   // scalar_to_vector for the elements followed by a shuffle (provided the
5713   // shuffle is valid for the target) and materialization element by element
5714   // on the stack followed by a load for everything else.
5715   if (!isConstant && !usesOnlyOneValue) {
5716     SDValue Vec = DAG.getUNDEF(VT);
5717     for (unsigned i = 0 ; i < NumElts; ++i) {
5718       SDValue V = Op.getOperand(i);
5719       if (V.isUndef())
5720         continue;
5721       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
5722       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
5723     }
5724     return Vec;
5725   }
5726 
5727   return SDValue();
5728 }
5729 
5730 // Gather data to see if the operation can be modelled as a
5731 // shuffle in combination with VEXTs.
5732 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
5733                                               SelectionDAG &DAG) const {
5734   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
5735   SDLoc dl(Op);
5736   EVT VT = Op.getValueType();
5737   unsigned NumElts = VT.getVectorNumElements();
5738 
5739   struct ShuffleSourceInfo {
5740     SDValue Vec;
5741     unsigned MinElt;
5742     unsigned MaxElt;
5743 
5744     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
5745     // be compatible with the shuffle we intend to construct. As a result
5746     // ShuffleVec will be some sliding window into the original Vec.
5747     SDValue ShuffleVec;
5748 
5749     // Code should guarantee that element i in Vec starts at element "WindowBase
5750     // + i * WindowScale in ShuffleVec".
5751     int WindowBase;
5752     int WindowScale;
5753 
5754     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
5755     ShuffleSourceInfo(SDValue Vec)
5756         : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
5757           WindowScale(1) {}
5758   };
5759 
5760   // First gather all vectors used as an immediate source for this BUILD_VECTOR
5761   // node.
5762   SmallVector<ShuffleSourceInfo, 2> Sources;
5763   for (unsigned i = 0; i < NumElts; ++i) {
5764     SDValue V = Op.getOperand(i);
5765     if (V.isUndef())
5766       continue;
5767     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
5768       // A shuffle can only come from building a vector from various
5769       // elements of other vectors.
5770       return SDValue();
5771     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
5772       // Furthermore, shuffles require a constant mask, whereas extractelts
5773       // accept variable indices.
5774       return SDValue();
5775     }
5776 
5777     // Add this element source to the list if it's not already there.
5778     SDValue SourceVec = V.getOperand(0);
5779     auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
5780     if (Source == Sources.end())
5781       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
5782 
5783     // Update the minimum and maximum lane number seen.
5784     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
5785     Source->MinElt = std::min(Source->MinElt, EltNo);
5786     Source->MaxElt = std::max(Source->MaxElt, EltNo);
5787   }
5788 
5789   // Currently only do something sane when at most two source vectors
5790   // are involved.
5791   if (Sources.size() > 2)
5792     return SDValue();
5793 
5794   // Find out the smallest element size among result and two sources, and use
5795   // it as element size to build the shuffle_vector.
5796   EVT SmallestEltTy = VT.getVectorElementType();
5797   for (auto &Source : Sources) {
5798     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
5799     if (SrcEltTy.bitsLT(SmallestEltTy))
5800       SmallestEltTy = SrcEltTy;
5801   }
5802   unsigned ResMultiplier =
5803       VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
5804   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
5805   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
5806 
5807   // If the source vector is too wide or too narrow, we may nevertheless be able
5808   // to construct a compatible shuffle either by concatenating it with UNDEF or
5809   // extracting a suitable range of elements.
5810   for (auto &Src : Sources) {
5811     EVT SrcVT = Src.ShuffleVec.getValueType();
5812 
5813     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
5814       continue;
5815 
5816     // This stage of the search produces a source with the same element type as
5817     // the original, but with a total width matching the BUILD_VECTOR output.
5818     EVT EltVT = SrcVT.getVectorElementType();
5819     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
5820     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
5821 
5822     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
5823       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
5824         return SDValue();
5825       // We can pad out the smaller vector for free, so if it's part of a
5826       // shuffle...
5827       Src.ShuffleVec =
5828           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
5829                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
5830       continue;
5831     }
5832 
5833     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
5834       return SDValue();
5835 
5836     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
5837       // Span too large for a VEXT to cope
5838       return SDValue();
5839     }
5840 
5841     if (Src.MinElt >= NumSrcElts) {
5842       // The extraction can just take the second half
5843       Src.ShuffleVec =
5844           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5845                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
5846       Src.WindowBase = -NumSrcElts;
5847     } else if (Src.MaxElt < NumSrcElts) {
5848       // The extraction can just take the first half
5849       Src.ShuffleVec =
5850           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5851                       DAG.getConstant(0, dl, MVT::i32));
5852     } else {
5853       // An actual VEXT is needed
5854       SDValue VEXTSrc1 =
5855           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5856                       DAG.getConstant(0, dl, MVT::i32));
5857       SDValue VEXTSrc2 =
5858           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5859                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
5860 
5861       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
5862                                    VEXTSrc2,
5863                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
5864       Src.WindowBase = -Src.MinElt;
5865     }
5866   }
5867 
5868   // Another possible incompatibility occurs from the vector element types. We
5869   // can fix this by bitcasting the source vectors to the same type we intend
5870   // for the shuffle.
5871   for (auto &Src : Sources) {
5872     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
5873     if (SrcEltTy == SmallestEltTy)
5874       continue;
5875     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
5876     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
5877     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
5878     Src.WindowBase *= Src.WindowScale;
5879   }
5880 
5881   // Final sanity check before we try to actually produce a shuffle.
5882   DEBUG(
5883     for (auto Src : Sources)
5884       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
5885   );
5886 
5887   // The stars all align, our next step is to produce the mask for the shuffle.
5888   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
5889   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
5890   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
5891     SDValue Entry = Op.getOperand(i);
5892     if (Entry.isUndef())
5893       continue;
5894 
5895     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
5896     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
5897 
5898     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
5899     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
5900     // segment.
5901     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
5902     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
5903                                VT.getVectorElementType().getSizeInBits());
5904     int LanesDefined = BitsDefined / BitsPerShuffleLane;
5905 
5906     // This source is expected to fill ResMultiplier lanes of the final shuffle,
5907     // starting at the appropriate offset.
5908     int *LaneMask = &Mask[i * ResMultiplier];
5909 
5910     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
5911     ExtractBase += NumElts * (Src - Sources.begin());
5912     for (int j = 0; j < LanesDefined; ++j)
5913       LaneMask[j] = ExtractBase + j;
5914   }
5915 
5916   // Final check before we try to produce nonsense...
5917   if (!isShuffleMaskLegal(Mask, ShuffleVT))
5918     return SDValue();
5919 
5920   // We can't handle more than two sources. This should have already
5921   // been checked before this point.
5922   assert(Sources.size() <= 2 && "Too many sources!");
5923 
5924   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
5925   for (unsigned i = 0; i < Sources.size(); ++i)
5926     ShuffleOps[i] = Sources[i].ShuffleVec;
5927 
5928   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
5929                                          ShuffleOps[1], &Mask[0]);
5930   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
5931 }
5932 
5933 /// isShuffleMaskLegal - Targets can use this to indicate that they only
5934 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
5935 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
5936 /// are assumed to be legal.
5937 bool
5938 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
5939                                       EVT VT) const {
5940   if (VT.getVectorNumElements() == 4 &&
5941       (VT.is128BitVector() || VT.is64BitVector())) {
5942     unsigned PFIndexes[4];
5943     for (unsigned i = 0; i != 4; ++i) {
5944       if (M[i] < 0)
5945         PFIndexes[i] = 8;
5946       else
5947         PFIndexes[i] = M[i];
5948     }
5949 
5950     // Compute the index in the perfect shuffle table.
5951     unsigned PFTableIndex =
5952       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
5953     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
5954     unsigned Cost = (PFEntry >> 30);
5955 
5956     if (Cost <= 4)
5957       return true;
5958   }
5959 
5960   bool ReverseVEXT, isV_UNDEF;
5961   unsigned Imm, WhichResult;
5962 
5963   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5964   return (EltSize >= 32 ||
5965           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
5966           isVREVMask(M, VT, 64) ||
5967           isVREVMask(M, VT, 32) ||
5968           isVREVMask(M, VT, 16) ||
5969           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
5970           isVTBLMask(M, VT) ||
5971           isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
5972           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
5973 }
5974 
5975 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
5976 /// the specified operations to build the shuffle.
5977 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
5978                                       SDValue RHS, SelectionDAG &DAG,
5979                                       SDLoc dl) {
5980   unsigned OpNum = (PFEntry >> 26) & 0x0F;
5981   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
5982   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
5983 
5984   enum {
5985     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
5986     OP_VREV,
5987     OP_VDUP0,
5988     OP_VDUP1,
5989     OP_VDUP2,
5990     OP_VDUP3,
5991     OP_VEXT1,
5992     OP_VEXT2,
5993     OP_VEXT3,
5994     OP_VUZPL, // VUZP, left result
5995     OP_VUZPR, // VUZP, right result
5996     OP_VZIPL, // VZIP, left result
5997     OP_VZIPR, // VZIP, right result
5998     OP_VTRNL, // VTRN, left result
5999     OP_VTRNR  // VTRN, right result
6000   };
6001 
6002   if (OpNum == OP_COPY) {
6003     if (LHSID == (1*9+2)*9+3) return LHS;
6004     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
6005     return RHS;
6006   }
6007 
6008   SDValue OpLHS, OpRHS;
6009   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
6010   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
6011   EVT VT = OpLHS.getValueType();
6012 
6013   switch (OpNum) {
6014   default: llvm_unreachable("Unknown shuffle opcode!");
6015   case OP_VREV:
6016     // VREV divides the vector in half and swaps within the half.
6017     if (VT.getVectorElementType() == MVT::i32 ||
6018         VT.getVectorElementType() == MVT::f32)
6019       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
6020     // vrev <4 x i16> -> VREV32
6021     if (VT.getVectorElementType() == MVT::i16)
6022       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
6023     // vrev <4 x i8> -> VREV16
6024     assert(VT.getVectorElementType() == MVT::i8);
6025     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
6026   case OP_VDUP0:
6027   case OP_VDUP1:
6028   case OP_VDUP2:
6029   case OP_VDUP3:
6030     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6031                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
6032   case OP_VEXT1:
6033   case OP_VEXT2:
6034   case OP_VEXT3:
6035     return DAG.getNode(ARMISD::VEXT, dl, VT,
6036                        OpLHS, OpRHS,
6037                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
6038   case OP_VUZPL:
6039   case OP_VUZPR:
6040     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
6041                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
6042   case OP_VZIPL:
6043   case OP_VZIPR:
6044     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
6045                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
6046   case OP_VTRNL:
6047   case OP_VTRNR:
6048     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
6049                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
6050   }
6051 }
6052 
6053 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
6054                                        ArrayRef<int> ShuffleMask,
6055                                        SelectionDAG &DAG) {
6056   // Check to see if we can use the VTBL instruction.
6057   SDValue V1 = Op.getOperand(0);
6058   SDValue V2 = Op.getOperand(1);
6059   SDLoc DL(Op);
6060 
6061   SmallVector<SDValue, 8> VTBLMask;
6062   for (ArrayRef<int>::iterator
6063          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
6064     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
6065 
6066   if (V2.getNode()->isUndef())
6067     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
6068                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6069 
6070   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
6071                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6072 }
6073 
6074 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
6075                                                       SelectionDAG &DAG) {
6076   SDLoc DL(Op);
6077   SDValue OpLHS = Op.getOperand(0);
6078   EVT VT = OpLHS.getValueType();
6079 
6080   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
6081          "Expect an v8i16/v16i8 type");
6082   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
6083   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
6084   // extract the first 8 bytes into the top double word and the last 8 bytes
6085   // into the bottom double word. The v8i16 case is similar.
6086   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
6087   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
6088                      DAG.getConstant(ExtractNum, DL, MVT::i32));
6089 }
6090 
6091 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
6092   SDValue V1 = Op.getOperand(0);
6093   SDValue V2 = Op.getOperand(1);
6094   SDLoc dl(Op);
6095   EVT VT = Op.getValueType();
6096   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
6097 
6098   // Convert shuffles that are directly supported on NEON to target-specific
6099   // DAG nodes, instead of keeping them as shuffles and matching them again
6100   // during code selection.  This is more efficient and avoids the possibility
6101   // of inconsistencies between legalization and selection.
6102   // FIXME: floating-point vectors should be canonicalized to integer vectors
6103   // of the same time so that they get CSEd properly.
6104   ArrayRef<int> ShuffleMask = SVN->getMask();
6105 
6106   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
6107   if (EltSize <= 32) {
6108     if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
6109       int Lane = SVN->getSplatIndex();
6110       // If this is undef splat, generate it via "just" vdup, if possible.
6111       if (Lane == -1) Lane = 0;
6112 
6113       // Test if V1 is a SCALAR_TO_VECTOR.
6114       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
6115         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6116       }
6117       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
6118       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
6119       // reaches it).
6120       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
6121           !isa<ConstantSDNode>(V1.getOperand(0))) {
6122         bool IsScalarToVector = true;
6123         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
6124           if (!V1.getOperand(i).isUndef()) {
6125             IsScalarToVector = false;
6126             break;
6127           }
6128         if (IsScalarToVector)
6129           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6130       }
6131       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
6132                          DAG.getConstant(Lane, dl, MVT::i32));
6133     }
6134 
6135     bool ReverseVEXT;
6136     unsigned Imm;
6137     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
6138       if (ReverseVEXT)
6139         std::swap(V1, V2);
6140       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
6141                          DAG.getConstant(Imm, dl, MVT::i32));
6142     }
6143 
6144     if (isVREVMask(ShuffleMask, VT, 64))
6145       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
6146     if (isVREVMask(ShuffleMask, VT, 32))
6147       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
6148     if (isVREVMask(ShuffleMask, VT, 16))
6149       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
6150 
6151     if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
6152       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
6153                          DAG.getConstant(Imm, dl, MVT::i32));
6154     }
6155 
6156     // Check for Neon shuffles that modify both input vectors in place.
6157     // If both results are used, i.e., if there are two shuffles with the same
6158     // source operands and with masks corresponding to both results of one of
6159     // these operations, DAG memoization will ensure that a single node is
6160     // used for both shuffles.
6161     unsigned WhichResult;
6162     bool isV_UNDEF;
6163     if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6164             ShuffleMask, VT, WhichResult, isV_UNDEF)) {
6165       if (isV_UNDEF)
6166         V2 = V1;
6167       return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
6168           .getValue(WhichResult);
6169     }
6170 
6171     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
6172     // shuffles that produce a result larger than their operands with:
6173     //   shuffle(concat(v1, undef), concat(v2, undef))
6174     // ->
6175     //   shuffle(concat(v1, v2), undef)
6176     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
6177     //
6178     // This is useful in the general case, but there are special cases where
6179     // native shuffles produce larger results: the two-result ops.
6180     //
6181     // Look through the concat when lowering them:
6182     //   shuffle(concat(v1, v2), undef)
6183     // ->
6184     //   concat(VZIP(v1, v2):0, :1)
6185     //
6186     if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
6187       SDValue SubV1 = V1->getOperand(0);
6188       SDValue SubV2 = V1->getOperand(1);
6189       EVT SubVT = SubV1.getValueType();
6190 
6191       // We expect these to have been canonicalized to -1.
6192       assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) {
6193         return i < (int)VT.getVectorNumElements();
6194       }) && "Unexpected shuffle index into UNDEF operand!");
6195 
6196       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6197               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
6198         if (isV_UNDEF)
6199           SubV2 = SubV1;
6200         assert((WhichResult == 0) &&
6201                "In-place shuffle of concat can only have one result!");
6202         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
6203                                   SubV1, SubV2);
6204         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
6205                            Res.getValue(1));
6206       }
6207     }
6208   }
6209 
6210   // If the shuffle is not directly supported and it has 4 elements, use
6211   // the PerfectShuffle-generated table to synthesize it from other shuffles.
6212   unsigned NumElts = VT.getVectorNumElements();
6213   if (NumElts == 4) {
6214     unsigned PFIndexes[4];
6215     for (unsigned i = 0; i != 4; ++i) {
6216       if (ShuffleMask[i] < 0)
6217         PFIndexes[i] = 8;
6218       else
6219         PFIndexes[i] = ShuffleMask[i];
6220     }
6221 
6222     // Compute the index in the perfect shuffle table.
6223     unsigned PFTableIndex =
6224       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6225     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6226     unsigned Cost = (PFEntry >> 30);
6227 
6228     if (Cost <= 4)
6229       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
6230   }
6231 
6232   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
6233   if (EltSize >= 32) {
6234     // Do the expansion with floating-point types, since that is what the VFP
6235     // registers are defined to use, and since i64 is not legal.
6236     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6237     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6238     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
6239     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
6240     SmallVector<SDValue, 8> Ops;
6241     for (unsigned i = 0; i < NumElts; ++i) {
6242       if (ShuffleMask[i] < 0)
6243         Ops.push_back(DAG.getUNDEF(EltVT));
6244       else
6245         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6246                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
6247                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
6248                                                   dl, MVT::i32)));
6249     }
6250     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6251     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6252   }
6253 
6254   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
6255     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
6256 
6257   if (VT == MVT::v8i8)
6258     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
6259       return NewOp;
6260 
6261   return SDValue();
6262 }
6263 
6264 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6265   // INSERT_VECTOR_ELT is legal only for immediate indexes.
6266   SDValue Lane = Op.getOperand(2);
6267   if (!isa<ConstantSDNode>(Lane))
6268     return SDValue();
6269 
6270   return Op;
6271 }
6272 
6273 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6274   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
6275   SDValue Lane = Op.getOperand(1);
6276   if (!isa<ConstantSDNode>(Lane))
6277     return SDValue();
6278 
6279   SDValue Vec = Op.getOperand(0);
6280   if (Op.getValueType() == MVT::i32 &&
6281       Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
6282     SDLoc dl(Op);
6283     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
6284   }
6285 
6286   return Op;
6287 }
6288 
6289 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6290   // The only time a CONCAT_VECTORS operation can have legal types is when
6291   // two 64-bit vectors are concatenated to a 128-bit vector.
6292   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
6293          "unexpected CONCAT_VECTORS");
6294   SDLoc dl(Op);
6295   SDValue Val = DAG.getUNDEF(MVT::v2f64);
6296   SDValue Op0 = Op.getOperand(0);
6297   SDValue Op1 = Op.getOperand(1);
6298   if (!Op0.isUndef())
6299     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6300                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
6301                       DAG.getIntPtrConstant(0, dl));
6302   if (!Op1.isUndef())
6303     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6304                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
6305                       DAG.getIntPtrConstant(1, dl));
6306   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
6307 }
6308 
6309 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
6310 /// element has been zero/sign-extended, depending on the isSigned parameter,
6311 /// from an integer type half its size.
6312 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
6313                                    bool isSigned) {
6314   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
6315   EVT VT = N->getValueType(0);
6316   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
6317     SDNode *BVN = N->getOperand(0).getNode();
6318     if (BVN->getValueType(0) != MVT::v4i32 ||
6319         BVN->getOpcode() != ISD::BUILD_VECTOR)
6320       return false;
6321     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6322     unsigned HiElt = 1 - LoElt;
6323     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
6324     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
6325     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
6326     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
6327     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
6328       return false;
6329     if (isSigned) {
6330       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
6331           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
6332         return true;
6333     } else {
6334       if (Hi0->isNullValue() && Hi1->isNullValue())
6335         return true;
6336     }
6337     return false;
6338   }
6339 
6340   if (N->getOpcode() != ISD::BUILD_VECTOR)
6341     return false;
6342 
6343   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
6344     SDNode *Elt = N->getOperand(i).getNode();
6345     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
6346       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
6347       unsigned HalfSize = EltSize / 2;
6348       if (isSigned) {
6349         if (!isIntN(HalfSize, C->getSExtValue()))
6350           return false;
6351       } else {
6352         if (!isUIntN(HalfSize, C->getZExtValue()))
6353           return false;
6354       }
6355       continue;
6356     }
6357     return false;
6358   }
6359 
6360   return true;
6361 }
6362 
6363 /// isSignExtended - Check if a node is a vector value that is sign-extended
6364 /// or a constant BUILD_VECTOR with sign-extended elements.
6365 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
6366   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
6367     return true;
6368   if (isExtendedBUILD_VECTOR(N, DAG, true))
6369     return true;
6370   return false;
6371 }
6372 
6373 /// isZeroExtended - Check if a node is a vector value that is zero-extended
6374 /// or a constant BUILD_VECTOR with zero-extended elements.
6375 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
6376   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
6377     return true;
6378   if (isExtendedBUILD_VECTOR(N, DAG, false))
6379     return true;
6380   return false;
6381 }
6382 
6383 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
6384   if (OrigVT.getSizeInBits() >= 64)
6385     return OrigVT;
6386 
6387   assert(OrigVT.isSimple() && "Expecting a simple value type");
6388 
6389   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
6390   switch (OrigSimpleTy) {
6391   default: llvm_unreachable("Unexpected Vector Type");
6392   case MVT::v2i8:
6393   case MVT::v2i16:
6394      return MVT::v2i32;
6395   case MVT::v4i8:
6396     return  MVT::v4i16;
6397   }
6398 }
6399 
6400 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
6401 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
6402 /// We insert the required extension here to get the vector to fill a D register.
6403 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
6404                                             const EVT &OrigTy,
6405                                             const EVT &ExtTy,
6406                                             unsigned ExtOpcode) {
6407   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
6408   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
6409   // 64-bits we need to insert a new extension so that it will be 64-bits.
6410   assert(ExtTy.is128BitVector() && "Unexpected extension size");
6411   if (OrigTy.getSizeInBits() >= 64)
6412     return N;
6413 
6414   // Must extend size to at least 64 bits to be used as an operand for VMULL.
6415   EVT NewVT = getExtensionTo64Bits(OrigTy);
6416 
6417   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
6418 }
6419 
6420 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
6421 /// does not do any sign/zero extension. If the original vector is less
6422 /// than 64 bits, an appropriate extension will be added after the load to
6423 /// reach a total size of 64 bits. We have to add the extension separately
6424 /// because ARM does not have a sign/zero extending load for vectors.
6425 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
6426   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
6427 
6428   // The load already has the right type.
6429   if (ExtendedTy == LD->getMemoryVT())
6430     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
6431                 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
6432                 LD->isNonTemporal(), LD->isInvariant(),
6433                 LD->getAlignment());
6434 
6435   // We need to create a zextload/sextload. We cannot just create a load
6436   // followed by a zext/zext node because LowerMUL is also run during normal
6437   // operation legalization where we can't create illegal types.
6438   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
6439                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
6440                         LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(),
6441                         LD->isNonTemporal(), LD->getAlignment());
6442 }
6443 
6444 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
6445 /// extending load, or BUILD_VECTOR with extended elements, return the
6446 /// unextended value. The unextended vector should be 64 bits so that it can
6447 /// be used as an operand to a VMULL instruction. If the original vector size
6448 /// before extension is less than 64 bits we add a an extension to resize
6449 /// the vector to 64 bits.
6450 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
6451   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
6452     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
6453                                         N->getOperand(0)->getValueType(0),
6454                                         N->getValueType(0),
6455                                         N->getOpcode());
6456 
6457   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
6458     return SkipLoadExtensionForVMULL(LD, DAG);
6459 
6460   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
6461   // have been legalized as a BITCAST from v4i32.
6462   if (N->getOpcode() == ISD::BITCAST) {
6463     SDNode *BVN = N->getOperand(0).getNode();
6464     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
6465            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
6466     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6467     return DAG.getBuildVector(
6468         MVT::v2i32, SDLoc(N),
6469         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
6470   }
6471   // Construct a new BUILD_VECTOR with elements truncated to half the size.
6472   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
6473   EVT VT = N->getValueType(0);
6474   unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
6475   unsigned NumElts = VT.getVectorNumElements();
6476   MVT TruncVT = MVT::getIntegerVT(EltSize);
6477   SmallVector<SDValue, 8> Ops;
6478   SDLoc dl(N);
6479   for (unsigned i = 0; i != NumElts; ++i) {
6480     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
6481     const APInt &CInt = C->getAPIntValue();
6482     // Element types smaller than 32 bits are not legal, so use i32 elements.
6483     // The values are implicitly truncated so sext vs. zext doesn't matter.
6484     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
6485   }
6486   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
6487 }
6488 
6489 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
6490   unsigned Opcode = N->getOpcode();
6491   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6492     SDNode *N0 = N->getOperand(0).getNode();
6493     SDNode *N1 = N->getOperand(1).getNode();
6494     return N0->hasOneUse() && N1->hasOneUse() &&
6495       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
6496   }
6497   return false;
6498 }
6499 
6500 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
6501   unsigned Opcode = N->getOpcode();
6502   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6503     SDNode *N0 = N->getOperand(0).getNode();
6504     SDNode *N1 = N->getOperand(1).getNode();
6505     return N0->hasOneUse() && N1->hasOneUse() &&
6506       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
6507   }
6508   return false;
6509 }
6510 
6511 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
6512   // Multiplications are only custom-lowered for 128-bit vectors so that
6513   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
6514   EVT VT = Op.getValueType();
6515   assert(VT.is128BitVector() && VT.isInteger() &&
6516          "unexpected type for custom-lowering ISD::MUL");
6517   SDNode *N0 = Op.getOperand(0).getNode();
6518   SDNode *N1 = Op.getOperand(1).getNode();
6519   unsigned NewOpc = 0;
6520   bool isMLA = false;
6521   bool isN0SExt = isSignExtended(N0, DAG);
6522   bool isN1SExt = isSignExtended(N1, DAG);
6523   if (isN0SExt && isN1SExt)
6524     NewOpc = ARMISD::VMULLs;
6525   else {
6526     bool isN0ZExt = isZeroExtended(N0, DAG);
6527     bool isN1ZExt = isZeroExtended(N1, DAG);
6528     if (isN0ZExt && isN1ZExt)
6529       NewOpc = ARMISD::VMULLu;
6530     else if (isN1SExt || isN1ZExt) {
6531       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
6532       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
6533       if (isN1SExt && isAddSubSExt(N0, DAG)) {
6534         NewOpc = ARMISD::VMULLs;
6535         isMLA = true;
6536       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
6537         NewOpc = ARMISD::VMULLu;
6538         isMLA = true;
6539       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
6540         std::swap(N0, N1);
6541         NewOpc = ARMISD::VMULLu;
6542         isMLA = true;
6543       }
6544     }
6545 
6546     if (!NewOpc) {
6547       if (VT == MVT::v2i64)
6548         // Fall through to expand this.  It is not legal.
6549         return SDValue();
6550       else
6551         // Other vector multiplications are legal.
6552         return Op;
6553     }
6554   }
6555 
6556   // Legalize to a VMULL instruction.
6557   SDLoc DL(Op);
6558   SDValue Op0;
6559   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
6560   if (!isMLA) {
6561     Op0 = SkipExtensionForVMULL(N0, DAG);
6562     assert(Op0.getValueType().is64BitVector() &&
6563            Op1.getValueType().is64BitVector() &&
6564            "unexpected types for extended operands to VMULL");
6565     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
6566   }
6567 
6568   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
6569   // isel lowering to take advantage of no-stall back to back vmul + vmla.
6570   //   vmull q0, d4, d6
6571   //   vmlal q0, d5, d6
6572   // is faster than
6573   //   vaddl q0, d4, d5
6574   //   vmovl q1, d6
6575   //   vmul  q0, q0, q1
6576   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
6577   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
6578   EVT Op1VT = Op1.getValueType();
6579   return DAG.getNode(N0->getOpcode(), DL, VT,
6580                      DAG.getNode(NewOpc, DL, VT,
6581                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
6582                      DAG.getNode(NewOpc, DL, VT,
6583                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
6584 }
6585 
6586 static SDValue
6587 LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
6588   // TODO: Should this propagate fast-math-flags?
6589 
6590   // Convert to float
6591   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
6592   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
6593   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
6594   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
6595   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
6596   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
6597   // Get reciprocal estimate.
6598   // float4 recip = vrecpeq_f32(yf);
6599   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6600                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
6601                    Y);
6602   // Because char has a smaller range than uchar, we can actually get away
6603   // without any newton steps.  This requires that we use a weird bias
6604   // of 0xb000, however (again, this has been exhaustively tested).
6605   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
6606   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
6607   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
6608   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
6609   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
6610   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
6611   // Convert back to short.
6612   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
6613   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
6614   return X;
6615 }
6616 
6617 static SDValue
6618 LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
6619   // TODO: Should this propagate fast-math-flags?
6620 
6621   SDValue N2;
6622   // Convert to float.
6623   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
6624   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
6625   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
6626   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
6627   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
6628   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
6629 
6630   // Use reciprocal estimate and one refinement step.
6631   // float4 recip = vrecpeq_f32(yf);
6632   // recip *= vrecpsq_f32(yf, recip);
6633   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6634                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
6635                    N1);
6636   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6637                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
6638                    N1, N2);
6639   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
6640   // Because short has a smaller range than ushort, we can actually get away
6641   // with only a single newton step.  This requires that we use a weird bias
6642   // of 89, however (again, this has been exhaustively tested).
6643   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
6644   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
6645   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
6646   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
6647   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
6648   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
6649   // Convert back to integer and return.
6650   // return vmovn_s32(vcvt_s32_f32(result));
6651   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
6652   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
6653   return N0;
6654 }
6655 
6656 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
6657   EVT VT = Op.getValueType();
6658   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
6659          "unexpected type for custom-lowering ISD::SDIV");
6660 
6661   SDLoc dl(Op);
6662   SDValue N0 = Op.getOperand(0);
6663   SDValue N1 = Op.getOperand(1);
6664   SDValue N2, N3;
6665 
6666   if (VT == MVT::v8i8) {
6667     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
6668     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
6669 
6670     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6671                      DAG.getIntPtrConstant(4, dl));
6672     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6673                      DAG.getIntPtrConstant(4, dl));
6674     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6675                      DAG.getIntPtrConstant(0, dl));
6676     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6677                      DAG.getIntPtrConstant(0, dl));
6678 
6679     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
6680     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
6681 
6682     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
6683     N0 = LowerCONCAT_VECTORS(N0, DAG);
6684 
6685     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
6686     return N0;
6687   }
6688   return LowerSDIV_v4i16(N0, N1, dl, DAG);
6689 }
6690 
6691 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
6692   // TODO: Should this propagate fast-math-flags?
6693   EVT VT = Op.getValueType();
6694   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
6695          "unexpected type for custom-lowering ISD::UDIV");
6696 
6697   SDLoc dl(Op);
6698   SDValue N0 = Op.getOperand(0);
6699   SDValue N1 = Op.getOperand(1);
6700   SDValue N2, N3;
6701 
6702   if (VT == MVT::v8i8) {
6703     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
6704     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
6705 
6706     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6707                      DAG.getIntPtrConstant(4, dl));
6708     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6709                      DAG.getIntPtrConstant(4, dl));
6710     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
6711                      DAG.getIntPtrConstant(0, dl));
6712     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
6713                      DAG.getIntPtrConstant(0, dl));
6714 
6715     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
6716     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
6717 
6718     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
6719     N0 = LowerCONCAT_VECTORS(N0, DAG);
6720 
6721     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
6722                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
6723                                      MVT::i32),
6724                      N0);
6725     return N0;
6726   }
6727 
6728   // v4i16 sdiv ... Convert to float.
6729   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
6730   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
6731   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
6732   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
6733   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
6734   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
6735 
6736   // Use reciprocal estimate and two refinement steps.
6737   // float4 recip = vrecpeq_f32(yf);
6738   // recip *= vrecpsq_f32(yf, recip);
6739   // recip *= vrecpsq_f32(yf, recip);
6740   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6741                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
6742                    BN1);
6743   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6744                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
6745                    BN1, N2);
6746   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
6747   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
6748                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
6749                    BN1, N2);
6750   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
6751   // Simply multiplying by the reciprocal estimate can leave us a few ulps
6752   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
6753   // and that it will never cause us to return an answer too large).
6754   // float4 result = as_float4(as_int4(xf*recip) + 2);
6755   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
6756   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
6757   N1 = DAG.getConstant(2, dl, MVT::v4i32);
6758   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
6759   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
6760   // Convert back to integer and return.
6761   // return vmovn_u32(vcvt_s32_f32(result));
6762   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
6763   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
6764   return N0;
6765 }
6766 
6767 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
6768   EVT VT = Op.getNode()->getValueType(0);
6769   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
6770 
6771   unsigned Opc;
6772   bool ExtraOp = false;
6773   switch (Op.getOpcode()) {
6774   default: llvm_unreachable("Invalid code");
6775   case ISD::ADDC: Opc = ARMISD::ADDC; break;
6776   case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
6777   case ISD::SUBC: Opc = ARMISD::SUBC; break;
6778   case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
6779   }
6780 
6781   if (!ExtraOp)
6782     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
6783                        Op.getOperand(1));
6784   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
6785                      Op.getOperand(1), Op.getOperand(2));
6786 }
6787 
6788 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
6789   assert(Subtarget->isTargetDarwin());
6790 
6791   // For iOS, we want to call an alternative entry point: __sincos_stret,
6792   // return values are passed via sret.
6793   SDLoc dl(Op);
6794   SDValue Arg = Op.getOperand(0);
6795   EVT ArgVT = Arg.getValueType();
6796   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
6797   auto PtrVT = getPointerTy(DAG.getDataLayout());
6798 
6799   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
6800   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6801 
6802   // Pair of floats / doubles used to pass the result.
6803   Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
6804   auto &DL = DAG.getDataLayout();
6805 
6806   ArgListTy Args;
6807   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
6808   SDValue SRet;
6809   if (ShouldUseSRet) {
6810     // Create stack object for sret.
6811     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
6812     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
6813     int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
6814     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
6815 
6816     ArgListEntry Entry;
6817     Entry.Node = SRet;
6818     Entry.Ty = RetTy->getPointerTo();
6819     Entry.isSExt = false;
6820     Entry.isZExt = false;
6821     Entry.isSRet = true;
6822     Args.push_back(Entry);
6823     RetTy = Type::getVoidTy(*DAG.getContext());
6824   }
6825 
6826   ArgListEntry Entry;
6827   Entry.Node = Arg;
6828   Entry.Ty = ArgTy;
6829   Entry.isSExt = false;
6830   Entry.isZExt = false;
6831   Args.push_back(Entry);
6832 
6833   const char *LibcallName =
6834       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
6835   RTLIB::Libcall LC =
6836       (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
6837   CallingConv::ID CC = getLibcallCallingConv(LC);
6838   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
6839 
6840   TargetLowering::CallLoweringInfo CLI(DAG);
6841   CLI.setDebugLoc(dl)
6842       .setChain(DAG.getEntryNode())
6843       .setCallee(CC, RetTy, Callee, std::move(Args), 0)
6844       .setDiscardResult(ShouldUseSRet);
6845   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6846 
6847   if (!ShouldUseSRet)
6848     return CallResult.first;
6849 
6850   SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
6851                                 MachinePointerInfo(), false, false, false, 0);
6852 
6853   // Address of cos field.
6854   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
6855                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
6856   SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
6857                                 MachinePointerInfo(), false, false, false, 0);
6858 
6859   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
6860   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
6861                      LoadSin.getValue(0), LoadCos.getValue(0));
6862 }
6863 
6864 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
6865                                                   bool Signed,
6866                                                   SDValue &Chain) const {
6867   EVT VT = Op.getValueType();
6868   assert((VT == MVT::i32 || VT == MVT::i64) &&
6869          "unexpected type for custom lowering DIV");
6870   SDLoc dl(Op);
6871 
6872   const auto &DL = DAG.getDataLayout();
6873   const auto &TLI = DAG.getTargetLoweringInfo();
6874 
6875   const char *Name = nullptr;
6876   if (Signed)
6877     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
6878   else
6879     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
6880 
6881   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
6882 
6883   ARMTargetLowering::ArgListTy Args;
6884 
6885   for (auto AI : {1, 0}) {
6886     ArgListEntry Arg;
6887     Arg.Node = Op.getOperand(AI);
6888     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
6889     Args.push_back(Arg);
6890   }
6891 
6892   CallLoweringInfo CLI(DAG);
6893   CLI.setDebugLoc(dl)
6894     .setChain(Chain)
6895     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
6896                ES, std::move(Args), 0);
6897 
6898   return LowerCallTo(CLI).first;
6899 }
6900 
6901 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
6902                                             bool Signed) const {
6903   assert(Op.getValueType() == MVT::i32 &&
6904          "unexpected type for custom lowering DIV");
6905   SDLoc dl(Op);
6906 
6907   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
6908                                DAG.getEntryNode(), Op.getOperand(1));
6909 
6910   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
6911 }
6912 
6913 void ARMTargetLowering::ExpandDIV_Windows(
6914     SDValue Op, SelectionDAG &DAG, bool Signed,
6915     SmallVectorImpl<SDValue> &Results) const {
6916   const auto &DL = DAG.getDataLayout();
6917   const auto &TLI = DAG.getTargetLoweringInfo();
6918 
6919   assert(Op.getValueType() == MVT::i64 &&
6920          "unexpected type for custom lowering DIV");
6921   SDLoc dl(Op);
6922 
6923   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
6924                            DAG.getConstant(0, dl, MVT::i32));
6925   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1),
6926                            DAG.getConstant(1, dl, MVT::i32));
6927   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi);
6928 
6929   SDValue DBZCHK =
6930       DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or);
6931 
6932   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
6933 
6934   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
6935   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
6936                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
6937   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
6938 
6939   Results.push_back(Lower);
6940   Results.push_back(Upper);
6941 }
6942 
6943 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
6944   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
6945     // Acquire/Release load/store is not legal for targets without a dmb or
6946     // equivalent available.
6947     return SDValue();
6948 
6949   // Monotonic load/store is legal for all targets.
6950   return Op;
6951 }
6952 
6953 static void ReplaceREADCYCLECOUNTER(SDNode *N,
6954                                     SmallVectorImpl<SDValue> &Results,
6955                                     SelectionDAG &DAG,
6956                                     const ARMSubtarget *Subtarget) {
6957   SDLoc DL(N);
6958   // Under Power Management extensions, the cycle-count is:
6959   //    mrc p15, #0, <Rt>, c9, c13, #0
6960   SDValue Ops[] = { N->getOperand(0), // Chain
6961                     DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
6962                     DAG.getConstant(15, DL, MVT::i32),
6963                     DAG.getConstant(0, DL, MVT::i32),
6964                     DAG.getConstant(9, DL, MVT::i32),
6965                     DAG.getConstant(13, DL, MVT::i32),
6966                     DAG.getConstant(0, DL, MVT::i32)
6967   };
6968 
6969   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
6970                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
6971   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
6972                                 DAG.getConstant(0, DL, MVT::i32)));
6973   Results.push_back(Cycles32.getValue(1));
6974 }
6975 
6976 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
6977   SDLoc dl(V.getNode());
6978   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
6979   SDValue VHi = DAG.getAnyExtOrTrunc(
6980       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
6981       dl, MVT::i32);
6982   SDValue RegClass =
6983       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
6984   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
6985   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
6986   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
6987   return SDValue(
6988       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
6989 }
6990 
6991 static void ReplaceCMP_SWAP_64Results(SDNode *N,
6992                                        SmallVectorImpl<SDValue> & Results,
6993                                        SelectionDAG &DAG) {
6994   assert(N->getValueType(0) == MVT::i64 &&
6995          "AtomicCmpSwap on types less than 64 should be legal");
6996   SDValue Ops[] = {N->getOperand(1),
6997                    createGPRPairNode(DAG, N->getOperand(2)),
6998                    createGPRPairNode(DAG, N->getOperand(3)),
6999                    N->getOperand(0)};
7000   SDNode *CmpSwap = DAG.getMachineNode(
7001       ARM::CMP_SWAP_64, SDLoc(N),
7002       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
7003 
7004   MachineFunction &MF = DAG.getMachineFunction();
7005   MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
7006   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
7007   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
7008 
7009   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
7010                                                SDValue(CmpSwap, 0)));
7011   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
7012                                                SDValue(CmpSwap, 0)));
7013   Results.push_back(SDValue(CmpSwap, 2));
7014 }
7015 
7016 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7017   switch (Op.getOpcode()) {
7018   default: llvm_unreachable("Don't know how to custom lower this!");
7019   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
7020   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
7021   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
7022   case ISD::GlobalAddress:
7023     switch (Subtarget->getTargetTriple().getObjectFormat()) {
7024     default: llvm_unreachable("unknown object format");
7025     case Triple::COFF:
7026       return LowerGlobalAddressWindows(Op, DAG);
7027     case Triple::ELF:
7028       return LowerGlobalAddressELF(Op, DAG);
7029     case Triple::MachO:
7030       return LowerGlobalAddressDarwin(Op, DAG);
7031     }
7032   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
7033   case ISD::SELECT:        return LowerSELECT(Op, DAG);
7034   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
7035   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
7036   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
7037   case ISD::VASTART:       return LowerVASTART(Op, DAG);
7038   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
7039   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
7040   case ISD::SINT_TO_FP:
7041   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
7042   case ISD::FP_TO_SINT:
7043   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
7044   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
7045   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
7046   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
7047   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
7048   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
7049   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
7050   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
7051                                                                Subtarget);
7052   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
7053   case ISD::SHL:
7054   case ISD::SRL:
7055   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
7056   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
7057   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
7058   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
7059   case ISD::SRL_PARTS:
7060   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
7061   case ISD::CTTZ:
7062   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
7063   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
7064   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
7065   case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
7066   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
7067   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
7068   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
7069   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
7070   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7071   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
7072   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
7073   case ISD::MUL:           return LowerMUL(Op, DAG);
7074   case ISD::SDIV:
7075     if (Subtarget->isTargetWindows())
7076       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
7077     return LowerSDIV(Op, DAG);
7078   case ISD::UDIV:
7079     if (Subtarget->isTargetWindows())
7080       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
7081     return LowerUDIV(Op, DAG);
7082   case ISD::ADDC:
7083   case ISD::ADDE:
7084   case ISD::SUBC:
7085   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
7086   case ISD::SADDO:
7087   case ISD::UADDO:
7088   case ISD::SSUBO:
7089   case ISD::USUBO:
7090     return LowerXALUO(Op, DAG);
7091   case ISD::ATOMIC_LOAD:
7092   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
7093   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
7094   case ISD::SDIVREM:
7095   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
7096   case ISD::DYNAMIC_STACKALLOC:
7097     if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
7098       return LowerDYNAMIC_STACKALLOC(Op, DAG);
7099     llvm_unreachable("Don't know how to custom lower this!");
7100   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
7101   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
7102   case ARMISD::WIN__DBZCHK: return SDValue();
7103   }
7104 }
7105 
7106 /// ReplaceNodeResults - Replace the results of node with an illegal result
7107 /// type with new values built out of custom code.
7108 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
7109                                            SmallVectorImpl<SDValue> &Results,
7110                                            SelectionDAG &DAG) const {
7111   SDValue Res;
7112   switch (N->getOpcode()) {
7113   default:
7114     llvm_unreachable("Don't know how to custom expand this!");
7115   case ISD::READ_REGISTER:
7116     ExpandREAD_REGISTER(N, Results, DAG);
7117     break;
7118   case ISD::BITCAST:
7119     Res = ExpandBITCAST(N, DAG);
7120     break;
7121   case ISD::SRL:
7122   case ISD::SRA:
7123     Res = Expand64BitShift(N, DAG, Subtarget);
7124     break;
7125   case ISD::SREM:
7126   case ISD::UREM:
7127     Res = LowerREM(N, DAG);
7128     break;
7129   case ISD::SDIVREM:
7130   case ISD::UDIVREM:
7131     Res = LowerDivRem(SDValue(N, 0), DAG);
7132     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
7133     Results.push_back(Res.getValue(0));
7134     Results.push_back(Res.getValue(1));
7135     return;
7136   case ISD::READCYCLECOUNTER:
7137     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
7138     return;
7139   case ISD::UDIV:
7140   case ISD::SDIV:
7141     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
7142     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
7143                              Results);
7144   case ISD::ATOMIC_CMP_SWAP:
7145     ReplaceCMP_SWAP_64Results(N, Results, DAG);
7146     return;
7147   }
7148   if (Res.getNode())
7149     Results.push_back(Res);
7150 }
7151 
7152 //===----------------------------------------------------------------------===//
7153 //                           ARM Scheduler Hooks
7154 //===----------------------------------------------------------------------===//
7155 
7156 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
7157 /// registers the function context.
7158 void ARMTargetLowering::
7159 SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
7160                        MachineBasicBlock *DispatchBB, int FI) const {
7161   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7162   DebugLoc dl = MI->getDebugLoc();
7163   MachineFunction *MF = MBB->getParent();
7164   MachineRegisterInfo *MRI = &MF->getRegInfo();
7165   MachineConstantPool *MCP = MF->getConstantPool();
7166   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
7167   const Function *F = MF->getFunction();
7168 
7169   bool isThumb = Subtarget->isThumb();
7170   bool isThumb2 = Subtarget->isThumb2();
7171 
7172   unsigned PCLabelId = AFI->createPICLabelUId();
7173   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
7174   ARMConstantPoolValue *CPV =
7175     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
7176   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
7177 
7178   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
7179                                            : &ARM::GPRRegClass;
7180 
7181   // Grab constant pool and fixed stack memory operands.
7182   MachineMemOperand *CPMMO =
7183       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
7184                                MachineMemOperand::MOLoad, 4, 4);
7185 
7186   MachineMemOperand *FIMMOSt =
7187       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
7188                                MachineMemOperand::MOStore, 4, 4);
7189 
7190   // Load the address of the dispatch MBB into the jump buffer.
7191   if (isThumb2) {
7192     // Incoming value: jbuf
7193     //   ldr.n  r5, LCPI1_1
7194     //   orr    r5, r5, #1
7195     //   add    r5, pc
7196     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
7197     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7198     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
7199                    .addConstantPoolIndex(CPI)
7200                    .addMemOperand(CPMMO));
7201     // Set the low bit because of thumb mode.
7202     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7203     AddDefaultCC(
7204       AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
7205                      .addReg(NewVReg1, RegState::Kill)
7206                      .addImm(0x01)));
7207     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7208     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
7209       .addReg(NewVReg2, RegState::Kill)
7210       .addImm(PCLabelId);
7211     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
7212                    .addReg(NewVReg3, RegState::Kill)
7213                    .addFrameIndex(FI)
7214                    .addImm(36)  // &jbuf[1] :: pc
7215                    .addMemOperand(FIMMOSt));
7216   } else if (isThumb) {
7217     // Incoming value: jbuf
7218     //   ldr.n  r1, LCPI1_4
7219     //   add    r1, pc
7220     //   mov    r2, #1
7221     //   orrs   r1, r2
7222     //   add    r2, $jbuf, #+4 ; &jbuf[1]
7223     //   str    r1, [r2]
7224     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7225     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
7226                    .addConstantPoolIndex(CPI)
7227                    .addMemOperand(CPMMO));
7228     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7229     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
7230       .addReg(NewVReg1, RegState::Kill)
7231       .addImm(PCLabelId);
7232     // Set the low bit because of thumb mode.
7233     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7234     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
7235                    .addReg(ARM::CPSR, RegState::Define)
7236                    .addImm(1));
7237     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7238     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
7239                    .addReg(ARM::CPSR, RegState::Define)
7240                    .addReg(NewVReg2, RegState::Kill)
7241                    .addReg(NewVReg3, RegState::Kill));
7242     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7243     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
7244             .addFrameIndex(FI)
7245             .addImm(36); // &jbuf[1] :: pc
7246     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
7247                    .addReg(NewVReg4, RegState::Kill)
7248                    .addReg(NewVReg5, RegState::Kill)
7249                    .addImm(0)
7250                    .addMemOperand(FIMMOSt));
7251   } else {
7252     // Incoming value: jbuf
7253     //   ldr  r1, LCPI1_1
7254     //   add  r1, pc, r1
7255     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
7256     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7257     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
7258                    .addConstantPoolIndex(CPI)
7259                    .addImm(0)
7260                    .addMemOperand(CPMMO));
7261     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7262     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
7263                    .addReg(NewVReg1, RegState::Kill)
7264                    .addImm(PCLabelId));
7265     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
7266                    .addReg(NewVReg2, RegState::Kill)
7267                    .addFrameIndex(FI)
7268                    .addImm(36)  // &jbuf[1] :: pc
7269                    .addMemOperand(FIMMOSt));
7270   }
7271 }
7272 
7273 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
7274                                               MachineBasicBlock *MBB) const {
7275   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7276   DebugLoc dl = MI->getDebugLoc();
7277   MachineFunction *MF = MBB->getParent();
7278   MachineRegisterInfo *MRI = &MF->getRegInfo();
7279   MachineFrameInfo *MFI = MF->getFrameInfo();
7280   int FI = MFI->getFunctionContextIndex();
7281 
7282   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
7283                                                         : &ARM::GPRnopcRegClass;
7284 
7285   // Get a mapping of the call site numbers to all of the landing pads they're
7286   // associated with.
7287   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
7288   unsigned MaxCSNum = 0;
7289   MachineModuleInfo &MMI = MF->getMMI();
7290   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
7291        ++BB) {
7292     if (!BB->isEHPad()) continue;
7293 
7294     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
7295     // pad.
7296     for (MachineBasicBlock::iterator
7297            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
7298       if (!II->isEHLabel()) continue;
7299 
7300       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
7301       if (!MMI.hasCallSiteLandingPad(Sym)) continue;
7302 
7303       SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
7304       for (SmallVectorImpl<unsigned>::iterator
7305              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
7306            CSI != CSE; ++CSI) {
7307         CallSiteNumToLPad[*CSI].push_back(&*BB);
7308         MaxCSNum = std::max(MaxCSNum, *CSI);
7309       }
7310       break;
7311     }
7312   }
7313 
7314   // Get an ordered list of the machine basic blocks for the jump table.
7315   std::vector<MachineBasicBlock*> LPadList;
7316   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
7317   LPadList.reserve(CallSiteNumToLPad.size());
7318   for (unsigned I = 1; I <= MaxCSNum; ++I) {
7319     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
7320     for (SmallVectorImpl<MachineBasicBlock*>::iterator
7321            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
7322       LPadList.push_back(*II);
7323       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
7324     }
7325   }
7326 
7327   assert(!LPadList.empty() &&
7328          "No landing pad destinations for the dispatch jump table!");
7329 
7330   // Create the jump table and associated information.
7331   MachineJumpTableInfo *JTI =
7332     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
7333   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
7334   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
7335 
7336   // Create the MBBs for the dispatch code.
7337 
7338   // Shove the dispatch's address into the return slot in the function context.
7339   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
7340   DispatchBB->setIsEHPad();
7341 
7342   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7343   unsigned trap_opcode;
7344   if (Subtarget->isThumb())
7345     trap_opcode = ARM::tTRAP;
7346   else
7347     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
7348 
7349   BuildMI(TrapBB, dl, TII->get(trap_opcode));
7350   DispatchBB->addSuccessor(TrapBB);
7351 
7352   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
7353   DispatchBB->addSuccessor(DispContBB);
7354 
7355   // Insert and MBBs.
7356   MF->insert(MF->end(), DispatchBB);
7357   MF->insert(MF->end(), DispContBB);
7358   MF->insert(MF->end(), TrapBB);
7359 
7360   // Insert code into the entry block that creates and registers the function
7361   // context.
7362   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
7363 
7364   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
7365       MachinePointerInfo::getFixedStack(*MF, FI),
7366       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
7367 
7368   MachineInstrBuilder MIB;
7369   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
7370 
7371   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
7372   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
7373 
7374   // Add a register mask with no preserved registers.  This results in all
7375   // registers being marked as clobbered.
7376   MIB.addRegMask(RI.getNoPreservedMask());
7377 
7378   unsigned NumLPads = LPadList.size();
7379   if (Subtarget->isThumb2()) {
7380     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7381     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
7382                    .addFrameIndex(FI)
7383                    .addImm(4)
7384                    .addMemOperand(FIMMOLd));
7385 
7386     if (NumLPads < 256) {
7387       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
7388                      .addReg(NewVReg1)
7389                      .addImm(LPadList.size()));
7390     } else {
7391       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7392       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
7393                      .addImm(NumLPads & 0xFFFF));
7394 
7395       unsigned VReg2 = VReg1;
7396       if ((NumLPads & 0xFFFF0000) != 0) {
7397         VReg2 = MRI->createVirtualRegister(TRC);
7398         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
7399                        .addReg(VReg1)
7400                        .addImm(NumLPads >> 16));
7401       }
7402 
7403       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
7404                      .addReg(NewVReg1)
7405                      .addReg(VReg2));
7406     }
7407 
7408     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
7409       .addMBB(TrapBB)
7410       .addImm(ARMCC::HI)
7411       .addReg(ARM::CPSR);
7412 
7413     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7414     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
7415                    .addJumpTableIndex(MJTI));
7416 
7417     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7418     AddDefaultCC(
7419       AddDefaultPred(
7420         BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
7421         .addReg(NewVReg3, RegState::Kill)
7422         .addReg(NewVReg1)
7423         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
7424 
7425     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
7426       .addReg(NewVReg4, RegState::Kill)
7427       .addReg(NewVReg1)
7428       .addJumpTableIndex(MJTI);
7429   } else if (Subtarget->isThumb()) {
7430     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7431     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
7432                    .addFrameIndex(FI)
7433                    .addImm(1)
7434                    .addMemOperand(FIMMOLd));
7435 
7436     if (NumLPads < 256) {
7437       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
7438                      .addReg(NewVReg1)
7439                      .addImm(NumLPads));
7440     } else {
7441       MachineConstantPool *ConstantPool = MF->getConstantPool();
7442       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7443       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
7444 
7445       // MachineConstantPool wants an explicit alignment.
7446       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
7447       if (Align == 0)
7448         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
7449       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7450 
7451       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7452       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
7453                      .addReg(VReg1, RegState::Define)
7454                      .addConstantPoolIndex(Idx));
7455       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
7456                      .addReg(NewVReg1)
7457                      .addReg(VReg1));
7458     }
7459 
7460     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
7461       .addMBB(TrapBB)
7462       .addImm(ARMCC::HI)
7463       .addReg(ARM::CPSR);
7464 
7465     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7466     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
7467                    .addReg(ARM::CPSR, RegState::Define)
7468                    .addReg(NewVReg1)
7469                    .addImm(2));
7470 
7471     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7472     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
7473                    .addJumpTableIndex(MJTI));
7474 
7475     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7476     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
7477                    .addReg(ARM::CPSR, RegState::Define)
7478                    .addReg(NewVReg2, RegState::Kill)
7479                    .addReg(NewVReg3));
7480 
7481     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
7482         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
7483 
7484     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7485     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
7486                    .addReg(NewVReg4, RegState::Kill)
7487                    .addImm(0)
7488                    .addMemOperand(JTMMOLd));
7489 
7490     unsigned NewVReg6 = NewVReg5;
7491     if (RelocM == Reloc::PIC_) {
7492       NewVReg6 = MRI->createVirtualRegister(TRC);
7493       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
7494                      .addReg(ARM::CPSR, RegState::Define)
7495                      .addReg(NewVReg5, RegState::Kill)
7496                      .addReg(NewVReg3));
7497     }
7498 
7499     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
7500       .addReg(NewVReg6, RegState::Kill)
7501       .addJumpTableIndex(MJTI);
7502   } else {
7503     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7504     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
7505                    .addFrameIndex(FI)
7506                    .addImm(4)
7507                    .addMemOperand(FIMMOLd));
7508 
7509     if (NumLPads < 256) {
7510       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
7511                      .addReg(NewVReg1)
7512                      .addImm(NumLPads));
7513     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
7514       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7515       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
7516                      .addImm(NumLPads & 0xFFFF));
7517 
7518       unsigned VReg2 = VReg1;
7519       if ((NumLPads & 0xFFFF0000) != 0) {
7520         VReg2 = MRI->createVirtualRegister(TRC);
7521         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
7522                        .addReg(VReg1)
7523                        .addImm(NumLPads >> 16));
7524       }
7525 
7526       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
7527                      .addReg(NewVReg1)
7528                      .addReg(VReg2));
7529     } else {
7530       MachineConstantPool *ConstantPool = MF->getConstantPool();
7531       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7532       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
7533 
7534       // MachineConstantPool wants an explicit alignment.
7535       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
7536       if (Align == 0)
7537         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
7538       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7539 
7540       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7541       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
7542                      .addReg(VReg1, RegState::Define)
7543                      .addConstantPoolIndex(Idx)
7544                      .addImm(0));
7545       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
7546                      .addReg(NewVReg1)
7547                      .addReg(VReg1, RegState::Kill));
7548     }
7549 
7550     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
7551       .addMBB(TrapBB)
7552       .addImm(ARMCC::HI)
7553       .addReg(ARM::CPSR);
7554 
7555     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7556     AddDefaultCC(
7557       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
7558                      .addReg(NewVReg1)
7559                      .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
7560     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7561     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
7562                    .addJumpTableIndex(MJTI));
7563 
7564     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
7565         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
7566     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7567     AddDefaultPred(
7568       BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
7569       .addReg(NewVReg3, RegState::Kill)
7570       .addReg(NewVReg4)
7571       .addImm(0)
7572       .addMemOperand(JTMMOLd));
7573 
7574     if (RelocM == Reloc::PIC_) {
7575       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
7576         .addReg(NewVReg5, RegState::Kill)
7577         .addReg(NewVReg4)
7578         .addJumpTableIndex(MJTI);
7579     } else {
7580       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
7581         .addReg(NewVReg5, RegState::Kill)
7582         .addJumpTableIndex(MJTI);
7583     }
7584   }
7585 
7586   // Add the jump table entries as successors to the MBB.
7587   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
7588   for (std::vector<MachineBasicBlock*>::iterator
7589          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
7590     MachineBasicBlock *CurMBB = *I;
7591     if (SeenMBBs.insert(CurMBB).second)
7592       DispContBB->addSuccessor(CurMBB);
7593   }
7594 
7595   // N.B. the order the invoke BBs are processed in doesn't matter here.
7596   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
7597   SmallVector<MachineBasicBlock*, 64> MBBLPads;
7598   for (MachineBasicBlock *BB : InvokeBBs) {
7599 
7600     // Remove the landing pad successor from the invoke block and replace it
7601     // with the new dispatch block.
7602     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
7603                                                   BB->succ_end());
7604     while (!Successors.empty()) {
7605       MachineBasicBlock *SMBB = Successors.pop_back_val();
7606       if (SMBB->isEHPad()) {
7607         BB->removeSuccessor(SMBB);
7608         MBBLPads.push_back(SMBB);
7609       }
7610     }
7611 
7612     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
7613     BB->normalizeSuccProbs();
7614 
7615     // Find the invoke call and mark all of the callee-saved registers as
7616     // 'implicit defined' so that they're spilled. This prevents code from
7617     // moving instructions to before the EH block, where they will never be
7618     // executed.
7619     for (MachineBasicBlock::reverse_iterator
7620            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
7621       if (!II->isCall()) continue;
7622 
7623       DenseMap<unsigned, bool> DefRegs;
7624       for (MachineInstr::mop_iterator
7625              OI = II->operands_begin(), OE = II->operands_end();
7626            OI != OE; ++OI) {
7627         if (!OI->isReg()) continue;
7628         DefRegs[OI->getReg()] = true;
7629       }
7630 
7631       MachineInstrBuilder MIB(*MF, &*II);
7632 
7633       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
7634         unsigned Reg = SavedRegs[i];
7635         if (Subtarget->isThumb2() &&
7636             !ARM::tGPRRegClass.contains(Reg) &&
7637             !ARM::hGPRRegClass.contains(Reg))
7638           continue;
7639         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
7640           continue;
7641         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
7642           continue;
7643         if (!DefRegs[Reg])
7644           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
7645       }
7646 
7647       break;
7648     }
7649   }
7650 
7651   // Mark all former landing pads as non-landing pads. The dispatch is the only
7652   // landing pad now.
7653   for (SmallVectorImpl<MachineBasicBlock*>::iterator
7654          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
7655     (*I)->setIsEHPad(false);
7656 
7657   // The instruction is gone now.
7658   MI->eraseFromParent();
7659 }
7660 
7661 static
7662 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
7663   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
7664        E = MBB->succ_end(); I != E; ++I)
7665     if (*I != Succ)
7666       return *I;
7667   llvm_unreachable("Expecting a BB with two successors!");
7668 }
7669 
7670 /// Return the load opcode for a given load size. If load size >= 8,
7671 /// neon opcode will be returned.
7672 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
7673   if (LdSize >= 8)
7674     return LdSize == 16 ? ARM::VLD1q32wb_fixed
7675                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
7676   if (IsThumb1)
7677     return LdSize == 4 ? ARM::tLDRi
7678                        : LdSize == 2 ? ARM::tLDRHi
7679                                      : LdSize == 1 ? ARM::tLDRBi : 0;
7680   if (IsThumb2)
7681     return LdSize == 4 ? ARM::t2LDR_POST
7682                        : LdSize == 2 ? ARM::t2LDRH_POST
7683                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
7684   return LdSize == 4 ? ARM::LDR_POST_IMM
7685                      : LdSize == 2 ? ARM::LDRH_POST
7686                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
7687 }
7688 
7689 /// Return the store opcode for a given store size. If store size >= 8,
7690 /// neon opcode will be returned.
7691 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
7692   if (StSize >= 8)
7693     return StSize == 16 ? ARM::VST1q32wb_fixed
7694                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
7695   if (IsThumb1)
7696     return StSize == 4 ? ARM::tSTRi
7697                        : StSize == 2 ? ARM::tSTRHi
7698                                      : StSize == 1 ? ARM::tSTRBi : 0;
7699   if (IsThumb2)
7700     return StSize == 4 ? ARM::t2STR_POST
7701                        : StSize == 2 ? ARM::t2STRH_POST
7702                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
7703   return StSize == 4 ? ARM::STR_POST_IMM
7704                      : StSize == 2 ? ARM::STRH_POST
7705                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
7706 }
7707 
7708 /// Emit a post-increment load operation with given size. The instructions
7709 /// will be added to BB at Pos.
7710 static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
7711                        const TargetInstrInfo *TII, DebugLoc dl,
7712                        unsigned LdSize, unsigned Data, unsigned AddrIn,
7713                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
7714   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
7715   assert(LdOpc != 0 && "Should have a load opcode");
7716   if (LdSize >= 8) {
7717     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7718                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
7719                        .addImm(0));
7720   } else if (IsThumb1) {
7721     // load + update AddrIn
7722     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7723                        .addReg(AddrIn).addImm(0));
7724     MachineInstrBuilder MIB =
7725         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
7726     MIB = AddDefaultT1CC(MIB);
7727     MIB.addReg(AddrIn).addImm(LdSize);
7728     AddDefaultPred(MIB);
7729   } else if (IsThumb2) {
7730     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7731                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
7732                        .addImm(LdSize));
7733   } else { // arm
7734     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
7735                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
7736                        .addReg(0).addImm(LdSize));
7737   }
7738 }
7739 
7740 /// Emit a post-increment store operation with given size. The instructions
7741 /// will be added to BB at Pos.
7742 static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
7743                        const TargetInstrInfo *TII, DebugLoc dl,
7744                        unsigned StSize, unsigned Data, unsigned AddrIn,
7745                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
7746   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
7747   assert(StOpc != 0 && "Should have a store opcode");
7748   if (StSize >= 8) {
7749     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
7750                        .addReg(AddrIn).addImm(0).addReg(Data));
7751   } else if (IsThumb1) {
7752     // store + update AddrIn
7753     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
7754                        .addReg(AddrIn).addImm(0));
7755     MachineInstrBuilder MIB =
7756         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
7757     MIB = AddDefaultT1CC(MIB);
7758     MIB.addReg(AddrIn).addImm(StSize);
7759     AddDefaultPred(MIB);
7760   } else if (IsThumb2) {
7761     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
7762                        .addReg(Data).addReg(AddrIn).addImm(StSize));
7763   } else { // arm
7764     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
7765                        .addReg(Data).addReg(AddrIn).addReg(0)
7766                        .addImm(StSize));
7767   }
7768 }
7769 
7770 MachineBasicBlock *
7771 ARMTargetLowering::EmitStructByval(MachineInstr *MI,
7772                                    MachineBasicBlock *BB) const {
7773   // This pseudo instruction has 3 operands: dst, src, size
7774   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
7775   // Otherwise, we will generate unrolled scalar copies.
7776   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7777   const BasicBlock *LLVM_BB = BB->getBasicBlock();
7778   MachineFunction::iterator It = ++BB->getIterator();
7779 
7780   unsigned dest = MI->getOperand(0).getReg();
7781   unsigned src = MI->getOperand(1).getReg();
7782   unsigned SizeVal = MI->getOperand(2).getImm();
7783   unsigned Align = MI->getOperand(3).getImm();
7784   DebugLoc dl = MI->getDebugLoc();
7785 
7786   MachineFunction *MF = BB->getParent();
7787   MachineRegisterInfo &MRI = MF->getRegInfo();
7788   unsigned UnitSize = 0;
7789   const TargetRegisterClass *TRC = nullptr;
7790   const TargetRegisterClass *VecTRC = nullptr;
7791 
7792   bool IsThumb1 = Subtarget->isThumb1Only();
7793   bool IsThumb2 = Subtarget->isThumb2();
7794 
7795   if (Align & 1) {
7796     UnitSize = 1;
7797   } else if (Align & 2) {
7798     UnitSize = 2;
7799   } else {
7800     // Check whether we can use NEON instructions.
7801     if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
7802         Subtarget->hasNEON()) {
7803       if ((Align % 16 == 0) && SizeVal >= 16)
7804         UnitSize = 16;
7805       else if ((Align % 8 == 0) && SizeVal >= 8)
7806         UnitSize = 8;
7807     }
7808     // Can't use NEON instructions.
7809     if (UnitSize == 0)
7810       UnitSize = 4;
7811   }
7812 
7813   // Select the correct opcode and register class for unit size load/store
7814   bool IsNeon = UnitSize >= 8;
7815   TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
7816   if (IsNeon)
7817     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
7818                             : UnitSize == 8 ? &ARM::DPRRegClass
7819                                             : nullptr;
7820 
7821   unsigned BytesLeft = SizeVal % UnitSize;
7822   unsigned LoopSize = SizeVal - BytesLeft;
7823 
7824   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
7825     // Use LDR and STR to copy.
7826     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
7827     // [destOut] = STR_POST(scratch, destIn, UnitSize)
7828     unsigned srcIn = src;
7829     unsigned destIn = dest;
7830     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
7831       unsigned srcOut = MRI.createVirtualRegister(TRC);
7832       unsigned destOut = MRI.createVirtualRegister(TRC);
7833       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
7834       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
7835                  IsThumb1, IsThumb2);
7836       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
7837                  IsThumb1, IsThumb2);
7838       srcIn = srcOut;
7839       destIn = destOut;
7840     }
7841 
7842     // Handle the leftover bytes with LDRB and STRB.
7843     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
7844     // [destOut] = STRB_POST(scratch, destIn, 1)
7845     for (unsigned i = 0; i < BytesLeft; i++) {
7846       unsigned srcOut = MRI.createVirtualRegister(TRC);
7847       unsigned destOut = MRI.createVirtualRegister(TRC);
7848       unsigned scratch = MRI.createVirtualRegister(TRC);
7849       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
7850                  IsThumb1, IsThumb2);
7851       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
7852                  IsThumb1, IsThumb2);
7853       srcIn = srcOut;
7854       destIn = destOut;
7855     }
7856     MI->eraseFromParent();   // The instruction is gone now.
7857     return BB;
7858   }
7859 
7860   // Expand the pseudo op to a loop.
7861   // thisMBB:
7862   //   ...
7863   //   movw varEnd, # --> with thumb2
7864   //   movt varEnd, #
7865   //   ldrcp varEnd, idx --> without thumb2
7866   //   fallthrough --> loopMBB
7867   // loopMBB:
7868   //   PHI varPhi, varEnd, varLoop
7869   //   PHI srcPhi, src, srcLoop
7870   //   PHI destPhi, dst, destLoop
7871   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7872   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
7873   //   subs varLoop, varPhi, #UnitSize
7874   //   bne loopMBB
7875   //   fallthrough --> exitMBB
7876   // exitMBB:
7877   //   epilogue to handle left-over bytes
7878   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7879   //   [destOut] = STRB_POST(scratch, destLoop, 1)
7880   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7881   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7882   MF->insert(It, loopMBB);
7883   MF->insert(It, exitMBB);
7884 
7885   // Transfer the remainder of BB and its successor edges to exitMBB.
7886   exitMBB->splice(exitMBB->begin(), BB,
7887                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
7888   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
7889 
7890   // Load an immediate to varEnd.
7891   unsigned varEnd = MRI.createVirtualRegister(TRC);
7892   if (Subtarget->useMovt(*MF)) {
7893     unsigned Vtmp = varEnd;
7894     if ((LoopSize & 0xFFFF0000) != 0)
7895       Vtmp = MRI.createVirtualRegister(TRC);
7896     AddDefaultPred(BuildMI(BB, dl,
7897                            TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16),
7898                            Vtmp).addImm(LoopSize & 0xFFFF));
7899 
7900     if ((LoopSize & 0xFFFF0000) != 0)
7901       AddDefaultPred(BuildMI(BB, dl,
7902                              TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16),
7903                              varEnd)
7904                          .addReg(Vtmp)
7905                          .addImm(LoopSize >> 16));
7906   } else {
7907     MachineConstantPool *ConstantPool = MF->getConstantPool();
7908     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7909     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
7910 
7911     // MachineConstantPool wants an explicit alignment.
7912     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
7913     if (Align == 0)
7914       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
7915     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7916 
7917     if (IsThumb1)
7918       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
7919           varEnd, RegState::Define).addConstantPoolIndex(Idx));
7920     else
7921       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
7922           varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
7923   }
7924   BB->addSuccessor(loopMBB);
7925 
7926   // Generate the loop body:
7927   //   varPhi = PHI(varLoop, varEnd)
7928   //   srcPhi = PHI(srcLoop, src)
7929   //   destPhi = PHI(destLoop, dst)
7930   MachineBasicBlock *entryBB = BB;
7931   BB = loopMBB;
7932   unsigned varLoop = MRI.createVirtualRegister(TRC);
7933   unsigned varPhi = MRI.createVirtualRegister(TRC);
7934   unsigned srcLoop = MRI.createVirtualRegister(TRC);
7935   unsigned srcPhi = MRI.createVirtualRegister(TRC);
7936   unsigned destLoop = MRI.createVirtualRegister(TRC);
7937   unsigned destPhi = MRI.createVirtualRegister(TRC);
7938 
7939   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
7940     .addReg(varLoop).addMBB(loopMBB)
7941     .addReg(varEnd).addMBB(entryBB);
7942   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
7943     .addReg(srcLoop).addMBB(loopMBB)
7944     .addReg(src).addMBB(entryBB);
7945   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
7946     .addReg(destLoop).addMBB(loopMBB)
7947     .addReg(dest).addMBB(entryBB);
7948 
7949   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7950   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
7951   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
7952   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
7953              IsThumb1, IsThumb2);
7954   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
7955              IsThumb1, IsThumb2);
7956 
7957   // Decrement loop variable by UnitSize.
7958   if (IsThumb1) {
7959     MachineInstrBuilder MIB =
7960         BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
7961     MIB = AddDefaultT1CC(MIB);
7962     MIB.addReg(varPhi).addImm(UnitSize);
7963     AddDefaultPred(MIB);
7964   } else {
7965     MachineInstrBuilder MIB =
7966         BuildMI(*BB, BB->end(), dl,
7967                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
7968     AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
7969     MIB->getOperand(5).setReg(ARM::CPSR);
7970     MIB->getOperand(5).setIsDef(true);
7971   }
7972   BuildMI(*BB, BB->end(), dl,
7973           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
7974       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
7975 
7976   // loopMBB can loop back to loopMBB or fall through to exitMBB.
7977   BB->addSuccessor(loopMBB);
7978   BB->addSuccessor(exitMBB);
7979 
7980   // Add epilogue to handle BytesLeft.
7981   BB = exitMBB;
7982   MachineInstr *StartOfExit = exitMBB->begin();
7983 
7984   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7985   //   [destOut] = STRB_POST(scratch, destLoop, 1)
7986   unsigned srcIn = srcLoop;
7987   unsigned destIn = destLoop;
7988   for (unsigned i = 0; i < BytesLeft; i++) {
7989     unsigned srcOut = MRI.createVirtualRegister(TRC);
7990     unsigned destOut = MRI.createVirtualRegister(TRC);
7991     unsigned scratch = MRI.createVirtualRegister(TRC);
7992     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
7993                IsThumb1, IsThumb2);
7994     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
7995                IsThumb1, IsThumb2);
7996     srcIn = srcOut;
7997     destIn = destOut;
7998   }
7999 
8000   MI->eraseFromParent();   // The instruction is gone now.
8001   return BB;
8002 }
8003 
8004 MachineBasicBlock *
8005 ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
8006                                        MachineBasicBlock *MBB) const {
8007   const TargetMachine &TM = getTargetMachine();
8008   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
8009   DebugLoc DL = MI->getDebugLoc();
8010 
8011   assert(Subtarget->isTargetWindows() &&
8012          "__chkstk is only supported on Windows");
8013   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
8014 
8015   // __chkstk takes the number of words to allocate on the stack in R4, and
8016   // returns the stack adjustment in number of bytes in R4.  This will not
8017   // clober any other registers (other than the obvious lr).
8018   //
8019   // Although, technically, IP should be considered a register which may be
8020   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
8021   // thumb-2 environment, so there is no interworking required.  As a result, we
8022   // do not expect a veneer to be emitted by the linker, clobbering IP.
8023   //
8024   // Each module receives its own copy of __chkstk, so no import thunk is
8025   // required, again, ensuring that IP is not clobbered.
8026   //
8027   // Finally, although some linkers may theoretically provide a trampoline for
8028   // out of range calls (which is quite common due to a 32M range limitation of
8029   // branches for Thumb), we can generate the long-call version via
8030   // -mcmodel=large, alleviating the need for the trampoline which may clobber
8031   // IP.
8032 
8033   switch (TM.getCodeModel()) {
8034   case CodeModel::Small:
8035   case CodeModel::Medium:
8036   case CodeModel::Default:
8037   case CodeModel::Kernel:
8038     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
8039       .addImm((unsigned)ARMCC::AL).addReg(0)
8040       .addExternalSymbol("__chkstk")
8041       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8042       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8043       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
8044     break;
8045   case CodeModel::Large:
8046   case CodeModel::JITDefault: {
8047     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
8048     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
8049 
8050     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
8051       .addExternalSymbol("__chkstk");
8052     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
8053       .addImm((unsigned)ARMCC::AL).addReg(0)
8054       .addReg(Reg, RegState::Kill)
8055       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8056       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8057       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
8058     break;
8059   }
8060   }
8061 
8062   AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
8063                                       ARM::SP)
8064                          .addReg(ARM::SP, RegState::Kill)
8065                          .addReg(ARM::R4, RegState::Kill)
8066                          .setMIFlags(MachineInstr::FrameSetup)));
8067 
8068   MI->eraseFromParent();
8069   return MBB;
8070 }
8071 
8072 MachineBasicBlock *
8073 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI,
8074                                        MachineBasicBlock *MBB) const {
8075   DebugLoc DL = MI->getDebugLoc();
8076   MachineFunction *MF = MBB->getParent();
8077   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8078 
8079   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
8080   MF->insert(++MBB->getIterator(), ContBB);
8081   ContBB->splice(ContBB->begin(), MBB,
8082                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
8083   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
8084 
8085   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
8086   MF->push_back(TrapBB);
8087   BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249);
8088   MBB->addSuccessor(TrapBB);
8089 
8090   BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ))
8091       .addReg(MI->getOperand(0).getReg())
8092       .addMBB(TrapBB);
8093   AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::t2B)).addMBB(ContBB));
8094   MBB->addSuccessor(ContBB);
8095 
8096   MI->eraseFromParent();
8097   return ContBB;
8098 }
8099 
8100 MachineBasicBlock *
8101 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
8102                                                MachineBasicBlock *BB) const {
8103   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8104   DebugLoc dl = MI->getDebugLoc();
8105   bool isThumb2 = Subtarget->isThumb2();
8106   switch (MI->getOpcode()) {
8107   default: {
8108     MI->dump();
8109     llvm_unreachable("Unexpected instr type to insert");
8110   }
8111   // The Thumb2 pre-indexed stores have the same MI operands, they just
8112   // define them differently in the .td files from the isel patterns, so
8113   // they need pseudos.
8114   case ARM::t2STR_preidx:
8115     MI->setDesc(TII->get(ARM::t2STR_PRE));
8116     return BB;
8117   case ARM::t2STRB_preidx:
8118     MI->setDesc(TII->get(ARM::t2STRB_PRE));
8119     return BB;
8120   case ARM::t2STRH_preidx:
8121     MI->setDesc(TII->get(ARM::t2STRH_PRE));
8122     return BB;
8123 
8124   case ARM::STRi_preidx:
8125   case ARM::STRBi_preidx: {
8126     unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
8127       ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
8128     // Decode the offset.
8129     unsigned Offset = MI->getOperand(4).getImm();
8130     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
8131     Offset = ARM_AM::getAM2Offset(Offset);
8132     if (isSub)
8133       Offset = -Offset;
8134 
8135     MachineMemOperand *MMO = *MI->memoperands_begin();
8136     BuildMI(*BB, MI, dl, TII->get(NewOpc))
8137       .addOperand(MI->getOperand(0))  // Rn_wb
8138       .addOperand(MI->getOperand(1))  // Rt
8139       .addOperand(MI->getOperand(2))  // Rn
8140       .addImm(Offset)                 // offset (skip GPR==zero_reg)
8141       .addOperand(MI->getOperand(5))  // pred
8142       .addOperand(MI->getOperand(6))
8143       .addMemOperand(MMO);
8144     MI->eraseFromParent();
8145     return BB;
8146   }
8147   case ARM::STRr_preidx:
8148   case ARM::STRBr_preidx:
8149   case ARM::STRH_preidx: {
8150     unsigned NewOpc;
8151     switch (MI->getOpcode()) {
8152     default: llvm_unreachable("unexpected opcode!");
8153     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
8154     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
8155     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
8156     }
8157     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
8158     for (unsigned i = 0; i < MI->getNumOperands(); ++i)
8159       MIB.addOperand(MI->getOperand(i));
8160     MI->eraseFromParent();
8161     return BB;
8162   }
8163 
8164   case ARM::tMOVCCr_pseudo: {
8165     // To "insert" a SELECT_CC instruction, we actually have to insert the
8166     // diamond control-flow pattern.  The incoming instruction knows the
8167     // destination vreg to set, the condition code register to branch on, the
8168     // true/false values to select between, and a branch opcode to use.
8169     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8170     MachineFunction::iterator It = ++BB->getIterator();
8171 
8172     //  thisMBB:
8173     //  ...
8174     //   TrueVal = ...
8175     //   cmpTY ccX, r1, r2
8176     //   bCC copy1MBB
8177     //   fallthrough --> copy0MBB
8178     MachineBasicBlock *thisMBB  = BB;
8179     MachineFunction *F = BB->getParent();
8180     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8181     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
8182     F->insert(It, copy0MBB);
8183     F->insert(It, sinkMBB);
8184 
8185     // Transfer the remainder of BB and its successor edges to sinkMBB.
8186     sinkMBB->splice(sinkMBB->begin(), BB,
8187                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
8188     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
8189 
8190     BB->addSuccessor(copy0MBB);
8191     BB->addSuccessor(sinkMBB);
8192 
8193     BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
8194       .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
8195 
8196     //  copy0MBB:
8197     //   %FalseValue = ...
8198     //   # fallthrough to sinkMBB
8199     BB = copy0MBB;
8200 
8201     // Update machine-CFG edges
8202     BB->addSuccessor(sinkMBB);
8203 
8204     //  sinkMBB:
8205     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8206     //  ...
8207     BB = sinkMBB;
8208     BuildMI(*BB, BB->begin(), dl,
8209             TII->get(ARM::PHI), MI->getOperand(0).getReg())
8210       .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
8211       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
8212 
8213     MI->eraseFromParent();   // The pseudo instruction is gone now.
8214     return BB;
8215   }
8216 
8217   case ARM::BCCi64:
8218   case ARM::BCCZi64: {
8219     // If there is an unconditional branch to the other successor, remove it.
8220     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
8221 
8222     // Compare both parts that make up the double comparison separately for
8223     // equality.
8224     bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
8225 
8226     unsigned LHS1 = MI->getOperand(1).getReg();
8227     unsigned LHS2 = MI->getOperand(2).getReg();
8228     if (RHSisZero) {
8229       AddDefaultPred(BuildMI(BB, dl,
8230                              TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8231                      .addReg(LHS1).addImm(0));
8232       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8233         .addReg(LHS2).addImm(0)
8234         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8235     } else {
8236       unsigned RHS1 = MI->getOperand(3).getReg();
8237       unsigned RHS2 = MI->getOperand(4).getReg();
8238       AddDefaultPred(BuildMI(BB, dl,
8239                              TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8240                      .addReg(LHS1).addReg(RHS1));
8241       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8242         .addReg(LHS2).addReg(RHS2)
8243         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8244     }
8245 
8246     MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
8247     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
8248     if (MI->getOperand(0).getImm() == ARMCC::NE)
8249       std::swap(destMBB, exitMBB);
8250 
8251     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
8252       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
8253     if (isThumb2)
8254       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
8255     else
8256       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
8257 
8258     MI->eraseFromParent();   // The pseudo instruction is gone now.
8259     return BB;
8260   }
8261 
8262   case ARM::Int_eh_sjlj_setjmp:
8263   case ARM::Int_eh_sjlj_setjmp_nofp:
8264   case ARM::tInt_eh_sjlj_setjmp:
8265   case ARM::t2Int_eh_sjlj_setjmp:
8266   case ARM::t2Int_eh_sjlj_setjmp_nofp:
8267     return BB;
8268 
8269   case ARM::Int_eh_sjlj_setup_dispatch:
8270     EmitSjLjDispatchBlock(MI, BB);
8271     return BB;
8272 
8273   case ARM::ABS:
8274   case ARM::t2ABS: {
8275     // To insert an ABS instruction, we have to insert the
8276     // diamond control-flow pattern.  The incoming instruction knows the
8277     // source vreg to test against 0, the destination vreg to set,
8278     // the condition code register to branch on, the
8279     // true/false values to select between, and a branch opcode to use.
8280     // It transforms
8281     //     V1 = ABS V0
8282     // into
8283     //     V2 = MOVS V0
8284     //     BCC                      (branch to SinkBB if V0 >= 0)
8285     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
8286     //     SinkBB: V1 = PHI(V2, V3)
8287     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8288     MachineFunction::iterator BBI = ++BB->getIterator();
8289     MachineFunction *Fn = BB->getParent();
8290     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
8291     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
8292     Fn->insert(BBI, RSBBB);
8293     Fn->insert(BBI, SinkBB);
8294 
8295     unsigned int ABSSrcReg = MI->getOperand(1).getReg();
8296     unsigned int ABSDstReg = MI->getOperand(0).getReg();
8297     bool ABSSrcKIll = MI->getOperand(1).isKill();
8298     bool isThumb2 = Subtarget->isThumb2();
8299     MachineRegisterInfo &MRI = Fn->getRegInfo();
8300     // In Thumb mode S must not be specified if source register is the SP or
8301     // PC and if destination register is the SP, so restrict register class
8302     unsigned NewRsbDstReg =
8303       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
8304 
8305     // Transfer the remainder of BB and its successor edges to sinkMBB.
8306     SinkBB->splice(SinkBB->begin(), BB,
8307                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
8308     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
8309 
8310     BB->addSuccessor(RSBBB);
8311     BB->addSuccessor(SinkBB);
8312 
8313     // fall through to SinkMBB
8314     RSBBB->addSuccessor(SinkBB);
8315 
8316     // insert a cmp at the end of BB
8317     AddDefaultPred(BuildMI(BB, dl,
8318                            TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8319                    .addReg(ABSSrcReg).addImm(0));
8320 
8321     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
8322     BuildMI(BB, dl,
8323       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
8324       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
8325 
8326     // insert rsbri in RSBBB
8327     // Note: BCC and rsbri will be converted into predicated rsbmi
8328     // by if-conversion pass
8329     BuildMI(*RSBBB, RSBBB->begin(), dl,
8330       TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
8331       .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
8332       .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
8333 
8334     // insert PHI in SinkBB,
8335     // reuse ABSDstReg to not change uses of ABS instruction
8336     BuildMI(*SinkBB, SinkBB->begin(), dl,
8337       TII->get(ARM::PHI), ABSDstReg)
8338       .addReg(NewRsbDstReg).addMBB(RSBBB)
8339       .addReg(ABSSrcReg).addMBB(BB);
8340 
8341     // remove ABS instruction
8342     MI->eraseFromParent();
8343 
8344     // return last added BB
8345     return SinkBB;
8346   }
8347   case ARM::COPY_STRUCT_BYVAL_I32:
8348     ++NumLoopByVals;
8349     return EmitStructByval(MI, BB);
8350   case ARM::WIN__CHKSTK:
8351     return EmitLowered__chkstk(MI, BB);
8352   case ARM::WIN__DBZCHK:
8353     return EmitLowered__dbzchk(MI, BB);
8354   }
8355 }
8356 
8357 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers
8358 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
8359 /// instead of as a custom inserter because we need the use list from the SDNode.
8360 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
8361                                    MachineInstr *MI, const SDNode *Node) {
8362   bool isThumb1 = Subtarget->isThumb1Only();
8363 
8364   DebugLoc DL = MI->getDebugLoc();
8365   MachineFunction *MF = MI->getParent()->getParent();
8366   MachineRegisterInfo &MRI = MF->getRegInfo();
8367   MachineInstrBuilder MIB(*MF, MI);
8368 
8369   // If the new dst/src is unused mark it as dead.
8370   if (!Node->hasAnyUseOfValue(0)) {
8371     MI->getOperand(0).setIsDead(true);
8372   }
8373   if (!Node->hasAnyUseOfValue(1)) {
8374     MI->getOperand(1).setIsDead(true);
8375   }
8376 
8377   // The MEMCPY both defines and kills the scratch registers.
8378   for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) {
8379     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
8380                                                          : &ARM::GPRRegClass);
8381     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
8382   }
8383 }
8384 
8385 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
8386                                                       SDNode *Node) const {
8387   if (MI->getOpcode() == ARM::MEMCPY) {
8388     attachMEMCPYScratchRegs(Subtarget, MI, Node);
8389     return;
8390   }
8391 
8392   const MCInstrDesc *MCID = &MI->getDesc();
8393   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
8394   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
8395   // operand is still set to noreg. If needed, set the optional operand's
8396   // register to CPSR, and remove the redundant implicit def.
8397   //
8398   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
8399 
8400   // Rename pseudo opcodes.
8401   unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
8402   if (NewOpc) {
8403     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
8404     MCID = &TII->get(NewOpc);
8405 
8406     assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
8407            "converted opcode should be the same except for cc_out");
8408 
8409     MI->setDesc(*MCID);
8410 
8411     // Add the optional cc_out operand
8412     MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
8413   }
8414   unsigned ccOutIdx = MCID->getNumOperands() - 1;
8415 
8416   // Any ARM instruction that sets the 's' bit should specify an optional
8417   // "cc_out" operand in the last operand position.
8418   if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
8419     assert(!NewOpc && "Optional cc_out operand required");
8420     return;
8421   }
8422   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
8423   // since we already have an optional CPSR def.
8424   bool definesCPSR = false;
8425   bool deadCPSR = false;
8426   for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
8427        i != e; ++i) {
8428     const MachineOperand &MO = MI->getOperand(i);
8429     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
8430       definesCPSR = true;
8431       if (MO.isDead())
8432         deadCPSR = true;
8433       MI->RemoveOperand(i);
8434       break;
8435     }
8436   }
8437   if (!definesCPSR) {
8438     assert(!NewOpc && "Optional cc_out operand required");
8439     return;
8440   }
8441   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
8442   if (deadCPSR) {
8443     assert(!MI->getOperand(ccOutIdx).getReg() &&
8444            "expect uninitialized optional cc_out operand");
8445     return;
8446   }
8447 
8448   // If this instruction was defined with an optional CPSR def and its dag node
8449   // had a live implicit CPSR def, then activate the optional CPSR def.
8450   MachineOperand &MO = MI->getOperand(ccOutIdx);
8451   MO.setReg(ARM::CPSR);
8452   MO.setIsDef(true);
8453 }
8454 
8455 //===----------------------------------------------------------------------===//
8456 //                           ARM Optimization Hooks
8457 //===----------------------------------------------------------------------===//
8458 
8459 // Helper function that checks if N is a null or all ones constant.
8460 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
8461   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
8462 }
8463 
8464 // Return true if N is conditionally 0 or all ones.
8465 // Detects these expressions where cc is an i1 value:
8466 //
8467 //   (select cc 0, y)   [AllOnes=0]
8468 //   (select cc y, 0)   [AllOnes=0]
8469 //   (zext cc)          [AllOnes=0]
8470 //   (sext cc)          [AllOnes=0/1]
8471 //   (select cc -1, y)  [AllOnes=1]
8472 //   (select cc y, -1)  [AllOnes=1]
8473 //
8474 // Invert is set when N is the null/all ones constant when CC is false.
8475 // OtherOp is set to the alternative value of N.
8476 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
8477                                        SDValue &CC, bool &Invert,
8478                                        SDValue &OtherOp,
8479                                        SelectionDAG &DAG) {
8480   switch (N->getOpcode()) {
8481   default: return false;
8482   case ISD::SELECT: {
8483     CC = N->getOperand(0);
8484     SDValue N1 = N->getOperand(1);
8485     SDValue N2 = N->getOperand(2);
8486     if (isZeroOrAllOnes(N1, AllOnes)) {
8487       Invert = false;
8488       OtherOp = N2;
8489       return true;
8490     }
8491     if (isZeroOrAllOnes(N2, AllOnes)) {
8492       Invert = true;
8493       OtherOp = N1;
8494       return true;
8495     }
8496     return false;
8497   }
8498   case ISD::ZERO_EXTEND:
8499     // (zext cc) can never be the all ones value.
8500     if (AllOnes)
8501       return false;
8502     // Fall through.
8503   case ISD::SIGN_EXTEND: {
8504     SDLoc dl(N);
8505     EVT VT = N->getValueType(0);
8506     CC = N->getOperand(0);
8507     if (CC.getValueType() != MVT::i1)
8508       return false;
8509     Invert = !AllOnes;
8510     if (AllOnes)
8511       // When looking for an AllOnes constant, N is an sext, and the 'other'
8512       // value is 0.
8513       OtherOp = DAG.getConstant(0, dl, VT);
8514     else if (N->getOpcode() == ISD::ZERO_EXTEND)
8515       // When looking for a 0 constant, N can be zext or sext.
8516       OtherOp = DAG.getConstant(1, dl, VT);
8517     else
8518       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
8519                                 VT);
8520     return true;
8521   }
8522   }
8523 }
8524 
8525 // Combine a constant select operand into its use:
8526 //
8527 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
8528 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
8529 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
8530 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
8531 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
8532 //
8533 // The transform is rejected if the select doesn't have a constant operand that
8534 // is null, or all ones when AllOnes is set.
8535 //
8536 // Also recognize sext/zext from i1:
8537 //
8538 //   (add (zext cc), x) -> (select cc (add x, 1), x)
8539 //   (add (sext cc), x) -> (select cc (add x, -1), x)
8540 //
8541 // These transformations eventually create predicated instructions.
8542 //
8543 // @param N       The node to transform.
8544 // @param Slct    The N operand that is a select.
8545 // @param OtherOp The other N operand (x above).
8546 // @param DCI     Context.
8547 // @param AllOnes Require the select constant to be all ones instead of null.
8548 // @returns The new node, or SDValue() on failure.
8549 static
8550 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
8551                             TargetLowering::DAGCombinerInfo &DCI,
8552                             bool AllOnes = false) {
8553   SelectionDAG &DAG = DCI.DAG;
8554   EVT VT = N->getValueType(0);
8555   SDValue NonConstantVal;
8556   SDValue CCOp;
8557   bool SwapSelectOps;
8558   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
8559                                   NonConstantVal, DAG))
8560     return SDValue();
8561 
8562   // Slct is now know to be the desired identity constant when CC is true.
8563   SDValue TrueVal = OtherOp;
8564   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
8565                                  OtherOp, NonConstantVal);
8566   // Unless SwapSelectOps says CC should be false.
8567   if (SwapSelectOps)
8568     std::swap(TrueVal, FalseVal);
8569 
8570   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
8571                      CCOp, TrueVal, FalseVal);
8572 }
8573 
8574 // Attempt combineSelectAndUse on each operand of a commutative operator N.
8575 static
8576 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
8577                                        TargetLowering::DAGCombinerInfo &DCI) {
8578   SDValue N0 = N->getOperand(0);
8579   SDValue N1 = N->getOperand(1);
8580   if (N0.getNode()->hasOneUse())
8581     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
8582       return Result;
8583   if (N1.getNode()->hasOneUse())
8584     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
8585       return Result;
8586   return SDValue();
8587 }
8588 
8589 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
8590 // (only after legalization).
8591 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
8592                                  TargetLowering::DAGCombinerInfo &DCI,
8593                                  const ARMSubtarget *Subtarget) {
8594 
8595   // Only perform optimization if after legalize, and if NEON is available. We
8596   // also expected both operands to be BUILD_VECTORs.
8597   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
8598       || N0.getOpcode() != ISD::BUILD_VECTOR
8599       || N1.getOpcode() != ISD::BUILD_VECTOR)
8600     return SDValue();
8601 
8602   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
8603   EVT VT = N->getValueType(0);
8604   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
8605     return SDValue();
8606 
8607   // Check that the vector operands are of the right form.
8608   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
8609   // operands, where N is the size of the formed vector.
8610   // Each EXTRACT_VECTOR should have the same input vector and odd or even
8611   // index such that we have a pair wise add pattern.
8612 
8613   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
8614   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8615     return SDValue();
8616   SDValue Vec = N0->getOperand(0)->getOperand(0);
8617   SDNode *V = Vec.getNode();
8618   unsigned nextIndex = 0;
8619 
8620   // For each operands to the ADD which are BUILD_VECTORs,
8621   // check to see if each of their operands are an EXTRACT_VECTOR with
8622   // the same vector and appropriate index.
8623   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
8624     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
8625         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
8626 
8627       SDValue ExtVec0 = N0->getOperand(i);
8628       SDValue ExtVec1 = N1->getOperand(i);
8629 
8630       // First operand is the vector, verify its the same.
8631       if (V != ExtVec0->getOperand(0).getNode() ||
8632           V != ExtVec1->getOperand(0).getNode())
8633         return SDValue();
8634 
8635       // Second is the constant, verify its correct.
8636       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
8637       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
8638 
8639       // For the constant, we want to see all the even or all the odd.
8640       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
8641           || C1->getZExtValue() != nextIndex+1)
8642         return SDValue();
8643 
8644       // Increment index.
8645       nextIndex+=2;
8646     } else
8647       return SDValue();
8648   }
8649 
8650   // Create VPADDL node.
8651   SelectionDAG &DAG = DCI.DAG;
8652   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8653 
8654   SDLoc dl(N);
8655 
8656   // Build operand list.
8657   SmallVector<SDValue, 8> Ops;
8658   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
8659                                 TLI.getPointerTy(DAG.getDataLayout())));
8660 
8661   // Input is the vector.
8662   Ops.push_back(Vec);
8663 
8664   // Get widened type and narrowed type.
8665   MVT widenType;
8666   unsigned numElem = VT.getVectorNumElements();
8667 
8668   EVT inputLaneType = Vec.getValueType().getVectorElementType();
8669   switch (inputLaneType.getSimpleVT().SimpleTy) {
8670     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
8671     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
8672     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
8673     default:
8674       llvm_unreachable("Invalid vector element type for padd optimization.");
8675   }
8676 
8677   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
8678   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
8679   return DAG.getNode(ExtOp, dl, VT, tmp);
8680 }
8681 
8682 static SDValue findMUL_LOHI(SDValue V) {
8683   if (V->getOpcode() == ISD::UMUL_LOHI ||
8684       V->getOpcode() == ISD::SMUL_LOHI)
8685     return V;
8686   return SDValue();
8687 }
8688 
8689 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
8690                                      TargetLowering::DAGCombinerInfo &DCI,
8691                                      const ARMSubtarget *Subtarget) {
8692 
8693   if (Subtarget->isThumb1Only()) return SDValue();
8694 
8695   // Only perform the checks after legalize when the pattern is available.
8696   if (DCI.isBeforeLegalize()) return SDValue();
8697 
8698   // Look for multiply add opportunities.
8699   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
8700   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
8701   // a glue link from the first add to the second add.
8702   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
8703   // a S/UMLAL instruction.
8704   //                  UMUL_LOHI
8705   //                 / :lo    \ :hi
8706   //                /          \          [no multiline comment]
8707   //    loAdd ->  ADDE         |
8708   //                 \ :glue  /
8709   //                  \      /
8710   //                    ADDC   <- hiAdd
8711   //
8712   assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
8713   SDValue AddcOp0 = AddcNode->getOperand(0);
8714   SDValue AddcOp1 = AddcNode->getOperand(1);
8715 
8716   // Check if the two operands are from the same mul_lohi node.
8717   if (AddcOp0.getNode() == AddcOp1.getNode())
8718     return SDValue();
8719 
8720   assert(AddcNode->getNumValues() == 2 &&
8721          AddcNode->getValueType(0) == MVT::i32 &&
8722          "Expect ADDC with two result values. First: i32");
8723 
8724   // Check that we have a glued ADDC node.
8725   if (AddcNode->getValueType(1) != MVT::Glue)
8726     return SDValue();
8727 
8728   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
8729   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
8730       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
8731       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
8732       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
8733     return SDValue();
8734 
8735   // Look for the glued ADDE.
8736   SDNode* AddeNode = AddcNode->getGluedUser();
8737   if (!AddeNode)
8738     return SDValue();
8739 
8740   // Make sure it is really an ADDE.
8741   if (AddeNode->getOpcode() != ISD::ADDE)
8742     return SDValue();
8743 
8744   assert(AddeNode->getNumOperands() == 3 &&
8745          AddeNode->getOperand(2).getValueType() == MVT::Glue &&
8746          "ADDE node has the wrong inputs");
8747 
8748   // Check for the triangle shape.
8749   SDValue AddeOp0 = AddeNode->getOperand(0);
8750   SDValue AddeOp1 = AddeNode->getOperand(1);
8751 
8752   // Make sure that the ADDE operands are not coming from the same node.
8753   if (AddeOp0.getNode() == AddeOp1.getNode())
8754     return SDValue();
8755 
8756   // Find the MUL_LOHI node walking up ADDE's operands.
8757   bool IsLeftOperandMUL = false;
8758   SDValue MULOp = findMUL_LOHI(AddeOp0);
8759   if (MULOp == SDValue())
8760    MULOp = findMUL_LOHI(AddeOp1);
8761   else
8762     IsLeftOperandMUL = true;
8763   if (MULOp == SDValue())
8764     return SDValue();
8765 
8766   // Figure out the right opcode.
8767   unsigned Opc = MULOp->getOpcode();
8768   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
8769 
8770   // Figure out the high and low input values to the MLAL node.
8771   SDValue* HiAdd = nullptr;
8772   SDValue* LoMul = nullptr;
8773   SDValue* LowAdd = nullptr;
8774 
8775   // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
8776   if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
8777     return SDValue();
8778 
8779   if (IsLeftOperandMUL)
8780     HiAdd = &AddeOp1;
8781   else
8782     HiAdd = &AddeOp0;
8783 
8784 
8785   // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
8786   // whose low result is fed to the ADDC we are checking.
8787 
8788   if (AddcOp0 == MULOp.getValue(0)) {
8789     LoMul = &AddcOp0;
8790     LowAdd = &AddcOp1;
8791   }
8792   if (AddcOp1 == MULOp.getValue(0)) {
8793     LoMul = &AddcOp1;
8794     LowAdd = &AddcOp0;
8795   }
8796 
8797   if (!LoMul)
8798     return SDValue();
8799 
8800   // Create the merged node.
8801   SelectionDAG &DAG = DCI.DAG;
8802 
8803   // Build operand list.
8804   SmallVector<SDValue, 8> Ops;
8805   Ops.push_back(LoMul->getOperand(0));
8806   Ops.push_back(LoMul->getOperand(1));
8807   Ops.push_back(*LowAdd);
8808   Ops.push_back(*HiAdd);
8809 
8810   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
8811                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
8812 
8813   // Replace the ADDs' nodes uses by the MLA node's values.
8814   SDValue HiMLALResult(MLALNode.getNode(), 1);
8815   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
8816 
8817   SDValue LoMLALResult(MLALNode.getNode(), 0);
8818   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
8819 
8820   // Return original node to notify the driver to stop replacing.
8821   SDValue resNode(AddcNode, 0);
8822   return resNode;
8823 }
8824 
8825 /// PerformADDCCombine - Target-specific dag combine transform from
8826 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
8827 static SDValue PerformADDCCombine(SDNode *N,
8828                                  TargetLowering::DAGCombinerInfo &DCI,
8829                                  const ARMSubtarget *Subtarget) {
8830 
8831   return AddCombineTo64bitMLAL(N, DCI, Subtarget);
8832 
8833 }
8834 
8835 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
8836 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
8837 /// called with the default operands, and if that fails, with commuted
8838 /// operands.
8839 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
8840                                           TargetLowering::DAGCombinerInfo &DCI,
8841                                           const ARMSubtarget *Subtarget){
8842 
8843   // Attempt to create vpaddl for this add.
8844   if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
8845     return Result;
8846 
8847   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
8848   if (N0.getNode()->hasOneUse())
8849     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
8850       return Result;
8851   return SDValue();
8852 }
8853 
8854 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
8855 ///
8856 static SDValue PerformADDCombine(SDNode *N,
8857                                  TargetLowering::DAGCombinerInfo &DCI,
8858                                  const ARMSubtarget *Subtarget) {
8859   SDValue N0 = N->getOperand(0);
8860   SDValue N1 = N->getOperand(1);
8861 
8862   // First try with the default operand order.
8863   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
8864     return Result;
8865 
8866   // If that didn't work, try again with the operands commuted.
8867   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
8868 }
8869 
8870 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
8871 ///
8872 static SDValue PerformSUBCombine(SDNode *N,
8873                                  TargetLowering::DAGCombinerInfo &DCI) {
8874   SDValue N0 = N->getOperand(0);
8875   SDValue N1 = N->getOperand(1);
8876 
8877   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
8878   if (N1.getNode()->hasOneUse())
8879     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
8880       return Result;
8881 
8882   return SDValue();
8883 }
8884 
8885 /// PerformVMULCombine
8886 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
8887 /// special multiplier accumulator forwarding.
8888 ///   vmul d3, d0, d2
8889 ///   vmla d3, d1, d2
8890 /// is faster than
8891 ///   vadd d3, d0, d1
8892 ///   vmul d3, d3, d2
8893 //  However, for (A + B) * (A + B),
8894 //    vadd d2, d0, d1
8895 //    vmul d3, d0, d2
8896 //    vmla d3, d1, d2
8897 //  is slower than
8898 //    vadd d2, d0, d1
8899 //    vmul d3, d2, d2
8900 static SDValue PerformVMULCombine(SDNode *N,
8901                                   TargetLowering::DAGCombinerInfo &DCI,
8902                                   const ARMSubtarget *Subtarget) {
8903   if (!Subtarget->hasVMLxForwarding())
8904     return SDValue();
8905 
8906   SelectionDAG &DAG = DCI.DAG;
8907   SDValue N0 = N->getOperand(0);
8908   SDValue N1 = N->getOperand(1);
8909   unsigned Opcode = N0.getOpcode();
8910   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8911       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
8912     Opcode = N1.getOpcode();
8913     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8914         Opcode != ISD::FADD && Opcode != ISD::FSUB)
8915       return SDValue();
8916     std::swap(N0, N1);
8917   }
8918 
8919   if (N0 == N1)
8920     return SDValue();
8921 
8922   EVT VT = N->getValueType(0);
8923   SDLoc DL(N);
8924   SDValue N00 = N0->getOperand(0);
8925   SDValue N01 = N0->getOperand(1);
8926   return DAG.getNode(Opcode, DL, VT,
8927                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
8928                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
8929 }
8930 
8931 static SDValue PerformMULCombine(SDNode *N,
8932                                  TargetLowering::DAGCombinerInfo &DCI,
8933                                  const ARMSubtarget *Subtarget) {
8934   SelectionDAG &DAG = DCI.DAG;
8935 
8936   if (Subtarget->isThumb1Only())
8937     return SDValue();
8938 
8939   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
8940     return SDValue();
8941 
8942   EVT VT = N->getValueType(0);
8943   if (VT.is64BitVector() || VT.is128BitVector())
8944     return PerformVMULCombine(N, DCI, Subtarget);
8945   if (VT != MVT::i32)
8946     return SDValue();
8947 
8948   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8949   if (!C)
8950     return SDValue();
8951 
8952   int64_t MulAmt = C->getSExtValue();
8953   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
8954 
8955   ShiftAmt = ShiftAmt & (32 - 1);
8956   SDValue V = N->getOperand(0);
8957   SDLoc DL(N);
8958 
8959   SDValue Res;
8960   MulAmt >>= ShiftAmt;
8961 
8962   if (MulAmt >= 0) {
8963     if (isPowerOf2_32(MulAmt - 1)) {
8964       // (mul x, 2^N + 1) => (add (shl x, N), x)
8965       Res = DAG.getNode(ISD::ADD, DL, VT,
8966                         V,
8967                         DAG.getNode(ISD::SHL, DL, VT,
8968                                     V,
8969                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
8970                                                     MVT::i32)));
8971     } else if (isPowerOf2_32(MulAmt + 1)) {
8972       // (mul x, 2^N - 1) => (sub (shl x, N), x)
8973       Res = DAG.getNode(ISD::SUB, DL, VT,
8974                         DAG.getNode(ISD::SHL, DL, VT,
8975                                     V,
8976                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
8977                                                     MVT::i32)),
8978                         V);
8979     } else
8980       return SDValue();
8981   } else {
8982     uint64_t MulAmtAbs = -MulAmt;
8983     if (isPowerOf2_32(MulAmtAbs + 1)) {
8984       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
8985       Res = DAG.getNode(ISD::SUB, DL, VT,
8986                         V,
8987                         DAG.getNode(ISD::SHL, DL, VT,
8988                                     V,
8989                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
8990                                                     MVT::i32)));
8991     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
8992       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
8993       Res = DAG.getNode(ISD::ADD, DL, VT,
8994                         V,
8995                         DAG.getNode(ISD::SHL, DL, VT,
8996                                     V,
8997                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
8998                                                     MVT::i32)));
8999       Res = DAG.getNode(ISD::SUB, DL, VT,
9000                         DAG.getConstant(0, DL, MVT::i32), Res);
9001 
9002     } else
9003       return SDValue();
9004   }
9005 
9006   if (ShiftAmt != 0)
9007     Res = DAG.getNode(ISD::SHL, DL, VT,
9008                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
9009 
9010   // Do not add new nodes to DAG combiner worklist.
9011   DCI.CombineTo(N, Res, false);
9012   return SDValue();
9013 }
9014 
9015 static SDValue PerformANDCombine(SDNode *N,
9016                                  TargetLowering::DAGCombinerInfo &DCI,
9017                                  const ARMSubtarget *Subtarget) {
9018 
9019   // Attempt to use immediate-form VBIC
9020   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9021   SDLoc dl(N);
9022   EVT VT = N->getValueType(0);
9023   SelectionDAG &DAG = DCI.DAG;
9024 
9025   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9026     return SDValue();
9027 
9028   APInt SplatBits, SplatUndef;
9029   unsigned SplatBitSize;
9030   bool HasAnyUndefs;
9031   if (BVN &&
9032       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9033     if (SplatBitSize <= 64) {
9034       EVT VbicVT;
9035       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
9036                                       SplatUndef.getZExtValue(), SplatBitSize,
9037                                       DAG, dl, VbicVT, VT.is128BitVector(),
9038                                       OtherModImm);
9039       if (Val.getNode()) {
9040         SDValue Input =
9041           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
9042         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
9043         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
9044       }
9045     }
9046   }
9047 
9048   if (!Subtarget->isThumb1Only()) {
9049     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
9050     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
9051       return Result;
9052   }
9053 
9054   return SDValue();
9055 }
9056 
9057 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
9058 static SDValue PerformORCombine(SDNode *N,
9059                                 TargetLowering::DAGCombinerInfo &DCI,
9060                                 const ARMSubtarget *Subtarget) {
9061   // Attempt to use immediate-form VORR
9062   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9063   SDLoc dl(N);
9064   EVT VT = N->getValueType(0);
9065   SelectionDAG &DAG = DCI.DAG;
9066 
9067   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9068     return SDValue();
9069 
9070   APInt SplatBits, SplatUndef;
9071   unsigned SplatBitSize;
9072   bool HasAnyUndefs;
9073   if (BVN && Subtarget->hasNEON() &&
9074       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9075     if (SplatBitSize <= 64) {
9076       EVT VorrVT;
9077       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
9078                                       SplatUndef.getZExtValue(), SplatBitSize,
9079                                       DAG, dl, VorrVT, VT.is128BitVector(),
9080                                       OtherModImm);
9081       if (Val.getNode()) {
9082         SDValue Input =
9083           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
9084         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
9085         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
9086       }
9087     }
9088   }
9089 
9090   if (!Subtarget->isThumb1Only()) {
9091     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
9092     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
9093       return Result;
9094   }
9095 
9096   // The code below optimizes (or (and X, Y), Z).
9097   // The AND operand needs to have a single user to make these optimizations
9098   // profitable.
9099   SDValue N0 = N->getOperand(0);
9100   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
9101     return SDValue();
9102   SDValue N1 = N->getOperand(1);
9103 
9104   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
9105   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
9106       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
9107     APInt SplatUndef;
9108     unsigned SplatBitSize;
9109     bool HasAnyUndefs;
9110 
9111     APInt SplatBits0, SplatBits1;
9112     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
9113     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
9114     // Ensure that the second operand of both ands are constants
9115     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
9116                                       HasAnyUndefs) && !HasAnyUndefs) {
9117         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
9118                                           HasAnyUndefs) && !HasAnyUndefs) {
9119             // Ensure that the bit width of the constants are the same and that
9120             // the splat arguments are logical inverses as per the pattern we
9121             // are trying to simplify.
9122             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
9123                 SplatBits0 == ~SplatBits1) {
9124                 // Canonicalize the vector type to make instruction selection
9125                 // simpler.
9126                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
9127                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
9128                                              N0->getOperand(1),
9129                                              N0->getOperand(0),
9130                                              N1->getOperand(0));
9131                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
9132             }
9133         }
9134     }
9135   }
9136 
9137   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
9138   // reasonable.
9139 
9140   // BFI is only available on V6T2+
9141   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
9142     return SDValue();
9143 
9144   SDLoc DL(N);
9145   // 1) or (and A, mask), val => ARMbfi A, val, mask
9146   //      iff (val & mask) == val
9147   //
9148   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
9149   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
9150   //          && mask == ~mask2
9151   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
9152   //          && ~mask == mask2
9153   //  (i.e., copy a bitfield value into another bitfield of the same width)
9154 
9155   if (VT != MVT::i32)
9156     return SDValue();
9157 
9158   SDValue N00 = N0.getOperand(0);
9159 
9160   // The value and the mask need to be constants so we can verify this is
9161   // actually a bitfield set. If the mask is 0xffff, we can do better
9162   // via a movt instruction, so don't use BFI in that case.
9163   SDValue MaskOp = N0.getOperand(1);
9164   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
9165   if (!MaskC)
9166     return SDValue();
9167   unsigned Mask = MaskC->getZExtValue();
9168   if (Mask == 0xffff)
9169     return SDValue();
9170   SDValue Res;
9171   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
9172   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9173   if (N1C) {
9174     unsigned Val = N1C->getZExtValue();
9175     if ((Val & ~Mask) != Val)
9176       return SDValue();
9177 
9178     if (ARM::isBitFieldInvertedMask(Mask)) {
9179       Val >>= countTrailingZeros(~Mask);
9180 
9181       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
9182                         DAG.getConstant(Val, DL, MVT::i32),
9183                         DAG.getConstant(Mask, DL, MVT::i32));
9184 
9185       // Do not add new nodes to DAG combiner worklist.
9186       DCI.CombineTo(N, Res, false);
9187       return SDValue();
9188     }
9189   } else if (N1.getOpcode() == ISD::AND) {
9190     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
9191     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
9192     if (!N11C)
9193       return SDValue();
9194     unsigned Mask2 = N11C->getZExtValue();
9195 
9196     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
9197     // as is to match.
9198     if (ARM::isBitFieldInvertedMask(Mask) &&
9199         (Mask == ~Mask2)) {
9200       // The pack halfword instruction works better for masks that fit it,
9201       // so use that when it's available.
9202       if (Subtarget->hasT2ExtractPack() &&
9203           (Mask == 0xffff || Mask == 0xffff0000))
9204         return SDValue();
9205       // 2a
9206       unsigned amt = countTrailingZeros(Mask2);
9207       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
9208                         DAG.getConstant(amt, DL, MVT::i32));
9209       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
9210                         DAG.getConstant(Mask, DL, MVT::i32));
9211       // Do not add new nodes to DAG combiner worklist.
9212       DCI.CombineTo(N, Res, false);
9213       return SDValue();
9214     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
9215                (~Mask == Mask2)) {
9216       // The pack halfword instruction works better for masks that fit it,
9217       // so use that when it's available.
9218       if (Subtarget->hasT2ExtractPack() &&
9219           (Mask2 == 0xffff || Mask2 == 0xffff0000))
9220         return SDValue();
9221       // 2b
9222       unsigned lsb = countTrailingZeros(Mask);
9223       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
9224                         DAG.getConstant(lsb, DL, MVT::i32));
9225       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
9226                         DAG.getConstant(Mask2, DL, MVT::i32));
9227       // Do not add new nodes to DAG combiner worklist.
9228       DCI.CombineTo(N, Res, false);
9229       return SDValue();
9230     }
9231   }
9232 
9233   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
9234       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
9235       ARM::isBitFieldInvertedMask(~Mask)) {
9236     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
9237     // where lsb(mask) == #shamt and masked bits of B are known zero.
9238     SDValue ShAmt = N00.getOperand(1);
9239     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
9240     unsigned LSB = countTrailingZeros(Mask);
9241     if (ShAmtC != LSB)
9242       return SDValue();
9243 
9244     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
9245                       DAG.getConstant(~Mask, DL, MVT::i32));
9246 
9247     // Do not add new nodes to DAG combiner worklist.
9248     DCI.CombineTo(N, Res, false);
9249   }
9250 
9251   return SDValue();
9252 }
9253 
9254 static SDValue PerformXORCombine(SDNode *N,
9255                                  TargetLowering::DAGCombinerInfo &DCI,
9256                                  const ARMSubtarget *Subtarget) {
9257   EVT VT = N->getValueType(0);
9258   SelectionDAG &DAG = DCI.DAG;
9259 
9260   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9261     return SDValue();
9262 
9263   if (!Subtarget->isThumb1Only()) {
9264     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
9265     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
9266       return Result;
9267   }
9268 
9269   return SDValue();
9270 }
9271 
9272 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
9273 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
9274 // their position in "to" (Rd).
9275 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
9276   assert(N->getOpcode() == ARMISD::BFI);
9277 
9278   SDValue From = N->getOperand(1);
9279   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
9280   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
9281 
9282   // If the Base came from a SHR #C, we can deduce that it is really testing bit
9283   // #C in the base of the SHR.
9284   if (From->getOpcode() == ISD::SRL &&
9285       isa<ConstantSDNode>(From->getOperand(1))) {
9286     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
9287     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
9288     FromMask <<= Shift.getLimitedValue(31);
9289     From = From->getOperand(0);
9290   }
9291 
9292   return From;
9293 }
9294 
9295 // If A and B contain one contiguous set of bits, does A | B == A . B?
9296 //
9297 // Neither A nor B must be zero.
9298 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
9299   unsigned LastActiveBitInA =  A.countTrailingZeros();
9300   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
9301   return LastActiveBitInA - 1 == FirstActiveBitInB;
9302 }
9303 
9304 static SDValue FindBFIToCombineWith(SDNode *N) {
9305   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
9306   // if one exists.
9307   APInt ToMask, FromMask;
9308   SDValue From = ParseBFI(N, ToMask, FromMask);
9309   SDValue To = N->getOperand(0);
9310 
9311   // Now check for a compatible BFI to merge with. We can pass through BFIs that
9312   // aren't compatible, but not if they set the same bit in their destination as
9313   // we do (or that of any BFI we're going to combine with).
9314   SDValue V = To;
9315   APInt CombinedToMask = ToMask;
9316   while (V.getOpcode() == ARMISD::BFI) {
9317     APInt NewToMask, NewFromMask;
9318     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
9319     if (NewFrom != From) {
9320       // This BFI has a different base. Keep going.
9321       CombinedToMask |= NewToMask;
9322       V = V.getOperand(0);
9323       continue;
9324     }
9325 
9326     // Do the written bits conflict with any we've seen so far?
9327     if ((NewToMask & CombinedToMask).getBoolValue())
9328       // Conflicting bits - bail out because going further is unsafe.
9329       return SDValue();
9330 
9331     // Are the new bits contiguous when combined with the old bits?
9332     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
9333         BitsProperlyConcatenate(FromMask, NewFromMask))
9334       return V;
9335     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
9336         BitsProperlyConcatenate(NewFromMask, FromMask))
9337       return V;
9338 
9339     // We've seen a write to some bits, so track it.
9340     CombinedToMask |= NewToMask;
9341     // Keep going...
9342     V = V.getOperand(0);
9343   }
9344 
9345   return SDValue();
9346 }
9347 
9348 static SDValue PerformBFICombine(SDNode *N,
9349                                  TargetLowering::DAGCombinerInfo &DCI) {
9350   SDValue N1 = N->getOperand(1);
9351   if (N1.getOpcode() == ISD::AND) {
9352     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
9353     // the bits being cleared by the AND are not demanded by the BFI.
9354     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
9355     if (!N11C)
9356       return SDValue();
9357     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
9358     unsigned LSB = countTrailingZeros(~InvMask);
9359     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
9360     assert(Width <
9361                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
9362            "undefined behavior");
9363     unsigned Mask = (1u << Width) - 1;
9364     unsigned Mask2 = N11C->getZExtValue();
9365     if ((Mask & (~Mask2)) == 0)
9366       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
9367                              N->getOperand(0), N1.getOperand(0),
9368                              N->getOperand(2));
9369   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
9370     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
9371     // Keep track of any consecutive bits set that all come from the same base
9372     // value. We can combine these together into a single BFI.
9373     SDValue CombineBFI = FindBFIToCombineWith(N);
9374     if (CombineBFI == SDValue())
9375       return SDValue();
9376 
9377     // We've found a BFI.
9378     APInt ToMask1, FromMask1;
9379     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
9380 
9381     APInt ToMask2, FromMask2;
9382     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
9383     assert(From1 == From2);
9384     (void)From2;
9385 
9386     // First, unlink CombineBFI.
9387     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
9388     // Then create a new BFI, combining the two together.
9389     APInt NewFromMask = FromMask1 | FromMask2;
9390     APInt NewToMask = ToMask1 | ToMask2;
9391 
9392     EVT VT = N->getValueType(0);
9393     SDLoc dl(N);
9394 
9395     if (NewFromMask[0] == 0)
9396       From1 = DCI.DAG.getNode(
9397         ISD::SRL, dl, VT, From1,
9398         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
9399     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
9400                            DCI.DAG.getConstant(~NewToMask, dl, VT));
9401   }
9402   return SDValue();
9403 }
9404 
9405 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
9406 /// ARMISD::VMOVRRD.
9407 static SDValue PerformVMOVRRDCombine(SDNode *N,
9408                                      TargetLowering::DAGCombinerInfo &DCI,
9409                                      const ARMSubtarget *Subtarget) {
9410   // vmovrrd(vmovdrr x, y) -> x,y
9411   SDValue InDouble = N->getOperand(0);
9412   if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
9413     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
9414 
9415   // vmovrrd(load f64) -> (load i32), (load i32)
9416   SDNode *InNode = InDouble.getNode();
9417   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
9418       InNode->getValueType(0) == MVT::f64 &&
9419       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
9420       !cast<LoadSDNode>(InNode)->isVolatile()) {
9421     // TODO: Should this be done for non-FrameIndex operands?
9422     LoadSDNode *LD = cast<LoadSDNode>(InNode);
9423 
9424     SelectionDAG &DAG = DCI.DAG;
9425     SDLoc DL(LD);
9426     SDValue BasePtr = LD->getBasePtr();
9427     SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
9428                                  LD->getPointerInfo(), LD->isVolatile(),
9429                                  LD->isNonTemporal(), LD->isInvariant(),
9430                                  LD->getAlignment());
9431 
9432     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
9433                                     DAG.getConstant(4, DL, MVT::i32));
9434     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
9435                                  LD->getPointerInfo(), LD->isVolatile(),
9436                                  LD->isNonTemporal(), LD->isInvariant(),
9437                                  std::min(4U, LD->getAlignment() / 2));
9438 
9439     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
9440     if (DCI.DAG.getDataLayout().isBigEndian())
9441       std::swap (NewLD1, NewLD2);
9442     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
9443     return Result;
9444   }
9445 
9446   return SDValue();
9447 }
9448 
9449 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
9450 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
9451 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
9452   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
9453   SDValue Op0 = N->getOperand(0);
9454   SDValue Op1 = N->getOperand(1);
9455   if (Op0.getOpcode() == ISD::BITCAST)
9456     Op0 = Op0.getOperand(0);
9457   if (Op1.getOpcode() == ISD::BITCAST)
9458     Op1 = Op1.getOperand(0);
9459   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
9460       Op0.getNode() == Op1.getNode() &&
9461       Op0.getResNo() == 0 && Op1.getResNo() == 1)
9462     return DAG.getNode(ISD::BITCAST, SDLoc(N),
9463                        N->getValueType(0), Op0.getOperand(0));
9464   return SDValue();
9465 }
9466 
9467 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
9468 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
9469 /// i64 vector to have f64 elements, since the value can then be loaded
9470 /// directly into a VFP register.
9471 static bool hasNormalLoadOperand(SDNode *N) {
9472   unsigned NumElts = N->getValueType(0).getVectorNumElements();
9473   for (unsigned i = 0; i < NumElts; ++i) {
9474     SDNode *Elt = N->getOperand(i).getNode();
9475     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
9476       return true;
9477   }
9478   return false;
9479 }
9480 
9481 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
9482 /// ISD::BUILD_VECTOR.
9483 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
9484                                           TargetLowering::DAGCombinerInfo &DCI,
9485                                           const ARMSubtarget *Subtarget) {
9486   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
9487   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
9488   // into a pair of GPRs, which is fine when the value is used as a scalar,
9489   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
9490   SelectionDAG &DAG = DCI.DAG;
9491   if (N->getNumOperands() == 2)
9492     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
9493       return RV;
9494 
9495   // Load i64 elements as f64 values so that type legalization does not split
9496   // them up into i32 values.
9497   EVT VT = N->getValueType(0);
9498   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
9499     return SDValue();
9500   SDLoc dl(N);
9501   SmallVector<SDValue, 8> Ops;
9502   unsigned NumElts = VT.getVectorNumElements();
9503   for (unsigned i = 0; i < NumElts; ++i) {
9504     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
9505     Ops.push_back(V);
9506     // Make the DAGCombiner fold the bitcast.
9507     DCI.AddToWorklist(V.getNode());
9508   }
9509   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
9510   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
9511   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
9512 }
9513 
9514 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
9515 static SDValue
9516 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
9517   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
9518   // At that time, we may have inserted bitcasts from integer to float.
9519   // If these bitcasts have survived DAGCombine, change the lowering of this
9520   // BUILD_VECTOR in something more vector friendly, i.e., that does not
9521   // force to use floating point types.
9522 
9523   // Make sure we can change the type of the vector.
9524   // This is possible iff:
9525   // 1. The vector is only used in a bitcast to a integer type. I.e.,
9526   //    1.1. Vector is used only once.
9527   //    1.2. Use is a bit convert to an integer type.
9528   // 2. The size of its operands are 32-bits (64-bits are not legal).
9529   EVT VT = N->getValueType(0);
9530   EVT EltVT = VT.getVectorElementType();
9531 
9532   // Check 1.1. and 2.
9533   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
9534     return SDValue();
9535 
9536   // By construction, the input type must be float.
9537   assert(EltVT == MVT::f32 && "Unexpected type!");
9538 
9539   // Check 1.2.
9540   SDNode *Use = *N->use_begin();
9541   if (Use->getOpcode() != ISD::BITCAST ||
9542       Use->getValueType(0).isFloatingPoint())
9543     return SDValue();
9544 
9545   // Check profitability.
9546   // Model is, if more than half of the relevant operands are bitcast from
9547   // i32, turn the build_vector into a sequence of insert_vector_elt.
9548   // Relevant operands are everything that is not statically
9549   // (i.e., at compile time) bitcasted.
9550   unsigned NumOfBitCastedElts = 0;
9551   unsigned NumElts = VT.getVectorNumElements();
9552   unsigned NumOfRelevantElts = NumElts;
9553   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
9554     SDValue Elt = N->getOperand(Idx);
9555     if (Elt->getOpcode() == ISD::BITCAST) {
9556       // Assume only bit cast to i32 will go away.
9557       if (Elt->getOperand(0).getValueType() == MVT::i32)
9558         ++NumOfBitCastedElts;
9559     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
9560       // Constants are statically casted, thus do not count them as
9561       // relevant operands.
9562       --NumOfRelevantElts;
9563   }
9564 
9565   // Check if more than half of the elements require a non-free bitcast.
9566   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
9567     return SDValue();
9568 
9569   SelectionDAG &DAG = DCI.DAG;
9570   // Create the new vector type.
9571   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
9572   // Check if the type is legal.
9573   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9574   if (!TLI.isTypeLegal(VecVT))
9575     return SDValue();
9576 
9577   // Combine:
9578   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
9579   // => BITCAST INSERT_VECTOR_ELT
9580   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
9581   //                      (BITCAST EN), N.
9582   SDValue Vec = DAG.getUNDEF(VecVT);
9583   SDLoc dl(N);
9584   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
9585     SDValue V = N->getOperand(Idx);
9586     if (V.isUndef())
9587       continue;
9588     if (V.getOpcode() == ISD::BITCAST &&
9589         V->getOperand(0).getValueType() == MVT::i32)
9590       // Fold obvious case.
9591       V = V.getOperand(0);
9592     else {
9593       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
9594       // Make the DAGCombiner fold the bitcasts.
9595       DCI.AddToWorklist(V.getNode());
9596     }
9597     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
9598     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
9599   }
9600   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
9601   // Make the DAGCombiner fold the bitcasts.
9602   DCI.AddToWorklist(Vec.getNode());
9603   return Vec;
9604 }
9605 
9606 /// PerformInsertEltCombine - Target-specific dag combine xforms for
9607 /// ISD::INSERT_VECTOR_ELT.
9608 static SDValue PerformInsertEltCombine(SDNode *N,
9609                                        TargetLowering::DAGCombinerInfo &DCI) {
9610   // Bitcast an i64 load inserted into a vector to f64.
9611   // Otherwise, the i64 value will be legalized to a pair of i32 values.
9612   EVT VT = N->getValueType(0);
9613   SDNode *Elt = N->getOperand(1).getNode();
9614   if (VT.getVectorElementType() != MVT::i64 ||
9615       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
9616     return SDValue();
9617 
9618   SelectionDAG &DAG = DCI.DAG;
9619   SDLoc dl(N);
9620   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
9621                                  VT.getVectorNumElements());
9622   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
9623   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
9624   // Make the DAGCombiner fold the bitcasts.
9625   DCI.AddToWorklist(Vec.getNode());
9626   DCI.AddToWorklist(V.getNode());
9627   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
9628                                Vec, V, N->getOperand(2));
9629   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
9630 }
9631 
9632 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
9633 /// ISD::VECTOR_SHUFFLE.
9634 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
9635   // The LLVM shufflevector instruction does not require the shuffle mask
9636   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
9637   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
9638   // operands do not match the mask length, they are extended by concatenating
9639   // them with undef vectors.  That is probably the right thing for other
9640   // targets, but for NEON it is better to concatenate two double-register
9641   // size vector operands into a single quad-register size vector.  Do that
9642   // transformation here:
9643   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
9644   //   shuffle(concat(v1, v2), undef)
9645   SDValue Op0 = N->getOperand(0);
9646   SDValue Op1 = N->getOperand(1);
9647   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
9648       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
9649       Op0.getNumOperands() != 2 ||
9650       Op1.getNumOperands() != 2)
9651     return SDValue();
9652   SDValue Concat0Op1 = Op0.getOperand(1);
9653   SDValue Concat1Op1 = Op1.getOperand(1);
9654   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
9655     return SDValue();
9656   // Skip the transformation if any of the types are illegal.
9657   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9658   EVT VT = N->getValueType(0);
9659   if (!TLI.isTypeLegal(VT) ||
9660       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
9661       !TLI.isTypeLegal(Concat1Op1.getValueType()))
9662     return SDValue();
9663 
9664   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
9665                                   Op0.getOperand(0), Op1.getOperand(0));
9666   // Translate the shuffle mask.
9667   SmallVector<int, 16> NewMask;
9668   unsigned NumElts = VT.getVectorNumElements();
9669   unsigned HalfElts = NumElts/2;
9670   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
9671   for (unsigned n = 0; n < NumElts; ++n) {
9672     int MaskElt = SVN->getMaskElt(n);
9673     int NewElt = -1;
9674     if (MaskElt < (int)HalfElts)
9675       NewElt = MaskElt;
9676     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
9677       NewElt = HalfElts + MaskElt - NumElts;
9678     NewMask.push_back(NewElt);
9679   }
9680   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
9681                               DAG.getUNDEF(VT), NewMask.data());
9682 }
9683 
9684 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
9685 /// NEON load/store intrinsics, and generic vector load/stores, to merge
9686 /// base address updates.
9687 /// For generic load/stores, the memory type is assumed to be a vector.
9688 /// The caller is assumed to have checked legality.
9689 static SDValue CombineBaseUpdate(SDNode *N,
9690                                  TargetLowering::DAGCombinerInfo &DCI) {
9691   SelectionDAG &DAG = DCI.DAG;
9692   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
9693                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
9694   const bool isStore = N->getOpcode() == ISD::STORE;
9695   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
9696   SDValue Addr = N->getOperand(AddrOpIdx);
9697   MemSDNode *MemN = cast<MemSDNode>(N);
9698   SDLoc dl(N);
9699 
9700   // Search for a use of the address operand that is an increment.
9701   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
9702          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
9703     SDNode *User = *UI;
9704     if (User->getOpcode() != ISD::ADD ||
9705         UI.getUse().getResNo() != Addr.getResNo())
9706       continue;
9707 
9708     // Check that the add is independent of the load/store.  Otherwise, folding
9709     // it would create a cycle.
9710     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
9711       continue;
9712 
9713     // Find the new opcode for the updating load/store.
9714     bool isLoadOp = true;
9715     bool isLaneOp = false;
9716     unsigned NewOpc = 0;
9717     unsigned NumVecs = 0;
9718     if (isIntrinsic) {
9719       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
9720       switch (IntNo) {
9721       default: llvm_unreachable("unexpected intrinsic for Neon base update");
9722       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
9723         NumVecs = 1; break;
9724       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
9725         NumVecs = 2; break;
9726       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
9727         NumVecs = 3; break;
9728       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
9729         NumVecs = 4; break;
9730       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
9731         NumVecs = 2; isLaneOp = true; break;
9732       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
9733         NumVecs = 3; isLaneOp = true; break;
9734       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
9735         NumVecs = 4; isLaneOp = true; break;
9736       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
9737         NumVecs = 1; isLoadOp = false; break;
9738       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
9739         NumVecs = 2; isLoadOp = false; break;
9740       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
9741         NumVecs = 3; isLoadOp = false; break;
9742       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
9743         NumVecs = 4; isLoadOp = false; break;
9744       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
9745         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
9746       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
9747         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
9748       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
9749         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
9750       }
9751     } else {
9752       isLaneOp = true;
9753       switch (N->getOpcode()) {
9754       default: llvm_unreachable("unexpected opcode for Neon base update");
9755       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
9756       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
9757       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
9758       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
9759         NumVecs = 1; isLaneOp = false; break;
9760       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
9761         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
9762       }
9763     }
9764 
9765     // Find the size of memory referenced by the load/store.
9766     EVT VecTy;
9767     if (isLoadOp) {
9768       VecTy = N->getValueType(0);
9769     } else if (isIntrinsic) {
9770       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
9771     } else {
9772       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
9773       VecTy = N->getOperand(1).getValueType();
9774     }
9775 
9776     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
9777     if (isLaneOp)
9778       NumBytes /= VecTy.getVectorNumElements();
9779 
9780     // If the increment is a constant, it must match the memory ref size.
9781     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
9782     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
9783       uint64_t IncVal = CInc->getZExtValue();
9784       if (IncVal != NumBytes)
9785         continue;
9786     } else if (NumBytes >= 3 * 16) {
9787       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
9788       // separate instructions that make it harder to use a non-constant update.
9789       continue;
9790     }
9791 
9792     // OK, we found an ADD we can fold into the base update.
9793     // Now, create a _UPD node, taking care of not breaking alignment.
9794 
9795     EVT AlignedVecTy = VecTy;
9796     unsigned Alignment = MemN->getAlignment();
9797 
9798     // If this is a less-than-standard-aligned load/store, change the type to
9799     // match the standard alignment.
9800     // The alignment is overlooked when selecting _UPD variants; and it's
9801     // easier to introduce bitcasts here than fix that.
9802     // There are 3 ways to get to this base-update combine:
9803     // - intrinsics: they are assumed to be properly aligned (to the standard
9804     //   alignment of the memory type), so we don't need to do anything.
9805     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
9806     //   intrinsics, so, likewise, there's nothing to do.
9807     // - generic load/store instructions: the alignment is specified as an
9808     //   explicit operand, rather than implicitly as the standard alignment
9809     //   of the memory type (like the intrisics).  We need to change the
9810     //   memory type to match the explicit alignment.  That way, we don't
9811     //   generate non-standard-aligned ARMISD::VLDx nodes.
9812     if (isa<LSBaseSDNode>(N)) {
9813       if (Alignment == 0)
9814         Alignment = 1;
9815       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
9816         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
9817         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
9818         assert(!isLaneOp && "Unexpected generic load/store lane.");
9819         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
9820         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
9821       }
9822       // Don't set an explicit alignment on regular load/stores that we want
9823       // to transform to VLD/VST 1_UPD nodes.
9824       // This matches the behavior of regular load/stores, which only get an
9825       // explicit alignment if the MMO alignment is larger than the standard
9826       // alignment of the memory type.
9827       // Intrinsics, however, always get an explicit alignment, set to the
9828       // alignment of the MMO.
9829       Alignment = 1;
9830     }
9831 
9832     // Create the new updating load/store node.
9833     // First, create an SDVTList for the new updating node's results.
9834     EVT Tys[6];
9835     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
9836     unsigned n;
9837     for (n = 0; n < NumResultVecs; ++n)
9838       Tys[n] = AlignedVecTy;
9839     Tys[n++] = MVT::i32;
9840     Tys[n] = MVT::Other;
9841     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
9842 
9843     // Then, gather the new node's operands.
9844     SmallVector<SDValue, 8> Ops;
9845     Ops.push_back(N->getOperand(0)); // incoming chain
9846     Ops.push_back(N->getOperand(AddrOpIdx));
9847     Ops.push_back(Inc);
9848 
9849     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
9850       // Try to match the intrinsic's signature
9851       Ops.push_back(StN->getValue());
9852     } else {
9853       // Loads (and of course intrinsics) match the intrinsics' signature,
9854       // so just add all but the alignment operand.
9855       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
9856         Ops.push_back(N->getOperand(i));
9857     }
9858 
9859     // For all node types, the alignment operand is always the last one.
9860     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
9861 
9862     // If this is a non-standard-aligned STORE, the penultimate operand is the
9863     // stored value.  Bitcast it to the aligned type.
9864     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
9865       SDValue &StVal = Ops[Ops.size()-2];
9866       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
9867     }
9868 
9869     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
9870                                            Ops, AlignedVecTy,
9871                                            MemN->getMemOperand());
9872 
9873     // Update the uses.
9874     SmallVector<SDValue, 5> NewResults;
9875     for (unsigned i = 0; i < NumResultVecs; ++i)
9876       NewResults.push_back(SDValue(UpdN.getNode(), i));
9877 
9878     // If this is an non-standard-aligned LOAD, the first result is the loaded
9879     // value.  Bitcast it to the expected result type.
9880     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
9881       SDValue &LdVal = NewResults[0];
9882       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
9883     }
9884 
9885     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
9886     DCI.CombineTo(N, NewResults);
9887     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
9888 
9889     break;
9890   }
9891   return SDValue();
9892 }
9893 
9894 static SDValue PerformVLDCombine(SDNode *N,
9895                                  TargetLowering::DAGCombinerInfo &DCI) {
9896   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9897     return SDValue();
9898 
9899   return CombineBaseUpdate(N, DCI);
9900 }
9901 
9902 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
9903 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
9904 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
9905 /// return true.
9906 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
9907   SelectionDAG &DAG = DCI.DAG;
9908   EVT VT = N->getValueType(0);
9909   // vldN-dup instructions only support 64-bit vectors for N > 1.
9910   if (!VT.is64BitVector())
9911     return false;
9912 
9913   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
9914   SDNode *VLD = N->getOperand(0).getNode();
9915   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
9916     return false;
9917   unsigned NumVecs = 0;
9918   unsigned NewOpc = 0;
9919   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
9920   if (IntNo == Intrinsic::arm_neon_vld2lane) {
9921     NumVecs = 2;
9922     NewOpc = ARMISD::VLD2DUP;
9923   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
9924     NumVecs = 3;
9925     NewOpc = ARMISD::VLD3DUP;
9926   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
9927     NumVecs = 4;
9928     NewOpc = ARMISD::VLD4DUP;
9929   } else {
9930     return false;
9931   }
9932 
9933   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
9934   // numbers match the load.
9935   unsigned VLDLaneNo =
9936     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
9937   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9938        UI != UE; ++UI) {
9939     // Ignore uses of the chain result.
9940     if (UI.getUse().getResNo() == NumVecs)
9941       continue;
9942     SDNode *User = *UI;
9943     if (User->getOpcode() != ARMISD::VDUPLANE ||
9944         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
9945       return false;
9946   }
9947 
9948   // Create the vldN-dup node.
9949   EVT Tys[5];
9950   unsigned n;
9951   for (n = 0; n < NumVecs; ++n)
9952     Tys[n] = VT;
9953   Tys[n] = MVT::Other;
9954   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
9955   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
9956   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
9957   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
9958                                            Ops, VLDMemInt->getMemoryVT(),
9959                                            VLDMemInt->getMemOperand());
9960 
9961   // Update the uses.
9962   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9963        UI != UE; ++UI) {
9964     unsigned ResNo = UI.getUse().getResNo();
9965     // Ignore uses of the chain result.
9966     if (ResNo == NumVecs)
9967       continue;
9968     SDNode *User = *UI;
9969     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
9970   }
9971 
9972   // Now the vldN-lane intrinsic is dead except for its chain result.
9973   // Update uses of the chain.
9974   std::vector<SDValue> VLDDupResults;
9975   for (unsigned n = 0; n < NumVecs; ++n)
9976     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
9977   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
9978   DCI.CombineTo(VLD, VLDDupResults);
9979 
9980   return true;
9981 }
9982 
9983 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
9984 /// ARMISD::VDUPLANE.
9985 static SDValue PerformVDUPLANECombine(SDNode *N,
9986                                       TargetLowering::DAGCombinerInfo &DCI) {
9987   SDValue Op = N->getOperand(0);
9988 
9989   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
9990   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
9991   if (CombineVLDDUP(N, DCI))
9992     return SDValue(N, 0);
9993 
9994   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
9995   // redundant.  Ignore bit_converts for now; element sizes are checked below.
9996   while (Op.getOpcode() == ISD::BITCAST)
9997     Op = Op.getOperand(0);
9998   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
9999     return SDValue();
10000 
10001   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
10002   unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
10003   // The canonical VMOV for a zero vector uses a 32-bit element size.
10004   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10005   unsigned EltBits;
10006   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
10007     EltSize = 8;
10008   EVT VT = N->getValueType(0);
10009   if (EltSize > VT.getVectorElementType().getSizeInBits())
10010     return SDValue();
10011 
10012   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
10013 }
10014 
10015 static SDValue PerformLOADCombine(SDNode *N,
10016                                   TargetLowering::DAGCombinerInfo &DCI) {
10017   EVT VT = N->getValueType(0);
10018 
10019   // If this is a legal vector load, try to combine it into a VLD1_UPD.
10020   if (ISD::isNormalLoad(N) && VT.isVector() &&
10021       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
10022     return CombineBaseUpdate(N, DCI);
10023 
10024   return SDValue();
10025 }
10026 
10027 /// PerformSTORECombine - Target-specific dag combine xforms for
10028 /// ISD::STORE.
10029 static SDValue PerformSTORECombine(SDNode *N,
10030                                    TargetLowering::DAGCombinerInfo &DCI) {
10031   StoreSDNode *St = cast<StoreSDNode>(N);
10032   if (St->isVolatile())
10033     return SDValue();
10034 
10035   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
10036   // pack all of the elements in one place.  Next, store to memory in fewer
10037   // chunks.
10038   SDValue StVal = St->getValue();
10039   EVT VT = StVal.getValueType();
10040   if (St->isTruncatingStore() && VT.isVector()) {
10041     SelectionDAG &DAG = DCI.DAG;
10042     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10043     EVT StVT = St->getMemoryVT();
10044     unsigned NumElems = VT.getVectorNumElements();
10045     assert(StVT != VT && "Cannot truncate to the same type");
10046     unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
10047     unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
10048 
10049     // From, To sizes and ElemCount must be pow of two
10050     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
10051 
10052     // We are going to use the original vector elt for storing.
10053     // Accumulated smaller vector elements must be a multiple of the store size.
10054     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
10055 
10056     unsigned SizeRatio  = FromEltSz / ToEltSz;
10057     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
10058 
10059     // Create a type on which we perform the shuffle.
10060     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
10061                                      NumElems*SizeRatio);
10062     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
10063 
10064     SDLoc DL(St);
10065     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
10066     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
10067     for (unsigned i = 0; i < NumElems; ++i)
10068       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
10069                           ? (i + 1) * SizeRatio - 1
10070                           : i * SizeRatio;
10071 
10072     // Can't shuffle using an illegal type.
10073     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
10074 
10075     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
10076                                 DAG.getUNDEF(WideVec.getValueType()),
10077                                 ShuffleVec.data());
10078     // At this point all of the data is stored at the bottom of the
10079     // register. We now need to save it to mem.
10080 
10081     // Find the largest store unit
10082     MVT StoreType = MVT::i8;
10083     for (MVT Tp : MVT::integer_valuetypes()) {
10084       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
10085         StoreType = Tp;
10086     }
10087     // Didn't find a legal store type.
10088     if (!TLI.isTypeLegal(StoreType))
10089       return SDValue();
10090 
10091     // Bitcast the original vector into a vector of store-size units
10092     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
10093             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
10094     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
10095     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
10096     SmallVector<SDValue, 8> Chains;
10097     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
10098                                         TLI.getPointerTy(DAG.getDataLayout()));
10099     SDValue BasePtr = St->getBasePtr();
10100 
10101     // Perform one or more big stores into memory.
10102     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
10103     for (unsigned I = 0; I < E; I++) {
10104       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
10105                                    StoreType, ShuffWide,
10106                                    DAG.getIntPtrConstant(I, DL));
10107       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
10108                                 St->getPointerInfo(), St->isVolatile(),
10109                                 St->isNonTemporal(), St->getAlignment());
10110       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
10111                             Increment);
10112       Chains.push_back(Ch);
10113     }
10114     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
10115   }
10116 
10117   if (!ISD::isNormalStore(St))
10118     return SDValue();
10119 
10120   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
10121   // ARM stores of arguments in the same cache line.
10122   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
10123       StVal.getNode()->hasOneUse()) {
10124     SelectionDAG  &DAG = DCI.DAG;
10125     bool isBigEndian = DAG.getDataLayout().isBigEndian();
10126     SDLoc DL(St);
10127     SDValue BasePtr = St->getBasePtr();
10128     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
10129                                   StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
10130                                   BasePtr, St->getPointerInfo(), St->isVolatile(),
10131                                   St->isNonTemporal(), St->getAlignment());
10132 
10133     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
10134                                     DAG.getConstant(4, DL, MVT::i32));
10135     return DAG.getStore(NewST1.getValue(0), DL,
10136                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
10137                         OffsetPtr, St->getPointerInfo(), St->isVolatile(),
10138                         St->isNonTemporal(),
10139                         std::min(4U, St->getAlignment() / 2));
10140   }
10141 
10142   if (StVal.getValueType() == MVT::i64 &&
10143       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10144 
10145     // Bitcast an i64 store extracted from a vector to f64.
10146     // Otherwise, the i64 value will be legalized to a pair of i32 values.
10147     SelectionDAG &DAG = DCI.DAG;
10148     SDLoc dl(StVal);
10149     SDValue IntVec = StVal.getOperand(0);
10150     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
10151                                    IntVec.getValueType().getVectorNumElements());
10152     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
10153     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
10154                                  Vec, StVal.getOperand(1));
10155     dl = SDLoc(N);
10156     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
10157     // Make the DAGCombiner fold the bitcasts.
10158     DCI.AddToWorklist(Vec.getNode());
10159     DCI.AddToWorklist(ExtElt.getNode());
10160     DCI.AddToWorklist(V.getNode());
10161     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
10162                         St->getPointerInfo(), St->isVolatile(),
10163                         St->isNonTemporal(), St->getAlignment(),
10164                         St->getAAInfo());
10165   }
10166 
10167   // If this is a legal vector store, try to combine it into a VST1_UPD.
10168   if (ISD::isNormalStore(N) && VT.isVector() &&
10169       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
10170     return CombineBaseUpdate(N, DCI);
10171 
10172   return SDValue();
10173 }
10174 
10175 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
10176 /// can replace combinations of VMUL and VCVT (floating-point to integer)
10177 /// when the VMUL has a constant operand that is a power of 2.
10178 ///
10179 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
10180 ///  vmul.f32        d16, d17, d16
10181 ///  vcvt.s32.f32    d16, d16
10182 /// becomes:
10183 ///  vcvt.s32.f32    d16, d16, #3
10184 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
10185                                   const ARMSubtarget *Subtarget) {
10186   if (!Subtarget->hasNEON())
10187     return SDValue();
10188 
10189   SDValue Op = N->getOperand(0);
10190   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
10191       Op.getOpcode() != ISD::FMUL)
10192     return SDValue();
10193 
10194   SDValue ConstVec = Op->getOperand(1);
10195   if (!isa<BuildVectorSDNode>(ConstVec))
10196     return SDValue();
10197 
10198   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
10199   uint32_t FloatBits = FloatTy.getSizeInBits();
10200   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
10201   uint32_t IntBits = IntTy.getSizeInBits();
10202   unsigned NumLanes = Op.getValueType().getVectorNumElements();
10203   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
10204     // These instructions only exist converting from f32 to i32. We can handle
10205     // smaller integers by generating an extra truncate, but larger ones would
10206     // be lossy. We also can't handle more then 4 lanes, since these intructions
10207     // only support v2i32/v4i32 types.
10208     return SDValue();
10209   }
10210 
10211   BitVector UndefElements;
10212   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
10213   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
10214   if (C == -1 || C == 0 || C > 32)
10215     return SDValue();
10216 
10217   SDLoc dl(N);
10218   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
10219   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
10220     Intrinsic::arm_neon_vcvtfp2fxu;
10221   SDValue FixConv = DAG.getNode(
10222       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
10223       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
10224       DAG.getConstant(C, dl, MVT::i32));
10225 
10226   if (IntBits < FloatBits)
10227     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
10228 
10229   return FixConv;
10230 }
10231 
10232 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
10233 /// can replace combinations of VCVT (integer to floating-point) and VDIV
10234 /// when the VDIV has a constant operand that is a power of 2.
10235 ///
10236 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
10237 ///  vcvt.f32.s32    d16, d16
10238 ///  vdiv.f32        d16, d17, d16
10239 /// becomes:
10240 ///  vcvt.f32.s32    d16, d16, #3
10241 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
10242                                   const ARMSubtarget *Subtarget) {
10243   if (!Subtarget->hasNEON())
10244     return SDValue();
10245 
10246   SDValue Op = N->getOperand(0);
10247   unsigned OpOpcode = Op.getNode()->getOpcode();
10248   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
10249       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
10250     return SDValue();
10251 
10252   SDValue ConstVec = N->getOperand(1);
10253   if (!isa<BuildVectorSDNode>(ConstVec))
10254     return SDValue();
10255 
10256   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
10257   uint32_t FloatBits = FloatTy.getSizeInBits();
10258   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
10259   uint32_t IntBits = IntTy.getSizeInBits();
10260   unsigned NumLanes = Op.getValueType().getVectorNumElements();
10261   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
10262     // These instructions only exist converting from i32 to f32. We can handle
10263     // smaller integers by generating an extra extend, but larger ones would
10264     // be lossy. We also can't handle more then 4 lanes, since these intructions
10265     // only support v2i32/v4i32 types.
10266     return SDValue();
10267   }
10268 
10269   BitVector UndefElements;
10270   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
10271   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
10272   if (C == -1 || C == 0 || C > 32)
10273     return SDValue();
10274 
10275   SDLoc dl(N);
10276   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
10277   SDValue ConvInput = Op.getOperand(0);
10278   if (IntBits < FloatBits)
10279     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
10280                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
10281                             ConvInput);
10282 
10283   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
10284     Intrinsic::arm_neon_vcvtfxu2fp;
10285   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
10286                      Op.getValueType(),
10287                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
10288                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
10289 }
10290 
10291 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
10292 /// operand of a vector shift operation, where all the elements of the
10293 /// build_vector must have the same constant integer value.
10294 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10295   // Ignore bit_converts.
10296   while (Op.getOpcode() == ISD::BITCAST)
10297     Op = Op.getOperand(0);
10298   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
10299   APInt SplatBits, SplatUndef;
10300   unsigned SplatBitSize;
10301   bool HasAnyUndefs;
10302   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10303                                       HasAnyUndefs, ElementBits) ||
10304       SplatBitSize > ElementBits)
10305     return false;
10306   Cnt = SplatBits.getSExtValue();
10307   return true;
10308 }
10309 
10310 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
10311 /// operand of a vector shift left operation.  That value must be in the range:
10312 ///   0 <= Value < ElementBits for a left shift; or
10313 ///   0 <= Value <= ElementBits for a long left shift.
10314 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10315   assert(VT.isVector() && "vector shift count is not a vector type");
10316   int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
10317   if (! getVShiftImm(Op, ElementBits, Cnt))
10318     return false;
10319   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
10320 }
10321 
10322 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
10323 /// operand of a vector shift right operation.  For a shift opcode, the value
10324 /// is positive, but for an intrinsic the value count must be negative. The
10325 /// absolute value must be in the range:
10326 ///   1 <= |Value| <= ElementBits for a right shift; or
10327 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
10328 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
10329                          int64_t &Cnt) {
10330   assert(VT.isVector() && "vector shift count is not a vector type");
10331   int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
10332   if (! getVShiftImm(Op, ElementBits, Cnt))
10333     return false;
10334   if (!isIntrinsic)
10335     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
10336   if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
10337     Cnt = -Cnt;
10338     return true;
10339   }
10340   return false;
10341 }
10342 
10343 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
10344 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
10345   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
10346   switch (IntNo) {
10347   default:
10348     // Don't do anything for most intrinsics.
10349     break;
10350 
10351   // Vector shifts: check for immediate versions and lower them.
10352   // Note: This is done during DAG combining instead of DAG legalizing because
10353   // the build_vectors for 64-bit vector element shift counts are generally
10354   // not legal, and it is hard to see their values after they get legalized to
10355   // loads from a constant pool.
10356   case Intrinsic::arm_neon_vshifts:
10357   case Intrinsic::arm_neon_vshiftu:
10358   case Intrinsic::arm_neon_vrshifts:
10359   case Intrinsic::arm_neon_vrshiftu:
10360   case Intrinsic::arm_neon_vrshiftn:
10361   case Intrinsic::arm_neon_vqshifts:
10362   case Intrinsic::arm_neon_vqshiftu:
10363   case Intrinsic::arm_neon_vqshiftsu:
10364   case Intrinsic::arm_neon_vqshiftns:
10365   case Intrinsic::arm_neon_vqshiftnu:
10366   case Intrinsic::arm_neon_vqshiftnsu:
10367   case Intrinsic::arm_neon_vqrshiftns:
10368   case Intrinsic::arm_neon_vqrshiftnu:
10369   case Intrinsic::arm_neon_vqrshiftnsu: {
10370     EVT VT = N->getOperand(1).getValueType();
10371     int64_t Cnt;
10372     unsigned VShiftOpc = 0;
10373 
10374     switch (IntNo) {
10375     case Intrinsic::arm_neon_vshifts:
10376     case Intrinsic::arm_neon_vshiftu:
10377       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
10378         VShiftOpc = ARMISD::VSHL;
10379         break;
10380       }
10381       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
10382         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
10383                      ARMISD::VSHRs : ARMISD::VSHRu);
10384         break;
10385       }
10386       return SDValue();
10387 
10388     case Intrinsic::arm_neon_vrshifts:
10389     case Intrinsic::arm_neon_vrshiftu:
10390       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
10391         break;
10392       return SDValue();
10393 
10394     case Intrinsic::arm_neon_vqshifts:
10395     case Intrinsic::arm_neon_vqshiftu:
10396       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
10397         break;
10398       return SDValue();
10399 
10400     case Intrinsic::arm_neon_vqshiftsu:
10401       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
10402         break;
10403       llvm_unreachable("invalid shift count for vqshlu intrinsic");
10404 
10405     case Intrinsic::arm_neon_vrshiftn:
10406     case Intrinsic::arm_neon_vqshiftns:
10407     case Intrinsic::arm_neon_vqshiftnu:
10408     case Intrinsic::arm_neon_vqshiftnsu:
10409     case Intrinsic::arm_neon_vqrshiftns:
10410     case Intrinsic::arm_neon_vqrshiftnu:
10411     case Intrinsic::arm_neon_vqrshiftnsu:
10412       // Narrowing shifts require an immediate right shift.
10413       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
10414         break;
10415       llvm_unreachable("invalid shift count for narrowing vector shift "
10416                        "intrinsic");
10417 
10418     default:
10419       llvm_unreachable("unhandled vector shift");
10420     }
10421 
10422     switch (IntNo) {
10423     case Intrinsic::arm_neon_vshifts:
10424     case Intrinsic::arm_neon_vshiftu:
10425       // Opcode already set above.
10426       break;
10427     case Intrinsic::arm_neon_vrshifts:
10428       VShiftOpc = ARMISD::VRSHRs; break;
10429     case Intrinsic::arm_neon_vrshiftu:
10430       VShiftOpc = ARMISD::VRSHRu; break;
10431     case Intrinsic::arm_neon_vrshiftn:
10432       VShiftOpc = ARMISD::VRSHRN; break;
10433     case Intrinsic::arm_neon_vqshifts:
10434       VShiftOpc = ARMISD::VQSHLs; break;
10435     case Intrinsic::arm_neon_vqshiftu:
10436       VShiftOpc = ARMISD::VQSHLu; break;
10437     case Intrinsic::arm_neon_vqshiftsu:
10438       VShiftOpc = ARMISD::VQSHLsu; break;
10439     case Intrinsic::arm_neon_vqshiftns:
10440       VShiftOpc = ARMISD::VQSHRNs; break;
10441     case Intrinsic::arm_neon_vqshiftnu:
10442       VShiftOpc = ARMISD::VQSHRNu; break;
10443     case Intrinsic::arm_neon_vqshiftnsu:
10444       VShiftOpc = ARMISD::VQSHRNsu; break;
10445     case Intrinsic::arm_neon_vqrshiftns:
10446       VShiftOpc = ARMISD::VQRSHRNs; break;
10447     case Intrinsic::arm_neon_vqrshiftnu:
10448       VShiftOpc = ARMISD::VQRSHRNu; break;
10449     case Intrinsic::arm_neon_vqrshiftnsu:
10450       VShiftOpc = ARMISD::VQRSHRNsu; break;
10451     }
10452 
10453     SDLoc dl(N);
10454     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
10455                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
10456   }
10457 
10458   case Intrinsic::arm_neon_vshiftins: {
10459     EVT VT = N->getOperand(1).getValueType();
10460     int64_t Cnt;
10461     unsigned VShiftOpc = 0;
10462 
10463     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
10464       VShiftOpc = ARMISD::VSLI;
10465     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
10466       VShiftOpc = ARMISD::VSRI;
10467     else {
10468       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
10469     }
10470 
10471     SDLoc dl(N);
10472     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
10473                        N->getOperand(1), N->getOperand(2),
10474                        DAG.getConstant(Cnt, dl, MVT::i32));
10475   }
10476 
10477   case Intrinsic::arm_neon_vqrshifts:
10478   case Intrinsic::arm_neon_vqrshiftu:
10479     // No immediate versions of these to check for.
10480     break;
10481   }
10482 
10483   return SDValue();
10484 }
10485 
10486 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
10487 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
10488 /// combining instead of DAG legalizing because the build_vectors for 64-bit
10489 /// vector element shift counts are generally not legal, and it is hard to see
10490 /// their values after they get legalized to loads from a constant pool.
10491 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
10492                                    const ARMSubtarget *ST) {
10493   EVT VT = N->getValueType(0);
10494   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
10495     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
10496     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
10497     SDValue N1 = N->getOperand(1);
10498     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
10499       SDValue N0 = N->getOperand(0);
10500       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
10501           DAG.MaskedValueIsZero(N0.getOperand(0),
10502                                 APInt::getHighBitsSet(32, 16)))
10503         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
10504     }
10505   }
10506 
10507   // Nothing to be done for scalar shifts.
10508   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10509   if (!VT.isVector() || !TLI.isTypeLegal(VT))
10510     return SDValue();
10511 
10512   assert(ST->hasNEON() && "unexpected vector shift");
10513   int64_t Cnt;
10514 
10515   switch (N->getOpcode()) {
10516   default: llvm_unreachable("unexpected shift opcode");
10517 
10518   case ISD::SHL:
10519     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
10520       SDLoc dl(N);
10521       return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
10522                          DAG.getConstant(Cnt, dl, MVT::i32));
10523     }
10524     break;
10525 
10526   case ISD::SRA:
10527   case ISD::SRL:
10528     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
10529       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
10530                             ARMISD::VSHRs : ARMISD::VSHRu);
10531       SDLoc dl(N);
10532       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
10533                          DAG.getConstant(Cnt, dl, MVT::i32));
10534     }
10535   }
10536   return SDValue();
10537 }
10538 
10539 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
10540 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
10541 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
10542                                     const ARMSubtarget *ST) {
10543   SDValue N0 = N->getOperand(0);
10544 
10545   // Check for sign- and zero-extensions of vector extract operations of 8-
10546   // and 16-bit vector elements.  NEON supports these directly.  They are
10547   // handled during DAG combining because type legalization will promote them
10548   // to 32-bit types and it is messy to recognize the operations after that.
10549   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10550     SDValue Vec = N0.getOperand(0);
10551     SDValue Lane = N0.getOperand(1);
10552     EVT VT = N->getValueType(0);
10553     EVT EltVT = N0.getValueType();
10554     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10555 
10556     if (VT == MVT::i32 &&
10557         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
10558         TLI.isTypeLegal(Vec.getValueType()) &&
10559         isa<ConstantSDNode>(Lane)) {
10560 
10561       unsigned Opc = 0;
10562       switch (N->getOpcode()) {
10563       default: llvm_unreachable("unexpected opcode");
10564       case ISD::SIGN_EXTEND:
10565         Opc = ARMISD::VGETLANEs;
10566         break;
10567       case ISD::ZERO_EXTEND:
10568       case ISD::ANY_EXTEND:
10569         Opc = ARMISD::VGETLANEu;
10570         break;
10571       }
10572       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
10573     }
10574   }
10575 
10576   return SDValue();
10577 }
10578 
10579 static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
10580                              APInt &KnownOne) {
10581   if (Op.getOpcode() == ARMISD::BFI) {
10582     // Conservatively, we can recurse down the first operand
10583     // and just mask out all affected bits.
10584     computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
10585 
10586     // The operand to BFI is already a mask suitable for removing the bits it
10587     // sets.
10588     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
10589     APInt Mask = CI->getAPIntValue();
10590     KnownZero &= Mask;
10591     KnownOne &= Mask;
10592     return;
10593   }
10594   if (Op.getOpcode() == ARMISD::CMOV) {
10595     APInt KZ2(KnownZero.getBitWidth(), 0);
10596     APInt KO2(KnownOne.getBitWidth(), 0);
10597     computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
10598     computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
10599 
10600     KnownZero &= KZ2;
10601     KnownOne &= KO2;
10602     return;
10603   }
10604   return DAG.computeKnownBits(Op, KnownZero, KnownOne);
10605 }
10606 
10607 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
10608   // If we have a CMOV, OR and AND combination such as:
10609   //   if (x & CN)
10610   //     y |= CM;
10611   //
10612   // And:
10613   //   * CN is a single bit;
10614   //   * All bits covered by CM are known zero in y
10615   //
10616   // Then we can convert this into a sequence of BFI instructions. This will
10617   // always be a win if CM is a single bit, will always be no worse than the
10618   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
10619   // three bits (due to the extra IT instruction).
10620 
10621   SDValue Op0 = CMOV->getOperand(0);
10622   SDValue Op1 = CMOV->getOperand(1);
10623   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
10624   auto CC = CCNode->getAPIntValue().getLimitedValue();
10625   SDValue CmpZ = CMOV->getOperand(4);
10626 
10627   // The compare must be against zero.
10628   if (!isNullConstant(CmpZ->getOperand(1)))
10629     return SDValue();
10630 
10631   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
10632   SDValue And = CmpZ->getOperand(0);
10633   if (And->getOpcode() != ISD::AND)
10634     return SDValue();
10635   ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
10636   if (!AndC || !AndC->getAPIntValue().isPowerOf2())
10637     return SDValue();
10638   SDValue X = And->getOperand(0);
10639 
10640   if (CC == ARMCC::EQ) {
10641     // We're performing an "equal to zero" compare. Swap the operands so we
10642     // canonicalize on a "not equal to zero" compare.
10643     std::swap(Op0, Op1);
10644   } else {
10645     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
10646   }
10647 
10648   if (Op1->getOpcode() != ISD::OR)
10649     return SDValue();
10650 
10651   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
10652   if (!OrC)
10653     return SDValue();
10654   SDValue Y = Op1->getOperand(0);
10655 
10656   if (Op0 != Y)
10657     return SDValue();
10658 
10659   // Now, is it profitable to continue?
10660   APInt OrCI = OrC->getAPIntValue();
10661   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
10662   if (OrCI.countPopulation() > Heuristic)
10663     return SDValue();
10664 
10665   // Lastly, can we determine that the bits defined by OrCI
10666   // are zero in Y?
10667   APInt KnownZero, KnownOne;
10668   computeKnownBits(DAG, Y, KnownZero, KnownOne);
10669   if ((OrCI & KnownZero) != OrCI)
10670     return SDValue();
10671 
10672   // OK, we can do the combine.
10673   SDValue V = Y;
10674   SDLoc dl(X);
10675   EVT VT = X.getValueType();
10676   unsigned BitInX = AndC->getAPIntValue().logBase2();
10677 
10678   if (BitInX != 0) {
10679     // We must shift X first.
10680     X = DAG.getNode(ISD::SRL, dl, VT, X,
10681                     DAG.getConstant(BitInX, dl, VT));
10682   }
10683 
10684   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
10685        BitInY < NumActiveBits; ++BitInY) {
10686     if (OrCI[BitInY] == 0)
10687       continue;
10688     APInt Mask(VT.getSizeInBits(), 0);
10689     Mask.setBit(BitInY);
10690     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
10691                     // Confusingly, the operand is an *inverted* mask.
10692                     DAG.getConstant(~Mask, dl, VT));
10693   }
10694 
10695   return V;
10696 }
10697 
10698 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
10699 SDValue
10700 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
10701   SDValue Cmp = N->getOperand(4);
10702   if (Cmp.getOpcode() != ARMISD::CMPZ)
10703     // Only looking at NE cases.
10704     return SDValue();
10705 
10706   EVT VT = N->getValueType(0);
10707   SDLoc dl(N);
10708   SDValue LHS = Cmp.getOperand(0);
10709   SDValue RHS = Cmp.getOperand(1);
10710   SDValue Chain = N->getOperand(0);
10711   SDValue BB = N->getOperand(1);
10712   SDValue ARMcc = N->getOperand(2);
10713   ARMCC::CondCodes CC =
10714     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
10715 
10716   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
10717   // -> (brcond Chain BB CC CPSR Cmp)
10718   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
10719       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
10720       LHS->getOperand(0)->hasOneUse()) {
10721     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
10722     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
10723     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
10724     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
10725     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
10726         (LHS01C && LHS01C->getZExtValue() == 1) &&
10727         (LHS1C && LHS1C->getZExtValue() == 1) &&
10728         (RHSC && RHSC->getZExtValue() == 0)) {
10729       return DAG.getNode(
10730           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
10731           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
10732     }
10733   }
10734 
10735   return SDValue();
10736 }
10737 
10738 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
10739 SDValue
10740 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
10741   SDValue Cmp = N->getOperand(4);
10742   if (Cmp.getOpcode() != ARMISD::CMPZ)
10743     // Only looking at EQ and NE cases.
10744     return SDValue();
10745 
10746   EVT VT = N->getValueType(0);
10747   SDLoc dl(N);
10748   SDValue LHS = Cmp.getOperand(0);
10749   SDValue RHS = Cmp.getOperand(1);
10750   SDValue FalseVal = N->getOperand(0);
10751   SDValue TrueVal = N->getOperand(1);
10752   SDValue ARMcc = N->getOperand(2);
10753   ARMCC::CondCodes CC =
10754     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
10755 
10756   // BFI is only available on V6T2+.
10757   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
10758     SDValue R = PerformCMOVToBFICombine(N, DAG);
10759     if (R)
10760       return R;
10761   }
10762 
10763   // Simplify
10764   //   mov     r1, r0
10765   //   cmp     r1, x
10766   //   mov     r0, y
10767   //   moveq   r0, x
10768   // to
10769   //   cmp     r0, x
10770   //   movne   r0, y
10771   //
10772   //   mov     r1, r0
10773   //   cmp     r1, x
10774   //   mov     r0, x
10775   //   movne   r0, y
10776   // to
10777   //   cmp     r0, x
10778   //   movne   r0, y
10779   /// FIXME: Turn this into a target neutral optimization?
10780   SDValue Res;
10781   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
10782     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
10783                       N->getOperand(3), Cmp);
10784   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
10785     SDValue ARMcc;
10786     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
10787     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
10788                       N->getOperand(3), NewCmp);
10789   }
10790 
10791   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
10792   // -> (cmov F T CC CPSR Cmp)
10793   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
10794     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
10795     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
10796     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
10797     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
10798         (LHS1C && LHS1C->getZExtValue() == 1) &&
10799         (RHSC && RHSC->getZExtValue() == 0)) {
10800       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
10801                          LHS->getOperand(2), LHS->getOperand(3),
10802                          LHS->getOperand(4));
10803     }
10804   }
10805 
10806   if (Res.getNode()) {
10807     APInt KnownZero, KnownOne;
10808     DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
10809     // Capture demanded bits information that would be otherwise lost.
10810     if (KnownZero == 0xfffffffe)
10811       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
10812                         DAG.getValueType(MVT::i1));
10813     else if (KnownZero == 0xffffff00)
10814       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
10815                         DAG.getValueType(MVT::i8));
10816     else if (KnownZero == 0xffff0000)
10817       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
10818                         DAG.getValueType(MVT::i16));
10819   }
10820 
10821   return Res;
10822 }
10823 
10824 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
10825                                              DAGCombinerInfo &DCI) const {
10826   switch (N->getOpcode()) {
10827   default: break;
10828   case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
10829   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
10830   case ISD::SUB:        return PerformSUBCombine(N, DCI);
10831   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
10832   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
10833   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
10834   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
10835   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
10836   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
10837   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
10838   case ISD::STORE:      return PerformSTORECombine(N, DCI);
10839   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
10840   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
10841   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
10842   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
10843   case ISD::FP_TO_SINT:
10844   case ISD::FP_TO_UINT:
10845     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
10846   case ISD::FDIV:
10847     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
10848   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
10849   case ISD::SHL:
10850   case ISD::SRA:
10851   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
10852   case ISD::SIGN_EXTEND:
10853   case ISD::ZERO_EXTEND:
10854   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
10855   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
10856   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
10857   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
10858   case ARMISD::VLD2DUP:
10859   case ARMISD::VLD3DUP:
10860   case ARMISD::VLD4DUP:
10861     return PerformVLDCombine(N, DCI);
10862   case ARMISD::BUILD_VECTOR:
10863     return PerformARMBUILD_VECTORCombine(N, DCI);
10864   case ISD::INTRINSIC_VOID:
10865   case ISD::INTRINSIC_W_CHAIN:
10866     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
10867     case Intrinsic::arm_neon_vld1:
10868     case Intrinsic::arm_neon_vld2:
10869     case Intrinsic::arm_neon_vld3:
10870     case Intrinsic::arm_neon_vld4:
10871     case Intrinsic::arm_neon_vld2lane:
10872     case Intrinsic::arm_neon_vld3lane:
10873     case Intrinsic::arm_neon_vld4lane:
10874     case Intrinsic::arm_neon_vst1:
10875     case Intrinsic::arm_neon_vst2:
10876     case Intrinsic::arm_neon_vst3:
10877     case Intrinsic::arm_neon_vst4:
10878     case Intrinsic::arm_neon_vst2lane:
10879     case Intrinsic::arm_neon_vst3lane:
10880     case Intrinsic::arm_neon_vst4lane:
10881       return PerformVLDCombine(N, DCI);
10882     default: break;
10883     }
10884     break;
10885   }
10886   return SDValue();
10887 }
10888 
10889 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
10890                                                           EVT VT) const {
10891   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
10892 }
10893 
10894 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
10895                                                        unsigned,
10896                                                        unsigned,
10897                                                        bool *Fast) const {
10898   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
10899   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
10900 
10901   switch (VT.getSimpleVT().SimpleTy) {
10902   default:
10903     return false;
10904   case MVT::i8:
10905   case MVT::i16:
10906   case MVT::i32: {
10907     // Unaligned access can use (for example) LRDB, LRDH, LDR
10908     if (AllowsUnaligned) {
10909       if (Fast)
10910         *Fast = Subtarget->hasV7Ops();
10911       return true;
10912     }
10913     return false;
10914   }
10915   case MVT::f64:
10916   case MVT::v2f64: {
10917     // For any little-endian targets with neon, we can support unaligned ld/st
10918     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
10919     // A big-endian target may also explicitly support unaligned accesses
10920     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
10921       if (Fast)
10922         *Fast = true;
10923       return true;
10924     }
10925     return false;
10926   }
10927   }
10928 }
10929 
10930 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
10931                        unsigned AlignCheck) {
10932   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
10933           (DstAlign == 0 || DstAlign % AlignCheck == 0));
10934 }
10935 
10936 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
10937                                            unsigned DstAlign, unsigned SrcAlign,
10938                                            bool IsMemset, bool ZeroMemset,
10939                                            bool MemcpyStrSrc,
10940                                            MachineFunction &MF) const {
10941   const Function *F = MF.getFunction();
10942 
10943   // See if we can use NEON instructions for this...
10944   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
10945       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10946     bool Fast;
10947     if (Size >= 16 &&
10948         (memOpAlign(SrcAlign, DstAlign, 16) ||
10949          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
10950       return MVT::v2f64;
10951     } else if (Size >= 8 &&
10952                (memOpAlign(SrcAlign, DstAlign, 8) ||
10953                 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
10954                  Fast))) {
10955       return MVT::f64;
10956     }
10957   }
10958 
10959   // Lowering to i32/i16 if the size permits.
10960   if (Size >= 4)
10961     return MVT::i32;
10962   else if (Size >= 2)
10963     return MVT::i16;
10964 
10965   // Let the target-independent logic figure it out.
10966   return MVT::Other;
10967 }
10968 
10969 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
10970   if (Val.getOpcode() != ISD::LOAD)
10971     return false;
10972 
10973   EVT VT1 = Val.getValueType();
10974   if (!VT1.isSimple() || !VT1.isInteger() ||
10975       !VT2.isSimple() || !VT2.isInteger())
10976     return false;
10977 
10978   switch (VT1.getSimpleVT().SimpleTy) {
10979   default: break;
10980   case MVT::i1:
10981   case MVT::i8:
10982   case MVT::i16:
10983     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
10984     return true;
10985   }
10986 
10987   return false;
10988 }
10989 
10990 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
10991   EVT VT = ExtVal.getValueType();
10992 
10993   if (!isTypeLegal(VT))
10994     return false;
10995 
10996   // Don't create a loadext if we can fold the extension into a wide/long
10997   // instruction.
10998   // If there's more than one user instruction, the loadext is desirable no
10999   // matter what.  There can be two uses by the same instruction.
11000   if (ExtVal->use_empty() ||
11001       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
11002     return true;
11003 
11004   SDNode *U = *ExtVal->use_begin();
11005   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
11006        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
11007     return false;
11008 
11009   return true;
11010 }
11011 
11012 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
11013   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11014     return false;
11015 
11016   if (!isTypeLegal(EVT::getEVT(Ty1)))
11017     return false;
11018 
11019   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
11020 
11021   // Assuming the caller doesn't have a zeroext or signext return parameter,
11022   // truncation all the way down to i1 is valid.
11023   return true;
11024 }
11025 
11026 
11027 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
11028   if (V < 0)
11029     return false;
11030 
11031   unsigned Scale = 1;
11032   switch (VT.getSimpleVT().SimpleTy) {
11033   default: return false;
11034   case MVT::i1:
11035   case MVT::i8:
11036     // Scale == 1;
11037     break;
11038   case MVT::i16:
11039     // Scale == 2;
11040     Scale = 2;
11041     break;
11042   case MVT::i32:
11043     // Scale == 4;
11044     Scale = 4;
11045     break;
11046   }
11047 
11048   if ((V & (Scale - 1)) != 0)
11049     return false;
11050   V /= Scale;
11051   return V == (V & ((1LL << 5) - 1));
11052 }
11053 
11054 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
11055                                       const ARMSubtarget *Subtarget) {
11056   bool isNeg = false;
11057   if (V < 0) {
11058     isNeg = true;
11059     V = - V;
11060   }
11061 
11062   switch (VT.getSimpleVT().SimpleTy) {
11063   default: return false;
11064   case MVT::i1:
11065   case MVT::i8:
11066   case MVT::i16:
11067   case MVT::i32:
11068     // + imm12 or - imm8
11069     if (isNeg)
11070       return V == (V & ((1LL << 8) - 1));
11071     return V == (V & ((1LL << 12) - 1));
11072   case MVT::f32:
11073   case MVT::f64:
11074     // Same as ARM mode. FIXME: NEON?
11075     if (!Subtarget->hasVFP2())
11076       return false;
11077     if ((V & 3) != 0)
11078       return false;
11079     V >>= 2;
11080     return V == (V & ((1LL << 8) - 1));
11081   }
11082 }
11083 
11084 /// isLegalAddressImmediate - Return true if the integer value can be used
11085 /// as the offset of the target addressing mode for load / store of the
11086 /// given type.
11087 static bool isLegalAddressImmediate(int64_t V, EVT VT,
11088                                     const ARMSubtarget *Subtarget) {
11089   if (V == 0)
11090     return true;
11091 
11092   if (!VT.isSimple())
11093     return false;
11094 
11095   if (Subtarget->isThumb1Only())
11096     return isLegalT1AddressImmediate(V, VT);
11097   else if (Subtarget->isThumb2())
11098     return isLegalT2AddressImmediate(V, VT, Subtarget);
11099 
11100   // ARM mode.
11101   if (V < 0)
11102     V = - V;
11103   switch (VT.getSimpleVT().SimpleTy) {
11104   default: return false;
11105   case MVT::i1:
11106   case MVT::i8:
11107   case MVT::i32:
11108     // +- imm12
11109     return V == (V & ((1LL << 12) - 1));
11110   case MVT::i16:
11111     // +- imm8
11112     return V == (V & ((1LL << 8) - 1));
11113   case MVT::f32:
11114   case MVT::f64:
11115     if (!Subtarget->hasVFP2()) // FIXME: NEON?
11116       return false;
11117     if ((V & 3) != 0)
11118       return false;
11119     V >>= 2;
11120     return V == (V & ((1LL << 8) - 1));
11121   }
11122 }
11123 
11124 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
11125                                                       EVT VT) const {
11126   int Scale = AM.Scale;
11127   if (Scale < 0)
11128     return false;
11129 
11130   switch (VT.getSimpleVT().SimpleTy) {
11131   default: return false;
11132   case MVT::i1:
11133   case MVT::i8:
11134   case MVT::i16:
11135   case MVT::i32:
11136     if (Scale == 1)
11137       return true;
11138     // r + r << imm
11139     Scale = Scale & ~1;
11140     return Scale == 2 || Scale == 4 || Scale == 8;
11141   case MVT::i64:
11142     // r + r
11143     if (((unsigned)AM.HasBaseReg + Scale) <= 2)
11144       return true;
11145     return false;
11146   case MVT::isVoid:
11147     // Note, we allow "void" uses (basically, uses that aren't loads or
11148     // stores), because arm allows folding a scale into many arithmetic
11149     // operations.  This should be made more precise and revisited later.
11150 
11151     // Allow r << imm, but the imm has to be a multiple of two.
11152     if (Scale & 1) return false;
11153     return isPowerOf2_32(Scale);
11154   }
11155 }
11156 
11157 /// isLegalAddressingMode - Return true if the addressing mode represented
11158 /// by AM is legal for this target, for a load/store of the specified type.
11159 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
11160                                               const AddrMode &AM, Type *Ty,
11161                                               unsigned AS) const {
11162   EVT VT = getValueType(DL, Ty, true);
11163   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
11164     return false;
11165 
11166   // Can never fold addr of global into load/store.
11167   if (AM.BaseGV)
11168     return false;
11169 
11170   switch (AM.Scale) {
11171   case 0:  // no scale reg, must be "r+i" or "r", or "i".
11172     break;
11173   case 1:
11174     if (Subtarget->isThumb1Only())
11175       return false;
11176     // FALL THROUGH.
11177   default:
11178     // ARM doesn't support any R+R*scale+imm addr modes.
11179     if (AM.BaseOffs)
11180       return false;
11181 
11182     if (!VT.isSimple())
11183       return false;
11184 
11185     if (Subtarget->isThumb2())
11186       return isLegalT2ScaledAddressingMode(AM, VT);
11187 
11188     int Scale = AM.Scale;
11189     switch (VT.getSimpleVT().SimpleTy) {
11190     default: return false;
11191     case MVT::i1:
11192     case MVT::i8:
11193     case MVT::i32:
11194       if (Scale < 0) Scale = -Scale;
11195       if (Scale == 1)
11196         return true;
11197       // r + r << imm
11198       return isPowerOf2_32(Scale & ~1);
11199     case MVT::i16:
11200     case MVT::i64:
11201       // r + r
11202       if (((unsigned)AM.HasBaseReg + Scale) <= 2)
11203         return true;
11204       return false;
11205 
11206     case MVT::isVoid:
11207       // Note, we allow "void" uses (basically, uses that aren't loads or
11208       // stores), because arm allows folding a scale into many arithmetic
11209       // operations.  This should be made more precise and revisited later.
11210 
11211       // Allow r << imm, but the imm has to be a multiple of two.
11212       if (Scale & 1) return false;
11213       return isPowerOf2_32(Scale);
11214     }
11215   }
11216   return true;
11217 }
11218 
11219 /// isLegalICmpImmediate - Return true if the specified immediate is legal
11220 /// icmp immediate, that is the target has icmp instructions which can compare
11221 /// a register against the immediate without having to materialize the
11222 /// immediate into a register.
11223 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
11224   // Thumb2 and ARM modes can use cmn for negative immediates.
11225   if (!Subtarget->isThumb())
11226     return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
11227   if (Subtarget->isThumb2())
11228     return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
11229   // Thumb1 doesn't have cmn, and only 8-bit immediates.
11230   return Imm >= 0 && Imm <= 255;
11231 }
11232 
11233 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
11234 /// *or sub* immediate, that is the target has add or sub instructions which can
11235 /// add a register with the immediate without having to materialize the
11236 /// immediate into a register.
11237 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
11238   // Same encoding for add/sub, just flip the sign.
11239   int64_t AbsImm = std::abs(Imm);
11240   if (!Subtarget->isThumb())
11241     return ARM_AM::getSOImmVal(AbsImm) != -1;
11242   if (Subtarget->isThumb2())
11243     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
11244   // Thumb1 only has 8-bit unsigned immediate.
11245   return AbsImm >= 0 && AbsImm <= 255;
11246 }
11247 
11248 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
11249                                       bool isSEXTLoad, SDValue &Base,
11250                                       SDValue &Offset, bool &isInc,
11251                                       SelectionDAG &DAG) {
11252   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
11253     return false;
11254 
11255   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
11256     // AddressingMode 3
11257     Base = Ptr->getOperand(0);
11258     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11259       int RHSC = (int)RHS->getZExtValue();
11260       if (RHSC < 0 && RHSC > -256) {
11261         assert(Ptr->getOpcode() == ISD::ADD);
11262         isInc = false;
11263         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11264         return true;
11265       }
11266     }
11267     isInc = (Ptr->getOpcode() == ISD::ADD);
11268     Offset = Ptr->getOperand(1);
11269     return true;
11270   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
11271     // AddressingMode 2
11272     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11273       int RHSC = (int)RHS->getZExtValue();
11274       if (RHSC < 0 && RHSC > -0x1000) {
11275         assert(Ptr->getOpcode() == ISD::ADD);
11276         isInc = false;
11277         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11278         Base = Ptr->getOperand(0);
11279         return true;
11280       }
11281     }
11282 
11283     if (Ptr->getOpcode() == ISD::ADD) {
11284       isInc = true;
11285       ARM_AM::ShiftOpc ShOpcVal=
11286         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
11287       if (ShOpcVal != ARM_AM::no_shift) {
11288         Base = Ptr->getOperand(1);
11289         Offset = Ptr->getOperand(0);
11290       } else {
11291         Base = Ptr->getOperand(0);
11292         Offset = Ptr->getOperand(1);
11293       }
11294       return true;
11295     }
11296 
11297     isInc = (Ptr->getOpcode() == ISD::ADD);
11298     Base = Ptr->getOperand(0);
11299     Offset = Ptr->getOperand(1);
11300     return true;
11301   }
11302 
11303   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
11304   return false;
11305 }
11306 
11307 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
11308                                      bool isSEXTLoad, SDValue &Base,
11309                                      SDValue &Offset, bool &isInc,
11310                                      SelectionDAG &DAG) {
11311   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
11312     return false;
11313 
11314   Base = Ptr->getOperand(0);
11315   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11316     int RHSC = (int)RHS->getZExtValue();
11317     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
11318       assert(Ptr->getOpcode() == ISD::ADD);
11319       isInc = false;
11320       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11321       return true;
11322     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
11323       isInc = Ptr->getOpcode() == ISD::ADD;
11324       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
11325       return true;
11326     }
11327   }
11328 
11329   return false;
11330 }
11331 
11332 /// getPreIndexedAddressParts - returns true by value, base pointer and
11333 /// offset pointer and addressing mode by reference if the node's address
11334 /// can be legally represented as pre-indexed load / store address.
11335 bool
11336 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
11337                                              SDValue &Offset,
11338                                              ISD::MemIndexedMode &AM,
11339                                              SelectionDAG &DAG) const {
11340   if (Subtarget->isThumb1Only())
11341     return false;
11342 
11343   EVT VT;
11344   SDValue Ptr;
11345   bool isSEXTLoad = false;
11346   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
11347     Ptr = LD->getBasePtr();
11348     VT  = LD->getMemoryVT();
11349     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
11350   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
11351     Ptr = ST->getBasePtr();
11352     VT  = ST->getMemoryVT();
11353   } else
11354     return false;
11355 
11356   bool isInc;
11357   bool isLegal = false;
11358   if (Subtarget->isThumb2())
11359     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
11360                                        Offset, isInc, DAG);
11361   else
11362     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
11363                                         Offset, isInc, DAG);
11364   if (!isLegal)
11365     return false;
11366 
11367   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
11368   return true;
11369 }
11370 
11371 /// getPostIndexedAddressParts - returns true by value, base pointer and
11372 /// offset pointer and addressing mode by reference if this node can be
11373 /// combined with a load / store to form a post-indexed load / store.
11374 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
11375                                                    SDValue &Base,
11376                                                    SDValue &Offset,
11377                                                    ISD::MemIndexedMode &AM,
11378                                                    SelectionDAG &DAG) const {
11379   if (Subtarget->isThumb1Only())
11380     return false;
11381 
11382   EVT VT;
11383   SDValue Ptr;
11384   bool isSEXTLoad = false;
11385   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
11386     VT  = LD->getMemoryVT();
11387     Ptr = LD->getBasePtr();
11388     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
11389   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
11390     VT  = ST->getMemoryVT();
11391     Ptr = ST->getBasePtr();
11392   } else
11393     return false;
11394 
11395   bool isInc;
11396   bool isLegal = false;
11397   if (Subtarget->isThumb2())
11398     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
11399                                        isInc, DAG);
11400   else
11401     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
11402                                         isInc, DAG);
11403   if (!isLegal)
11404     return false;
11405 
11406   if (Ptr != Base) {
11407     // Swap base ptr and offset to catch more post-index load / store when
11408     // it's legal. In Thumb2 mode, offset must be an immediate.
11409     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
11410         !Subtarget->isThumb2())
11411       std::swap(Base, Offset);
11412 
11413     // Post-indexed load / store update the base pointer.
11414     if (Ptr != Base)
11415       return false;
11416   }
11417 
11418   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
11419   return true;
11420 }
11421 
11422 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
11423                                                       APInt &KnownZero,
11424                                                       APInt &KnownOne,
11425                                                       const SelectionDAG &DAG,
11426                                                       unsigned Depth) const {
11427   unsigned BitWidth = KnownOne.getBitWidth();
11428   KnownZero = KnownOne = APInt(BitWidth, 0);
11429   switch (Op.getOpcode()) {
11430   default: break;
11431   case ARMISD::ADDC:
11432   case ARMISD::ADDE:
11433   case ARMISD::SUBC:
11434   case ARMISD::SUBE:
11435     // These nodes' second result is a boolean
11436     if (Op.getResNo() == 0)
11437       break;
11438     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
11439     break;
11440   case ARMISD::CMOV: {
11441     // Bits are known zero/one if known on the LHS and RHS.
11442     DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
11443     if (KnownZero == 0 && KnownOne == 0) return;
11444 
11445     APInt KnownZeroRHS, KnownOneRHS;
11446     DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
11447     KnownZero &= KnownZeroRHS;
11448     KnownOne  &= KnownOneRHS;
11449     return;
11450   }
11451   case ISD::INTRINSIC_W_CHAIN: {
11452     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
11453     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
11454     switch (IntID) {
11455     default: return;
11456     case Intrinsic::arm_ldaex:
11457     case Intrinsic::arm_ldrex: {
11458       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
11459       unsigned MemBits = VT.getScalarType().getSizeInBits();
11460       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
11461       return;
11462     }
11463     }
11464   }
11465   }
11466 }
11467 
11468 //===----------------------------------------------------------------------===//
11469 //                           ARM Inline Assembly Support
11470 //===----------------------------------------------------------------------===//
11471 
11472 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
11473   // Looking for "rev" which is V6+.
11474   if (!Subtarget->hasV6Ops())
11475     return false;
11476 
11477   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
11478   std::string AsmStr = IA->getAsmString();
11479   SmallVector<StringRef, 4> AsmPieces;
11480   SplitString(AsmStr, AsmPieces, ";\n");
11481 
11482   switch (AsmPieces.size()) {
11483   default: return false;
11484   case 1:
11485     AsmStr = AsmPieces[0];
11486     AsmPieces.clear();
11487     SplitString(AsmStr, AsmPieces, " \t,");
11488 
11489     // rev $0, $1
11490     if (AsmPieces.size() == 3 &&
11491         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
11492         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
11493       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
11494       if (Ty && Ty->getBitWidth() == 32)
11495         return IntrinsicLowering::LowerToByteSwap(CI);
11496     }
11497     break;
11498   }
11499 
11500   return false;
11501 }
11502 
11503 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
11504   // At this point, we have to lower this constraint to something else, so we
11505   // lower it to an "r" or "w". However, by doing this we will force the result
11506   // to be in register, while the X constraint is much more permissive.
11507   //
11508   // Although we are correct (we are free to emit anything, without
11509   // constraints), we might break use cases that would expect us to be more
11510   // efficient and emit something else.
11511   if (!Subtarget->hasVFP2())
11512     return "r";
11513   if (ConstraintVT.isFloatingPoint())
11514     return "w";
11515   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
11516      (ConstraintVT.getSizeInBits() == 64 ||
11517       ConstraintVT.getSizeInBits() == 128))
11518     return "w";
11519 
11520   return "r";
11521 }
11522 
11523 /// getConstraintType - Given a constraint letter, return the type of
11524 /// constraint it is for this target.
11525 ARMTargetLowering::ConstraintType
11526 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
11527   if (Constraint.size() == 1) {
11528     switch (Constraint[0]) {
11529     default:  break;
11530     case 'l': return C_RegisterClass;
11531     case 'w': return C_RegisterClass;
11532     case 'h': return C_RegisterClass;
11533     case 'x': return C_RegisterClass;
11534     case 't': return C_RegisterClass;
11535     case 'j': return C_Other; // Constant for movw.
11536       // An address with a single base register. Due to the way we
11537       // currently handle addresses it is the same as an 'r' memory constraint.
11538     case 'Q': return C_Memory;
11539     }
11540   } else if (Constraint.size() == 2) {
11541     switch (Constraint[0]) {
11542     default: break;
11543     // All 'U+' constraints are addresses.
11544     case 'U': return C_Memory;
11545     }
11546   }
11547   return TargetLowering::getConstraintType(Constraint);
11548 }
11549 
11550 /// Examine constraint type and operand type and determine a weight value.
11551 /// This object must already have been set up with the operand type
11552 /// and the current alternative constraint selected.
11553 TargetLowering::ConstraintWeight
11554 ARMTargetLowering::getSingleConstraintMatchWeight(
11555     AsmOperandInfo &info, const char *constraint) const {
11556   ConstraintWeight weight = CW_Invalid;
11557   Value *CallOperandVal = info.CallOperandVal;
11558     // If we don't have a value, we can't do a match,
11559     // but allow it at the lowest weight.
11560   if (!CallOperandVal)
11561     return CW_Default;
11562   Type *type = CallOperandVal->getType();
11563   // Look at the constraint type.
11564   switch (*constraint) {
11565   default:
11566     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
11567     break;
11568   case 'l':
11569     if (type->isIntegerTy()) {
11570       if (Subtarget->isThumb())
11571         weight = CW_SpecificReg;
11572       else
11573         weight = CW_Register;
11574     }
11575     break;
11576   case 'w':
11577     if (type->isFloatingPointTy())
11578       weight = CW_Register;
11579     break;
11580   }
11581   return weight;
11582 }
11583 
11584 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
11585 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
11586     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11587   if (Constraint.size() == 1) {
11588     // GCC ARM Constraint Letters
11589     switch (Constraint[0]) {
11590     case 'l': // Low regs or general regs.
11591       if (Subtarget->isThumb())
11592         return RCPair(0U, &ARM::tGPRRegClass);
11593       return RCPair(0U, &ARM::GPRRegClass);
11594     case 'h': // High regs or no regs.
11595       if (Subtarget->isThumb())
11596         return RCPair(0U, &ARM::hGPRRegClass);
11597       break;
11598     case 'r':
11599       if (Subtarget->isThumb1Only())
11600         return RCPair(0U, &ARM::tGPRRegClass);
11601       return RCPair(0U, &ARM::GPRRegClass);
11602     case 'w':
11603       if (VT == MVT::Other)
11604         break;
11605       if (VT == MVT::f32)
11606         return RCPair(0U, &ARM::SPRRegClass);
11607       if (VT.getSizeInBits() == 64)
11608         return RCPair(0U, &ARM::DPRRegClass);
11609       if (VT.getSizeInBits() == 128)
11610         return RCPair(0U, &ARM::QPRRegClass);
11611       break;
11612     case 'x':
11613       if (VT == MVT::Other)
11614         break;
11615       if (VT == MVT::f32)
11616         return RCPair(0U, &ARM::SPR_8RegClass);
11617       if (VT.getSizeInBits() == 64)
11618         return RCPair(0U, &ARM::DPR_8RegClass);
11619       if (VT.getSizeInBits() == 128)
11620         return RCPair(0U, &ARM::QPR_8RegClass);
11621       break;
11622     case 't':
11623       if (VT == MVT::f32)
11624         return RCPair(0U, &ARM::SPRRegClass);
11625       break;
11626     }
11627   }
11628   if (StringRef("{cc}").equals_lower(Constraint))
11629     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
11630 
11631   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11632 }
11633 
11634 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11635 /// vector.  If it is invalid, don't add anything to Ops.
11636 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
11637                                                      std::string &Constraint,
11638                                                      std::vector<SDValue>&Ops,
11639                                                      SelectionDAG &DAG) const {
11640   SDValue Result;
11641 
11642   // Currently only support length 1 constraints.
11643   if (Constraint.length() != 1) return;
11644 
11645   char ConstraintLetter = Constraint[0];
11646   switch (ConstraintLetter) {
11647   default: break;
11648   case 'j':
11649   case 'I': case 'J': case 'K': case 'L':
11650   case 'M': case 'N': case 'O':
11651     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11652     if (!C)
11653       return;
11654 
11655     int64_t CVal64 = C->getSExtValue();
11656     int CVal = (int) CVal64;
11657     // None of these constraints allow values larger than 32 bits.  Check
11658     // that the value fits in an int.
11659     if (CVal != CVal64)
11660       return;
11661 
11662     switch (ConstraintLetter) {
11663       case 'j':
11664         // Constant suitable for movw, must be between 0 and
11665         // 65535.
11666         if (Subtarget->hasV6T2Ops())
11667           if (CVal >= 0 && CVal <= 65535)
11668             break;
11669         return;
11670       case 'I':
11671         if (Subtarget->isThumb1Only()) {
11672           // This must be a constant between 0 and 255, for ADD
11673           // immediates.
11674           if (CVal >= 0 && CVal <= 255)
11675             break;
11676         } else if (Subtarget->isThumb2()) {
11677           // A constant that can be used as an immediate value in a
11678           // data-processing instruction.
11679           if (ARM_AM::getT2SOImmVal(CVal) != -1)
11680             break;
11681         } else {
11682           // A constant that can be used as an immediate value in a
11683           // data-processing instruction.
11684           if (ARM_AM::getSOImmVal(CVal) != -1)
11685             break;
11686         }
11687         return;
11688 
11689       case 'J':
11690         if (Subtarget->isThumb1Only()) {
11691           // This must be a constant between -255 and -1, for negated ADD
11692           // immediates. This can be used in GCC with an "n" modifier that
11693           // prints the negated value, for use with SUB instructions. It is
11694           // not useful otherwise but is implemented for compatibility.
11695           if (CVal >= -255 && CVal <= -1)
11696             break;
11697         } else {
11698           // This must be a constant between -4095 and 4095. It is not clear
11699           // what this constraint is intended for. Implemented for
11700           // compatibility with GCC.
11701           if (CVal >= -4095 && CVal <= 4095)
11702             break;
11703         }
11704         return;
11705 
11706       case 'K':
11707         if (Subtarget->isThumb1Only()) {
11708           // A 32-bit value where only one byte has a nonzero value. Exclude
11709           // zero to match GCC. This constraint is used by GCC internally for
11710           // constants that can be loaded with a move/shift combination.
11711           // It is not useful otherwise but is implemented for compatibility.
11712           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
11713             break;
11714         } else if (Subtarget->isThumb2()) {
11715           // A constant whose bitwise inverse can be used as an immediate
11716           // value in a data-processing instruction. This can be used in GCC
11717           // with a "B" modifier that prints the inverted value, for use with
11718           // BIC and MVN instructions. It is not useful otherwise but is
11719           // implemented for compatibility.
11720           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
11721             break;
11722         } else {
11723           // A constant whose bitwise inverse can be used as an immediate
11724           // value in a data-processing instruction. This can be used in GCC
11725           // with a "B" modifier that prints the inverted value, for use with
11726           // BIC and MVN instructions. It is not useful otherwise but is
11727           // implemented for compatibility.
11728           if (ARM_AM::getSOImmVal(~CVal) != -1)
11729             break;
11730         }
11731         return;
11732 
11733       case 'L':
11734         if (Subtarget->isThumb1Only()) {
11735           // This must be a constant between -7 and 7,
11736           // for 3-operand ADD/SUB immediate instructions.
11737           if (CVal >= -7 && CVal < 7)
11738             break;
11739         } else if (Subtarget->isThumb2()) {
11740           // A constant whose negation can be used as an immediate value in a
11741           // data-processing instruction. This can be used in GCC with an "n"
11742           // modifier that prints the negated value, for use with SUB
11743           // instructions. It is not useful otherwise but is implemented for
11744           // compatibility.
11745           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
11746             break;
11747         } else {
11748           // A constant whose negation can be used as an immediate value in a
11749           // data-processing instruction. This can be used in GCC with an "n"
11750           // modifier that prints the negated value, for use with SUB
11751           // instructions. It is not useful otherwise but is implemented for
11752           // compatibility.
11753           if (ARM_AM::getSOImmVal(-CVal) != -1)
11754             break;
11755         }
11756         return;
11757 
11758       case 'M':
11759         if (Subtarget->isThumb1Only()) {
11760           // This must be a multiple of 4 between 0 and 1020, for
11761           // ADD sp + immediate.
11762           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
11763             break;
11764         } else {
11765           // A power of two or a constant between 0 and 32.  This is used in
11766           // GCC for the shift amount on shifted register operands, but it is
11767           // useful in general for any shift amounts.
11768           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
11769             break;
11770         }
11771         return;
11772 
11773       case 'N':
11774         if (Subtarget->isThumb()) {  // FIXME thumb2
11775           // This must be a constant between 0 and 31, for shift amounts.
11776           if (CVal >= 0 && CVal <= 31)
11777             break;
11778         }
11779         return;
11780 
11781       case 'O':
11782         if (Subtarget->isThumb()) {  // FIXME thumb2
11783           // This must be a multiple of 4 between -508 and 508, for
11784           // ADD/SUB sp = sp + immediate.
11785           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
11786             break;
11787         }
11788         return;
11789     }
11790     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
11791     break;
11792   }
11793 
11794   if (Result.getNode()) {
11795     Ops.push_back(Result);
11796     return;
11797   }
11798   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11799 }
11800 
11801 static RTLIB::Libcall getDivRemLibcall(
11802     const SDNode *N, MVT::SimpleValueType SVT) {
11803   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
11804           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
11805          "Unhandled Opcode in getDivRemLibcall");
11806   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
11807                   N->getOpcode() == ISD::SREM;
11808   RTLIB::Libcall LC;
11809   switch (SVT) {
11810   default: llvm_unreachable("Unexpected request for libcall!");
11811   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
11812   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
11813   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
11814   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
11815   }
11816   return LC;
11817 }
11818 
11819 static TargetLowering::ArgListTy getDivRemArgList(
11820     const SDNode *N, LLVMContext *Context) {
11821   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
11822           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
11823          "Unhandled Opcode in getDivRemArgList");
11824   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
11825                   N->getOpcode() == ISD::SREM;
11826   TargetLowering::ArgListTy Args;
11827   TargetLowering::ArgListEntry Entry;
11828   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
11829     EVT ArgVT = N->getOperand(i).getValueType();
11830     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
11831     Entry.Node = N->getOperand(i);
11832     Entry.Ty = ArgTy;
11833     Entry.isSExt = isSigned;
11834     Entry.isZExt = !isSigned;
11835     Args.push_back(Entry);
11836   }
11837   return Args;
11838 }
11839 
11840 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
11841   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
11842           Subtarget->isTargetGNUAEABI()) &&
11843          "Register-based DivRem lowering only");
11844   unsigned Opcode = Op->getOpcode();
11845   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
11846          "Invalid opcode for Div/Rem lowering");
11847   bool isSigned = (Opcode == ISD::SDIVREM);
11848   EVT VT = Op->getValueType(0);
11849   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
11850 
11851   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
11852                                        VT.getSimpleVT().SimpleTy);
11853   SDValue InChain = DAG.getEntryNode();
11854 
11855   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
11856                                                     DAG.getContext());
11857 
11858   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
11859                                          getPointerTy(DAG.getDataLayout()));
11860 
11861   Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
11862 
11863   SDLoc dl(Op);
11864   TargetLowering::CallLoweringInfo CLI(DAG);
11865   CLI.setDebugLoc(dl).setChain(InChain)
11866     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
11867     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
11868 
11869   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
11870   return CallInfo.first;
11871 }
11872 
11873 // Lowers REM using divmod helpers
11874 // see RTABI section 4.2/4.3
11875 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
11876   // Build return types (div and rem)
11877   std::vector<Type*> RetTyParams;
11878   Type *RetTyElement;
11879 
11880   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
11881   default: llvm_unreachable("Unexpected request for libcall!");
11882   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
11883   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
11884   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
11885   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
11886   }
11887 
11888   RetTyParams.push_back(RetTyElement);
11889   RetTyParams.push_back(RetTyElement);
11890   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
11891   Type *RetTy = StructType::get(*DAG.getContext(), ret);
11892 
11893   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
11894                                                              SimpleTy);
11895   SDValue InChain = DAG.getEntryNode();
11896   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext());
11897   bool isSigned = N->getOpcode() == ISD::SREM;
11898   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
11899                                          getPointerTy(DAG.getDataLayout()));
11900 
11901   // Lower call
11902   CallLoweringInfo CLI(DAG);
11903   CLI.setChain(InChain)
11904      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0)
11905      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
11906   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
11907 
11908   // Return second (rem) result operand (first contains div)
11909   SDNode *ResNode = CallResult.first.getNode();
11910   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
11911   return ResNode->getOperand(1);
11912 }
11913 
11914 SDValue
11915 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
11916   assert(Subtarget->isTargetWindows() && "unsupported target platform");
11917   SDLoc DL(Op);
11918 
11919   // Get the inputs.
11920   SDValue Chain = Op.getOperand(0);
11921   SDValue Size  = Op.getOperand(1);
11922 
11923   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
11924                               DAG.getConstant(2, DL, MVT::i32));
11925 
11926   SDValue Flag;
11927   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
11928   Flag = Chain.getValue(1);
11929 
11930   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
11931   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
11932 
11933   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
11934   Chain = NewSP.getValue(1);
11935 
11936   SDValue Ops[2] = { NewSP, Chain };
11937   return DAG.getMergeValues(Ops, DL);
11938 }
11939 
11940 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
11941   assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
11942          "Unexpected type for custom-lowering FP_EXTEND");
11943 
11944   RTLIB::Libcall LC;
11945   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
11946 
11947   SDValue SrcVal = Op.getOperand(0);
11948   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
11949                      SDLoc(Op)).first;
11950 }
11951 
11952 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
11953   assert(Op.getOperand(0).getValueType() == MVT::f64 &&
11954          Subtarget->isFPOnlySP() &&
11955          "Unexpected type for custom-lowering FP_ROUND");
11956 
11957   RTLIB::Libcall LC;
11958   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
11959 
11960   SDValue SrcVal = Op.getOperand(0);
11961   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
11962                      SDLoc(Op)).first;
11963 }
11964 
11965 bool
11966 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
11967   // The ARM target isn't yet aware of offsets.
11968   return false;
11969 }
11970 
11971 bool ARM::isBitFieldInvertedMask(unsigned v) {
11972   if (v == 0xffffffff)
11973     return false;
11974 
11975   // there can be 1's on either or both "outsides", all the "inside"
11976   // bits must be 0's
11977   return isShiftedMask_32(~v);
11978 }
11979 
11980 /// isFPImmLegal - Returns true if the target can instruction select the
11981 /// specified FP immediate natively. If false, the legalizer will
11982 /// materialize the FP immediate as a load from a constant pool.
11983 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
11984   if (!Subtarget->hasVFP3())
11985     return false;
11986   if (VT == MVT::f32)
11987     return ARM_AM::getFP32Imm(Imm) != -1;
11988   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
11989     return ARM_AM::getFP64Imm(Imm) != -1;
11990   return false;
11991 }
11992 
11993 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
11994 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
11995 /// specified in the intrinsic calls.
11996 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
11997                                            const CallInst &I,
11998                                            unsigned Intrinsic) const {
11999   switch (Intrinsic) {
12000   case Intrinsic::arm_neon_vld1:
12001   case Intrinsic::arm_neon_vld2:
12002   case Intrinsic::arm_neon_vld3:
12003   case Intrinsic::arm_neon_vld4:
12004   case Intrinsic::arm_neon_vld2lane:
12005   case Intrinsic::arm_neon_vld3lane:
12006   case Intrinsic::arm_neon_vld4lane: {
12007     Info.opc = ISD::INTRINSIC_W_CHAIN;
12008     // Conservatively set memVT to the entire set of vectors loaded.
12009     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12010     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
12011     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12012     Info.ptrVal = I.getArgOperand(0);
12013     Info.offset = 0;
12014     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12015     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12016     Info.vol = false; // volatile loads with NEON intrinsics not supported
12017     Info.readMem = true;
12018     Info.writeMem = false;
12019     return true;
12020   }
12021   case Intrinsic::arm_neon_vst1:
12022   case Intrinsic::arm_neon_vst2:
12023   case Intrinsic::arm_neon_vst3:
12024   case Intrinsic::arm_neon_vst4:
12025   case Intrinsic::arm_neon_vst2lane:
12026   case Intrinsic::arm_neon_vst3lane:
12027   case Intrinsic::arm_neon_vst4lane: {
12028     Info.opc = ISD::INTRINSIC_VOID;
12029     // Conservatively set memVT to the entire set of vectors stored.
12030     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12031     unsigned NumElts = 0;
12032     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
12033       Type *ArgTy = I.getArgOperand(ArgI)->getType();
12034       if (!ArgTy->isVectorTy())
12035         break;
12036       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
12037     }
12038     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12039     Info.ptrVal = I.getArgOperand(0);
12040     Info.offset = 0;
12041     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12042     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12043     Info.vol = false; // volatile stores with NEON intrinsics not supported
12044     Info.readMem = false;
12045     Info.writeMem = true;
12046     return true;
12047   }
12048   case Intrinsic::arm_ldaex:
12049   case Intrinsic::arm_ldrex: {
12050     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12051     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
12052     Info.opc = ISD::INTRINSIC_W_CHAIN;
12053     Info.memVT = MVT::getVT(PtrTy->getElementType());
12054     Info.ptrVal = I.getArgOperand(0);
12055     Info.offset = 0;
12056     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12057     Info.vol = true;
12058     Info.readMem = true;
12059     Info.writeMem = false;
12060     return true;
12061   }
12062   case Intrinsic::arm_stlex:
12063   case Intrinsic::arm_strex: {
12064     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12065     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
12066     Info.opc = ISD::INTRINSIC_W_CHAIN;
12067     Info.memVT = MVT::getVT(PtrTy->getElementType());
12068     Info.ptrVal = I.getArgOperand(1);
12069     Info.offset = 0;
12070     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12071     Info.vol = true;
12072     Info.readMem = false;
12073     Info.writeMem = true;
12074     return true;
12075   }
12076   case Intrinsic::arm_stlexd:
12077   case Intrinsic::arm_strexd: {
12078     Info.opc = ISD::INTRINSIC_W_CHAIN;
12079     Info.memVT = MVT::i64;
12080     Info.ptrVal = I.getArgOperand(2);
12081     Info.offset = 0;
12082     Info.align = 8;
12083     Info.vol = true;
12084     Info.readMem = false;
12085     Info.writeMem = true;
12086     return true;
12087   }
12088   case Intrinsic::arm_ldaexd:
12089   case Intrinsic::arm_ldrexd: {
12090     Info.opc = ISD::INTRINSIC_W_CHAIN;
12091     Info.memVT = MVT::i64;
12092     Info.ptrVal = I.getArgOperand(0);
12093     Info.offset = 0;
12094     Info.align = 8;
12095     Info.vol = true;
12096     Info.readMem = true;
12097     Info.writeMem = false;
12098     return true;
12099   }
12100   default:
12101     break;
12102   }
12103 
12104   return false;
12105 }
12106 
12107 /// \brief Returns true if it is beneficial to convert a load of a constant
12108 /// to just the constant itself.
12109 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
12110                                                           Type *Ty) const {
12111   assert(Ty->isIntegerTy());
12112 
12113   unsigned Bits = Ty->getPrimitiveSizeInBits();
12114   if (Bits == 0 || Bits > 32)
12115     return false;
12116   return true;
12117 }
12118 
12119 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
12120                                         ARM_MB::MemBOpt Domain) const {
12121   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12122 
12123   // First, if the target has no DMB, see what fallback we can use.
12124   if (!Subtarget->hasDataBarrier()) {
12125     // Some ARMv6 cpus can support data barriers with an mcr instruction.
12126     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
12127     // here.
12128     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
12129       Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
12130       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
12131                         Builder.getInt32(0), Builder.getInt32(7),
12132                         Builder.getInt32(10), Builder.getInt32(5)};
12133       return Builder.CreateCall(MCR, args);
12134     } else {
12135       // Instead of using barriers, atomic accesses on these subtargets use
12136       // libcalls.
12137       llvm_unreachable("makeDMB on a target so old that it has no barriers");
12138     }
12139   } else {
12140     Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
12141     // Only a full system barrier exists in the M-class architectures.
12142     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
12143     Constant *CDomain = Builder.getInt32(Domain);
12144     return Builder.CreateCall(DMB, CDomain);
12145   }
12146 }
12147 
12148 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
12149 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
12150                                          AtomicOrdering Ord, bool IsStore,
12151                                          bool IsLoad) const {
12152   switch (Ord) {
12153   case AtomicOrdering::NotAtomic:
12154   case AtomicOrdering::Unordered:
12155     llvm_unreachable("Invalid fence: unordered/non-atomic");
12156   case AtomicOrdering::Monotonic:
12157   case AtomicOrdering::Acquire:
12158     return nullptr; // Nothing to do
12159   case AtomicOrdering::SequentiallyConsistent:
12160     if (!IsStore)
12161       return nullptr; // Nothing to do
12162     /*FALLTHROUGH*/
12163   case AtomicOrdering::Release:
12164   case AtomicOrdering::AcquireRelease:
12165     if (Subtarget->isSwift())
12166       return makeDMB(Builder, ARM_MB::ISHST);
12167     // FIXME: add a comment with a link to documentation justifying this.
12168     else
12169       return makeDMB(Builder, ARM_MB::ISH);
12170   }
12171   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
12172 }
12173 
12174 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
12175                                           AtomicOrdering Ord, bool IsStore,
12176                                           bool IsLoad) const {
12177   switch (Ord) {
12178   case AtomicOrdering::NotAtomic:
12179   case AtomicOrdering::Unordered:
12180     llvm_unreachable("Invalid fence: unordered/not-atomic");
12181   case AtomicOrdering::Monotonic:
12182   case AtomicOrdering::Release:
12183     return nullptr; // Nothing to do
12184   case AtomicOrdering::Acquire:
12185   case AtomicOrdering::AcquireRelease:
12186   case AtomicOrdering::SequentiallyConsistent:
12187     return makeDMB(Builder, ARM_MB::ISH);
12188   }
12189   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
12190 }
12191 
12192 // Loads and stores less than 64-bits are already atomic; ones above that
12193 // are doomed anyway, so defer to the default libcall and blame the OS when
12194 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
12195 // anything for those.
12196 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
12197   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
12198   return (Size == 64) && !Subtarget->isMClass();
12199 }
12200 
12201 // Loads and stores less than 64-bits are already atomic; ones above that
12202 // are doomed anyway, so defer to the default libcall and blame the OS when
12203 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
12204 // anything for those.
12205 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
12206 // guarantee, see DDI0406C ARM architecture reference manual,
12207 // sections A8.8.72-74 LDRD)
12208 TargetLowering::AtomicExpansionKind
12209 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
12210   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
12211   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
12212                                                   : AtomicExpansionKind::None;
12213 }
12214 
12215 // For the real atomic operations, we have ldrex/strex up to 32 bits,
12216 // and up to 64 bits on the non-M profiles
12217 TargetLowering::AtomicExpansionKind
12218 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
12219   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
12220   return (Size <= (Subtarget->isMClass() ? 32U : 64U))
12221              ? AtomicExpansionKind::LLSC
12222              : AtomicExpansionKind::None;
12223 }
12224 
12225 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
12226     AtomicCmpXchgInst *AI) const {
12227   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
12228   // implement cmpxchg without spilling. If the address being exchanged is also
12229   // on the stack and close enough to the spill slot, this can lead to a
12230   // situation where the monitor always gets cleared and the atomic operation
12231   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
12232   return getTargetMachine().getOptLevel() != 0;
12233 }
12234 
12235 bool ARMTargetLowering::shouldInsertFencesForAtomic(
12236     const Instruction *I) const {
12237   return InsertFencesForAtomic;
12238 }
12239 
12240 // This has so far only been implemented for MachO.
12241 bool ARMTargetLowering::useLoadStackGuardNode() const {
12242   return Subtarget->isTargetMachO();
12243 }
12244 
12245 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
12246                                                   unsigned &Cost) const {
12247   // If we do not have NEON, vector types are not natively supported.
12248   if (!Subtarget->hasNEON())
12249     return false;
12250 
12251   // Floating point values and vector values map to the same register file.
12252   // Therefore, although we could do a store extract of a vector type, this is
12253   // better to leave at float as we have more freedom in the addressing mode for
12254   // those.
12255   if (VectorTy->isFPOrFPVectorTy())
12256     return false;
12257 
12258   // If the index is unknown at compile time, this is very expensive to lower
12259   // and it is not possible to combine the store with the extract.
12260   if (!isa<ConstantInt>(Idx))
12261     return false;
12262 
12263   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
12264   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
12265   // We can do a store + vector extract on any vector that fits perfectly in a D
12266   // or Q register.
12267   if (BitWidth == 64 || BitWidth == 128) {
12268     Cost = 0;
12269     return true;
12270   }
12271   return false;
12272 }
12273 
12274 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
12275   return Subtarget->hasV6T2Ops();
12276 }
12277 
12278 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
12279   return Subtarget->hasV6T2Ops();
12280 }
12281 
12282 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
12283                                          AtomicOrdering Ord) const {
12284   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12285   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
12286   bool IsAcquire = isAcquireOrStronger(Ord);
12287 
12288   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
12289   // intrinsic must return {i32, i32} and we have to recombine them into a
12290   // single i64 here.
12291   if (ValTy->getPrimitiveSizeInBits() == 64) {
12292     Intrinsic::ID Int =
12293         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
12294     Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
12295 
12296     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
12297     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
12298 
12299     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
12300     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
12301     if (!Subtarget->isLittle())
12302       std::swap (Lo, Hi);
12303     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
12304     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
12305     return Builder.CreateOr(
12306         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
12307   }
12308 
12309   Type *Tys[] = { Addr->getType() };
12310   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
12311   Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
12312 
12313   return Builder.CreateTruncOrBitCast(
12314       Builder.CreateCall(Ldrex, Addr),
12315       cast<PointerType>(Addr->getType())->getElementType());
12316 }
12317 
12318 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
12319     IRBuilder<> &Builder) const {
12320   if (!Subtarget->hasV7Ops())
12321     return;
12322   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12323   Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
12324 }
12325 
12326 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
12327                                                Value *Addr,
12328                                                AtomicOrdering Ord) const {
12329   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12330   bool IsRelease = isReleaseOrStronger(Ord);
12331 
12332   // Since the intrinsics must have legal type, the i64 intrinsics take two
12333   // parameters: "i32, i32". We must marshal Val into the appropriate form
12334   // before the call.
12335   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
12336     Intrinsic::ID Int =
12337         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
12338     Function *Strex = Intrinsic::getDeclaration(M, Int);
12339     Type *Int32Ty = Type::getInt32Ty(M->getContext());
12340 
12341     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
12342     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
12343     if (!Subtarget->isLittle())
12344       std::swap (Lo, Hi);
12345     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
12346     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
12347   }
12348 
12349   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
12350   Type *Tys[] = { Addr->getType() };
12351   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
12352 
12353   return Builder.CreateCall(
12354       Strex, {Builder.CreateZExtOrBitCast(
12355                   Val, Strex->getFunctionType()->getParamType(0)),
12356               Addr});
12357 }
12358 
12359 /// \brief Lower an interleaved load into a vldN intrinsic.
12360 ///
12361 /// E.g. Lower an interleaved load (Factor = 2):
12362 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
12363 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
12364 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
12365 ///
12366 ///      Into:
12367 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
12368 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
12369 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
12370 bool ARMTargetLowering::lowerInterleavedLoad(
12371     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
12372     ArrayRef<unsigned> Indices, unsigned Factor) const {
12373   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
12374          "Invalid interleave factor");
12375   assert(!Shuffles.empty() && "Empty shufflevector input");
12376   assert(Shuffles.size() == Indices.size() &&
12377          "Unmatched number of shufflevectors and indices");
12378 
12379   VectorType *VecTy = Shuffles[0]->getType();
12380   Type *EltTy = VecTy->getVectorElementType();
12381 
12382   const DataLayout &DL = LI->getModule()->getDataLayout();
12383   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
12384   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
12385 
12386   // Skip if we do not have NEON and skip illegal vector types and vector types
12387   // with i64/f64 elements (vldN doesn't support i64/f64 elements).
12388   if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
12389     return false;
12390 
12391   // A pointer vector can not be the return type of the ldN intrinsics. Need to
12392   // load integer vectors first and then convert to pointer vectors.
12393   if (EltTy->isPointerTy())
12394     VecTy =
12395         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
12396 
12397   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
12398                                             Intrinsic::arm_neon_vld3,
12399                                             Intrinsic::arm_neon_vld4};
12400 
12401   IRBuilder<> Builder(LI);
12402   SmallVector<Value *, 2> Ops;
12403 
12404   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
12405   Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
12406   Ops.push_back(Builder.getInt32(LI->getAlignment()));
12407 
12408   Type *Tys[] = { VecTy, Int8Ptr };
12409   Function *VldnFunc =
12410       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
12411   CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
12412 
12413   // Replace uses of each shufflevector with the corresponding vector loaded
12414   // by ldN.
12415   for (unsigned i = 0; i < Shuffles.size(); i++) {
12416     ShuffleVectorInst *SV = Shuffles[i];
12417     unsigned Index = Indices[i];
12418 
12419     Value *SubVec = Builder.CreateExtractValue(VldN, Index);
12420 
12421     // Convert the integer vector to pointer vector if the element is pointer.
12422     if (EltTy->isPointerTy())
12423       SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
12424 
12425     SV->replaceAllUsesWith(SubVec);
12426   }
12427 
12428   return true;
12429 }
12430 
12431 /// \brief Get a mask consisting of sequential integers starting from \p Start.
12432 ///
12433 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
12434 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
12435                                    unsigned NumElts) {
12436   SmallVector<Constant *, 16> Mask;
12437   for (unsigned i = 0; i < NumElts; i++)
12438     Mask.push_back(Builder.getInt32(Start + i));
12439 
12440   return ConstantVector::get(Mask);
12441 }
12442 
12443 /// \brief Lower an interleaved store into a vstN intrinsic.
12444 ///
12445 /// E.g. Lower an interleaved store (Factor = 3):
12446 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
12447 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
12448 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
12449 ///
12450 ///      Into:
12451 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
12452 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
12453 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
12454 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
12455 ///
12456 /// Note that the new shufflevectors will be removed and we'll only generate one
12457 /// vst3 instruction in CodeGen.
12458 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
12459                                               ShuffleVectorInst *SVI,
12460                                               unsigned Factor) const {
12461   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
12462          "Invalid interleave factor");
12463 
12464   VectorType *VecTy = SVI->getType();
12465   assert(VecTy->getVectorNumElements() % Factor == 0 &&
12466          "Invalid interleaved store");
12467 
12468   unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
12469   Type *EltTy = VecTy->getVectorElementType();
12470   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
12471 
12472   const DataLayout &DL = SI->getModule()->getDataLayout();
12473   unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
12474   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
12475 
12476   // Skip if we do not have NEON and skip illegal vector types and vector types
12477   // with i64/f64 elements (vstN doesn't support i64/f64 elements).
12478   if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
12479       EltIs64Bits)
12480     return false;
12481 
12482   Value *Op0 = SVI->getOperand(0);
12483   Value *Op1 = SVI->getOperand(1);
12484   IRBuilder<> Builder(SI);
12485 
12486   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
12487   // vectors to integer vectors.
12488   if (EltTy->isPointerTy()) {
12489     Type *IntTy = DL.getIntPtrType(EltTy);
12490 
12491     // Convert to the corresponding integer vector.
12492     Type *IntVecTy =
12493         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
12494     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
12495     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
12496 
12497     SubVecTy = VectorType::get(IntTy, NumSubElts);
12498   }
12499 
12500   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
12501                                              Intrinsic::arm_neon_vst3,
12502                                              Intrinsic::arm_neon_vst4};
12503   SmallVector<Value *, 6> Ops;
12504 
12505   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
12506   Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
12507 
12508   Type *Tys[] = { Int8Ptr, SubVecTy };
12509   Function *VstNFunc = Intrinsic::getDeclaration(
12510       SI->getModule(), StoreInts[Factor - 2], Tys);
12511 
12512   // Split the shufflevector operands into sub vectors for the new vstN call.
12513   for (unsigned i = 0; i < Factor; i++)
12514     Ops.push_back(Builder.CreateShuffleVector(
12515         Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
12516 
12517   Ops.push_back(Builder.getInt32(SI->getAlignment()));
12518   Builder.CreateCall(VstNFunc, Ops);
12519   return true;
12520 }
12521 
12522 enum HABaseType {
12523   HA_UNKNOWN = 0,
12524   HA_FLOAT,
12525   HA_DOUBLE,
12526   HA_VECT64,
12527   HA_VECT128
12528 };
12529 
12530 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
12531                                    uint64_t &Members) {
12532   if (auto *ST = dyn_cast<StructType>(Ty)) {
12533     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
12534       uint64_t SubMembers = 0;
12535       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
12536         return false;
12537       Members += SubMembers;
12538     }
12539   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
12540     uint64_t SubMembers = 0;
12541     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
12542       return false;
12543     Members += SubMembers * AT->getNumElements();
12544   } else if (Ty->isFloatTy()) {
12545     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
12546       return false;
12547     Members = 1;
12548     Base = HA_FLOAT;
12549   } else if (Ty->isDoubleTy()) {
12550     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
12551       return false;
12552     Members = 1;
12553     Base = HA_DOUBLE;
12554   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
12555     Members = 1;
12556     switch (Base) {
12557     case HA_FLOAT:
12558     case HA_DOUBLE:
12559       return false;
12560     case HA_VECT64:
12561       return VT->getBitWidth() == 64;
12562     case HA_VECT128:
12563       return VT->getBitWidth() == 128;
12564     case HA_UNKNOWN:
12565       switch (VT->getBitWidth()) {
12566       case 64:
12567         Base = HA_VECT64;
12568         return true;
12569       case 128:
12570         Base = HA_VECT128;
12571         return true;
12572       default:
12573         return false;
12574       }
12575     }
12576   }
12577 
12578   return (Members > 0 && Members <= 4);
12579 }
12580 
12581 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
12582 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
12583 /// passing according to AAPCS rules.
12584 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
12585     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
12586   if (getEffectiveCallingConv(CallConv, isVarArg) !=
12587       CallingConv::ARM_AAPCS_VFP)
12588     return false;
12589 
12590   HABaseType Base = HA_UNKNOWN;
12591   uint64_t Members = 0;
12592   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
12593   DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
12594 
12595   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
12596   return IsHA || IsIntArray;
12597 }
12598 
12599 unsigned ARMTargetLowering::getExceptionPointerRegister(
12600     const Constant *PersonalityFn) const {
12601   // Platforms which do not use SjLj EH may return values in these registers
12602   // via the personality function.
12603   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
12604 }
12605 
12606 unsigned ARMTargetLowering::getExceptionSelectorRegister(
12607     const Constant *PersonalityFn) const {
12608   // Platforms which do not use SjLj EH may return values in these registers
12609   // via the personality function.
12610   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
12611 }
12612 
12613 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
12614   // Update IsSplitCSR in ARMFunctionInfo.
12615   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
12616   AFI->setIsSplitCSR(true);
12617 }
12618 
12619 void ARMTargetLowering::insertCopiesSplitCSR(
12620     MachineBasicBlock *Entry,
12621     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
12622   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
12623   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
12624   if (!IStart)
12625     return;
12626 
12627   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
12628   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
12629   MachineBasicBlock::iterator MBBI = Entry->begin();
12630   for (const MCPhysReg *I = IStart; *I; ++I) {
12631     const TargetRegisterClass *RC = nullptr;
12632     if (ARM::GPRRegClass.contains(*I))
12633       RC = &ARM::GPRRegClass;
12634     else if (ARM::DPRRegClass.contains(*I))
12635       RC = &ARM::DPRRegClass;
12636     else
12637       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
12638 
12639     unsigned NewVR = MRI->createVirtualRegister(RC);
12640     // Create copy from CSR to a virtual register.
12641     // FIXME: this currently does not emit CFI pseudo-instructions, it works
12642     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
12643     // nounwind. If we want to generalize this later, we may need to emit
12644     // CFI pseudo-instructions.
12645     assert(Entry->getParent()->getFunction()->hasFnAttribute(
12646                Attribute::NoUnwind) &&
12647            "Function should be nounwind in insertCopiesSplitCSR!");
12648     Entry->addLiveIn(*I);
12649     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
12650         .addReg(*I);
12651 
12652     // Insert the copy-back instructions right before the terminator.
12653     for (auto *Exit : Exits)
12654       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
12655               TII->get(TargetOpcode::COPY), *I)
12656           .addReg(NewVR);
12657   }
12658 }
12659