1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that ARM uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARMISelLowering.h"
16 #include "ARMCallingConv.h"
17 #include "ARMConstantPoolValue.h"
18 #include "ARMMachineFunctionInfo.h"
19 #include "ARMPerfectShuffle.h"
20 #include "ARMSubtarget.h"
21 #include "ARMTargetMachine.h"
22 #include "ARMTargetObjectFile.h"
23 #include "MCTargetDesc/ARMAddressingModes.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/ADT/StringExtras.h"
26 #include "llvm/ADT/StringSwitch.h"
27 #include "llvm/CodeGen/CallingConvLower.h"
28 #include "llvm/CodeGen/IntrinsicLowering.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFrameInfo.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
34 #include "llvm/CodeGen/MachineModuleInfo.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/SelectionDAG.h"
37 #include "llvm/IR/CallingConv.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/IR/DebugInfoMetadata.h"
41 #include "llvm/IR/GlobalValue.h"
42 #include "llvm/IR/IRBuilder.h"
43 #include "llvm/IR/Instruction.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/IntrinsicInst.h"
46 #include "llvm/IR/Intrinsics.h"
47 #include "llvm/IR/Type.h"
48 #include "llvm/MC/MCSectionMachO.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/Debug.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Support/raw_ostream.h"
54 #include "llvm/Target/TargetOptions.h"
55 #include <utility>
56 using namespace llvm;
57 
58 #define DEBUG_TYPE "arm-isel"
59 
60 STATISTIC(NumTailCalls, "Number of tail calls");
61 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
62 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
63 STATISTIC(NumConstpoolPromoted,
64   "Number of constants with their storage promoted into constant pools");
65 
66 static cl::opt<bool>
67 ARMInterworking("arm-interworking", cl::Hidden,
68   cl::desc("Enable / disable ARM interworking (for debugging only)"),
69   cl::init(true));
70 
71 static cl::opt<bool> EnableConstpoolPromotion(
72     "arm-promote-constant", cl::Hidden,
73     cl::desc("Enable / disable promotion of unnamed_addr constants into "
74              "constant pools"),
75     cl::init(true));
76 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
77     "arm-promote-constant-max-size", cl::Hidden,
78     cl::desc("Maximum size of constant to promote into a constant pool"),
79     cl::init(64));
80 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
81     "arm-promote-constant-max-total", cl::Hidden,
82     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
83     cl::init(128));
84 
85 namespace {
86   class ARMCCState : public CCState {
87   public:
88     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
89                SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
90                ParmContext PC)
91         : CCState(CC, isVarArg, MF, locs, C) {
92       assert(((PC == Call) || (PC == Prologue)) &&
93              "ARMCCState users must specify whether their context is call"
94              "or prologue generation.");
95       CallOrPrologue = PC;
96     }
97   };
98 }
99 
100 void ARMTargetLowering::InitLibcallCallingConvs() {
101   // The builtins on ARM always use AAPCS, irrespective of wheter C is AAPCS or
102   // AAPCS_VFP.
103   for (const auto LC : {
104            RTLIB::SHL_I16,
105            RTLIB::SHL_I32,
106            RTLIB::SHL_I64,
107            RTLIB::SHL_I128,
108            RTLIB::SRL_I16,
109            RTLIB::SRL_I32,
110            RTLIB::SRL_I64,
111            RTLIB::SRL_I128,
112            RTLIB::SRA_I16,
113            RTLIB::SRA_I32,
114            RTLIB::SRA_I64,
115            RTLIB::SRA_I128,
116            RTLIB::MUL_I8,
117            RTLIB::MUL_I16,
118            RTLIB::MUL_I32,
119            RTLIB::MUL_I64,
120            RTLIB::MUL_I128,
121            RTLIB::MULO_I32,
122            RTLIB::MULO_I64,
123            RTLIB::MULO_I128,
124            RTLIB::SDIV_I8,
125            RTLIB::SDIV_I16,
126            RTLIB::SDIV_I32,
127            RTLIB::SDIV_I64,
128            RTLIB::SDIV_I128,
129            RTLIB::UDIV_I8,
130            RTLIB::UDIV_I16,
131            RTLIB::UDIV_I32,
132            RTLIB::UDIV_I64,
133            RTLIB::UDIV_I128,
134            RTLIB::SREM_I8,
135            RTLIB::SREM_I16,
136            RTLIB::SREM_I32,
137            RTLIB::SREM_I64,
138            RTLIB::SREM_I128,
139            RTLIB::UREM_I8,
140            RTLIB::UREM_I16,
141            RTLIB::UREM_I32,
142            RTLIB::UREM_I64,
143            RTLIB::UREM_I128,
144            RTLIB::SDIVREM_I8,
145            RTLIB::SDIVREM_I16,
146            RTLIB::SDIVREM_I32,
147            RTLIB::SDIVREM_I64,
148            RTLIB::SDIVREM_I128,
149            RTLIB::UDIVREM_I8,
150            RTLIB::UDIVREM_I16,
151            RTLIB::UDIVREM_I32,
152            RTLIB::UDIVREM_I64,
153            RTLIB::UDIVREM_I128,
154            RTLIB::NEG_I32,
155            RTLIB::NEG_I64,
156            RTLIB::ADD_F32,
157            RTLIB::ADD_F64,
158            RTLIB::ADD_F80,
159            RTLIB::ADD_F128,
160            RTLIB::SUB_F32,
161            RTLIB::SUB_F64,
162            RTLIB::SUB_F80,
163            RTLIB::SUB_F128,
164            RTLIB::MUL_F32,
165            RTLIB::MUL_F64,
166            RTLIB::MUL_F80,
167            RTLIB::MUL_F128,
168            RTLIB::DIV_F32,
169            RTLIB::DIV_F64,
170            RTLIB::DIV_F80,
171            RTLIB::DIV_F128,
172            RTLIB::POWI_F32,
173            RTLIB::POWI_F64,
174            RTLIB::POWI_F80,
175            RTLIB::POWI_F128,
176            RTLIB::FPEXT_F64_F128,
177            RTLIB::FPEXT_F32_F128,
178            RTLIB::FPEXT_F32_F64,
179            RTLIB::FPEXT_F16_F32,
180            RTLIB::FPROUND_F32_F16,
181            RTLIB::FPROUND_F64_F16,
182            RTLIB::FPROUND_F80_F16,
183            RTLIB::FPROUND_F128_F16,
184            RTLIB::FPROUND_F64_F32,
185            RTLIB::FPROUND_F80_F32,
186            RTLIB::FPROUND_F128_F32,
187            RTLIB::FPROUND_F80_F64,
188            RTLIB::FPROUND_F128_F64,
189            RTLIB::FPTOSINT_F32_I32,
190            RTLIB::FPTOSINT_F32_I64,
191            RTLIB::FPTOSINT_F32_I128,
192            RTLIB::FPTOSINT_F64_I32,
193            RTLIB::FPTOSINT_F64_I64,
194            RTLIB::FPTOSINT_F64_I128,
195            RTLIB::FPTOSINT_F80_I32,
196            RTLIB::FPTOSINT_F80_I64,
197            RTLIB::FPTOSINT_F80_I128,
198            RTLIB::FPTOSINT_F128_I32,
199            RTLIB::FPTOSINT_F128_I64,
200            RTLIB::FPTOSINT_F128_I128,
201            RTLIB::FPTOUINT_F32_I32,
202            RTLIB::FPTOUINT_F32_I64,
203            RTLIB::FPTOUINT_F32_I128,
204            RTLIB::FPTOUINT_F64_I32,
205            RTLIB::FPTOUINT_F64_I64,
206            RTLIB::FPTOUINT_F64_I128,
207            RTLIB::FPTOUINT_F80_I32,
208            RTLIB::FPTOUINT_F80_I64,
209            RTLIB::FPTOUINT_F80_I128,
210            RTLIB::FPTOUINT_F128_I32,
211            RTLIB::FPTOUINT_F128_I64,
212            RTLIB::FPTOUINT_F128_I128,
213            RTLIB::SINTTOFP_I32_F32,
214            RTLIB::SINTTOFP_I32_F64,
215            RTLIB::SINTTOFP_I32_F80,
216            RTLIB::SINTTOFP_I32_F128,
217            RTLIB::SINTTOFP_I64_F32,
218            RTLIB::SINTTOFP_I64_F64,
219            RTLIB::SINTTOFP_I64_F80,
220            RTLIB::SINTTOFP_I64_F128,
221            RTLIB::SINTTOFP_I128_F32,
222            RTLIB::SINTTOFP_I128_F64,
223            RTLIB::SINTTOFP_I128_F80,
224            RTLIB::SINTTOFP_I128_F128,
225            RTLIB::UINTTOFP_I32_F32,
226            RTLIB::UINTTOFP_I32_F64,
227            RTLIB::UINTTOFP_I32_F80,
228            RTLIB::UINTTOFP_I32_F128,
229            RTLIB::UINTTOFP_I64_F32,
230            RTLIB::UINTTOFP_I64_F64,
231            RTLIB::UINTTOFP_I64_F80,
232            RTLIB::UINTTOFP_I64_F128,
233            RTLIB::UINTTOFP_I128_F32,
234            RTLIB::UINTTOFP_I128_F64,
235            RTLIB::UINTTOFP_I128_F80,
236            RTLIB::UINTTOFP_I128_F128,
237            RTLIB::OEQ_F32,
238            RTLIB::OEQ_F64,
239            RTLIB::OEQ_F128,
240            RTLIB::UNE_F32,
241            RTLIB::UNE_F64,
242            RTLIB::UNE_F128,
243            RTLIB::OGE_F32,
244            RTLIB::OGE_F64,
245            RTLIB::OGE_F128,
246            RTLIB::OLT_F32,
247            RTLIB::OLT_F64,
248            RTLIB::OLT_F128,
249            RTLIB::OLE_F32,
250            RTLIB::OLE_F64,
251            RTLIB::OLE_F128,
252            RTLIB::OGT_F32,
253            RTLIB::OGT_F64,
254            RTLIB::OGT_F128,
255            RTLIB::UO_F32,
256            RTLIB::UO_F64,
257            RTLIB::UO_F128,
258            RTLIB::O_F32,
259            RTLIB::O_F64,
260            RTLIB::O_F128,
261        })
262   setLibcallCallingConv(LC, CallingConv::ARM_AAPCS);
263 }
264 
265 // The APCS parameter registers.
266 static const MCPhysReg GPRArgRegs[] = {
267   ARM::R0, ARM::R1, ARM::R2, ARM::R3
268 };
269 
270 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
271                                        MVT PromotedBitwiseVT) {
272   if (VT != PromotedLdStVT) {
273     setOperationAction(ISD::LOAD, VT, Promote);
274     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
275 
276     setOperationAction(ISD::STORE, VT, Promote);
277     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
278   }
279 
280   MVT ElemTy = VT.getVectorElementType();
281   if (ElemTy != MVT::f64)
282     setOperationAction(ISD::SETCC, VT, Custom);
283   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
284   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
285   if (ElemTy == MVT::i32) {
286     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
287     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
288     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
289     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
290   } else {
291     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
292     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
293     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
294     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
295   }
296   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
297   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
298   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
299   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
300   setOperationAction(ISD::SELECT,            VT, Expand);
301   setOperationAction(ISD::SELECT_CC,         VT, Expand);
302   setOperationAction(ISD::VSELECT,           VT, Expand);
303   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
304   if (VT.isInteger()) {
305     setOperationAction(ISD::SHL, VT, Custom);
306     setOperationAction(ISD::SRA, VT, Custom);
307     setOperationAction(ISD::SRL, VT, Custom);
308   }
309 
310   // Promote all bit-wise operations.
311   if (VT.isInteger() && VT != PromotedBitwiseVT) {
312     setOperationAction(ISD::AND, VT, Promote);
313     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
314     setOperationAction(ISD::OR,  VT, Promote);
315     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
316     setOperationAction(ISD::XOR, VT, Promote);
317     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
318   }
319 
320   // Neon does not support vector divide/remainder operations.
321   setOperationAction(ISD::SDIV, VT, Expand);
322   setOperationAction(ISD::UDIV, VT, Expand);
323   setOperationAction(ISD::FDIV, VT, Expand);
324   setOperationAction(ISD::SREM, VT, Expand);
325   setOperationAction(ISD::UREM, VT, Expand);
326   setOperationAction(ISD::FREM, VT, Expand);
327 
328   if (!VT.isFloatingPoint() &&
329       VT != MVT::v2i64 && VT != MVT::v1i64)
330     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
331       setOperationAction(Opcode, VT, Legal);
332 }
333 
334 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
335   addRegisterClass(VT, &ARM::DPRRegClass);
336   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
337 }
338 
339 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
340   addRegisterClass(VT, &ARM::DPairRegClass);
341   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
342 }
343 
344 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
345                                      const ARMSubtarget &STI)
346     : TargetLowering(TM), Subtarget(&STI) {
347   RegInfo = Subtarget->getRegisterInfo();
348   Itins = Subtarget->getInstrItineraryData();
349 
350   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
351 
352   InitLibcallCallingConvs();
353 
354   if (Subtarget->isTargetMachO()) {
355     // Uses VFP for Thumb libfuncs if available.
356     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
357         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
358       static const struct {
359         const RTLIB::Libcall Op;
360         const char * const Name;
361         const ISD::CondCode Cond;
362       } LibraryCalls[] = {
363         // Single-precision floating-point arithmetic.
364         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
365         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
366         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
367         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
368 
369         // Double-precision floating-point arithmetic.
370         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
371         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
372         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
373         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
374 
375         // Single-precision comparisons.
376         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
377         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
378         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
379         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
380         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
381         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
382         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
383         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
384 
385         // Double-precision comparisons.
386         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
387         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
388         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
389         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
390         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
391         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
392         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
393         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
394 
395         // Floating-point to integer conversions.
396         // i64 conversions are done via library routines even when generating VFP
397         // instructions, so use the same ones.
398         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
399         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
400         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
401         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
402 
403         // Conversions between floating types.
404         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
405         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
406 
407         // Integer to floating-point conversions.
408         // i64 conversions are done via library routines even when generating VFP
409         // instructions, so use the same ones.
410         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
411         // e.g., __floatunsidf vs. __floatunssidfvfp.
412         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
413         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
414         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
415         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
416       };
417 
418       for (const auto &LC : LibraryCalls) {
419         setLibcallName(LC.Op, LC.Name);
420         if (LC.Cond != ISD::SETCC_INVALID)
421           setCmpLibcallCC(LC.Op, LC.Cond);
422       }
423     }
424 
425     // Set the correct calling convention for ARMv7k WatchOS. It's just
426     // AAPCS_VFP for functions as simple as libcalls.
427     if (Subtarget->isTargetWatchABI()) {
428       for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
429         setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
430     }
431   }
432 
433   // These libcalls are not available in 32-bit.
434   setLibcallName(RTLIB::SHL_I128, nullptr);
435   setLibcallName(RTLIB::SRL_I128, nullptr);
436   setLibcallName(RTLIB::SRA_I128, nullptr);
437 
438   // RTLIB
439   if (Subtarget->isAAPCS_ABI() &&
440       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
441        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
442     static const struct {
443       const RTLIB::Libcall Op;
444       const char * const Name;
445       const CallingConv::ID CC;
446       const ISD::CondCode Cond;
447     } LibraryCalls[] = {
448       // Double-precision floating-point arithmetic helper functions
449       // RTABI chapter 4.1.2, Table 2
450       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
451       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
452       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
453       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
454 
455       // Double-precision floating-point comparison helper functions
456       // RTABI chapter 4.1.2, Table 3
457       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
458       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
459       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
460       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
461       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
462       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
463       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
464       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
465 
466       // Single-precision floating-point arithmetic helper functions
467       // RTABI chapter 4.1.2, Table 4
468       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
469       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
470       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
471       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
472 
473       // Single-precision floating-point comparison helper functions
474       // RTABI chapter 4.1.2, Table 5
475       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
476       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
477       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
478       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
479       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
480       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
481       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
482       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
483 
484       // Floating-point to integer conversions.
485       // RTABI chapter 4.1.2, Table 6
486       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
487       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
488       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
489       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
490       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
491       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
492       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
493       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
494 
495       // Conversions between floating types.
496       // RTABI chapter 4.1.2, Table 7
497       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
498       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
499       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
500 
501       // Integer to floating-point conversions.
502       // RTABI chapter 4.1.2, Table 8
503       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
504       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
505       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
506       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
507       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
508       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
509       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
510       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
511 
512       // Long long helper functions
513       // RTABI chapter 4.2, Table 9
514       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
515       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
516       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
517       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
518 
519       // Integer division functions
520       // RTABI chapter 4.3.1
521       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
522       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
523       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
524       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
525       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
526       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
527       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
528       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
529     };
530 
531     for (const auto &LC : LibraryCalls) {
532       setLibcallName(LC.Op, LC.Name);
533       setLibcallCallingConv(LC.Op, LC.CC);
534       if (LC.Cond != ISD::SETCC_INVALID)
535         setCmpLibcallCC(LC.Op, LC.Cond);
536     }
537 
538     // EABI dependent RTLIB
539     if (TM.Options.EABIVersion == EABI::EABI4 ||
540         TM.Options.EABIVersion == EABI::EABI5) {
541       static const struct {
542         const RTLIB::Libcall Op;
543         const char *const Name;
544         const CallingConv::ID CC;
545         const ISD::CondCode Cond;
546       } MemOpsLibraryCalls[] = {
547         // Memory operations
548         // RTABI chapter 4.3.4
549         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
550         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
551         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
552       };
553 
554       for (const auto &LC : MemOpsLibraryCalls) {
555         setLibcallName(LC.Op, LC.Name);
556         setLibcallCallingConv(LC.Op, LC.CC);
557         if (LC.Cond != ISD::SETCC_INVALID)
558           setCmpLibcallCC(LC.Op, LC.Cond);
559       }
560     }
561   }
562 
563   if (Subtarget->isTargetWindows()) {
564     static const struct {
565       const RTLIB::Libcall Op;
566       const char * const Name;
567       const CallingConv::ID CC;
568     } LibraryCalls[] = {
569       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
570       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
571       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
572       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
573       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
574       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
575       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
576       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
577     };
578 
579     for (const auto &LC : LibraryCalls) {
580       setLibcallName(LC.Op, LC.Name);
581       setLibcallCallingConv(LC.Op, LC.CC);
582     }
583   }
584 
585   // Use divmod compiler-rt calls for iOS 5.0 and later.
586   if (Subtarget->isTargetWatchOS() ||
587       (Subtarget->isTargetIOS() &&
588        !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
589     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
590     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
591   }
592 
593   // The half <-> float conversion functions are always soft-float on
594   // non-watchos platforms, but are needed for some targets which use a
595   // hard-float calling convention by default.
596   if (!Subtarget->isTargetWatchABI()) {
597     if (Subtarget->isAAPCS_ABI()) {
598       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
599       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
600       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
601     } else {
602       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
603       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
604       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
605     }
606   }
607 
608   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
609   // a __gnu_ prefix (which is the default).
610   if (Subtarget->isTargetAEABI()) {
611     setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
612     setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
613     setLibcallName(RTLIB::FPEXT_F16_F32,   "__aeabi_h2f");
614   }
615 
616   if (Subtarget->isThumb1Only())
617     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
618   else
619     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
620   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
621       !Subtarget->isThumb1Only()) {
622     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
623     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
624   }
625 
626   for (MVT VT : MVT::vector_valuetypes()) {
627     for (MVT InnerVT : MVT::vector_valuetypes()) {
628       setTruncStoreAction(VT, InnerVT, Expand);
629       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
630       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
631       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
632     }
633 
634     setOperationAction(ISD::MULHS, VT, Expand);
635     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
636     setOperationAction(ISD::MULHU, VT, Expand);
637     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
638 
639     setOperationAction(ISD::BSWAP, VT, Expand);
640   }
641 
642   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
643   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
644 
645   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
646   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
647 
648   if (Subtarget->hasNEON()) {
649     addDRTypeForNEON(MVT::v2f32);
650     addDRTypeForNEON(MVT::v8i8);
651     addDRTypeForNEON(MVT::v4i16);
652     addDRTypeForNEON(MVT::v2i32);
653     addDRTypeForNEON(MVT::v1i64);
654 
655     addQRTypeForNEON(MVT::v4f32);
656     addQRTypeForNEON(MVT::v2f64);
657     addQRTypeForNEON(MVT::v16i8);
658     addQRTypeForNEON(MVT::v8i16);
659     addQRTypeForNEON(MVT::v4i32);
660     addQRTypeForNEON(MVT::v2i64);
661 
662     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
663     // neither Neon nor VFP support any arithmetic operations on it.
664     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
665     // supported for v4f32.
666     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
667     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
668     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
669     // FIXME: Code duplication: FDIV and FREM are expanded always, see
670     // ARMTargetLowering::addTypeForNEON method for details.
671     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
672     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
673     // FIXME: Create unittest.
674     // In another words, find a way when "copysign" appears in DAG with vector
675     // operands.
676     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
677     // FIXME: Code duplication: SETCC has custom operation action, see
678     // ARMTargetLowering::addTypeForNEON method for details.
679     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
680     // FIXME: Create unittest for FNEG and for FABS.
681     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
682     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
683     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
684     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
685     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
686     setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
687     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
688     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
689     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
690     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
691     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
692     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
693     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
694     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
695     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
696     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
697     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
698     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
699     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
700 
701     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
702     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
703     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
704     setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
705     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
706     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
707     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
708     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
709     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
710     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
711     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
712     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
713     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
714     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
715     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
716 
717     // Mark v2f32 intrinsics.
718     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
719     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
720     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
721     setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
722     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
723     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
724     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
725     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
726     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
727     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
728     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
729     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
730     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
731     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
732     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
733 
734     // Neon does not support some operations on v1i64 and v2i64 types.
735     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
736     // Custom handling for some quad-vector types to detect VMULL.
737     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
738     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
739     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
740     // Custom handling for some vector types to avoid expensive expansions
741     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
742     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
743     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
744     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
745     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
746     // a destination type that is wider than the source, and nor does
747     // it have a FP_TO_[SU]INT instruction with a narrower destination than
748     // source.
749     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
750     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
751     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
752     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
753 
754     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
755     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
756 
757     // NEON does not have single instruction CTPOP for vectors with element
758     // types wider than 8-bits.  However, custom lowering can leverage the
759     // v8i8/v16i8 vcnt instruction.
760     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
761     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
762     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
763     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
764     setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
765     setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
766 
767     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
768     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
769 
770     // NEON does not have single instruction CTTZ for vectors.
771     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
772     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
773     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
774     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
775 
776     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
777     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
778     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
779     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
780 
781     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
782     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
783     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
784     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
785 
786     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
787     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
788     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
789     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
790 
791     // NEON only has FMA instructions as of VFP4.
792     if (!Subtarget->hasVFP4()) {
793       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
794       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
795     }
796 
797     setTargetDAGCombine(ISD::INTRINSIC_VOID);
798     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
799     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
800     setTargetDAGCombine(ISD::SHL);
801     setTargetDAGCombine(ISD::SRL);
802     setTargetDAGCombine(ISD::SRA);
803     setTargetDAGCombine(ISD::SIGN_EXTEND);
804     setTargetDAGCombine(ISD::ZERO_EXTEND);
805     setTargetDAGCombine(ISD::ANY_EXTEND);
806     setTargetDAGCombine(ISD::BUILD_VECTOR);
807     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
808     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
809     setTargetDAGCombine(ISD::STORE);
810     setTargetDAGCombine(ISD::FP_TO_SINT);
811     setTargetDAGCombine(ISD::FP_TO_UINT);
812     setTargetDAGCombine(ISD::FDIV);
813     setTargetDAGCombine(ISD::LOAD);
814 
815     // It is legal to extload from v4i8 to v4i16 or v4i32.
816     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
817                    MVT::v2i32}) {
818       for (MVT VT : MVT::integer_vector_valuetypes()) {
819         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
820         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
821         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
822       }
823     }
824   }
825 
826   // ARM and Thumb2 support UMLAL/SMLAL.
827   if (!Subtarget->isThumb1Only())
828     setTargetDAGCombine(ISD::ADDC);
829 
830   if (Subtarget->isFPOnlySP()) {
831     // When targeting a floating-point unit with only single-precision
832     // operations, f64 is legal for the few double-precision instructions which
833     // are present However, no double-precision operations other than moves,
834     // loads and stores are provided by the hardware.
835     setOperationAction(ISD::FADD,       MVT::f64, Expand);
836     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
837     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
838     setOperationAction(ISD::FMA,        MVT::f64, Expand);
839     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
840     setOperationAction(ISD::FREM,       MVT::f64, Expand);
841     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
842     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
843     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
844     setOperationAction(ISD::FABS,       MVT::f64, Expand);
845     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
846     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
847     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
848     setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
849     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
850     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
851     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
852     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
853     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
854     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
855     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
856     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
857     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
858     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
859     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
860     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
861     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
862     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
863     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
864     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
865     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
866     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
867     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
868   }
869 
870   computeRegisterProperties(Subtarget->getRegisterInfo());
871 
872   // ARM does not have floating-point extending loads.
873   for (MVT VT : MVT::fp_valuetypes()) {
874     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
875     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
876   }
877 
878   // ... or truncating stores
879   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
880   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
881   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
882 
883   // ARM does not have i1 sign extending load.
884   for (MVT VT : MVT::integer_valuetypes())
885     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
886 
887   // ARM supports all 4 flavors of integer indexed load / store.
888   if (!Subtarget->isThumb1Only()) {
889     for (unsigned im = (unsigned)ISD::PRE_INC;
890          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
891       setIndexedLoadAction(im,  MVT::i1,  Legal);
892       setIndexedLoadAction(im,  MVT::i8,  Legal);
893       setIndexedLoadAction(im,  MVT::i16, Legal);
894       setIndexedLoadAction(im,  MVT::i32, Legal);
895       setIndexedStoreAction(im, MVT::i1,  Legal);
896       setIndexedStoreAction(im, MVT::i8,  Legal);
897       setIndexedStoreAction(im, MVT::i16, Legal);
898       setIndexedStoreAction(im, MVT::i32, Legal);
899     }
900   } else {
901     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
902     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
903     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
904   }
905 
906   setOperationAction(ISD::SADDO, MVT::i32, Custom);
907   setOperationAction(ISD::UADDO, MVT::i32, Custom);
908   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
909   setOperationAction(ISD::USUBO, MVT::i32, Custom);
910 
911   // i64 operation support.
912   setOperationAction(ISD::MUL,     MVT::i64, Expand);
913   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
914   if (Subtarget->isThumb1Only()) {
915     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
916     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
917   }
918   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
919       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
920     setOperationAction(ISD::MULHS, MVT::i32, Expand);
921 
922   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
923   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
924   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
925   setOperationAction(ISD::SRL,       MVT::i64, Custom);
926   setOperationAction(ISD::SRA,       MVT::i64, Custom);
927 
928   if (!Subtarget->isThumb1Only()) {
929     // FIXME: We should do this for Thumb1 as well.
930     setOperationAction(ISD::ADDC,    MVT::i32, Custom);
931     setOperationAction(ISD::ADDE,    MVT::i32, Custom);
932     setOperationAction(ISD::SUBC,    MVT::i32, Custom);
933     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
934   }
935 
936   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
937     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
938 
939   // ARM does not have ROTL.
940   setOperationAction(ISD::ROTL, MVT::i32, Expand);
941   for (MVT VT : MVT::vector_valuetypes()) {
942     setOperationAction(ISD::ROTL, VT, Expand);
943     setOperationAction(ISD::ROTR, VT, Expand);
944   }
945   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
946   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
947   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
948     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
949 
950   // @llvm.readcyclecounter requires the Performance Monitors extension.
951   // Default to the 0 expansion on unsupported platforms.
952   // FIXME: Technically there are older ARM CPUs that have
953   // implementation-specific ways of obtaining this information.
954   if (Subtarget->hasPerfMon())
955     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
956 
957   // Only ARMv6 has BSWAP.
958   if (!Subtarget->hasV6Ops())
959     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
960 
961   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
962                                         : Subtarget->hasDivideInARMMode();
963   if (!hasDivide) {
964     // These are expanded into libcalls if the cpu doesn't have HW divider.
965     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
966     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
967   }
968 
969   if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
970     setOperationAction(ISD::SDIV, MVT::i32, Custom);
971     setOperationAction(ISD::UDIV, MVT::i32, Custom);
972 
973     setOperationAction(ISD::SDIV, MVT::i64, Custom);
974     setOperationAction(ISD::UDIV, MVT::i64, Custom);
975   }
976 
977   setOperationAction(ISD::SREM,  MVT::i32, Expand);
978   setOperationAction(ISD::UREM,  MVT::i32, Expand);
979   // Register based DivRem for AEABI (RTABI 4.2)
980   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
981       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
982       Subtarget->isTargetWindows()) {
983     setOperationAction(ISD::SREM, MVT::i64, Custom);
984     setOperationAction(ISD::UREM, MVT::i64, Custom);
985     HasStandaloneRem = false;
986 
987     for (const auto &LC :
988          {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32})
989       setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv"
990                                                       : "__aeabi_idivmod");
991     setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows()
992                                            ? "__rt_sdiv64"
993                                            : "__aeabi_ldivmod");
994     for (const auto &LC :
995          {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32})
996       setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv"
997                                                       : "__aeabi_uidivmod");
998     setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows()
999                                            ? "__rt_udiv64"
1000                                            : "__aeabi_uldivmod");
1001 
1002     setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
1003     setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
1004     setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
1005     setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
1006     setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
1007     setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
1008     setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
1009     setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
1010 
1011     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
1012     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1013     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
1014     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
1015   } else {
1016     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
1017     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
1018   }
1019 
1020   if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
1021     for (auto &VT : {MVT::f32, MVT::f64})
1022       setOperationAction(ISD::FPOWI, VT, Custom);
1023 
1024   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
1025   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
1026   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
1027   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
1028 
1029   setOperationAction(ISD::TRAP, MVT::Other, Legal);
1030 
1031   // Use the default implementation.
1032   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
1033   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
1034   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
1035   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
1036   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
1037   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
1038 
1039   if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
1040     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1041   else
1042     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1043 
1044   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1045   // the default expansion.
1046   InsertFencesForAtomic = false;
1047   if (Subtarget->hasAnyDataBarrier() &&
1048       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1049     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1050     // to ldrex/strex loops already.
1051     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
1052     if (!Subtarget->isThumb() || !Subtarget->isMClass())
1053       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
1054 
1055     // On v8, we have particularly efficient implementations of atomic fences
1056     // if they can be combined with nearby atomic loads and stores.
1057     if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
1058       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1059       InsertFencesForAtomic = true;
1060     }
1061   } else {
1062     // If there's anything we can use as a barrier, go through custom lowering
1063     // for ATOMIC_FENCE.
1064     // If target has DMB in thumb, Fences can be inserted.
1065     if (Subtarget->hasDataBarrier())
1066       InsertFencesForAtomic = true;
1067 
1068     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
1069                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1070 
1071     // Set them all for expansion, which will force libcalls.
1072     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
1073     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
1074     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
1075     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
1076     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
1077     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
1078     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
1079     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1080     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1081     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1082     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1083     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1084     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1085     // Unordered/Monotonic case.
1086     if (!InsertFencesForAtomic) {
1087       setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1088       setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1089     }
1090   }
1091 
1092   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
1093 
1094   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1095   if (!Subtarget->hasV6Ops()) {
1096     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1097     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
1098   }
1099   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1100 
1101   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1102       !Subtarget->isThumb1Only()) {
1103     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1104     // iff target supports vfp2.
1105     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1106     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1107   }
1108 
1109   // We want to custom lower some of our intrinsics.
1110   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1111   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1112   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1113   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1114   if (Subtarget->useSjLjEH())
1115     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1116 
1117   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
1118   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
1119   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
1120   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
1121   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
1122   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
1123   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1124   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1125   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1126 
1127   // Thumb-1 cannot currently select ARMISD::SUBE.
1128   if (!Subtarget->isThumb1Only())
1129     setOperationAction(ISD::SETCCE, MVT::i32, Custom);
1130 
1131   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
1132   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
1133   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
1134   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
1135   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
1136 
1137   // We don't support sin/cos/fmod/copysign/pow
1138   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
1139   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
1140   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
1141   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
1142   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
1143   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
1144   setOperationAction(ISD::FREM,      MVT::f64, Expand);
1145   setOperationAction(ISD::FREM,      MVT::f32, Expand);
1146   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1147       !Subtarget->isThumb1Only()) {
1148     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1149     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1150   }
1151   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
1152   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
1153 
1154   if (!Subtarget->hasVFP4()) {
1155     setOperationAction(ISD::FMA, MVT::f64, Expand);
1156     setOperationAction(ISD::FMA, MVT::f32, Expand);
1157   }
1158 
1159   // Various VFP goodness
1160   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1161     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1162     if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
1163       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1164       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1165     }
1166 
1167     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1168     if (!Subtarget->hasFP16()) {
1169       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1170       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1171     }
1172   }
1173 
1174   // Combine sin / cos into one node or libcall if possible.
1175   if (Subtarget->hasSinCos()) {
1176     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1177     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1178     if (Subtarget->isTargetWatchABI()) {
1179       setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
1180       setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
1181     }
1182     if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
1183       // For iOS, we don't want to the normal expansion of a libcall to
1184       // sincos. We want to issue a libcall to __sincos_stret.
1185       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1186       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1187     }
1188   }
1189 
1190   // FP-ARMv8 implements a lot of rounding-like FP operations.
1191   if (Subtarget->hasFPARMv8()) {
1192     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1193     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1194     setOperationAction(ISD::FROUND, MVT::f32, Legal);
1195     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1196     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1197     setOperationAction(ISD::FRINT, MVT::f32, Legal);
1198     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1199     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1200     setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1201     setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1202     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1203     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1204 
1205     if (!Subtarget->isFPOnlySP()) {
1206       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1207       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1208       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1209       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1210       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1211       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1212       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1213       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1214     }
1215   }
1216 
1217   if (Subtarget->hasNEON()) {
1218     // vmin and vmax aren't available in a scalar form, so we use
1219     // a NEON instruction with an undef lane instead.
1220     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
1221     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
1222     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
1223     setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
1224     setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
1225     setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
1226   }
1227 
1228   // We have target-specific dag combine patterns for the following nodes:
1229   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1230   setTargetDAGCombine(ISD::ADD);
1231   setTargetDAGCombine(ISD::SUB);
1232   setTargetDAGCombine(ISD::MUL);
1233   setTargetDAGCombine(ISD::AND);
1234   setTargetDAGCombine(ISD::OR);
1235   setTargetDAGCombine(ISD::XOR);
1236 
1237   if (Subtarget->hasV6Ops())
1238     setTargetDAGCombine(ISD::SRL);
1239 
1240   setStackPointerRegisterToSaveRestore(ARM::SP);
1241 
1242   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1243       !Subtarget->hasVFP2())
1244     setSchedulingPreference(Sched::RegPressure);
1245   else
1246     setSchedulingPreference(Sched::Hybrid);
1247 
1248   //// temporary - rewrite interface to use type
1249   MaxStoresPerMemset = 8;
1250   MaxStoresPerMemsetOptSize = 4;
1251   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1252   MaxStoresPerMemcpyOptSize = 2;
1253   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1254   MaxStoresPerMemmoveOptSize = 2;
1255 
1256   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1257   // are at least 4 bytes aligned.
1258   setMinStackArgumentAlignment(4);
1259 
1260   // Prefer likely predicted branches to selects on out-of-order cores.
1261   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1262 
1263   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
1264 }
1265 
1266 bool ARMTargetLowering::useSoftFloat() const {
1267   return Subtarget->useSoftFloat();
1268 }
1269 
1270 // FIXME: It might make sense to define the representative register class as the
1271 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1272 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1273 // SPR's representative would be DPR_VFP2. This should work well if register
1274 // pressure tracking were modified such that a register use would increment the
1275 // pressure of the register class's representative and all of it's super
1276 // classes' representatives transitively. We have not implemented this because
1277 // of the difficulty prior to coalescing of modeling operand register classes
1278 // due to the common occurrence of cross class copies and subregister insertions
1279 // and extractions.
1280 std::pair<const TargetRegisterClass *, uint8_t>
1281 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1282                                            MVT VT) const {
1283   const TargetRegisterClass *RRC = nullptr;
1284   uint8_t Cost = 1;
1285   switch (VT.SimpleTy) {
1286   default:
1287     return TargetLowering::findRepresentativeClass(TRI, VT);
1288   // Use DPR as representative register class for all floating point
1289   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1290   // the cost is 1 for both f32 and f64.
1291   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1292   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1293     RRC = &ARM::DPRRegClass;
1294     // When NEON is used for SP, only half of the register file is available
1295     // because operations that define both SP and DP results will be constrained
1296     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1297     // coalescing by double-counting the SP regs. See the FIXME above.
1298     if (Subtarget->useNEONForSinglePrecisionFP())
1299       Cost = 2;
1300     break;
1301   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1302   case MVT::v4f32: case MVT::v2f64:
1303     RRC = &ARM::DPRRegClass;
1304     Cost = 2;
1305     break;
1306   case MVT::v4i64:
1307     RRC = &ARM::DPRRegClass;
1308     Cost = 4;
1309     break;
1310   case MVT::v8i64:
1311     RRC = &ARM::DPRRegClass;
1312     Cost = 8;
1313     break;
1314   }
1315   return std::make_pair(RRC, Cost);
1316 }
1317 
1318 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1319   switch ((ARMISD::NodeType)Opcode) {
1320   case ARMISD::FIRST_NUMBER:  break;
1321   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1322   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1323   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1324   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1325   case ARMISD::CALL:          return "ARMISD::CALL";
1326   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1327   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1328   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1329   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1330   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1331   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1332   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1333   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1334   case ARMISD::CMP:           return "ARMISD::CMP";
1335   case ARMISD::CMN:           return "ARMISD::CMN";
1336   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1337   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1338   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1339   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1340   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1341 
1342   case ARMISD::CMOV:          return "ARMISD::CMOV";
1343 
1344   case ARMISD::SSAT:          return "ARMISD::SSAT";
1345 
1346   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1347   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1348   case ARMISD::RRX:           return "ARMISD::RRX";
1349 
1350   case ARMISD::ADDC:          return "ARMISD::ADDC";
1351   case ARMISD::ADDE:          return "ARMISD::ADDE";
1352   case ARMISD::SUBC:          return "ARMISD::SUBC";
1353   case ARMISD::SUBE:          return "ARMISD::SUBE";
1354 
1355   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1356   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1357 
1358   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1359   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1360   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1361 
1362   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1363 
1364   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1365 
1366   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1367 
1368   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1369 
1370   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1371 
1372   case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
1373   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1374 
1375   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1376   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1377   case ARMISD::VCGE:          return "ARMISD::VCGE";
1378   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1379   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1380   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1381   case ARMISD::VCGT:          return "ARMISD::VCGT";
1382   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1383   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1384   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1385   case ARMISD::VTST:          return "ARMISD::VTST";
1386 
1387   case ARMISD::VSHL:          return "ARMISD::VSHL";
1388   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1389   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1390   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1391   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1392   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1393   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1394   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1395   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1396   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1397   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1398   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1399   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1400   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1401   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1402   case ARMISD::VSLI:          return "ARMISD::VSLI";
1403   case ARMISD::VSRI:          return "ARMISD::VSRI";
1404   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1405   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1406   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1407   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1408   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1409   case ARMISD::VDUP:          return "ARMISD::VDUP";
1410   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1411   case ARMISD::VEXT:          return "ARMISD::VEXT";
1412   case ARMISD::VREV64:        return "ARMISD::VREV64";
1413   case ARMISD::VREV32:        return "ARMISD::VREV32";
1414   case ARMISD::VREV16:        return "ARMISD::VREV16";
1415   case ARMISD::VZIP:          return "ARMISD::VZIP";
1416   case ARMISD::VUZP:          return "ARMISD::VUZP";
1417   case ARMISD::VTRN:          return "ARMISD::VTRN";
1418   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1419   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1420   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1421   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1422   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
1423   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1424   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1425   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1426   case ARMISD::BFI:           return "ARMISD::BFI";
1427   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1428   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1429   case ARMISD::VBSL:          return "ARMISD::VBSL";
1430   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1431   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1432   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1433   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1434   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1435   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1436   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1437   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1438   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1439   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1440   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1441   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1442   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1443   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1444   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1445   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1446   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1447   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1448   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1449   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1450   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1451   }
1452   return nullptr;
1453 }
1454 
1455 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1456                                           EVT VT) const {
1457   if (!VT.isVector())
1458     return getPointerTy(DL);
1459   return VT.changeVectorElementTypeToInteger();
1460 }
1461 
1462 /// getRegClassFor - Return the register class that should be used for the
1463 /// specified value type.
1464 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1465   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1466   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1467   // load / store 4 to 8 consecutive D registers.
1468   if (Subtarget->hasNEON()) {
1469     if (VT == MVT::v4i64)
1470       return &ARM::QQPRRegClass;
1471     if (VT == MVT::v8i64)
1472       return &ARM::QQQQPRRegClass;
1473   }
1474   return TargetLowering::getRegClassFor(VT);
1475 }
1476 
1477 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1478 // source/dest is aligned and the copy size is large enough. We therefore want
1479 // to align such objects passed to memory intrinsics.
1480 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1481                                                unsigned &PrefAlign) const {
1482   if (!isa<MemIntrinsic>(CI))
1483     return false;
1484   MinSize = 8;
1485   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1486   // cycle faster than 4-byte aligned LDM.
1487   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1488   return true;
1489 }
1490 
1491 // Create a fast isel object.
1492 FastISel *
1493 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1494                                   const TargetLibraryInfo *libInfo) const {
1495   return ARM::createFastISel(funcInfo, libInfo);
1496 }
1497 
1498 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1499   unsigned NumVals = N->getNumValues();
1500   if (!NumVals)
1501     return Sched::RegPressure;
1502 
1503   for (unsigned i = 0; i != NumVals; ++i) {
1504     EVT VT = N->getValueType(i);
1505     if (VT == MVT::Glue || VT == MVT::Other)
1506       continue;
1507     if (VT.isFloatingPoint() || VT.isVector())
1508       return Sched::ILP;
1509   }
1510 
1511   if (!N->isMachineOpcode())
1512     return Sched::RegPressure;
1513 
1514   // Load are scheduled for latency even if there instruction itinerary
1515   // is not available.
1516   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1517   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1518 
1519   if (MCID.getNumDefs() == 0)
1520     return Sched::RegPressure;
1521   if (!Itins->isEmpty() &&
1522       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1523     return Sched::ILP;
1524 
1525   return Sched::RegPressure;
1526 }
1527 
1528 //===----------------------------------------------------------------------===//
1529 // Lowering Code
1530 //===----------------------------------------------------------------------===//
1531 
1532 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1533 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1534   switch (CC) {
1535   default: llvm_unreachable("Unknown condition code!");
1536   case ISD::SETNE:  return ARMCC::NE;
1537   case ISD::SETEQ:  return ARMCC::EQ;
1538   case ISD::SETGT:  return ARMCC::GT;
1539   case ISD::SETGE:  return ARMCC::GE;
1540   case ISD::SETLT:  return ARMCC::LT;
1541   case ISD::SETLE:  return ARMCC::LE;
1542   case ISD::SETUGT: return ARMCC::HI;
1543   case ISD::SETUGE: return ARMCC::HS;
1544   case ISD::SETULT: return ARMCC::LO;
1545   case ISD::SETULE: return ARMCC::LS;
1546   }
1547 }
1548 
1549 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1550 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1551                         ARMCC::CondCodes &CondCode2) {
1552   CondCode2 = ARMCC::AL;
1553   switch (CC) {
1554   default: llvm_unreachable("Unknown FP condition!");
1555   case ISD::SETEQ:
1556   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1557   case ISD::SETGT:
1558   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1559   case ISD::SETGE:
1560   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1561   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1562   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1563   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1564   case ISD::SETO:   CondCode = ARMCC::VC; break;
1565   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1566   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1567   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1568   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1569   case ISD::SETLT:
1570   case ISD::SETULT: CondCode = ARMCC::LT; break;
1571   case ISD::SETLE:
1572   case ISD::SETULE: CondCode = ARMCC::LE; break;
1573   case ISD::SETNE:
1574   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1575   }
1576 }
1577 
1578 //===----------------------------------------------------------------------===//
1579 //                      Calling Convention Implementation
1580 //===----------------------------------------------------------------------===//
1581 
1582 #include "ARMGenCallingConv.inc"
1583 
1584 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1585 /// account presence of floating point hardware and calling convention
1586 /// limitations, such as support for variadic functions.
1587 CallingConv::ID
1588 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1589                                            bool isVarArg) const {
1590   switch (CC) {
1591   default:
1592     llvm_unreachable("Unsupported calling convention");
1593   case CallingConv::ARM_AAPCS:
1594   case CallingConv::ARM_APCS:
1595   case CallingConv::GHC:
1596     return CC;
1597   case CallingConv::PreserveMost:
1598     return CallingConv::PreserveMost;
1599   case CallingConv::ARM_AAPCS_VFP:
1600   case CallingConv::Swift:
1601     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1602   case CallingConv::C:
1603     if (!Subtarget->isAAPCS_ABI())
1604       return CallingConv::ARM_APCS;
1605     else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
1606              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1607              !isVarArg)
1608       return CallingConv::ARM_AAPCS_VFP;
1609     else
1610       return CallingConv::ARM_AAPCS;
1611   case CallingConv::Fast:
1612   case CallingConv::CXX_FAST_TLS:
1613     if (!Subtarget->isAAPCS_ABI()) {
1614       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1615         return CallingConv::Fast;
1616       return CallingConv::ARM_APCS;
1617     } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1618       return CallingConv::ARM_AAPCS_VFP;
1619     else
1620       return CallingConv::ARM_AAPCS;
1621   }
1622 }
1623 
1624 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1625 /// CallingConvention.
1626 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1627                                                  bool Return,
1628                                                  bool isVarArg) const {
1629   switch (getEffectiveCallingConv(CC, isVarArg)) {
1630   default:
1631     llvm_unreachable("Unsupported calling convention");
1632   case CallingConv::ARM_APCS:
1633     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1634   case CallingConv::ARM_AAPCS:
1635     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1636   case CallingConv::ARM_AAPCS_VFP:
1637     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1638   case CallingConv::Fast:
1639     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1640   case CallingConv::GHC:
1641     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1642   case CallingConv::PreserveMost:
1643     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1644   }
1645 }
1646 
1647 /// LowerCallResult - Lower the result values of a call into the
1648 /// appropriate copies out of appropriate physical registers.
1649 SDValue ARMTargetLowering::LowerCallResult(
1650     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
1651     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1652     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1653     SDValue ThisVal) const {
1654 
1655   // Assign locations to each value returned by this call.
1656   SmallVector<CCValAssign, 16> RVLocs;
1657   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1658                     *DAG.getContext(), Call);
1659   CCInfo.AnalyzeCallResult(Ins,
1660                            CCAssignFnForNode(CallConv, /* Return*/ true,
1661                                              isVarArg));
1662 
1663   // Copy all of the result registers out of their specified physreg.
1664   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1665     CCValAssign VA = RVLocs[i];
1666 
1667     // Pass 'this' value directly from the argument to return value, to avoid
1668     // reg unit interference
1669     if (i == 0 && isThisReturn) {
1670       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1671              "unexpected return calling convention register assignment");
1672       InVals.push_back(ThisVal);
1673       continue;
1674     }
1675 
1676     SDValue Val;
1677     if (VA.needsCustom()) {
1678       // Handle f64 or half of a v2f64.
1679       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1680                                       InFlag);
1681       Chain = Lo.getValue(1);
1682       InFlag = Lo.getValue(2);
1683       VA = RVLocs[++i]; // skip ahead to next loc
1684       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1685                                       InFlag);
1686       Chain = Hi.getValue(1);
1687       InFlag = Hi.getValue(2);
1688       if (!Subtarget->isLittle())
1689         std::swap (Lo, Hi);
1690       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1691 
1692       if (VA.getLocVT() == MVT::v2f64) {
1693         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1694         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1695                           DAG.getConstant(0, dl, MVT::i32));
1696 
1697         VA = RVLocs[++i]; // skip ahead to next loc
1698         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1699         Chain = Lo.getValue(1);
1700         InFlag = Lo.getValue(2);
1701         VA = RVLocs[++i]; // skip ahead to next loc
1702         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1703         Chain = Hi.getValue(1);
1704         InFlag = Hi.getValue(2);
1705         if (!Subtarget->isLittle())
1706           std::swap (Lo, Hi);
1707         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1708         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1709                           DAG.getConstant(1, dl, MVT::i32));
1710       }
1711     } else {
1712       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1713                                InFlag);
1714       Chain = Val.getValue(1);
1715       InFlag = Val.getValue(2);
1716     }
1717 
1718     switch (VA.getLocInfo()) {
1719     default: llvm_unreachable("Unknown loc info!");
1720     case CCValAssign::Full: break;
1721     case CCValAssign::BCvt:
1722       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1723       break;
1724     }
1725 
1726     InVals.push_back(Val);
1727   }
1728 
1729   return Chain;
1730 }
1731 
1732 /// LowerMemOpCallTo - Store the argument to the stack.
1733 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1734                                             SDValue Arg, const SDLoc &dl,
1735                                             SelectionDAG &DAG,
1736                                             const CCValAssign &VA,
1737                                             ISD::ArgFlagsTy Flags) const {
1738   unsigned LocMemOffset = VA.getLocMemOffset();
1739   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1740   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1741                        StackPtr, PtrOff);
1742   return DAG.getStore(
1743       Chain, dl, Arg, PtrOff,
1744       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
1745 }
1746 
1747 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1748                                          SDValue Chain, SDValue &Arg,
1749                                          RegsToPassVector &RegsToPass,
1750                                          CCValAssign &VA, CCValAssign &NextVA,
1751                                          SDValue &StackPtr,
1752                                          SmallVectorImpl<SDValue> &MemOpChains,
1753                                          ISD::ArgFlagsTy Flags) const {
1754 
1755   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1756                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1757   unsigned id = Subtarget->isLittle() ? 0 : 1;
1758   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1759 
1760   if (NextVA.isRegLoc())
1761     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1762   else {
1763     assert(NextVA.isMemLoc());
1764     if (!StackPtr.getNode())
1765       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1766                                     getPointerTy(DAG.getDataLayout()));
1767 
1768     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
1769                                            dl, DAG, NextVA,
1770                                            Flags));
1771   }
1772 }
1773 
1774 /// LowerCall - Lowering a call into a callseq_start <-
1775 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1776 /// nodes.
1777 SDValue
1778 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1779                              SmallVectorImpl<SDValue> &InVals) const {
1780   SelectionDAG &DAG                     = CLI.DAG;
1781   SDLoc &dl                             = CLI.DL;
1782   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1783   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1784   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1785   SDValue Chain                         = CLI.Chain;
1786   SDValue Callee                        = CLI.Callee;
1787   bool &isTailCall                      = CLI.IsTailCall;
1788   CallingConv::ID CallConv              = CLI.CallConv;
1789   bool doesNotRet                       = CLI.DoesNotReturn;
1790   bool isVarArg                         = CLI.IsVarArg;
1791 
1792   MachineFunction &MF = DAG.getMachineFunction();
1793   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1794   bool isThisReturn   = false;
1795   bool isSibCall      = false;
1796   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
1797 
1798   // Disable tail calls if they're not supported.
1799   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
1800     isTailCall = false;
1801 
1802   if (isTailCall) {
1803     // Check if it's really possible to do a tail call.
1804     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1805                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
1806                                                    Outs, OutVals, Ins, DAG);
1807     if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
1808       report_fatal_error("failed to perform tail call elimination on a call "
1809                          "site marked musttail");
1810     // We don't support GuaranteedTailCallOpt for ARM, only automatically
1811     // detected sibcalls.
1812     if (isTailCall) {
1813       ++NumTailCalls;
1814       isSibCall = true;
1815     }
1816   }
1817 
1818   // Analyze operands of the call, assigning locations to each operand.
1819   SmallVector<CCValAssign, 16> ArgLocs;
1820   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1821                     *DAG.getContext(), Call);
1822   CCInfo.AnalyzeCallOperands(Outs,
1823                              CCAssignFnForNode(CallConv, /* Return*/ false,
1824                                                isVarArg));
1825 
1826   // Get a count of how many bytes are to be pushed on the stack.
1827   unsigned NumBytes = CCInfo.getNextStackOffset();
1828 
1829   // For tail calls, memory operands are available in our caller's stack.
1830   if (isSibCall)
1831     NumBytes = 0;
1832 
1833   // Adjust the stack pointer for the new arguments...
1834   // These operations are automatically eliminated by the prolog/epilog pass
1835   if (!isSibCall)
1836     Chain = DAG.getCALLSEQ_START(Chain,
1837                                  DAG.getIntPtrConstant(NumBytes, dl, true), dl);
1838 
1839   SDValue StackPtr =
1840       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
1841 
1842   RegsToPassVector RegsToPass;
1843   SmallVector<SDValue, 8> MemOpChains;
1844 
1845   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1846   // of tail call optimization, arguments are handled later.
1847   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1848        i != e;
1849        ++i, ++realArgIdx) {
1850     CCValAssign &VA = ArgLocs[i];
1851     SDValue Arg = OutVals[realArgIdx];
1852     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1853     bool isByVal = Flags.isByVal();
1854 
1855     // Promote the value if needed.
1856     switch (VA.getLocInfo()) {
1857     default: llvm_unreachable("Unknown loc info!");
1858     case CCValAssign::Full: break;
1859     case CCValAssign::SExt:
1860       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1861       break;
1862     case CCValAssign::ZExt:
1863       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1864       break;
1865     case CCValAssign::AExt:
1866       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1867       break;
1868     case CCValAssign::BCvt:
1869       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1870       break;
1871     }
1872 
1873     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1874     if (VA.needsCustom()) {
1875       if (VA.getLocVT() == MVT::v2f64) {
1876         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1877                                   DAG.getConstant(0, dl, MVT::i32));
1878         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1879                                   DAG.getConstant(1, dl, MVT::i32));
1880 
1881         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1882                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1883 
1884         VA = ArgLocs[++i]; // skip ahead to next loc
1885         if (VA.isRegLoc()) {
1886           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1887                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1888         } else {
1889           assert(VA.isMemLoc());
1890 
1891           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1892                                                  dl, DAG, VA, Flags));
1893         }
1894       } else {
1895         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1896                          StackPtr, MemOpChains, Flags);
1897       }
1898     } else if (VA.isRegLoc()) {
1899       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
1900         assert(VA.getLocVT() == MVT::i32 &&
1901                "unexpected calling convention register assignment");
1902         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1903                "unexpected use of 'returned'");
1904         isThisReturn = true;
1905       }
1906       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1907     } else if (isByVal) {
1908       assert(VA.isMemLoc());
1909       unsigned offset = 0;
1910 
1911       // True if this byval aggregate will be split between registers
1912       // and memory.
1913       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1914       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
1915 
1916       if (CurByValIdx < ByValArgsCount) {
1917 
1918         unsigned RegBegin, RegEnd;
1919         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1920 
1921         EVT PtrVT =
1922             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1923         unsigned int i, j;
1924         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1925           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
1926           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1927           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1928                                      MachinePointerInfo(),
1929                                      DAG.InferPtrAlignment(AddArg));
1930           MemOpChains.push_back(Load.getValue(1));
1931           RegsToPass.push_back(std::make_pair(j, Load));
1932         }
1933 
1934         // If parameter size outsides register area, "offset" value
1935         // helps us to calculate stack slot for remained part properly.
1936         offset = RegEnd - RegBegin;
1937 
1938         CCInfo.nextInRegsParam();
1939       }
1940 
1941       if (Flags.getByValSize() > 4*offset) {
1942         auto PtrVT = getPointerTy(DAG.getDataLayout());
1943         unsigned LocMemOffset = VA.getLocMemOffset();
1944         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1945         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
1946         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
1947         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
1948         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
1949                                            MVT::i32);
1950         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
1951                                             MVT::i32);
1952 
1953         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1954         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1955         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1956                                           Ops));
1957       }
1958     } else if (!isSibCall) {
1959       assert(VA.isMemLoc());
1960 
1961       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1962                                              dl, DAG, VA, Flags));
1963     }
1964   }
1965 
1966   if (!MemOpChains.empty())
1967     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
1968 
1969   // Build a sequence of copy-to-reg nodes chained together with token chain
1970   // and flag operands which copy the outgoing args into the appropriate regs.
1971   SDValue InFlag;
1972   // Tail call byval lowering might overwrite argument registers so in case of
1973   // tail call optimization the copies to registers are lowered later.
1974   if (!isTailCall)
1975     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1976       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1977                                RegsToPass[i].second, InFlag);
1978       InFlag = Chain.getValue(1);
1979     }
1980 
1981   // For tail calls lower the arguments to the 'real' stack slot.
1982   if (isTailCall) {
1983     // Force all the incoming stack arguments to be loaded from the stack
1984     // before any new outgoing arguments are stored to the stack, because the
1985     // outgoing stack slots may alias the incoming argument stack slots, and
1986     // the alias isn't otherwise explicit. This is slightly more conservative
1987     // than necessary, because it means that each store effectively depends
1988     // on every argument instead of just those arguments it would clobber.
1989 
1990     // Do not flag preceding copytoreg stuff together with the following stuff.
1991     InFlag = SDValue();
1992     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1993       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1994                                RegsToPass[i].second, InFlag);
1995       InFlag = Chain.getValue(1);
1996     }
1997     InFlag = SDValue();
1998   }
1999 
2000   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2001   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2002   // node so that legalize doesn't hack it.
2003   bool isDirect = false;
2004 
2005   const TargetMachine &TM = getTargetMachine();
2006   const Module *Mod = MF.getFunction()->getParent();
2007   const GlobalValue *GV = nullptr;
2008   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2009     GV = G->getGlobal();
2010   bool isStub =
2011       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2012 
2013   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2014   bool isLocalARMFunc = false;
2015   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2016   auto PtrVt = getPointerTy(DAG.getDataLayout());
2017 
2018   if (Subtarget->genLongCalls()) {
2019     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2020            "long-calls codegen is not position independent!");
2021     // Handle a global address or an external symbol. If it's not one of
2022     // those, the target's already in a register, so we don't need to do
2023     // anything extra.
2024     if (isa<GlobalAddressSDNode>(Callee)) {
2025       // Create a constant pool entry for the callee address
2026       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2027       ARMConstantPoolValue *CPV =
2028         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2029 
2030       // Get the address of the callee into a register
2031       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2032       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2033       Callee = DAG.getLoad(
2034           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2035           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2036     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2037       const char *Sym = S->getSymbol();
2038 
2039       // Create a constant pool entry for the callee address
2040       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2041       ARMConstantPoolValue *CPV =
2042         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2043                                       ARMPCLabelIndex, 0);
2044       // Get the address of the callee into a register
2045       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2046       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2047       Callee = DAG.getLoad(
2048           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2049           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2050     }
2051   } else if (isa<GlobalAddressSDNode>(Callee)) {
2052     // If we're optimizing for minimum size and the function is called three or
2053     // more times in this block, we can improve codesize by calling indirectly
2054     // as BLXr has a 16-bit encoding.
2055     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2056     auto *BB = CLI.CS->getParent();
2057     bool PreferIndirect =
2058         Subtarget->isThumb() && MF.getFunction()->optForMinSize() &&
2059         count_if(GV->users(), [&BB](const User *U) {
2060           return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
2061         }) > 2;
2062 
2063     if (!PreferIndirect) {
2064       isDirect = true;
2065       bool isDef = GV->isStrongDefinitionForLinker();
2066 
2067       // ARM call to a local ARM function is predicable.
2068       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2069       // tBX takes a register source operand.
2070       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2071         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2072         Callee = DAG.getNode(
2073             ARMISD::WrapperPIC, dl, PtrVt,
2074             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2075         Callee = DAG.getLoad(
2076             PtrVt, dl, DAG.getEntryNode(), Callee,
2077             MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2078             /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
2079                                      MachineMemOperand::MOInvariant);
2080       } else if (Subtarget->isTargetCOFF()) {
2081         assert(Subtarget->isTargetWindows() &&
2082                "Windows is the only supported COFF target");
2083         unsigned TargetFlags = GV->hasDLLImportStorageClass()
2084                                    ? ARMII::MO_DLLIMPORT
2085                                    : ARMII::MO_NO_FLAG;
2086         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
2087                                             TargetFlags);
2088         if (GV->hasDLLImportStorageClass())
2089           Callee =
2090               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2091                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2092                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2093       } else {
2094         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2095       }
2096     }
2097   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2098     isDirect = true;
2099     // tBX takes a register source operand.
2100     const char *Sym = S->getSymbol();
2101     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2102       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2103       ARMConstantPoolValue *CPV =
2104         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2105                                       ARMPCLabelIndex, 4);
2106       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2107       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2108       Callee = DAG.getLoad(
2109           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2110           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2111       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2112       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2113     } else {
2114       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2115     }
2116   }
2117 
2118   // FIXME: handle tail calls differently.
2119   unsigned CallOpc;
2120   if (Subtarget->isThumb()) {
2121     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2122       CallOpc = ARMISD::CALL_NOLINK;
2123     else
2124       CallOpc = ARMISD::CALL;
2125   } else {
2126     if (!isDirect && !Subtarget->hasV5TOps())
2127       CallOpc = ARMISD::CALL_NOLINK;
2128     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2129              // Emit regular call when code size is the priority
2130              !MF.getFunction()->optForMinSize())
2131       // "mov lr, pc; b _foo" to avoid confusing the RSP
2132       CallOpc = ARMISD::CALL_NOLINK;
2133     else
2134       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2135   }
2136 
2137   std::vector<SDValue> Ops;
2138   Ops.push_back(Chain);
2139   Ops.push_back(Callee);
2140 
2141   // Add argument registers to the end of the list so that they are known live
2142   // into the call.
2143   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2144     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2145                                   RegsToPass[i].second.getValueType()));
2146 
2147   // Add a register mask operand representing the call-preserved registers.
2148   if (!isTailCall) {
2149     const uint32_t *Mask;
2150     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2151     if (isThisReturn) {
2152       // For 'this' returns, use the R0-preserving mask if applicable
2153       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2154       if (!Mask) {
2155         // Set isThisReturn to false if the calling convention is not one that
2156         // allows 'returned' to be modeled in this way, so LowerCallResult does
2157         // not try to pass 'this' straight through
2158         isThisReturn = false;
2159         Mask = ARI->getCallPreservedMask(MF, CallConv);
2160       }
2161     } else
2162       Mask = ARI->getCallPreservedMask(MF, CallConv);
2163 
2164     assert(Mask && "Missing call preserved mask for calling convention");
2165     Ops.push_back(DAG.getRegisterMask(Mask));
2166   }
2167 
2168   if (InFlag.getNode())
2169     Ops.push_back(InFlag);
2170 
2171   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2172   if (isTailCall) {
2173     MF.getFrameInfo().setHasTailCall();
2174     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2175   }
2176 
2177   // Returns a chain and a flag for retval copy to use.
2178   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2179   InFlag = Chain.getValue(1);
2180 
2181   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2182                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2183   if (!Ins.empty())
2184     InFlag = Chain.getValue(1);
2185 
2186   // Handle result values, copying them out of physregs into vregs that we
2187   // return.
2188   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2189                          InVals, isThisReturn,
2190                          isThisReturn ? OutVals[0] : SDValue());
2191 }
2192 
2193 /// HandleByVal - Every parameter *after* a byval parameter is passed
2194 /// on the stack.  Remember the next parameter register to allocate,
2195 /// and then confiscate the rest of the parameter registers to insure
2196 /// this.
2197 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2198                                     unsigned Align) const {
2199   assert((State->getCallOrPrologue() == Prologue ||
2200           State->getCallOrPrologue() == Call) &&
2201          "unhandled ParmContext");
2202 
2203   // Byval (as with any stack) slots are always at least 4 byte aligned.
2204   Align = std::max(Align, 4U);
2205 
2206   unsigned Reg = State->AllocateReg(GPRArgRegs);
2207   if (!Reg)
2208     return;
2209 
2210   unsigned AlignInRegs = Align / 4;
2211   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2212   for (unsigned i = 0; i < Waste; ++i)
2213     Reg = State->AllocateReg(GPRArgRegs);
2214 
2215   if (!Reg)
2216     return;
2217 
2218   unsigned Excess = 4 * (ARM::R4 - Reg);
2219 
2220   // Special case when NSAA != SP and parameter size greater than size of
2221   // all remained GPR regs. In that case we can't split parameter, we must
2222   // send it to stack. We also must set NCRN to R4, so waste all
2223   // remained registers.
2224   const unsigned NSAAOffset = State->getNextStackOffset();
2225   if (NSAAOffset != 0 && Size > Excess) {
2226     while (State->AllocateReg(GPRArgRegs))
2227       ;
2228     return;
2229   }
2230 
2231   // First register for byval parameter is the first register that wasn't
2232   // allocated before this method call, so it would be "reg".
2233   // If parameter is small enough to be saved in range [reg, r4), then
2234   // the end (first after last) register would be reg + param-size-in-regs,
2235   // else parameter would be splitted between registers and stack,
2236   // end register would be r4 in this case.
2237   unsigned ByValRegBegin = Reg;
2238   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2239   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2240   // Note, first register is allocated in the beginning of function already,
2241   // allocate remained amount of registers we need.
2242   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2243     State->AllocateReg(GPRArgRegs);
2244   // A byval parameter that is split between registers and memory needs its
2245   // size truncated here.
2246   // In the case where the entire structure fits in registers, we set the
2247   // size in memory to zero.
2248   Size = std::max<int>(Size - Excess, 0);
2249 }
2250 
2251 /// MatchingStackOffset - Return true if the given stack call argument is
2252 /// already available in the same position (relatively) of the caller's
2253 /// incoming argument stack.
2254 static
2255 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2256                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2257                          const TargetInstrInfo *TII) {
2258   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2259   int FI = INT_MAX;
2260   if (Arg.getOpcode() == ISD::CopyFromReg) {
2261     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2262     if (!TargetRegisterInfo::isVirtualRegister(VR))
2263       return false;
2264     MachineInstr *Def = MRI->getVRegDef(VR);
2265     if (!Def)
2266       return false;
2267     if (!Flags.isByVal()) {
2268       if (!TII->isLoadFromStackSlot(*Def, FI))
2269         return false;
2270     } else {
2271       return false;
2272     }
2273   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2274     if (Flags.isByVal())
2275       // ByVal argument is passed in as a pointer but it's now being
2276       // dereferenced. e.g.
2277       // define @foo(%struct.X* %A) {
2278       //   tail call @bar(%struct.X* byval %A)
2279       // }
2280       return false;
2281     SDValue Ptr = Ld->getBasePtr();
2282     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2283     if (!FINode)
2284       return false;
2285     FI = FINode->getIndex();
2286   } else
2287     return false;
2288 
2289   assert(FI != INT_MAX);
2290   if (!MFI.isFixedObjectIndex(FI))
2291     return false;
2292   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2293 }
2294 
2295 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2296 /// for tail call optimization. Targets which want to do tail call
2297 /// optimization should implement this function.
2298 bool
2299 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2300                                                      CallingConv::ID CalleeCC,
2301                                                      bool isVarArg,
2302                                                      bool isCalleeStructRet,
2303                                                      bool isCallerStructRet,
2304                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
2305                                     const SmallVectorImpl<SDValue> &OutVals,
2306                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2307                                                      SelectionDAG& DAG) const {
2308   MachineFunction &MF = DAG.getMachineFunction();
2309   const Function *CallerF = MF.getFunction();
2310   CallingConv::ID CallerCC = CallerF->getCallingConv();
2311 
2312   assert(Subtarget->supportsTailCall());
2313 
2314   // Look for obvious safe cases to perform tail call optimization that do not
2315   // require ABI changes. This is what gcc calls sibcall.
2316 
2317   // Do not sibcall optimize vararg calls unless the call site is not passing
2318   // any arguments.
2319   if (isVarArg && !Outs.empty())
2320     return false;
2321 
2322   // Exception-handling functions need a special set of instructions to indicate
2323   // a return to the hardware. Tail-calling another function would probably
2324   // break this.
2325   if (CallerF->hasFnAttribute("interrupt"))
2326     return false;
2327 
2328   // Also avoid sibcall optimization if either caller or callee uses struct
2329   // return semantics.
2330   if (isCalleeStructRet || isCallerStructRet)
2331     return false;
2332 
2333   // Externally-defined functions with weak linkage should not be
2334   // tail-called on ARM when the OS does not support dynamic
2335   // pre-emption of symbols, as the AAELF spec requires normal calls
2336   // to undefined weak functions to be replaced with a NOP or jump to the
2337   // next instruction. The behaviour of branch instructions in this
2338   // situation (as used for tail calls) is implementation-defined, so we
2339   // cannot rely on the linker replacing the tail call with a return.
2340   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2341     const GlobalValue *GV = G->getGlobal();
2342     const Triple &TT = getTargetMachine().getTargetTriple();
2343     if (GV->hasExternalWeakLinkage() &&
2344         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2345       return false;
2346   }
2347 
2348   // Check that the call results are passed in the same way.
2349   LLVMContext &C = *DAG.getContext();
2350   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2351                                   CCAssignFnForNode(CalleeCC, true, isVarArg),
2352                                   CCAssignFnForNode(CallerCC, true, isVarArg)))
2353     return false;
2354   // The callee has to preserve all registers the caller needs to preserve.
2355   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2356   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2357   if (CalleeCC != CallerCC) {
2358     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2359     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2360       return false;
2361   }
2362 
2363   // If Caller's vararg or byval argument has been split between registers and
2364   // stack, do not perform tail call, since part of the argument is in caller's
2365   // local frame.
2366   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2367   if (AFI_Caller->getArgRegsSaveSize())
2368     return false;
2369 
2370   // If the callee takes no arguments then go on to check the results of the
2371   // call.
2372   if (!Outs.empty()) {
2373     // Check if stack adjustment is needed. For now, do not do this if any
2374     // argument is passed on the stack.
2375     SmallVector<CCValAssign, 16> ArgLocs;
2376     ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
2377     CCInfo.AnalyzeCallOperands(Outs,
2378                                CCAssignFnForNode(CalleeCC, false, isVarArg));
2379     if (CCInfo.getNextStackOffset()) {
2380       // Check if the arguments are already laid out in the right way as
2381       // the caller's fixed stack objects.
2382       MachineFrameInfo &MFI = MF.getFrameInfo();
2383       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2384       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2385       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2386            i != e;
2387            ++i, ++realArgIdx) {
2388         CCValAssign &VA = ArgLocs[i];
2389         EVT RegVT = VA.getLocVT();
2390         SDValue Arg = OutVals[realArgIdx];
2391         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2392         if (VA.getLocInfo() == CCValAssign::Indirect)
2393           return false;
2394         if (VA.needsCustom()) {
2395           // f64 and vector types are split into multiple registers or
2396           // register/stack-slot combinations.  The types will not match
2397           // the registers; give up on memory f64 refs until we figure
2398           // out what to do about this.
2399           if (!VA.isRegLoc())
2400             return false;
2401           if (!ArgLocs[++i].isRegLoc())
2402             return false;
2403           if (RegVT == MVT::v2f64) {
2404             if (!ArgLocs[++i].isRegLoc())
2405               return false;
2406             if (!ArgLocs[++i].isRegLoc())
2407               return false;
2408           }
2409         } else if (!VA.isRegLoc()) {
2410           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2411                                    MFI, MRI, TII))
2412             return false;
2413         }
2414       }
2415     }
2416 
2417     const MachineRegisterInfo &MRI = MF.getRegInfo();
2418     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2419       return false;
2420   }
2421 
2422   return true;
2423 }
2424 
2425 bool
2426 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2427                                   MachineFunction &MF, bool isVarArg,
2428                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2429                                   LLVMContext &Context) const {
2430   SmallVector<CCValAssign, 16> RVLocs;
2431   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2432   return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
2433                                                     isVarArg));
2434 }
2435 
2436 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2437                                     const SDLoc &DL, SelectionDAG &DAG) {
2438   const MachineFunction &MF = DAG.getMachineFunction();
2439   const Function *F = MF.getFunction();
2440 
2441   StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
2442 
2443   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2444   // version of the "preferred return address". These offsets affect the return
2445   // instruction if this is a return from PL1 without hypervisor extensions.
2446   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2447   //    SWI:     0      "subs pc, lr, #0"
2448   //    ABORT:   +4     "subs pc, lr, #4"
2449   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2450   // UNDEF varies depending on where the exception came from ARM or Thumb
2451   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2452 
2453   int64_t LROffset;
2454   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2455       IntKind == "ABORT")
2456     LROffset = 4;
2457   else if (IntKind == "SWI" || IntKind == "UNDEF")
2458     LROffset = 0;
2459   else
2460     report_fatal_error("Unsupported interrupt attribute. If present, value "
2461                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2462 
2463   RetOps.insert(RetOps.begin() + 1,
2464                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2465 
2466   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2467 }
2468 
2469 SDValue
2470 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2471                                bool isVarArg,
2472                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2473                                const SmallVectorImpl<SDValue> &OutVals,
2474                                const SDLoc &dl, SelectionDAG &DAG) const {
2475 
2476   // CCValAssign - represent the assignment of the return value to a location.
2477   SmallVector<CCValAssign, 16> RVLocs;
2478 
2479   // CCState - Info about the registers and stack slots.
2480   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2481                     *DAG.getContext(), Call);
2482 
2483   // Analyze outgoing return values.
2484   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
2485                                                isVarArg));
2486 
2487   SDValue Flag;
2488   SmallVector<SDValue, 4> RetOps;
2489   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2490   bool isLittleEndian = Subtarget->isLittle();
2491 
2492   MachineFunction &MF = DAG.getMachineFunction();
2493   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2494   AFI->setReturnRegsCount(RVLocs.size());
2495 
2496   // Copy the result values into the output registers.
2497   for (unsigned i = 0, realRVLocIdx = 0;
2498        i != RVLocs.size();
2499        ++i, ++realRVLocIdx) {
2500     CCValAssign &VA = RVLocs[i];
2501     assert(VA.isRegLoc() && "Can only return in registers!");
2502 
2503     SDValue Arg = OutVals[realRVLocIdx];
2504 
2505     switch (VA.getLocInfo()) {
2506     default: llvm_unreachable("Unknown loc info!");
2507     case CCValAssign::Full: break;
2508     case CCValAssign::BCvt:
2509       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2510       break;
2511     }
2512 
2513     if (VA.needsCustom()) {
2514       if (VA.getLocVT() == MVT::v2f64) {
2515         // Extract the first half and return it in two registers.
2516         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2517                                    DAG.getConstant(0, dl, MVT::i32));
2518         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2519                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2520 
2521         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2522                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2523                                  Flag);
2524         Flag = Chain.getValue(1);
2525         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2526         VA = RVLocs[++i]; // skip ahead to next loc
2527         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2528                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2529                                  Flag);
2530         Flag = Chain.getValue(1);
2531         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2532         VA = RVLocs[++i]; // skip ahead to next loc
2533 
2534         // Extract the 2nd half and fall through to handle it as an f64 value.
2535         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2536                           DAG.getConstant(1, dl, MVT::i32));
2537       }
2538       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2539       // available.
2540       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2541                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2542       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2543                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2544                                Flag);
2545       Flag = Chain.getValue(1);
2546       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2547       VA = RVLocs[++i]; // skip ahead to next loc
2548       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2549                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2550                                Flag);
2551     } else
2552       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2553 
2554     // Guarantee that all emitted copies are
2555     // stuck together, avoiding something bad.
2556     Flag = Chain.getValue(1);
2557     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2558   }
2559   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2560   const MCPhysReg *I =
2561       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2562   if (I) {
2563     for (; *I; ++I) {
2564       if (ARM::GPRRegClass.contains(*I))
2565         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2566       else if (ARM::DPRRegClass.contains(*I))
2567         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
2568       else
2569         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2570     }
2571   }
2572 
2573   // Update chain and glue.
2574   RetOps[0] = Chain;
2575   if (Flag.getNode())
2576     RetOps.push_back(Flag);
2577 
2578   // CPUs which aren't M-class use a special sequence to return from
2579   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2580   // though we use "subs pc, lr, #N").
2581   //
2582   // M-class CPUs actually use a normal return sequence with a special
2583   // (hardware-provided) value in LR, so the normal code path works.
2584   if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
2585       !Subtarget->isMClass()) {
2586     if (Subtarget->isThumb1Only())
2587       report_fatal_error("interrupt attribute is not supported in Thumb1");
2588     return LowerInterruptReturn(RetOps, dl, DAG);
2589   }
2590 
2591   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2592 }
2593 
2594 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2595   if (N->getNumValues() != 1)
2596     return false;
2597   if (!N->hasNUsesOfValue(1, 0))
2598     return false;
2599 
2600   SDValue TCChain = Chain;
2601   SDNode *Copy = *N->use_begin();
2602   if (Copy->getOpcode() == ISD::CopyToReg) {
2603     // If the copy has a glue operand, we conservatively assume it isn't safe to
2604     // perform a tail call.
2605     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2606       return false;
2607     TCChain = Copy->getOperand(0);
2608   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2609     SDNode *VMov = Copy;
2610     // f64 returned in a pair of GPRs.
2611     SmallPtrSet<SDNode*, 2> Copies;
2612     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2613          UI != UE; ++UI) {
2614       if (UI->getOpcode() != ISD::CopyToReg)
2615         return false;
2616       Copies.insert(*UI);
2617     }
2618     if (Copies.size() > 2)
2619       return false;
2620 
2621     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2622          UI != UE; ++UI) {
2623       SDValue UseChain = UI->getOperand(0);
2624       if (Copies.count(UseChain.getNode()))
2625         // Second CopyToReg
2626         Copy = *UI;
2627       else {
2628         // We are at the top of this chain.
2629         // If the copy has a glue operand, we conservatively assume it
2630         // isn't safe to perform a tail call.
2631         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2632           return false;
2633         // First CopyToReg
2634         TCChain = UseChain;
2635       }
2636     }
2637   } else if (Copy->getOpcode() == ISD::BITCAST) {
2638     // f32 returned in a single GPR.
2639     if (!Copy->hasOneUse())
2640       return false;
2641     Copy = *Copy->use_begin();
2642     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2643       return false;
2644     // If the copy has a glue operand, we conservatively assume it isn't safe to
2645     // perform a tail call.
2646     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2647       return false;
2648     TCChain = Copy->getOperand(0);
2649   } else {
2650     return false;
2651   }
2652 
2653   bool HasRet = false;
2654   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2655        UI != UE; ++UI) {
2656     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2657         UI->getOpcode() != ARMISD::INTRET_FLAG)
2658       return false;
2659     HasRet = true;
2660   }
2661 
2662   if (!HasRet)
2663     return false;
2664 
2665   Chain = TCChain;
2666   return true;
2667 }
2668 
2669 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2670   if (!Subtarget->supportsTailCall())
2671     return false;
2672 
2673   auto Attr =
2674       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2675   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2676     return false;
2677 
2678   return true;
2679 }
2680 
2681 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2682 // and pass the lower and high parts through.
2683 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2684   SDLoc DL(Op);
2685   SDValue WriteValue = Op->getOperand(2);
2686 
2687   // This function is only supposed to be called for i64 type argument.
2688   assert(WriteValue.getValueType() == MVT::i64
2689           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2690 
2691   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2692                            DAG.getConstant(0, DL, MVT::i32));
2693   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2694                            DAG.getConstant(1, DL, MVT::i32));
2695   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2696   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2697 }
2698 
2699 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2700 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2701 // one of the above mentioned nodes. It has to be wrapped because otherwise
2702 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2703 // be used to form addressing mode. These wrapped nodes will be selected
2704 // into MOVi.
2705 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2706   EVT PtrVT = Op.getValueType();
2707   // FIXME there is no actual debug info here
2708   SDLoc dl(Op);
2709   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2710   SDValue Res;
2711   if (CP->isMachineConstantPoolEntry())
2712     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2713                                     CP->getAlignment());
2714   else
2715     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2716                                     CP->getAlignment());
2717   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2718 }
2719 
2720 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2721   return MachineJumpTableInfo::EK_Inline;
2722 }
2723 
2724 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2725                                              SelectionDAG &DAG) const {
2726   MachineFunction &MF = DAG.getMachineFunction();
2727   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2728   unsigned ARMPCLabelIndex = 0;
2729   SDLoc DL(Op);
2730   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2731   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2732   SDValue CPAddr;
2733   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
2734   if (!IsPositionIndependent) {
2735     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2736   } else {
2737     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2738     ARMPCLabelIndex = AFI->createPICLabelUId();
2739     ARMConstantPoolValue *CPV =
2740       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2741                                       ARMCP::CPBlockAddress, PCAdj);
2742     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2743   }
2744   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2745   SDValue Result = DAG.getLoad(
2746       PtrVT, DL, DAG.getEntryNode(), CPAddr,
2747       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2748   if (!IsPositionIndependent)
2749     return Result;
2750   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
2751   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2752 }
2753 
2754 /// \brief Convert a TLS address reference into the correct sequence of loads
2755 /// and calls to compute the variable's address for Darwin, and return an
2756 /// SDValue containing the final node.
2757 
2758 /// Darwin only has one TLS scheme which must be capable of dealing with the
2759 /// fully general situation, in the worst case. This means:
2760 ///     + "extern __thread" declaration.
2761 ///     + Defined in a possibly unknown dynamic library.
2762 ///
2763 /// The general system is that each __thread variable has a [3 x i32] descriptor
2764 /// which contains information used by the runtime to calculate the address. The
2765 /// only part of this the compiler needs to know about is the first word, which
2766 /// contains a function pointer that must be called with the address of the
2767 /// entire descriptor in "r0".
2768 ///
2769 /// Since this descriptor may be in a different unit, in general access must
2770 /// proceed along the usual ARM rules. A common sequence to produce is:
2771 ///
2772 ///     movw rT1, :lower16:_var$non_lazy_ptr
2773 ///     movt rT1, :upper16:_var$non_lazy_ptr
2774 ///     ldr r0, [rT1]
2775 ///     ldr rT2, [r0]
2776 ///     blx rT2
2777 ///     [...address now in r0...]
2778 SDValue
2779 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
2780                                                SelectionDAG &DAG) const {
2781   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
2782   SDLoc DL(Op);
2783 
2784   // First step is to get the address of the actua global symbol. This is where
2785   // the TLS descriptor lives.
2786   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
2787 
2788   // The first entry in the descriptor is a function pointer that we must call
2789   // to obtain the address of the variable.
2790   SDValue Chain = DAG.getEntryNode();
2791   SDValue FuncTLVGet = DAG.getLoad(
2792       MVT::i32, DL, Chain, DescAddr,
2793       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2794       /* Alignment = */ 4,
2795       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
2796           MachineMemOperand::MOInvariant);
2797   Chain = FuncTLVGet.getValue(1);
2798 
2799   MachineFunction &F = DAG.getMachineFunction();
2800   MachineFrameInfo &MFI = F.getFrameInfo();
2801   MFI.setAdjustsStack(true);
2802 
2803   // TLS calls preserve all registers except those that absolutely must be
2804   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
2805   // silly).
2806   auto TRI =
2807       getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
2808   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
2809   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
2810 
2811   // Finally, we can make the call. This is just a degenerate version of a
2812   // normal AArch64 call node: r0 takes the address of the descriptor, and
2813   // returns the address of the variable in this thread.
2814   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
2815   Chain =
2816       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
2817                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
2818                   DAG.getRegisterMask(Mask), Chain.getValue(1));
2819   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
2820 }
2821 
2822 SDValue
2823 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
2824                                                 SelectionDAG &DAG) const {
2825   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
2826 
2827   SDValue Chain = DAG.getEntryNode();
2828   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2829   SDLoc DL(Op);
2830 
2831   // Load the current TEB (thread environment block)
2832   SDValue Ops[] = {Chain,
2833                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
2834                    DAG.getConstant(15, DL, MVT::i32),
2835                    DAG.getConstant(0, DL, MVT::i32),
2836                    DAG.getConstant(13, DL, MVT::i32),
2837                    DAG.getConstant(0, DL, MVT::i32),
2838                    DAG.getConstant(2, DL, MVT::i32)};
2839   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
2840                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
2841 
2842   SDValue TEB = CurrentTEB.getValue(0);
2843   Chain = CurrentTEB.getValue(1);
2844 
2845   // Load the ThreadLocalStoragePointer from the TEB
2846   // A pointer to the TLS array is located at offset 0x2c from the TEB.
2847   SDValue TLSArray =
2848       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
2849   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
2850 
2851   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
2852   // offset into the TLSArray.
2853 
2854   // Load the TLS index from the C runtime
2855   SDValue TLSIndex =
2856       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
2857   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
2858   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
2859 
2860   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
2861                               DAG.getConstant(2, DL, MVT::i32));
2862   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
2863                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
2864                             MachinePointerInfo());
2865 
2866   // Get the offset of the start of the .tls section (section base)
2867   const auto *GA = cast<GlobalAddressSDNode>(Op);
2868   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
2869   SDValue Offset = DAG.getLoad(
2870       PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
2871                                     DAG.getTargetConstantPool(CPV, PtrVT, 4)),
2872       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2873 
2874   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
2875 }
2876 
2877 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
2878 SDValue
2879 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2880                                                  SelectionDAG &DAG) const {
2881   SDLoc dl(GA);
2882   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2883   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2884   MachineFunction &MF = DAG.getMachineFunction();
2885   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2886   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2887   ARMConstantPoolValue *CPV =
2888     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2889                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2890   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2891   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2892   Argument = DAG.getLoad(
2893       PtrVT, dl, DAG.getEntryNode(), Argument,
2894       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2895   SDValue Chain = Argument.getValue(1);
2896 
2897   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2898   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2899 
2900   // call __tls_get_addr.
2901   ArgListTy Args;
2902   ArgListEntry Entry;
2903   Entry.Node = Argument;
2904   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2905   Args.push_back(Entry);
2906 
2907   // FIXME: is there useful debug info available here?
2908   TargetLowering::CallLoweringInfo CLI(DAG);
2909   CLI.setDebugLoc(dl).setChain(Chain)
2910     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
2911                DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
2912 
2913   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2914   return CallResult.first;
2915 }
2916 
2917 // Lower ISD::GlobalTLSAddress using the "initial exec" or
2918 // "local exec" model.
2919 SDValue
2920 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2921                                         SelectionDAG &DAG,
2922                                         TLSModel::Model model) const {
2923   const GlobalValue *GV = GA->getGlobal();
2924   SDLoc dl(GA);
2925   SDValue Offset;
2926   SDValue Chain = DAG.getEntryNode();
2927   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2928   // Get the Thread Pointer
2929   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2930 
2931   if (model == TLSModel::InitialExec) {
2932     MachineFunction &MF = DAG.getMachineFunction();
2933     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2934     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2935     // Initial exec model.
2936     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2937     ARMConstantPoolValue *CPV =
2938       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2939                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2940                                       true);
2941     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2942     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2943     Offset = DAG.getLoad(
2944         PtrVT, dl, Chain, Offset,
2945         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2946     Chain = Offset.getValue(1);
2947 
2948     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2949     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2950 
2951     Offset = DAG.getLoad(
2952         PtrVT, dl, Chain, Offset,
2953         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2954   } else {
2955     // local exec model
2956     assert(model == TLSModel::LocalExec);
2957     ARMConstantPoolValue *CPV =
2958       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2959     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2960     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2961     Offset = DAG.getLoad(
2962         PtrVT, dl, Chain, Offset,
2963         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2964   }
2965 
2966   // The address of the thread local variable is the add of the thread
2967   // pointer with the offset of the variable.
2968   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2969 }
2970 
2971 SDValue
2972 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2973   if (Subtarget->isTargetDarwin())
2974     return LowerGlobalTLSAddressDarwin(Op, DAG);
2975 
2976   if (Subtarget->isTargetWindows())
2977     return LowerGlobalTLSAddressWindows(Op, DAG);
2978 
2979   // TODO: implement the "local dynamic" model
2980   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
2981   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2982   if (DAG.getTarget().Options.EmulatedTLS)
2983     return LowerToTLSEmulatedModel(GA, DAG);
2984 
2985   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2986 
2987   switch (model) {
2988     case TLSModel::GeneralDynamic:
2989     case TLSModel::LocalDynamic:
2990       return LowerToTLSGeneralDynamicModel(GA, DAG);
2991     case TLSModel::InitialExec:
2992     case TLSModel::LocalExec:
2993       return LowerToTLSExecModels(GA, DAG, model);
2994   }
2995   llvm_unreachable("bogus TLS model");
2996 }
2997 
2998 /// Return true if all users of V are within function F, looking through
2999 /// ConstantExprs.
3000 static bool allUsersAreInFunction(const Value *V, const Function *F) {
3001   SmallVector<const User*,4> Worklist;
3002   for (auto *U : V->users())
3003     Worklist.push_back(U);
3004   while (!Worklist.empty()) {
3005     auto *U = Worklist.pop_back_val();
3006     if (isa<ConstantExpr>(U)) {
3007       for (auto *UU : U->users())
3008         Worklist.push_back(UU);
3009       continue;
3010     }
3011 
3012     auto *I = dyn_cast<Instruction>(U);
3013     if (!I || I->getParent()->getParent() != F)
3014       return false;
3015   }
3016   return true;
3017 }
3018 
3019 /// Return true if all users of V are within some (any) function, looking through
3020 /// ConstantExprs. In other words, are there any global constant users?
3021 static bool allUsersAreInFunctions(const Value *V) {
3022   SmallVector<const User*,4> Worklist;
3023   for (auto *U : V->users())
3024     Worklist.push_back(U);
3025   while (!Worklist.empty()) {
3026     auto *U = Worklist.pop_back_val();
3027     if (isa<ConstantExpr>(U)) {
3028       for (auto *UU : U->users())
3029         Worklist.push_back(UU);
3030       continue;
3031     }
3032 
3033     if (!isa<Instruction>(U))
3034       return false;
3035   }
3036   return true;
3037 }
3038 
3039 // Return true if T is an integer, float or an array/vector of either.
3040 static bool isSimpleType(Type *T) {
3041   if (T->isIntegerTy() || T->isFloatingPointTy())
3042     return true;
3043   Type *SubT = nullptr;
3044   if (T->isArrayTy())
3045     SubT = T->getArrayElementType();
3046   else if (T->isVectorTy())
3047     SubT = T->getVectorElementType();
3048   else
3049     return false;
3050   return SubT->isIntegerTy() || SubT->isFloatingPointTy();
3051 }
3052 
3053 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
3054                                      EVT PtrVT, SDLoc dl) {
3055   // If we're creating a pool entry for a constant global with unnamed address,
3056   // and the global is small enough, we can emit it inline into the constant pool
3057   // to save ourselves an indirection.
3058   //
3059   // This is a win if the constant is only used in one function (so it doesn't
3060   // need to be duplicated) or duplicating the constant wouldn't increase code
3061   // size (implying the constant is no larger than 4 bytes).
3062   const Function *F = DAG.getMachineFunction().getFunction();
3063 
3064   // We rely on this decision to inline being idemopotent and unrelated to the
3065   // use-site. We know that if we inline a variable at one use site, we'll
3066   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3067   // doesn't know about this optimization, so bail out if it's enabled else
3068   // we could decide to inline here (and thus never emit the GV) but require
3069   // the GV from fast-isel generated code.
3070   if (!EnableConstpoolPromotion ||
3071       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3072       return SDValue();
3073 
3074   auto *GVar = dyn_cast<GlobalVariable>(GV);
3075   if (!GVar || !GVar->hasInitializer() ||
3076       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3077       !GVar->hasLocalLinkage())
3078     return SDValue();
3079 
3080   // Ensure that we don't try and inline any type that contains pointers. If
3081   // we inline a value that contains relocations, we move the relocations from
3082   // .data to .text which is not ideal.
3083   auto *Init = GVar->getInitializer();
3084   if (!isSimpleType(Init->getType()))
3085     return SDValue();
3086 
3087   // The constant islands pass can only really deal with alignment requests
3088   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3089   // any type wanting greater alignment requirements than 4 bytes. We also
3090   // can only promote constants that are multiples of 4 bytes in size or
3091   // are paddable to a multiple of 4. Currently we only try and pad constants
3092   // that are strings for simplicity.
3093   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3094   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3095   unsigned Align = GVar->getAlignment();
3096   unsigned RequiredPadding = 4 - (Size % 4);
3097   bool PaddingPossible =
3098     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3099   if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
3100     return SDValue();
3101 
3102   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3103   MachineFunction &MF = DAG.getMachineFunction();
3104   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3105 
3106   // We can't bloat the constant pool too much, else the ConstantIslands pass
3107   // may fail to converge. If we haven't promoted this global yet (it may have
3108   // multiple uses), and promoting it would increase the constant pool size (Sz
3109   // > 4), ensure we have space to do so up to MaxTotal.
3110   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3111     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3112         ConstpoolPromotionMaxTotal)
3113       return SDValue();
3114 
3115   // This is only valid if all users are in a single function OR it has users
3116   // in multiple functions but it no larger than a pointer. We also check if
3117   // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
3118   // address taken.
3119   if (!allUsersAreInFunction(GVar, F) &&
3120       !(Size <= 4 && allUsersAreInFunctions(GVar)))
3121     return SDValue();
3122 
3123   // We're going to inline this global. Pad it out if needed.
3124   if (RequiredPadding != 4) {
3125     StringRef S = CDAInit->getAsString();
3126 
3127     SmallVector<uint8_t,16> V(S.size());
3128     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3129     while (RequiredPadding--)
3130       V.push_back(0);
3131     Init = ConstantDataArray::get(*DAG.getContext(), V);
3132   }
3133 
3134   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3135   SDValue CPAddr =
3136     DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
3137   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3138     AFI->markGlobalAsPromotedToConstantPool(GVar);
3139     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3140                                       PaddedSize - 4);
3141   }
3142   ++NumConstpoolPromoted;
3143   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3144 }
3145 
3146 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3147                                                  SelectionDAG &DAG) const {
3148   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3149   SDLoc dl(Op);
3150   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3151   const TargetMachine &TM = getTargetMachine();
3152   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3153     GV = GA->getBaseObject();
3154   bool IsRO =
3155       (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
3156       isa<Function>(GV);
3157 
3158   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
3159     if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
3160       return V;
3161 
3162   if (isPositionIndependent()) {
3163     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3164 
3165     MachineFunction &MF = DAG.getMachineFunction();
3166     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3167     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3168     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3169     SDLoc dl(Op);
3170     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3171     ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
3172         GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
3173         UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
3174         /*AddCurrentAddress=*/UseGOT_PREL);
3175     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3176     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3177     SDValue Result = DAG.getLoad(
3178         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3179         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3180     SDValue Chain = Result.getValue(1);
3181     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3182     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3183     if (UseGOT_PREL)
3184       Result =
3185           DAG.getLoad(PtrVT, dl, Chain, Result,
3186                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3187     return Result;
3188   } else if (Subtarget->isROPI() && IsRO) {
3189     // PC-relative.
3190     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3191     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3192     return Result;
3193   } else if (Subtarget->isRWPI() && !IsRO) {
3194     // SB-relative.
3195     ARMConstantPoolValue *CPV =
3196       ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3197     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3198     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3199     SDValue G = DAG.getLoad(
3200         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3201         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3202     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3203     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
3204     return Result;
3205   }
3206 
3207   // If we have T2 ops, we can materialize the address directly via movt/movw
3208   // pair. This is always cheaper.
3209   if (Subtarget->useMovt(DAG.getMachineFunction())) {
3210     ++NumMovwMovt;
3211     // FIXME: Once remat is capable of dealing with instructions with register
3212     // operands, expand this into two nodes.
3213     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3214                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3215   } else {
3216     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
3217     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3218     return DAG.getLoad(
3219         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3220         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3221   }
3222 }
3223 
3224 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3225                                                     SelectionDAG &DAG) const {
3226   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3227          "ROPI/RWPI not currently supported for Darwin");
3228   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3229   SDLoc dl(Op);
3230   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3231 
3232   if (Subtarget->useMovt(DAG.getMachineFunction()))
3233     ++NumMovwMovt;
3234 
3235   // FIXME: Once remat is capable of dealing with instructions with register
3236   // operands, expand this into multiple nodes
3237   unsigned Wrapper =
3238       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3239 
3240   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3241   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3242 
3243   if (Subtarget->isGVIndirectSymbol(GV))
3244     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3245                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3246   return Result;
3247 }
3248 
3249 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3250                                                      SelectionDAG &DAG) const {
3251   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3252   assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
3253          "Windows on ARM expects to use movw/movt");
3254   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3255          "ROPI/RWPI not currently supported for Windows");
3256 
3257   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3258   const ARMII::TOF TargetFlags =
3259     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
3260   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3261   SDValue Result;
3262   SDLoc DL(Op);
3263 
3264   ++NumMovwMovt;
3265 
3266   // FIXME: Once remat is capable of dealing with instructions with register
3267   // operands, expand this into two nodes.
3268   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3269                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
3270                                                   TargetFlags));
3271   if (GV->hasDLLImportStorageClass())
3272     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3273                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3274   return Result;
3275 }
3276 
3277 SDValue
3278 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3279   SDLoc dl(Op);
3280   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3281   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3282                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3283                      Op.getOperand(1), Val);
3284 }
3285 
3286 SDValue
3287 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3288   SDLoc dl(Op);
3289   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3290                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3291 }
3292 
3293 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3294                                                       SelectionDAG &DAG) const {
3295   SDLoc dl(Op);
3296   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3297                      Op.getOperand(0));
3298 }
3299 
3300 SDValue
3301 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3302                                           const ARMSubtarget *Subtarget) const {
3303   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3304   SDLoc dl(Op);
3305   switch (IntNo) {
3306   default: return SDValue();    // Don't custom lower most intrinsics.
3307   case Intrinsic::arm_rbit: {
3308     assert(Op.getOperand(1).getValueType() == MVT::i32 &&
3309            "RBIT intrinsic must have i32 type!");
3310     return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
3311   }
3312   case Intrinsic::thread_pointer: {
3313     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3314     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3315   }
3316   case Intrinsic::eh_sjlj_lsda: {
3317     MachineFunction &MF = DAG.getMachineFunction();
3318     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3319     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3320     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3321     SDValue CPAddr;
3322     bool IsPositionIndependent = isPositionIndependent();
3323     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3324     ARMConstantPoolValue *CPV =
3325       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
3326                                       ARMCP::CPLSDA, PCAdj);
3327     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3328     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3329     SDValue Result = DAG.getLoad(
3330         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3331         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3332 
3333     if (IsPositionIndependent) {
3334       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3335       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3336     }
3337     return Result;
3338   }
3339   case Intrinsic::arm_neon_vmulls:
3340   case Intrinsic::arm_neon_vmullu: {
3341     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3342       ? ARMISD::VMULLs : ARMISD::VMULLu;
3343     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3344                        Op.getOperand(1), Op.getOperand(2));
3345   }
3346   case Intrinsic::arm_neon_vminnm:
3347   case Intrinsic::arm_neon_vmaxnm: {
3348     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3349       ? ISD::FMINNUM : ISD::FMAXNUM;
3350     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3351                        Op.getOperand(1), Op.getOperand(2));
3352   }
3353   case Intrinsic::arm_neon_vminu:
3354   case Intrinsic::arm_neon_vmaxu: {
3355     if (Op.getValueType().isFloatingPoint())
3356       return SDValue();
3357     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3358       ? ISD::UMIN : ISD::UMAX;
3359     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3360                          Op.getOperand(1), Op.getOperand(2));
3361   }
3362   case Intrinsic::arm_neon_vmins:
3363   case Intrinsic::arm_neon_vmaxs: {
3364     // v{min,max}s is overloaded between signed integers and floats.
3365     if (!Op.getValueType().isFloatingPoint()) {
3366       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3367         ? ISD::SMIN : ISD::SMAX;
3368       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3369                          Op.getOperand(1), Op.getOperand(2));
3370     }
3371     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3372       ? ISD::FMINNAN : ISD::FMAXNAN;
3373     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3374                        Op.getOperand(1), Op.getOperand(2));
3375   }
3376   }
3377 }
3378 
3379 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
3380                                  const ARMSubtarget *Subtarget) {
3381   // FIXME: handle "fence singlethread" more efficiently.
3382   SDLoc dl(Op);
3383   if (!Subtarget->hasDataBarrier()) {
3384     // Some ARMv6 cpus can support data barriers with an mcr instruction.
3385     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3386     // here.
3387     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3388            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3389     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3390                        DAG.getConstant(0, dl, MVT::i32));
3391   }
3392 
3393   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
3394   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
3395   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
3396   if (Subtarget->isMClass()) {
3397     // Only a full system barrier exists in the M-class architectures.
3398     Domain = ARM_MB::SY;
3399   } else if (Subtarget->preferISHSTBarriers() &&
3400              Ord == AtomicOrdering::Release) {
3401     // Swift happens to implement ISHST barriers in a way that's compatible with
3402     // Release semantics but weaker than ISH so we'd be fools not to use
3403     // it. Beware: other processors probably don't!
3404     Domain = ARM_MB::ISHST;
3405   }
3406 
3407   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3408                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3409                      DAG.getConstant(Domain, dl, MVT::i32));
3410 }
3411 
3412 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
3413                              const ARMSubtarget *Subtarget) {
3414   // ARM pre v5TE and Thumb1 does not have preload instructions.
3415   if (!(Subtarget->isThumb2() ||
3416         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
3417     // Just preserve the chain.
3418     return Op.getOperand(0);
3419 
3420   SDLoc dl(Op);
3421   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
3422   if (!isRead &&
3423       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
3424     // ARMv7 with MP extension has PLDW.
3425     return Op.getOperand(0);
3426 
3427   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3428   if (Subtarget->isThumb()) {
3429     // Invert the bits.
3430     isRead = ~isRead & 1;
3431     isData = ~isData & 1;
3432   }
3433 
3434   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
3435                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
3436                      DAG.getConstant(isData, dl, MVT::i32));
3437 }
3438 
3439 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
3440   MachineFunction &MF = DAG.getMachineFunction();
3441   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
3442 
3443   // vastart just stores the address of the VarArgsFrameIndex slot into the
3444   // memory location argument.
3445   SDLoc dl(Op);
3446   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
3447   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3448   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3449   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3450                       MachinePointerInfo(SV));
3451 }
3452 
3453 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
3454                                                 CCValAssign &NextVA,
3455                                                 SDValue &Root,
3456                                                 SelectionDAG &DAG,
3457                                                 const SDLoc &dl) const {
3458   MachineFunction &MF = DAG.getMachineFunction();
3459   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3460 
3461   const TargetRegisterClass *RC;
3462   if (AFI->isThumb1OnlyFunction())
3463     RC = &ARM::tGPRRegClass;
3464   else
3465     RC = &ARM::GPRRegClass;
3466 
3467   // Transform the arguments stored in physical registers into virtual ones.
3468   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3469   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3470 
3471   SDValue ArgValue2;
3472   if (NextVA.isMemLoc()) {
3473     MachineFrameInfo &MFI = MF.getFrameInfo();
3474     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
3475 
3476     // Create load node to retrieve arguments from the stack.
3477     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3478     ArgValue2 = DAG.getLoad(
3479         MVT::i32, dl, Root, FIN,
3480         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3481   } else {
3482     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3483     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3484   }
3485   if (!Subtarget->isLittle())
3486     std::swap (ArgValue, ArgValue2);
3487   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
3488 }
3489 
3490 // The remaining GPRs hold either the beginning of variable-argument
3491 // data, or the beginning of an aggregate passed by value (usually
3492 // byval).  Either way, we allocate stack slots adjacent to the data
3493 // provided by our caller, and store the unallocated registers there.
3494 // If this is a variadic function, the va_list pointer will begin with
3495 // these values; otherwise, this reassembles a (byval) structure that
3496 // was split between registers and memory.
3497 // Return: The frame index registers were stored into.
3498 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
3499                                       const SDLoc &dl, SDValue &Chain,
3500                                       const Value *OrigArg,
3501                                       unsigned InRegsParamRecordIdx,
3502                                       int ArgOffset, unsigned ArgSize) const {
3503   // Currently, two use-cases possible:
3504   // Case #1. Non-var-args function, and we meet first byval parameter.
3505   //          Setup first unallocated register as first byval register;
3506   //          eat all remained registers
3507   //          (these two actions are performed by HandleByVal method).
3508   //          Then, here, we initialize stack frame with
3509   //          "store-reg" instructions.
3510   // Case #2. Var-args function, that doesn't contain byval parameters.
3511   //          The same: eat all remained unallocated registers,
3512   //          initialize stack frame.
3513 
3514   MachineFunction &MF = DAG.getMachineFunction();
3515   MachineFrameInfo &MFI = MF.getFrameInfo();
3516   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3517   unsigned RBegin, REnd;
3518   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
3519     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
3520   } else {
3521     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3522     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
3523     REnd = ARM::R4;
3524   }
3525 
3526   if (REnd != RBegin)
3527     ArgOffset = -4 * (ARM::R4 - RBegin);
3528 
3529   auto PtrVT = getPointerTy(DAG.getDataLayout());
3530   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
3531   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
3532 
3533   SmallVector<SDValue, 4> MemOps;
3534   const TargetRegisterClass *RC =
3535       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
3536 
3537   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3538     unsigned VReg = MF.addLiveIn(Reg, RC);
3539     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3540     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
3541                                  MachinePointerInfo(OrigArg, 4 * i));
3542     MemOps.push_back(Store);
3543     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3544   }
3545 
3546   if (!MemOps.empty())
3547     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3548   return FrameIndex;
3549 }
3550 
3551 // Setup stack frame, the va_list pointer will start from.
3552 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3553                                              const SDLoc &dl, SDValue &Chain,
3554                                              unsigned ArgOffset,
3555                                              unsigned TotalArgRegsSaveSize,
3556                                              bool ForceMutable) const {
3557   MachineFunction &MF = DAG.getMachineFunction();
3558   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3559 
3560   // Try to store any remaining integer argument regs
3561   // to their spots on the stack so that they may be loaded by dereferencing
3562   // the result of va_next.
3563   // If there is no regs to be stored, just point address after last
3564   // argument passed via stack.
3565   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3566                                   CCInfo.getInRegsParamsCount(),
3567                                   CCInfo.getNextStackOffset(), 4);
3568   AFI->setVarArgsFrameIndex(FrameIndex);
3569 }
3570 
3571 SDValue ARMTargetLowering::LowerFormalArguments(
3572     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3573     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3574     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3575   MachineFunction &MF = DAG.getMachineFunction();
3576   MachineFrameInfo &MFI = MF.getFrameInfo();
3577 
3578   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3579 
3580   // Assign locations to all of the incoming arguments.
3581   SmallVector<CCValAssign, 16> ArgLocs;
3582   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3583                     *DAG.getContext(), Prologue);
3584   CCInfo.AnalyzeFormalArguments(Ins,
3585                                 CCAssignFnForNode(CallConv, /* Return*/ false,
3586                                                   isVarArg));
3587 
3588   SmallVector<SDValue, 16> ArgValues;
3589   SDValue ArgValue;
3590   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
3591   unsigned CurArgIdx = 0;
3592 
3593   // Initially ArgRegsSaveSize is zero.
3594   // Then we increase this value each time we meet byval parameter.
3595   // We also increase this value in case of varargs function.
3596   AFI->setArgRegsSaveSize(0);
3597 
3598   // Calculate the amount of stack space that we need to allocate to store
3599   // byval and variadic arguments that are passed in registers.
3600   // We need to know this before we allocate the first byval or variadic
3601   // argument, as they will be allocated a stack slot below the CFA (Canonical
3602   // Frame Address, the stack pointer at entry to the function).
3603   unsigned ArgRegBegin = ARM::R4;
3604   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3605     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3606       break;
3607 
3608     CCValAssign &VA = ArgLocs[i];
3609     unsigned Index = VA.getValNo();
3610     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3611     if (!Flags.isByVal())
3612       continue;
3613 
3614     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3615     unsigned RBegin, REnd;
3616     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3617     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3618 
3619     CCInfo.nextInRegsParam();
3620   }
3621   CCInfo.rewindByValRegsInfo();
3622 
3623   int lastInsIndex = -1;
3624   if (isVarArg && MFI.hasVAStart()) {
3625     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3626     if (RegIdx != array_lengthof(GPRArgRegs))
3627       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3628   }
3629 
3630   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3631   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3632   auto PtrVT = getPointerTy(DAG.getDataLayout());
3633 
3634   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3635     CCValAssign &VA = ArgLocs[i];
3636     if (Ins[VA.getValNo()].isOrigArg()) {
3637       std::advance(CurOrigArg,
3638                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3639       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3640     }
3641     // Arguments stored in registers.
3642     if (VA.isRegLoc()) {
3643       EVT RegVT = VA.getLocVT();
3644 
3645       if (VA.needsCustom()) {
3646         // f64 and vector types are split up into multiple registers or
3647         // combinations of registers and stack slots.
3648         if (VA.getLocVT() == MVT::v2f64) {
3649           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3650                                                    Chain, DAG, dl);
3651           VA = ArgLocs[++i]; // skip ahead to next loc
3652           SDValue ArgValue2;
3653           if (VA.isMemLoc()) {
3654             int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
3655             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3656             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
3657                                     MachinePointerInfo::getFixedStack(
3658                                         DAG.getMachineFunction(), FI));
3659           } else {
3660             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3661                                              Chain, DAG, dl);
3662           }
3663           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3664           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3665                                  ArgValue, ArgValue1,
3666                                  DAG.getIntPtrConstant(0, dl));
3667           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3668                                  ArgValue, ArgValue2,
3669                                  DAG.getIntPtrConstant(1, dl));
3670         } else
3671           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3672 
3673       } else {
3674         const TargetRegisterClass *RC;
3675 
3676         if (RegVT == MVT::f32)
3677           RC = &ARM::SPRRegClass;
3678         else if (RegVT == MVT::f64)
3679           RC = &ARM::DPRRegClass;
3680         else if (RegVT == MVT::v2f64)
3681           RC = &ARM::QPRRegClass;
3682         else if (RegVT == MVT::i32)
3683           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3684                                            : &ARM::GPRRegClass;
3685         else
3686           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3687 
3688         // Transform the arguments in physical registers into virtual ones.
3689         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3690         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3691       }
3692 
3693       // If this is an 8 or 16-bit value, it is really passed promoted
3694       // to 32 bits.  Insert an assert[sz]ext to capture this, then
3695       // truncate to the right size.
3696       switch (VA.getLocInfo()) {
3697       default: llvm_unreachable("Unknown loc info!");
3698       case CCValAssign::Full: break;
3699       case CCValAssign::BCvt:
3700         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
3701         break;
3702       case CCValAssign::SExt:
3703         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3704                                DAG.getValueType(VA.getValVT()));
3705         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3706         break;
3707       case CCValAssign::ZExt:
3708         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3709                                DAG.getValueType(VA.getValVT()));
3710         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3711         break;
3712       }
3713 
3714       InVals.push_back(ArgValue);
3715 
3716     } else { // VA.isRegLoc()
3717 
3718       // sanity check
3719       assert(VA.isMemLoc());
3720       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
3721 
3722       int index = VA.getValNo();
3723 
3724       // Some Ins[] entries become multiple ArgLoc[] entries.
3725       // Process them only once.
3726       if (index != lastInsIndex)
3727         {
3728           ISD::ArgFlagsTy Flags = Ins[index].Flags;
3729           // FIXME: For now, all byval parameter objects are marked mutable.
3730           // This can be changed with more analysis.
3731           // In case of tail call optimization mark all arguments mutable.
3732           // Since they could be overwritten by lowering of arguments in case of
3733           // a tail call.
3734           if (Flags.isByVal()) {
3735             assert(Ins[index].isOrigArg() &&
3736                    "Byval arguments cannot be implicit");
3737             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
3738 
3739             int FrameIndex = StoreByValRegs(
3740                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
3741                 VA.getLocMemOffset(), Flags.getByValSize());
3742             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
3743             CCInfo.nextInRegsParam();
3744           } else {
3745             unsigned FIOffset = VA.getLocMemOffset();
3746             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
3747                                            FIOffset, true);
3748 
3749             // Create load nodes to retrieve arguments from the stack.
3750             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3751             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
3752                                          MachinePointerInfo::getFixedStack(
3753                                              DAG.getMachineFunction(), FI)));
3754           }
3755           lastInsIndex = index;
3756         }
3757     }
3758   }
3759 
3760   // varargs
3761   if (isVarArg && MFI.hasVAStart())
3762     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3763                          CCInfo.getNextStackOffset(),
3764                          TotalArgRegsSaveSize);
3765 
3766   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
3767 
3768   return Chain;
3769 }
3770 
3771 /// isFloatingPointZero - Return true if this is +0.0.
3772 static bool isFloatingPointZero(SDValue Op) {
3773   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3774     return CFP->getValueAPF().isPosZero();
3775   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3776     // Maybe this has already been legalized into the constant pool?
3777     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3778       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3779       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3780         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3781           return CFP->getValueAPF().isPosZero();
3782     }
3783   } else if (Op->getOpcode() == ISD::BITCAST &&
3784              Op->getValueType(0) == MVT::f64) {
3785     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
3786     // created by LowerConstantFP().
3787     SDValue BitcastOp = Op->getOperand(0);
3788     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
3789         isNullConstant(BitcastOp->getOperand(0)))
3790       return true;
3791   }
3792   return false;
3793 }
3794 
3795 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3796 /// the given operands.
3797 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3798                                      SDValue &ARMcc, SelectionDAG &DAG,
3799                                      const SDLoc &dl) const {
3800   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3801     unsigned C = RHSC->getZExtValue();
3802     if (!isLegalICmpImmediate(C)) {
3803       // Constant does not fit, try adjusting it by one?
3804       switch (CC) {
3805       default: break;
3806       case ISD::SETLT:
3807       case ISD::SETGE:
3808         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3809           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3810           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3811         }
3812         break;
3813       case ISD::SETULT:
3814       case ISD::SETUGE:
3815         if (C != 0 && isLegalICmpImmediate(C-1)) {
3816           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3817           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3818         }
3819         break;
3820       case ISD::SETLE:
3821       case ISD::SETGT:
3822         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3823           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3824           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3825         }
3826         break;
3827       case ISD::SETULE:
3828       case ISD::SETUGT:
3829         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3830           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3831           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3832         }
3833         break;
3834       }
3835     }
3836   }
3837 
3838   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3839   ARMISD::NodeType CompareType;
3840   switch (CondCode) {
3841   default:
3842     CompareType = ARMISD::CMP;
3843     break;
3844   case ARMCC::EQ:
3845   case ARMCC::NE:
3846     // Uses only Z Flag
3847     CompareType = ARMISD::CMPZ;
3848     break;
3849   }
3850   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3851   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3852 }
3853 
3854 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3855 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
3856                                      SelectionDAG &DAG, const SDLoc &dl) const {
3857   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
3858   SDValue Cmp;
3859   if (!isFloatingPointZero(RHS))
3860     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
3861   else
3862     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
3863   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3864 }
3865 
3866 /// duplicateCmp - Glue values can have only one use, so this function
3867 /// duplicates a comparison node.
3868 SDValue
3869 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3870   unsigned Opc = Cmp.getOpcode();
3871   SDLoc DL(Cmp);
3872   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3873     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3874 
3875   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3876   Cmp = Cmp.getOperand(0);
3877   Opc = Cmp.getOpcode();
3878   if (Opc == ARMISD::CMPFP)
3879     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3880   else {
3881     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3882     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
3883   }
3884   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3885 }
3886 
3887 std::pair<SDValue, SDValue>
3888 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
3889                                  SDValue &ARMcc) const {
3890   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
3891 
3892   SDValue Value, OverflowCmp;
3893   SDValue LHS = Op.getOperand(0);
3894   SDValue RHS = Op.getOperand(1);
3895   SDLoc dl(Op);
3896 
3897   // FIXME: We are currently always generating CMPs because we don't support
3898   // generating CMN through the backend. This is not as good as the natural
3899   // CMP case because it causes a register dependency and cannot be folded
3900   // later.
3901 
3902   switch (Op.getOpcode()) {
3903   default:
3904     llvm_unreachable("Unknown overflow instruction!");
3905   case ISD::SADDO:
3906     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3907     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3908     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3909     break;
3910   case ISD::UADDO:
3911     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3912     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3913     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3914     break;
3915   case ISD::SSUBO:
3916     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3917     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3918     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3919     break;
3920   case ISD::USUBO:
3921     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3922     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3923     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3924     break;
3925   } // switch (...)
3926 
3927   return std::make_pair(Value, OverflowCmp);
3928 }
3929 
3930 
3931 SDValue
3932 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
3933   // Let legalize expand this if it isn't a legal type yet.
3934   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3935     return SDValue();
3936 
3937   SDValue Value, OverflowCmp;
3938   SDValue ARMcc;
3939   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
3940   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3941   SDLoc dl(Op);
3942   // We use 0 and 1 as false and true values.
3943   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3944   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3945   EVT VT = Op.getValueType();
3946 
3947   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
3948                                  ARMcc, CCR, OverflowCmp);
3949 
3950   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3951   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3952 }
3953 
3954 
3955 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
3956   SDValue Cond = Op.getOperand(0);
3957   SDValue SelectTrue = Op.getOperand(1);
3958   SDValue SelectFalse = Op.getOperand(2);
3959   SDLoc dl(Op);
3960   unsigned Opc = Cond.getOpcode();
3961 
3962   if (Cond.getResNo() == 1 &&
3963       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3964        Opc == ISD::USUBO)) {
3965     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
3966       return SDValue();
3967 
3968     SDValue Value, OverflowCmp;
3969     SDValue ARMcc;
3970     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
3971     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3972     EVT VT = Op.getValueType();
3973 
3974     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
3975                    OverflowCmp, DAG);
3976   }
3977 
3978   // Convert:
3979   //
3980   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
3981   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
3982   //
3983   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
3984     const ConstantSDNode *CMOVTrue =
3985       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
3986     const ConstantSDNode *CMOVFalse =
3987       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3988 
3989     if (CMOVTrue && CMOVFalse) {
3990       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
3991       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
3992 
3993       SDValue True;
3994       SDValue False;
3995       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
3996         True = SelectTrue;
3997         False = SelectFalse;
3998       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
3999         True = SelectFalse;
4000         False = SelectTrue;
4001       }
4002 
4003       if (True.getNode() && False.getNode()) {
4004         EVT VT = Op.getValueType();
4005         SDValue ARMcc = Cond.getOperand(2);
4006         SDValue CCR = Cond.getOperand(3);
4007         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
4008         assert(True.getValueType() == VT);
4009         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
4010       }
4011     }
4012   }
4013 
4014   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4015   // undefined bits before doing a full-word comparison with zero.
4016   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4017                      DAG.getConstant(1, dl, Cond.getValueType()));
4018 
4019   return DAG.getSelectCC(dl, Cond,
4020                          DAG.getConstant(0, dl, Cond.getValueType()),
4021                          SelectTrue, SelectFalse, ISD::SETNE);
4022 }
4023 
4024 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
4025                                  bool &swpCmpOps, bool &swpVselOps) {
4026   // Start by selecting the GE condition code for opcodes that return true for
4027   // 'equality'
4028   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4029       CC == ISD::SETULE)
4030     CondCode = ARMCC::GE;
4031 
4032   // and GT for opcodes that return false for 'equality'.
4033   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4034            CC == ISD::SETULT)
4035     CondCode = ARMCC::GT;
4036 
4037   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4038   // to swap the compare operands.
4039   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4040       CC == ISD::SETULT)
4041     swpCmpOps = true;
4042 
4043   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4044   // If we have an unordered opcode, we need to swap the operands to the VSEL
4045   // instruction (effectively negating the condition).
4046   //
4047   // This also has the effect of swapping which one of 'less' or 'greater'
4048   // returns true, so we also swap the compare operands. It also switches
4049   // whether we return true for 'equality', so we compensate by picking the
4050   // opposite condition code to our original choice.
4051   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4052       CC == ISD::SETUGT) {
4053     swpCmpOps = !swpCmpOps;
4054     swpVselOps = !swpVselOps;
4055     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4056   }
4057 
4058   // 'ordered' is 'anything but unordered', so use the VS condition code and
4059   // swap the VSEL operands.
4060   if (CC == ISD::SETO) {
4061     CondCode = ARMCC::VS;
4062     swpVselOps = true;
4063   }
4064 
4065   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4066   // code and swap the VSEL operands.
4067   if (CC == ISD::SETUNE) {
4068     CondCode = ARMCC::EQ;
4069     swpVselOps = true;
4070   }
4071 }
4072 
4073 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4074                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
4075                                    SDValue Cmp, SelectionDAG &DAG) const {
4076   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
4077     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4078                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4079     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4080                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4081 
4082     SDValue TrueLow = TrueVal.getValue(0);
4083     SDValue TrueHigh = TrueVal.getValue(1);
4084     SDValue FalseLow = FalseVal.getValue(0);
4085     SDValue FalseHigh = FalseVal.getValue(1);
4086 
4087     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4088                               ARMcc, CCR, Cmp);
4089     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4090                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
4091 
4092     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4093   } else {
4094     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
4095                        Cmp);
4096   }
4097 }
4098 
4099 static bool isGTorGE(ISD::CondCode CC) {
4100   return CC == ISD::SETGT || CC == ISD::SETGE;
4101 }
4102 
4103 static bool isLTorLE(ISD::CondCode CC) {
4104   return CC == ISD::SETLT || CC == ISD::SETLE;
4105 }
4106 
4107 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4108 // All of these conditions (and their <= and >= counterparts) will do:
4109 //          x < k ? k : x
4110 //          x > k ? x : k
4111 //          k < x ? x : k
4112 //          k > x ? k : x
4113 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4114                             const SDValue TrueVal, const SDValue FalseVal,
4115                             const ISD::CondCode CC, const SDValue K) {
4116   return (isGTorGE(CC) &&
4117           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4118          (isLTorLE(CC) &&
4119           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4120 }
4121 
4122 // Similar to isLowerSaturate(), but checks for upper-saturating conditions.
4123 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
4124                             const SDValue TrueVal, const SDValue FalseVal,
4125                             const ISD::CondCode CC, const SDValue K) {
4126   return (isGTorGE(CC) &&
4127           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
4128          (isLTorLE(CC) &&
4129           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
4130 }
4131 
4132 // Check if two chained conditionals could be converted into SSAT.
4133 //
4134 // SSAT can replace a set of two conditional selectors that bound a number to an
4135 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4136 //
4137 //     x < -k ? -k : (x > k ? k : x)
4138 //     x < -k ? -k : (x < k ? x : k)
4139 //     x > -k ? (x > k ? k : x) : -k
4140 //     x < k ? (x < -k ? -k : x) : k
4141 //     etc.
4142 //
4143 // It returns true if the conversion can be done, false otherwise.
4144 // Additionally, the variable is returned in parameter V and the constant in K.
4145 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
4146                                     uint64_t &K) {
4147 
4148   SDValue LHS1 = Op.getOperand(0);
4149   SDValue RHS1 = Op.getOperand(1);
4150   SDValue TrueVal1 = Op.getOperand(2);
4151   SDValue FalseVal1 = Op.getOperand(3);
4152   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4153 
4154   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4155   if (Op2.getOpcode() != ISD::SELECT_CC)
4156     return false;
4157 
4158   SDValue LHS2 = Op2.getOperand(0);
4159   SDValue RHS2 = Op2.getOperand(1);
4160   SDValue TrueVal2 = Op2.getOperand(2);
4161   SDValue FalseVal2 = Op2.getOperand(3);
4162   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
4163 
4164   // Find out which are the constants and which are the variables
4165   // in each conditional
4166   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
4167                                                         ? &RHS1
4168                                                         : NULL;
4169   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
4170                                                         ? &RHS2
4171                                                         : NULL;
4172   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
4173   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
4174   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
4175   SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
4176 
4177   // We must detect cases where the original operations worked with 16- or
4178   // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
4179   // must work with sign-extended values but the select operations return
4180   // the original non-extended value.
4181   SDValue V2TmpReg = V2Tmp;
4182   if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
4183     V2TmpReg = V2Tmp->getOperand(0);
4184 
4185   // Check that the registers and the constants have the correct values
4186   // in both conditionals
4187   if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
4188       V2TmpReg != V2)
4189     return false;
4190 
4191   // Figure out which conditional is saturating the lower/upper bound.
4192   const SDValue *LowerCheckOp =
4193       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4194           ? &Op
4195           : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
4196                                                                        : NULL;
4197   const SDValue *UpperCheckOp =
4198       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4199           ? &Op
4200           : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
4201                                                                        : NULL;
4202 
4203   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
4204     return false;
4205 
4206   // Check that the constant in the lower-bound check is
4207   // the opposite of the constant in the upper-bound check
4208   // in 1's complement.
4209   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
4210   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
4211   int64_t PosVal = std::max(Val1, Val2);
4212 
4213   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
4214        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
4215       Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
4216 
4217     V = V2;
4218     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
4219     return true;
4220   }
4221 
4222   return false;
4223 }
4224 
4225 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
4226 
4227   EVT VT = Op.getValueType();
4228   SDLoc dl(Op);
4229 
4230   // Try to convert two saturating conditional selects into a single SSAT
4231   SDValue SatValue;
4232   uint64_t SatConstant;
4233   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
4234       isSaturatingConditional(Op, SatValue, SatConstant))
4235     return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
4236                        DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4237 
4238   SDValue LHS = Op.getOperand(0);
4239   SDValue RHS = Op.getOperand(1);
4240   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4241   SDValue TrueVal = Op.getOperand(2);
4242   SDValue FalseVal = Op.getOperand(3);
4243 
4244   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4245     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4246                                                     dl);
4247 
4248     // If softenSetCCOperands only returned one value, we should compare it to
4249     // zero.
4250     if (!RHS.getNode()) {
4251       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4252       CC = ISD::SETNE;
4253     }
4254   }
4255 
4256   if (LHS.getValueType() == MVT::i32) {
4257     // Try to generate VSEL on ARMv8.
4258     // The VSEL instruction can't use all the usual ARM condition
4259     // codes: it only has two bits to select the condition code, so it's
4260     // constrained to use only GE, GT, VS and EQ.
4261     //
4262     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
4263     // swap the operands of the previous compare instruction (effectively
4264     // inverting the compare condition, swapping 'less' and 'greater') and
4265     // sometimes need to swap the operands to the VSEL (which inverts the
4266     // condition in the sense of firing whenever the previous condition didn't)
4267     if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
4268                                     TrueVal.getValueType() == MVT::f64)) {
4269       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4270       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
4271           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
4272         CC = ISD::getSetCCInverse(CC, true);
4273         std::swap(TrueVal, FalseVal);
4274       }
4275     }
4276 
4277     SDValue ARMcc;
4278     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4279     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4280     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4281   }
4282 
4283   ARMCC::CondCodes CondCode, CondCode2;
4284   FPCCToARMCC(CC, CondCode, CondCode2);
4285 
4286   // Try to generate VMAXNM/VMINNM on ARMv8.
4287   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
4288                                   TrueVal.getValueType() == MVT::f64)) {
4289     bool swpCmpOps = false;
4290     bool swpVselOps = false;
4291     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
4292 
4293     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
4294         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
4295       if (swpCmpOps)
4296         std::swap(LHS, RHS);
4297       if (swpVselOps)
4298         std::swap(TrueVal, FalseVal);
4299     }
4300   }
4301 
4302   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4303   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
4304   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4305   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4306   if (CondCode2 != ARMCC::AL) {
4307     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
4308     // FIXME: Needs another CMP because flag can have but one use.
4309     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
4310     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
4311   }
4312   return Result;
4313 }
4314 
4315 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
4316 /// to morph to an integer compare sequence.
4317 static bool canChangeToInt(SDValue Op, bool &SeenZero,
4318                            const ARMSubtarget *Subtarget) {
4319   SDNode *N = Op.getNode();
4320   if (!N->hasOneUse())
4321     // Otherwise it requires moving the value from fp to integer registers.
4322     return false;
4323   if (!N->getNumValues())
4324     return false;
4325   EVT VT = Op.getValueType();
4326   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
4327     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
4328     // vmrs are very slow, e.g. cortex-a8.
4329     return false;
4330 
4331   if (isFloatingPointZero(Op)) {
4332     SeenZero = true;
4333     return true;
4334   }
4335   return ISD::isNormalLoad(N);
4336 }
4337 
4338 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
4339   if (isFloatingPointZero(Op))
4340     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
4341 
4342   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
4343     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
4344                        Ld->getPointerInfo(), Ld->getAlignment(),
4345                        Ld->getMemOperand()->getFlags());
4346 
4347   llvm_unreachable("Unknown VFP cmp argument!");
4348 }
4349 
4350 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
4351                            SDValue &RetVal1, SDValue &RetVal2) {
4352   SDLoc dl(Op);
4353 
4354   if (isFloatingPointZero(Op)) {
4355     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
4356     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
4357     return;
4358   }
4359 
4360   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
4361     SDValue Ptr = Ld->getBasePtr();
4362     RetVal1 =
4363         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
4364                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
4365 
4366     EVT PtrType = Ptr.getValueType();
4367     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
4368     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
4369                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
4370     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
4371                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
4372                           Ld->getMemOperand()->getFlags());
4373     return;
4374   }
4375 
4376   llvm_unreachable("Unknown VFP cmp argument!");
4377 }
4378 
4379 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
4380 /// f32 and even f64 comparisons to integer ones.
4381 SDValue
4382 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
4383   SDValue Chain = Op.getOperand(0);
4384   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4385   SDValue LHS = Op.getOperand(2);
4386   SDValue RHS = Op.getOperand(3);
4387   SDValue Dest = Op.getOperand(4);
4388   SDLoc dl(Op);
4389 
4390   bool LHSSeenZero = false;
4391   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
4392   bool RHSSeenZero = false;
4393   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
4394   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
4395     // If unsafe fp math optimization is enabled and there are no other uses of
4396     // the CMP operands, and the condition code is EQ or NE, we can optimize it
4397     // to an integer comparison.
4398     if (CC == ISD::SETOEQ)
4399       CC = ISD::SETEQ;
4400     else if (CC == ISD::SETUNE)
4401       CC = ISD::SETNE;
4402 
4403     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4404     SDValue ARMcc;
4405     if (LHS.getValueType() == MVT::f32) {
4406       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4407                         bitcastf32Toi32(LHS, DAG), Mask);
4408       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4409                         bitcastf32Toi32(RHS, DAG), Mask);
4410       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4411       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4412       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4413                          Chain, Dest, ARMcc, CCR, Cmp);
4414     }
4415 
4416     SDValue LHS1, LHS2;
4417     SDValue RHS1, RHS2;
4418     expandf64Toi32(LHS, DAG, LHS1, LHS2);
4419     expandf64Toi32(RHS, DAG, RHS1, RHS2);
4420     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
4421     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
4422     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4423     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4424     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4425     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
4426     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
4427   }
4428 
4429   return SDValue();
4430 }
4431 
4432 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
4433   SDValue Chain = Op.getOperand(0);
4434   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4435   SDValue LHS = Op.getOperand(2);
4436   SDValue RHS = Op.getOperand(3);
4437   SDValue Dest = Op.getOperand(4);
4438   SDLoc dl(Op);
4439 
4440   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4441     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4442                                                     dl);
4443 
4444     // If softenSetCCOperands only returned one value, we should compare it to
4445     // zero.
4446     if (!RHS.getNode()) {
4447       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4448       CC = ISD::SETNE;
4449     }
4450   }
4451 
4452   if (LHS.getValueType() == MVT::i32) {
4453     SDValue ARMcc;
4454     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4455     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4456     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4457                        Chain, Dest, ARMcc, CCR, Cmp);
4458   }
4459 
4460   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
4461 
4462   if (getTargetMachine().Options.UnsafeFPMath &&
4463       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
4464        CC == ISD::SETNE || CC == ISD::SETUNE)) {
4465     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
4466       return Result;
4467   }
4468 
4469   ARMCC::CondCodes CondCode, CondCode2;
4470   FPCCToARMCC(CC, CondCode, CondCode2);
4471 
4472   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4473   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
4474   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4475   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4476   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
4477   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4478   if (CondCode2 != ARMCC::AL) {
4479     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
4480     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
4481     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4482   }
4483   return Res;
4484 }
4485 
4486 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
4487   SDValue Chain = Op.getOperand(0);
4488   SDValue Table = Op.getOperand(1);
4489   SDValue Index = Op.getOperand(2);
4490   SDLoc dl(Op);
4491 
4492   EVT PTy = getPointerTy(DAG.getDataLayout());
4493   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
4494   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
4495   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
4496   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
4497   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
4498   if (Subtarget->isThumb2()) {
4499     // Thumb2 uses a two-level jump. That is, it jumps into the jump table
4500     // which does another jump to the destination. This also makes it easier
4501     // to translate it to TBB / TBH later.
4502     // FIXME: This might not work if the function is extremely large.
4503     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
4504                        Addr, Op.getOperand(2), JTI);
4505   }
4506   if (isPositionIndependent() || Subtarget->isROPI()) {
4507     Addr =
4508         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
4509                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4510     Chain = Addr.getValue(1);
4511     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
4512     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4513   } else {
4514     Addr =
4515         DAG.getLoad(PTy, dl, Chain, Addr,
4516                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4517     Chain = Addr.getValue(1);
4518     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4519   }
4520 }
4521 
4522 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
4523   EVT VT = Op.getValueType();
4524   SDLoc dl(Op);
4525 
4526   if (Op.getValueType().getVectorElementType() == MVT::i32) {
4527     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
4528       return Op;
4529     return DAG.UnrollVectorOp(Op.getNode());
4530   }
4531 
4532   assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
4533          "Invalid type for custom lowering!");
4534   if (VT != MVT::v4i16)
4535     return DAG.UnrollVectorOp(Op.getNode());
4536 
4537   Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
4538   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
4539 }
4540 
4541 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
4542   EVT VT = Op.getValueType();
4543   if (VT.isVector())
4544     return LowerVectorFP_TO_INT(Op, DAG);
4545   if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
4546     RTLIB::Libcall LC;
4547     if (Op.getOpcode() == ISD::FP_TO_SINT)
4548       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
4549                               Op.getValueType());
4550     else
4551       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
4552                               Op.getValueType());
4553     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4554                        /*isSigned*/ false, SDLoc(Op)).first;
4555   }
4556 
4557   return Op;
4558 }
4559 
4560 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4561   EVT VT = Op.getValueType();
4562   SDLoc dl(Op);
4563 
4564   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
4565     if (VT.getVectorElementType() == MVT::f32)
4566       return Op;
4567     return DAG.UnrollVectorOp(Op.getNode());
4568   }
4569 
4570   assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
4571          "Invalid type for custom lowering!");
4572   if (VT != MVT::v4f32)
4573     return DAG.UnrollVectorOp(Op.getNode());
4574 
4575   unsigned CastOpc;
4576   unsigned Opc;
4577   switch (Op.getOpcode()) {
4578   default: llvm_unreachable("Invalid opcode!");
4579   case ISD::SINT_TO_FP:
4580     CastOpc = ISD::SIGN_EXTEND;
4581     Opc = ISD::SINT_TO_FP;
4582     break;
4583   case ISD::UINT_TO_FP:
4584     CastOpc = ISD::ZERO_EXTEND;
4585     Opc = ISD::UINT_TO_FP;
4586     break;
4587   }
4588 
4589   Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
4590   return DAG.getNode(Opc, dl, VT, Op);
4591 }
4592 
4593 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
4594   EVT VT = Op.getValueType();
4595   if (VT.isVector())
4596     return LowerVectorINT_TO_FP(Op, DAG);
4597   if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
4598     RTLIB::Libcall LC;
4599     if (Op.getOpcode() == ISD::SINT_TO_FP)
4600       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
4601                               Op.getValueType());
4602     else
4603       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
4604                               Op.getValueType());
4605     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4606                        /*isSigned*/ false, SDLoc(Op)).first;
4607   }
4608 
4609   return Op;
4610 }
4611 
4612 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
4613   // Implement fcopysign with a fabs and a conditional fneg.
4614   SDValue Tmp0 = Op.getOperand(0);
4615   SDValue Tmp1 = Op.getOperand(1);
4616   SDLoc dl(Op);
4617   EVT VT = Op.getValueType();
4618   EVT SrcVT = Tmp1.getValueType();
4619   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
4620     Tmp0.getOpcode() == ARMISD::VMOVDRR;
4621   bool UseNEON = !InGPR && Subtarget->hasNEON();
4622 
4623   if (UseNEON) {
4624     // Use VBSL to copy the sign bit.
4625     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
4626     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
4627                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
4628     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
4629     if (VT == MVT::f64)
4630       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4631                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
4632                          DAG.getConstant(32, dl, MVT::i32));
4633     else /*if (VT == MVT::f32)*/
4634       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
4635     if (SrcVT == MVT::f32) {
4636       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
4637       if (VT == MVT::f64)
4638         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4639                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
4640                            DAG.getConstant(32, dl, MVT::i32));
4641     } else if (VT == MVT::f32)
4642       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
4643                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
4644                          DAG.getConstant(32, dl, MVT::i32));
4645     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
4646     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
4647 
4648     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
4649                                             dl, MVT::i32);
4650     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
4651     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
4652                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
4653 
4654     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
4655                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
4656                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
4657     if (VT == MVT::f32) {
4658       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
4659       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
4660                         DAG.getConstant(0, dl, MVT::i32));
4661     } else {
4662       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
4663     }
4664 
4665     return Res;
4666   }
4667 
4668   // Bitcast operand 1 to i32.
4669   if (SrcVT == MVT::f64)
4670     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4671                        Tmp1).getValue(1);
4672   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
4673 
4674   // Or in the signbit with integer operations.
4675   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
4676   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4677   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
4678   if (VT == MVT::f32) {
4679     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
4680                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
4681     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
4682                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
4683   }
4684 
4685   // f64: Or the high part with signbit and then combine two parts.
4686   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4687                      Tmp0);
4688   SDValue Lo = Tmp0.getValue(0);
4689   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
4690   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
4691   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
4692 }
4693 
4694 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
4695   MachineFunction &MF = DAG.getMachineFunction();
4696   MachineFrameInfo &MFI = MF.getFrameInfo();
4697   MFI.setReturnAddressIsTaken(true);
4698 
4699   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
4700     return SDValue();
4701 
4702   EVT VT = Op.getValueType();
4703   SDLoc dl(Op);
4704   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4705   if (Depth) {
4706     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
4707     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
4708     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
4709                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
4710                        MachinePointerInfo());
4711   }
4712 
4713   // Return LR, which contains the return address. Mark it an implicit live-in.
4714   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4715   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
4716 }
4717 
4718 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
4719   const ARMBaseRegisterInfo &ARI =
4720     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
4721   MachineFunction &MF = DAG.getMachineFunction();
4722   MachineFrameInfo &MFI = MF.getFrameInfo();
4723   MFI.setFrameAddressIsTaken(true);
4724 
4725   EVT VT = Op.getValueType();
4726   SDLoc dl(Op);  // FIXME probably not meaningful
4727   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4728   unsigned FrameReg = ARI.getFrameRegister(MF);
4729   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
4730   while (Depth--)
4731     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
4732                             MachinePointerInfo());
4733   return FrameAddr;
4734 }
4735 
4736 // FIXME? Maybe this could be a TableGen attribute on some registers and
4737 // this table could be generated automatically from RegInfo.
4738 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
4739                                               SelectionDAG &DAG) const {
4740   unsigned Reg = StringSwitch<unsigned>(RegName)
4741                        .Case("sp", ARM::SP)
4742                        .Default(0);
4743   if (Reg)
4744     return Reg;
4745   report_fatal_error(Twine("Invalid register name \""
4746                               + StringRef(RegName)  + "\"."));
4747 }
4748 
4749 // Result is 64 bit value so split into two 32 bit values and return as a
4750 // pair of values.
4751 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
4752                                 SelectionDAG &DAG) {
4753   SDLoc DL(N);
4754 
4755   // This function is only supposed to be called for i64 type destination.
4756   assert(N->getValueType(0) == MVT::i64
4757           && "ExpandREAD_REGISTER called for non-i64 type result.");
4758 
4759   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
4760                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
4761                              N->getOperand(0),
4762                              N->getOperand(1));
4763 
4764   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
4765                     Read.getValue(1)));
4766   Results.push_back(Read.getOperand(0));
4767 }
4768 
4769 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
4770 /// When \p DstVT, the destination type of \p BC, is on the vector
4771 /// register bank and the source of bitcast, \p Op, operates on the same bank,
4772 /// it might be possible to combine them, such that everything stays on the
4773 /// vector register bank.
4774 /// \p return The node that would replace \p BT, if the combine
4775 /// is possible.
4776 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
4777                                                 SelectionDAG &DAG) {
4778   SDValue Op = BC->getOperand(0);
4779   EVT DstVT = BC->getValueType(0);
4780 
4781   // The only vector instruction that can produce a scalar (remember,
4782   // since the bitcast was about to be turned into VMOVDRR, the source
4783   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
4784   // Moreover, we can do this combine only if there is one use.
4785   // Finally, if the destination type is not a vector, there is not
4786   // much point on forcing everything on the vector bank.
4787   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
4788       !Op.hasOneUse())
4789     return SDValue();
4790 
4791   // If the index is not constant, we will introduce an additional
4792   // multiply that will stick.
4793   // Give up in that case.
4794   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
4795   if (!Index)
4796     return SDValue();
4797   unsigned DstNumElt = DstVT.getVectorNumElements();
4798 
4799   // Compute the new index.
4800   const APInt &APIntIndex = Index->getAPIntValue();
4801   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
4802   NewIndex *= APIntIndex;
4803   // Check if the new constant index fits into i32.
4804   if (NewIndex.getBitWidth() > 32)
4805     return SDValue();
4806 
4807   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
4808   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
4809   SDLoc dl(Op);
4810   SDValue ExtractSrc = Op.getOperand(0);
4811   EVT VecVT = EVT::getVectorVT(
4812       *DAG.getContext(), DstVT.getScalarType(),
4813       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
4814   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
4815   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
4816                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
4817 }
4818 
4819 /// ExpandBITCAST - If the target supports VFP, this function is called to
4820 /// expand a bit convert where either the source or destination type is i64 to
4821 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
4822 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
4823 /// vectors), since the legalizer won't know what to do with that.
4824 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
4825   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4826   SDLoc dl(N);
4827   SDValue Op = N->getOperand(0);
4828 
4829   // This function is only supposed to be called for i64 types, either as the
4830   // source or destination of the bit convert.
4831   EVT SrcVT = Op.getValueType();
4832   EVT DstVT = N->getValueType(0);
4833   assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
4834          "ExpandBITCAST called for non-i64 type");
4835 
4836   // Turn i64->f64 into VMOVDRR.
4837   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
4838     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
4839     // if we can combine the bitcast with its source.
4840     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
4841       return Val;
4842 
4843     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4844                              DAG.getConstant(0, dl, MVT::i32));
4845     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4846                              DAG.getConstant(1, dl, MVT::i32));
4847     return DAG.getNode(ISD::BITCAST, dl, DstVT,
4848                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
4849   }
4850 
4851   // Turn f64->i64 into VMOVRRD.
4852   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
4853     SDValue Cvt;
4854     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
4855         SrcVT.getVectorNumElements() > 1)
4856       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4857                         DAG.getVTList(MVT::i32, MVT::i32),
4858                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
4859     else
4860       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4861                         DAG.getVTList(MVT::i32, MVT::i32), Op);
4862     // Merge the pieces into a single i64 value.
4863     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
4864   }
4865 
4866   return SDValue();
4867 }
4868 
4869 /// getZeroVector - Returns a vector of specified type with all zero elements.
4870 /// Zero vectors are used to represent vector negation and in those cases
4871 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
4872 /// not support i64 elements, so sometimes the zero vectors will need to be
4873 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
4874 /// zero vector.
4875 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4876   assert(VT.isVector() && "Expected a vector type");
4877   // The canonical modified immediate encoding of a zero vector is....0!
4878   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
4879   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
4880   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
4881   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4882 }
4883 
4884 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
4885 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4886 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
4887                                                 SelectionDAG &DAG) const {
4888   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4889   EVT VT = Op.getValueType();
4890   unsigned VTBits = VT.getSizeInBits();
4891   SDLoc dl(Op);
4892   SDValue ShOpLo = Op.getOperand(0);
4893   SDValue ShOpHi = Op.getOperand(1);
4894   SDValue ShAmt  = Op.getOperand(2);
4895   SDValue ARMcc;
4896   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
4897 
4898   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
4899 
4900   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4901                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4902   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
4903   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4904                                    DAG.getConstant(VTBits, dl, MVT::i32));
4905   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
4906   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4907   SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
4908 
4909   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4910   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4911                           ISD::SETGE, ARMcc, DAG, dl);
4912   SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
4913   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
4914                            CCR, Cmp);
4915 
4916   SDValue Ops[2] = { Lo, Hi };
4917   return DAG.getMergeValues(Ops, dl);
4918 }
4919 
4920 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
4921 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4922 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
4923                                                SelectionDAG &DAG) const {
4924   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4925   EVT VT = Op.getValueType();
4926   unsigned VTBits = VT.getSizeInBits();
4927   SDLoc dl(Op);
4928   SDValue ShOpLo = Op.getOperand(0);
4929   SDValue ShOpHi = Op.getOperand(1);
4930   SDValue ShAmt  = Op.getOperand(2);
4931   SDValue ARMcc;
4932 
4933   assert(Op.getOpcode() == ISD::SHL_PARTS);
4934   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4935                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4936   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
4937   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4938                                    DAG.getConstant(VTBits, dl, MVT::i32));
4939   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
4940   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
4941 
4942   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4943   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4944   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4945                           ISD::SETGE, ARMcc, DAG, dl);
4946   SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4947   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
4948                            CCR, Cmp);
4949 
4950   SDValue Ops[2] = { Lo, Hi };
4951   return DAG.getMergeValues(Ops, dl);
4952 }
4953 
4954 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4955                                             SelectionDAG &DAG) const {
4956   // The rounding mode is in bits 23:22 of the FPSCR.
4957   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4958   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4959   // so that the shift + and get folded into a bitfield extract.
4960   SDLoc dl(Op);
4961   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
4962                               DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
4963                                               MVT::i32));
4964   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
4965                                   DAG.getConstant(1U << 22, dl, MVT::i32));
4966   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4967                               DAG.getConstant(22, dl, MVT::i32));
4968   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4969                      DAG.getConstant(3, dl, MVT::i32));
4970 }
4971 
4972 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
4973                          const ARMSubtarget *ST) {
4974   SDLoc dl(N);
4975   EVT VT = N->getValueType(0);
4976   if (VT.isVector()) {
4977     assert(ST->hasNEON());
4978 
4979     // Compute the least significant set bit: LSB = X & -X
4980     SDValue X = N->getOperand(0);
4981     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
4982     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
4983 
4984     EVT ElemTy = VT.getVectorElementType();
4985 
4986     if (ElemTy == MVT::i8) {
4987       // Compute with: cttz(x) = ctpop(lsb - 1)
4988       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4989                                 DAG.getTargetConstant(1, dl, ElemTy));
4990       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4991       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
4992     }
4993 
4994     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
4995         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
4996       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
4997       unsigned NumBits = ElemTy.getSizeInBits();
4998       SDValue WidthMinus1 =
4999           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5000                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
5001       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
5002       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
5003     }
5004 
5005     // Compute with: cttz(x) = ctpop(lsb - 1)
5006 
5007     // Since we can only compute the number of bits in a byte with vcnt.8, we
5008     // have to gather the result with pairwise addition (vpaddl) for i16, i32,
5009     // and i64.
5010 
5011     // Compute LSB - 1.
5012     SDValue Bits;
5013     if (ElemTy == MVT::i64) {
5014       // Load constant 0xffff'ffff'ffff'ffff to register.
5015       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5016                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
5017       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
5018     } else {
5019       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5020                                 DAG.getTargetConstant(1, dl, ElemTy));
5021       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5022     }
5023 
5024     // Count #bits with vcnt.8.
5025     EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5026     SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
5027     SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
5028 
5029     // Gather the #bits with vpaddl (pairwise add.)
5030     EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
5031     SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
5032         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5033         Cnt8);
5034     if (ElemTy == MVT::i16)
5035       return Cnt16;
5036 
5037     EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
5038     SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
5039         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5040         Cnt16);
5041     if (ElemTy == MVT::i32)
5042       return Cnt32;
5043 
5044     assert(ElemTy == MVT::i64);
5045     SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5046         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5047         Cnt32);
5048     return Cnt64;
5049   }
5050 
5051   if (!ST->hasV6T2Ops())
5052     return SDValue();
5053 
5054   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
5055   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
5056 }
5057 
5058 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
5059 /// for each 16-bit element from operand, repeated.  The basic idea is to
5060 /// leverage vcnt to get the 8-bit counts, gather and add the results.
5061 ///
5062 /// Trace for v4i16:
5063 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
5064 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
5065 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
5066 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
5067 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
5068 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
5069 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
5070 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
5071 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
5072   EVT VT = N->getValueType(0);
5073   SDLoc DL(N);
5074 
5075   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5076   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
5077   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
5078   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
5079   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
5080   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
5081 }
5082 
5083 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
5084 /// bit-count for each 16-bit element from the operand.  We need slightly
5085 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
5086 /// 64/128-bit registers.
5087 ///
5088 /// Trace for v4i16:
5089 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
5090 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
5091 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
5092 /// v4i16:Extracted = [k0    k1    k2    k3    ]
5093 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
5094   EVT VT = N->getValueType(0);
5095   SDLoc DL(N);
5096 
5097   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
5098   if (VT.is64BitVector()) {
5099     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
5100     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
5101                        DAG.getIntPtrConstant(0, DL));
5102   } else {
5103     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
5104                                     BitCounts, DAG.getIntPtrConstant(0, DL));
5105     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
5106   }
5107 }
5108 
5109 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
5110 /// bit-count for each 32-bit element from the operand.  The idea here is
5111 /// to split the vector into 16-bit elements, leverage the 16-bit count
5112 /// routine, and then combine the results.
5113 ///
5114 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
5115 /// input    = [v0    v1    ] (vi: 32-bit elements)
5116 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
5117 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
5118 /// vrev: N0 = [k1 k0 k3 k2 ]
5119 ///            [k0 k1 k2 k3 ]
5120 ///       N1 =+[k1 k0 k3 k2 ]
5121 ///            [k0 k2 k1 k3 ]
5122 ///       N2 =+[k1 k3 k0 k2 ]
5123 ///            [k0    k2    k1    k3    ]
5124 /// Extended =+[k1    k3    k0    k2    ]
5125 ///            [k0    k2    ]
5126 /// Extracted=+[k1    k3    ]
5127 ///
5128 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
5129   EVT VT = N->getValueType(0);
5130   SDLoc DL(N);
5131 
5132   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
5133 
5134   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
5135   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
5136   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
5137   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
5138   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
5139 
5140   if (VT.is64BitVector()) {
5141     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
5142     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
5143                        DAG.getIntPtrConstant(0, DL));
5144   } else {
5145     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
5146                                     DAG.getIntPtrConstant(0, DL));
5147     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
5148   }
5149 }
5150 
5151 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
5152                           const ARMSubtarget *ST) {
5153   EVT VT = N->getValueType(0);
5154 
5155   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
5156   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
5157           VT == MVT::v4i16 || VT == MVT::v8i16) &&
5158          "Unexpected type for custom ctpop lowering");
5159 
5160   if (VT.getVectorElementType() == MVT::i32)
5161     return lowerCTPOP32BitElements(N, DAG);
5162   else
5163     return lowerCTPOP16BitElements(N, DAG);
5164 }
5165 
5166 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
5167                           const ARMSubtarget *ST) {
5168   EVT VT = N->getValueType(0);
5169   SDLoc dl(N);
5170 
5171   if (!VT.isVector())
5172     return SDValue();
5173 
5174   // Lower vector shifts on NEON to use VSHL.
5175   assert(ST->hasNEON() && "unexpected vector shift");
5176 
5177   // Left shifts translate directly to the vshiftu intrinsic.
5178   if (N->getOpcode() == ISD::SHL)
5179     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5180                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
5181                                        MVT::i32),
5182                        N->getOperand(0), N->getOperand(1));
5183 
5184   assert((N->getOpcode() == ISD::SRA ||
5185           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
5186 
5187   // NEON uses the same intrinsics for both left and right shifts.  For
5188   // right shifts, the shift amounts are negative, so negate the vector of
5189   // shift amounts.
5190   EVT ShiftVT = N->getOperand(1).getValueType();
5191   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
5192                                      getZeroVector(ShiftVT, DAG, dl),
5193                                      N->getOperand(1));
5194   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
5195                              Intrinsic::arm_neon_vshifts :
5196                              Intrinsic::arm_neon_vshiftu);
5197   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5198                      DAG.getConstant(vshiftInt, dl, MVT::i32),
5199                      N->getOperand(0), NegatedCount);
5200 }
5201 
5202 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
5203                                 const ARMSubtarget *ST) {
5204   EVT VT = N->getValueType(0);
5205   SDLoc dl(N);
5206 
5207   // We can get here for a node like i32 = ISD::SHL i32, i64
5208   if (VT != MVT::i64)
5209     return SDValue();
5210 
5211   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
5212          "Unknown shift to lower!");
5213 
5214   // We only lower SRA, SRL of 1 here, all others use generic lowering.
5215   if (!isOneConstant(N->getOperand(1)))
5216     return SDValue();
5217 
5218   // If we are in thumb mode, we don't have RRX.
5219   if (ST->isThumb1Only()) return SDValue();
5220 
5221   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
5222   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5223                            DAG.getConstant(0, dl, MVT::i32));
5224   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5225                            DAG.getConstant(1, dl, MVT::i32));
5226 
5227   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
5228   // captures the result into a carry flag.
5229   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
5230   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
5231 
5232   // The low part is an ARMISD::RRX operand, which shifts the carry in.
5233   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
5234 
5235   // Merge the pieces into a single i64 value.
5236  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
5237 }
5238 
5239 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5240   SDValue TmpOp0, TmpOp1;
5241   bool Invert = false;
5242   bool Swap = false;
5243   unsigned Opc = 0;
5244 
5245   SDValue Op0 = Op.getOperand(0);
5246   SDValue Op1 = Op.getOperand(1);
5247   SDValue CC = Op.getOperand(2);
5248   EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
5249   EVT VT = Op.getValueType();
5250   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5251   SDLoc dl(Op);
5252 
5253   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
5254       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
5255     // Special-case integer 64-bit equality comparisons. They aren't legal,
5256     // but they can be lowered with a few vector instructions.
5257     unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
5258     EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
5259     SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
5260     SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
5261     SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
5262                               DAG.getCondCode(ISD::SETEQ));
5263     SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
5264     SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
5265     Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
5266     if (SetCCOpcode == ISD::SETNE)
5267       Merged = DAG.getNOT(dl, Merged, CmpVT);
5268     Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
5269     return Merged;
5270   }
5271 
5272   if (CmpVT.getVectorElementType() == MVT::i64)
5273     // 64-bit comparisons are not legal in general.
5274     return SDValue();
5275 
5276   if (Op1.getValueType().isFloatingPoint()) {
5277     switch (SetCCOpcode) {
5278     default: llvm_unreachable("Illegal FP comparison");
5279     case ISD::SETUNE:
5280     case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
5281     case ISD::SETOEQ:
5282     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5283     case ISD::SETOLT:
5284     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
5285     case ISD::SETOGT:
5286     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5287     case ISD::SETOLE:
5288     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
5289     case ISD::SETOGE:
5290     case ISD::SETGE: Opc = ARMISD::VCGE; break;
5291     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
5292     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
5293     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
5294     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
5295     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
5296     case ISD::SETONE:
5297       // Expand this to (OLT | OGT).
5298       TmpOp0 = Op0;
5299       TmpOp1 = Op1;
5300       Opc = ISD::OR;
5301       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5302       Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
5303       break;
5304     case ISD::SETUO:
5305       Invert = true;
5306       LLVM_FALLTHROUGH;
5307     case ISD::SETO:
5308       // Expand this to (OLT | OGE).
5309       TmpOp0 = Op0;
5310       TmpOp1 = Op1;
5311       Opc = ISD::OR;
5312       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5313       Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
5314       break;
5315     }
5316   } else {
5317     // Integer comparisons.
5318     switch (SetCCOpcode) {
5319     default: llvm_unreachable("Illegal integer comparison");
5320     case ISD::SETNE:  Invert = true;
5321     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5322     case ISD::SETLT:  Swap = true;
5323     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5324     case ISD::SETLE:  Swap = true;
5325     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
5326     case ISD::SETULT: Swap = true;
5327     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
5328     case ISD::SETULE: Swap = true;
5329     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
5330     }
5331 
5332     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
5333     if (Opc == ARMISD::VCEQ) {
5334 
5335       SDValue AndOp;
5336       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5337         AndOp = Op0;
5338       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
5339         AndOp = Op1;
5340 
5341       // Ignore bitconvert.
5342       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
5343         AndOp = AndOp.getOperand(0);
5344 
5345       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
5346         Opc = ARMISD::VTST;
5347         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
5348         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
5349         Invert = !Invert;
5350       }
5351     }
5352   }
5353 
5354   if (Swap)
5355     std::swap(Op0, Op1);
5356 
5357   // If one of the operands is a constant vector zero, attempt to fold the
5358   // comparison to a specialized compare-against-zero form.
5359   SDValue SingleOp;
5360   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5361     SingleOp = Op0;
5362   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
5363     if (Opc == ARMISD::VCGE)
5364       Opc = ARMISD::VCLEZ;
5365     else if (Opc == ARMISD::VCGT)
5366       Opc = ARMISD::VCLTZ;
5367     SingleOp = Op1;
5368   }
5369 
5370   SDValue Result;
5371   if (SingleOp.getNode()) {
5372     switch (Opc) {
5373     case ARMISD::VCEQ:
5374       Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
5375     case ARMISD::VCGE:
5376       Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
5377     case ARMISD::VCLEZ:
5378       Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
5379     case ARMISD::VCGT:
5380       Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
5381     case ARMISD::VCLTZ:
5382       Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
5383     default:
5384       Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5385     }
5386   } else {
5387      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5388   }
5389 
5390   Result = DAG.getSExtOrTrunc(Result, dl, VT);
5391 
5392   if (Invert)
5393     Result = DAG.getNOT(dl, Result, VT);
5394 
5395   return Result;
5396 }
5397 
5398 static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
5399   SDValue LHS = Op.getOperand(0);
5400   SDValue RHS = Op.getOperand(1);
5401   SDValue Carry = Op.getOperand(2);
5402   SDValue Cond = Op.getOperand(3);
5403   SDLoc DL(Op);
5404 
5405   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
5406 
5407   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
5408   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
5409   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
5410 
5411   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
5412   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
5413   SDValue ARMcc = DAG.getConstant(
5414       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
5415   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5416   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
5417                                    Cmp.getValue(1), SDValue());
5418   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
5419                      CCR, Chain.getValue(1));
5420 }
5421 
5422 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
5423 /// valid vector constant for a NEON instruction with a "modified immediate"
5424 /// operand (e.g., VMOV).  If so, return the encoded value.
5425 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
5426                                  unsigned SplatBitSize, SelectionDAG &DAG,
5427                                  const SDLoc &dl, EVT &VT, bool is128Bits,
5428                                  NEONModImmType type) {
5429   unsigned OpCmode, Imm;
5430 
5431   // SplatBitSize is set to the smallest size that splats the vector, so a
5432   // zero vector will always have SplatBitSize == 8.  However, NEON modified
5433   // immediate instructions others than VMOV do not support the 8-bit encoding
5434   // of a zero vector, and the default encoding of zero is supposed to be the
5435   // 32-bit version.
5436   if (SplatBits == 0)
5437     SplatBitSize = 32;
5438 
5439   switch (SplatBitSize) {
5440   case 8:
5441     if (type != VMOVModImm)
5442       return SDValue();
5443     // Any 1-byte value is OK.  Op=0, Cmode=1110.
5444     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
5445     OpCmode = 0xe;
5446     Imm = SplatBits;
5447     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
5448     break;
5449 
5450   case 16:
5451     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
5452     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
5453     if ((SplatBits & ~0xff) == 0) {
5454       // Value = 0x00nn: Op=x, Cmode=100x.
5455       OpCmode = 0x8;
5456       Imm = SplatBits;
5457       break;
5458     }
5459     if ((SplatBits & ~0xff00) == 0) {
5460       // Value = 0xnn00: Op=x, Cmode=101x.
5461       OpCmode = 0xa;
5462       Imm = SplatBits >> 8;
5463       break;
5464     }
5465     return SDValue();
5466 
5467   case 32:
5468     // NEON's 32-bit VMOV supports splat values where:
5469     // * only one byte is nonzero, or
5470     // * the least significant byte is 0xff and the second byte is nonzero, or
5471     // * the least significant 2 bytes are 0xff and the third is nonzero.
5472     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
5473     if ((SplatBits & ~0xff) == 0) {
5474       // Value = 0x000000nn: Op=x, Cmode=000x.
5475       OpCmode = 0;
5476       Imm = SplatBits;
5477       break;
5478     }
5479     if ((SplatBits & ~0xff00) == 0) {
5480       // Value = 0x0000nn00: Op=x, Cmode=001x.
5481       OpCmode = 0x2;
5482       Imm = SplatBits >> 8;
5483       break;
5484     }
5485     if ((SplatBits & ~0xff0000) == 0) {
5486       // Value = 0x00nn0000: Op=x, Cmode=010x.
5487       OpCmode = 0x4;
5488       Imm = SplatBits >> 16;
5489       break;
5490     }
5491     if ((SplatBits & ~0xff000000) == 0) {
5492       // Value = 0xnn000000: Op=x, Cmode=011x.
5493       OpCmode = 0x6;
5494       Imm = SplatBits >> 24;
5495       break;
5496     }
5497 
5498     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
5499     if (type == OtherModImm) return SDValue();
5500 
5501     if ((SplatBits & ~0xffff) == 0 &&
5502         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
5503       // Value = 0x0000nnff: Op=x, Cmode=1100.
5504       OpCmode = 0xc;
5505       Imm = SplatBits >> 8;
5506       break;
5507     }
5508 
5509     if ((SplatBits & ~0xffffff) == 0 &&
5510         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
5511       // Value = 0x00nnffff: Op=x, Cmode=1101.
5512       OpCmode = 0xd;
5513       Imm = SplatBits >> 16;
5514       break;
5515     }
5516 
5517     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
5518     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
5519     // VMOV.I32.  A (very) minor optimization would be to replicate the value
5520     // and fall through here to test for a valid 64-bit splat.  But, then the
5521     // caller would also need to check and handle the change in size.
5522     return SDValue();
5523 
5524   case 64: {
5525     if (type != VMOVModImm)
5526       return SDValue();
5527     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
5528     uint64_t BitMask = 0xff;
5529     uint64_t Val = 0;
5530     unsigned ImmMask = 1;
5531     Imm = 0;
5532     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
5533       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
5534         Val |= BitMask;
5535         Imm |= ImmMask;
5536       } else if ((SplatBits & BitMask) != 0) {
5537         return SDValue();
5538       }
5539       BitMask <<= 8;
5540       ImmMask <<= 1;
5541     }
5542 
5543     if (DAG.getDataLayout().isBigEndian())
5544       // swap higher and lower 32 bit word
5545       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
5546 
5547     // Op=1, Cmode=1110.
5548     OpCmode = 0x1e;
5549     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
5550     break;
5551   }
5552 
5553   default:
5554     llvm_unreachable("unexpected size for isNEONModifiedImm");
5555   }
5556 
5557   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
5558   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
5559 }
5560 
5561 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
5562                                            const ARMSubtarget *ST) const {
5563   if (!ST->hasVFP3())
5564     return SDValue();
5565 
5566   bool IsDouble = Op.getValueType() == MVT::f64;
5567   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
5568 
5569   // Use the default (constant pool) lowering for double constants when we have
5570   // an SP-only FPU
5571   if (IsDouble && Subtarget->isFPOnlySP())
5572     return SDValue();
5573 
5574   // Try splatting with a VMOV.f32...
5575   const APFloat &FPVal = CFP->getValueAPF();
5576   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
5577 
5578   if (ImmVal != -1) {
5579     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
5580       // We have code in place to select a valid ConstantFP already, no need to
5581       // do any mangling.
5582       return Op;
5583     }
5584 
5585     // It's a float and we are trying to use NEON operations where
5586     // possible. Lower it to a splat followed by an extract.
5587     SDLoc DL(Op);
5588     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
5589     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
5590                                       NewVal);
5591     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
5592                        DAG.getConstant(0, DL, MVT::i32));
5593   }
5594 
5595   // The rest of our options are NEON only, make sure that's allowed before
5596   // proceeding..
5597   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
5598     return SDValue();
5599 
5600   EVT VMovVT;
5601   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
5602 
5603   // It wouldn't really be worth bothering for doubles except for one very
5604   // important value, which does happen to match: 0.0. So make sure we don't do
5605   // anything stupid.
5606   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
5607     return SDValue();
5608 
5609   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
5610   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
5611                                      VMovVT, false, VMOVModImm);
5612   if (NewVal != SDValue()) {
5613     SDLoc DL(Op);
5614     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
5615                                       NewVal);
5616     if (IsDouble)
5617       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5618 
5619     // It's a float: cast and extract a vector element.
5620     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5621                                        VecConstant);
5622     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5623                        DAG.getConstant(0, DL, MVT::i32));
5624   }
5625 
5626   // Finally, try a VMVN.i32
5627   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
5628                              false, VMVNModImm);
5629   if (NewVal != SDValue()) {
5630     SDLoc DL(Op);
5631     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
5632 
5633     if (IsDouble)
5634       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5635 
5636     // It's a float: cast and extract a vector element.
5637     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5638                                        VecConstant);
5639     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5640                        DAG.getConstant(0, DL, MVT::i32));
5641   }
5642 
5643   return SDValue();
5644 }
5645 
5646 // check if an VEXT instruction can handle the shuffle mask when the
5647 // vector sources of the shuffle are the same.
5648 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
5649   unsigned NumElts = VT.getVectorNumElements();
5650 
5651   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5652   if (M[0] < 0)
5653     return false;
5654 
5655   Imm = M[0];
5656 
5657   // If this is a VEXT shuffle, the immediate value is the index of the first
5658   // element.  The other shuffle indices must be the successive elements after
5659   // the first one.
5660   unsigned ExpectedElt = Imm;
5661   for (unsigned i = 1; i < NumElts; ++i) {
5662     // Increment the expected index.  If it wraps around, just follow it
5663     // back to index zero and keep going.
5664     ++ExpectedElt;
5665     if (ExpectedElt == NumElts)
5666       ExpectedElt = 0;
5667 
5668     if (M[i] < 0) continue; // ignore UNDEF indices
5669     if (ExpectedElt != static_cast<unsigned>(M[i]))
5670       return false;
5671   }
5672 
5673   return true;
5674 }
5675 
5676 
5677 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
5678                        bool &ReverseVEXT, unsigned &Imm) {
5679   unsigned NumElts = VT.getVectorNumElements();
5680   ReverseVEXT = false;
5681 
5682   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5683   if (M[0] < 0)
5684     return false;
5685 
5686   Imm = M[0];
5687 
5688   // If this is a VEXT shuffle, the immediate value is the index of the first
5689   // element.  The other shuffle indices must be the successive elements after
5690   // the first one.
5691   unsigned ExpectedElt = Imm;
5692   for (unsigned i = 1; i < NumElts; ++i) {
5693     // Increment the expected index.  If it wraps around, it may still be
5694     // a VEXT but the source vectors must be swapped.
5695     ExpectedElt += 1;
5696     if (ExpectedElt == NumElts * 2) {
5697       ExpectedElt = 0;
5698       ReverseVEXT = true;
5699     }
5700 
5701     if (M[i] < 0) continue; // ignore UNDEF indices
5702     if (ExpectedElt != static_cast<unsigned>(M[i]))
5703       return false;
5704   }
5705 
5706   // Adjust the index value if the source operands will be swapped.
5707   if (ReverseVEXT)
5708     Imm -= NumElts;
5709 
5710   return true;
5711 }
5712 
5713 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
5714 /// instruction with the specified blocksize.  (The order of the elements
5715 /// within each block of the vector is reversed.)
5716 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
5717   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
5718          "Only possible block sizes for VREV are: 16, 32, 64");
5719 
5720   unsigned EltSz = VT.getScalarSizeInBits();
5721   if (EltSz == 64)
5722     return false;
5723 
5724   unsigned NumElts = VT.getVectorNumElements();
5725   unsigned BlockElts = M[0] + 1;
5726   // If the first shuffle index is UNDEF, be optimistic.
5727   if (M[0] < 0)
5728     BlockElts = BlockSize / EltSz;
5729 
5730   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
5731     return false;
5732 
5733   for (unsigned i = 0; i < NumElts; ++i) {
5734     if (M[i] < 0) continue; // ignore UNDEF indices
5735     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
5736       return false;
5737   }
5738 
5739   return true;
5740 }
5741 
5742 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
5743   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
5744   // range, then 0 is placed into the resulting vector. So pretty much any mask
5745   // of 8 elements can work here.
5746   return VT == MVT::v8i8 && M.size() == 8;
5747 }
5748 
5749 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
5750 // checking that pairs of elements in the shuffle mask represent the same index
5751 // in each vector, incrementing the expected index by 2 at each step.
5752 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
5753 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
5754 //  v2={e,f,g,h}
5755 // WhichResult gives the offset for each element in the mask based on which
5756 // of the two results it belongs to.
5757 //
5758 // The transpose can be represented either as:
5759 // result1 = shufflevector v1, v2, result1_shuffle_mask
5760 // result2 = shufflevector v1, v2, result2_shuffle_mask
5761 // where v1/v2 and the shuffle masks have the same number of elements
5762 // (here WhichResult (see below) indicates which result is being checked)
5763 //
5764 // or as:
5765 // results = shufflevector v1, v2, shuffle_mask
5766 // where both results are returned in one vector and the shuffle mask has twice
5767 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
5768 // want to check the low half and high half of the shuffle mask as if it were
5769 // the other case
5770 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5771   unsigned EltSz = VT.getScalarSizeInBits();
5772   if (EltSz == 64)
5773     return false;
5774 
5775   unsigned NumElts = VT.getVectorNumElements();
5776   if (M.size() != NumElts && M.size() != NumElts*2)
5777     return false;
5778 
5779   // If the mask is twice as long as the input vector then we need to check the
5780   // upper and lower parts of the mask with a matching value for WhichResult
5781   // FIXME: A mask with only even values will be rejected in case the first
5782   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
5783   // M[0] is used to determine WhichResult
5784   for (unsigned i = 0; i < M.size(); i += NumElts) {
5785     if (M.size() == NumElts * 2)
5786       WhichResult = i / NumElts;
5787     else
5788       WhichResult = M[i] == 0 ? 0 : 1;
5789     for (unsigned j = 0; j < NumElts; j += 2) {
5790       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5791           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
5792         return false;
5793     }
5794   }
5795 
5796   if (M.size() == NumElts*2)
5797     WhichResult = 0;
5798 
5799   return true;
5800 }
5801 
5802 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
5803 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5804 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
5805 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5806   unsigned EltSz = VT.getScalarSizeInBits();
5807   if (EltSz == 64)
5808     return false;
5809 
5810   unsigned NumElts = VT.getVectorNumElements();
5811   if (M.size() != NumElts && M.size() != NumElts*2)
5812     return false;
5813 
5814   for (unsigned i = 0; i < M.size(); i += NumElts) {
5815     if (M.size() == NumElts * 2)
5816       WhichResult = i / NumElts;
5817     else
5818       WhichResult = M[i] == 0 ? 0 : 1;
5819     for (unsigned j = 0; j < NumElts; j += 2) {
5820       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5821           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
5822         return false;
5823     }
5824   }
5825 
5826   if (M.size() == NumElts*2)
5827     WhichResult = 0;
5828 
5829   return true;
5830 }
5831 
5832 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
5833 // that the mask elements are either all even and in steps of size 2 or all odd
5834 // and in steps of size 2.
5835 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
5836 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
5837 //  v2={e,f,g,h}
5838 // Requires similar checks to that of isVTRNMask with
5839 // respect the how results are returned.
5840 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5841   unsigned EltSz = VT.getScalarSizeInBits();
5842   if (EltSz == 64)
5843     return false;
5844 
5845   unsigned NumElts = VT.getVectorNumElements();
5846   if (M.size() != NumElts && M.size() != NumElts*2)
5847     return false;
5848 
5849   for (unsigned i = 0; i < M.size(); i += NumElts) {
5850     WhichResult = M[i] == 0 ? 0 : 1;
5851     for (unsigned j = 0; j < NumElts; ++j) {
5852       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
5853         return false;
5854     }
5855   }
5856 
5857   if (M.size() == NumElts*2)
5858     WhichResult = 0;
5859 
5860   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5861   if (VT.is64BitVector() && EltSz == 32)
5862     return false;
5863 
5864   return true;
5865 }
5866 
5867 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
5868 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5869 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
5870 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5871   unsigned EltSz = VT.getScalarSizeInBits();
5872   if (EltSz == 64)
5873     return false;
5874 
5875   unsigned NumElts = VT.getVectorNumElements();
5876   if (M.size() != NumElts && M.size() != NumElts*2)
5877     return false;
5878 
5879   unsigned Half = NumElts / 2;
5880   for (unsigned i = 0; i < M.size(); i += NumElts) {
5881     WhichResult = M[i] == 0 ? 0 : 1;
5882     for (unsigned j = 0; j < NumElts; j += Half) {
5883       unsigned Idx = WhichResult;
5884       for (unsigned k = 0; k < Half; ++k) {
5885         int MIdx = M[i + j + k];
5886         if (MIdx >= 0 && (unsigned) MIdx != Idx)
5887           return false;
5888         Idx += 2;
5889       }
5890     }
5891   }
5892 
5893   if (M.size() == NumElts*2)
5894     WhichResult = 0;
5895 
5896   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5897   if (VT.is64BitVector() && EltSz == 32)
5898     return false;
5899 
5900   return true;
5901 }
5902 
5903 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
5904 // that pairs of elements of the shufflemask represent the same index in each
5905 // vector incrementing sequentially through the vectors.
5906 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
5907 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
5908 //  v2={e,f,g,h}
5909 // Requires similar checks to that of isVTRNMask with respect the how results
5910 // are returned.
5911 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5912   unsigned EltSz = VT.getScalarSizeInBits();
5913   if (EltSz == 64)
5914     return false;
5915 
5916   unsigned NumElts = VT.getVectorNumElements();
5917   if (M.size() != NumElts && M.size() != NumElts*2)
5918     return false;
5919 
5920   for (unsigned i = 0; i < M.size(); i += NumElts) {
5921     WhichResult = M[i] == 0 ? 0 : 1;
5922     unsigned Idx = WhichResult * NumElts / 2;
5923     for (unsigned j = 0; j < NumElts; j += 2) {
5924       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5925           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
5926         return false;
5927       Idx += 1;
5928     }
5929   }
5930 
5931   if (M.size() == NumElts*2)
5932     WhichResult = 0;
5933 
5934   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5935   if (VT.is64BitVector() && EltSz == 32)
5936     return false;
5937 
5938   return true;
5939 }
5940 
5941 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
5942 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5943 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
5944 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5945   unsigned EltSz = VT.getScalarSizeInBits();
5946   if (EltSz == 64)
5947     return false;
5948 
5949   unsigned NumElts = VT.getVectorNumElements();
5950   if (M.size() != NumElts && M.size() != NumElts*2)
5951     return false;
5952 
5953   for (unsigned i = 0; i < M.size(); i += NumElts) {
5954     WhichResult = M[i] == 0 ? 0 : 1;
5955     unsigned Idx = WhichResult * NumElts / 2;
5956     for (unsigned j = 0; j < NumElts; j += 2) {
5957       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5958           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
5959         return false;
5960       Idx += 1;
5961     }
5962   }
5963 
5964   if (M.size() == NumElts*2)
5965     WhichResult = 0;
5966 
5967   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5968   if (VT.is64BitVector() && EltSz == 32)
5969     return false;
5970 
5971   return true;
5972 }
5973 
5974 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
5975 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
5976 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
5977                                            unsigned &WhichResult,
5978                                            bool &isV_UNDEF) {
5979   isV_UNDEF = false;
5980   if (isVTRNMask(ShuffleMask, VT, WhichResult))
5981     return ARMISD::VTRN;
5982   if (isVUZPMask(ShuffleMask, VT, WhichResult))
5983     return ARMISD::VUZP;
5984   if (isVZIPMask(ShuffleMask, VT, WhichResult))
5985     return ARMISD::VZIP;
5986 
5987   isV_UNDEF = true;
5988   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
5989     return ARMISD::VTRN;
5990   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5991     return ARMISD::VUZP;
5992   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5993     return ARMISD::VZIP;
5994 
5995   return 0;
5996 }
5997 
5998 /// \return true if this is a reverse operation on an vector.
5999 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
6000   unsigned NumElts = VT.getVectorNumElements();
6001   // Make sure the mask has the right size.
6002   if (NumElts != M.size())
6003       return false;
6004 
6005   // Look for <15, ..., 3, -1, 1, 0>.
6006   for (unsigned i = 0; i != NumElts; ++i)
6007     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
6008       return false;
6009 
6010   return true;
6011 }
6012 
6013 // If N is an integer constant that can be moved into a register in one
6014 // instruction, return an SDValue of such a constant (will become a MOV
6015 // instruction).  Otherwise return null.
6016 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
6017                                      const ARMSubtarget *ST, const SDLoc &dl) {
6018   uint64_t Val;
6019   if (!isa<ConstantSDNode>(N))
6020     return SDValue();
6021   Val = cast<ConstantSDNode>(N)->getZExtValue();
6022 
6023   if (ST->isThumb1Only()) {
6024     if (Val <= 255 || ~Val <= 255)
6025       return DAG.getConstant(Val, dl, MVT::i32);
6026   } else {
6027     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
6028       return DAG.getConstant(Val, dl, MVT::i32);
6029   }
6030   return SDValue();
6031 }
6032 
6033 // If this is a case we can't handle, return null and let the default
6034 // expansion code take care of it.
6035 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
6036                                              const ARMSubtarget *ST) const {
6037   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
6038   SDLoc dl(Op);
6039   EVT VT = Op.getValueType();
6040 
6041   APInt SplatBits, SplatUndef;
6042   unsigned SplatBitSize;
6043   bool HasAnyUndefs;
6044   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
6045     if (SplatBitSize <= 64) {
6046       // Check if an immediate VMOV works.
6047       EVT VmovVT;
6048       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
6049                                       SplatUndef.getZExtValue(), SplatBitSize,
6050                                       DAG, dl, VmovVT, VT.is128BitVector(),
6051                                       VMOVModImm);
6052       if (Val.getNode()) {
6053         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
6054         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6055       }
6056 
6057       // Try an immediate VMVN.
6058       uint64_t NegatedImm = (~SplatBits).getZExtValue();
6059       Val = isNEONModifiedImm(NegatedImm,
6060                                       SplatUndef.getZExtValue(), SplatBitSize,
6061                                       DAG, dl, VmovVT, VT.is128BitVector(),
6062                                       VMVNModImm);
6063       if (Val.getNode()) {
6064         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
6065         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6066       }
6067 
6068       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
6069       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
6070         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
6071         if (ImmVal != -1) {
6072           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
6073           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
6074         }
6075       }
6076     }
6077   }
6078 
6079   // Scan through the operands to see if only one value is used.
6080   //
6081   // As an optimisation, even if more than one value is used it may be more
6082   // profitable to splat with one value then change some lanes.
6083   //
6084   // Heuristically we decide to do this if the vector has a "dominant" value,
6085   // defined as splatted to more than half of the lanes.
6086   unsigned NumElts = VT.getVectorNumElements();
6087   bool isOnlyLowElement = true;
6088   bool usesOnlyOneValue = true;
6089   bool hasDominantValue = false;
6090   bool isConstant = true;
6091 
6092   // Map of the number of times a particular SDValue appears in the
6093   // element list.
6094   DenseMap<SDValue, unsigned> ValueCounts;
6095   SDValue Value;
6096   for (unsigned i = 0; i < NumElts; ++i) {
6097     SDValue V = Op.getOperand(i);
6098     if (V.isUndef())
6099       continue;
6100     if (i > 0)
6101       isOnlyLowElement = false;
6102     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
6103       isConstant = false;
6104 
6105     ValueCounts.insert(std::make_pair(V, 0));
6106     unsigned &Count = ValueCounts[V];
6107 
6108     // Is this value dominant? (takes up more than half of the lanes)
6109     if (++Count > (NumElts / 2)) {
6110       hasDominantValue = true;
6111       Value = V;
6112     }
6113   }
6114   if (ValueCounts.size() != 1)
6115     usesOnlyOneValue = false;
6116   if (!Value.getNode() && ValueCounts.size() > 0)
6117     Value = ValueCounts.begin()->first;
6118 
6119   if (ValueCounts.size() == 0)
6120     return DAG.getUNDEF(VT);
6121 
6122   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
6123   // Keep going if we are hitting this case.
6124   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
6125     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
6126 
6127   unsigned EltSize = VT.getScalarSizeInBits();
6128 
6129   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
6130   // i32 and try again.
6131   if (hasDominantValue && EltSize <= 32) {
6132     if (!isConstant) {
6133       SDValue N;
6134 
6135       // If we are VDUPing a value that comes directly from a vector, that will
6136       // cause an unnecessary move to and from a GPR, where instead we could
6137       // just use VDUPLANE. We can only do this if the lane being extracted
6138       // is at a constant index, as the VDUP from lane instructions only have
6139       // constant-index forms.
6140       ConstantSDNode *constIndex;
6141       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6142           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
6143         // We need to create a new undef vector to use for the VDUPLANE if the
6144         // size of the vector from which we get the value is different than the
6145         // size of the vector that we need to create. We will insert the element
6146         // such that the register coalescer will remove unnecessary copies.
6147         if (VT != Value->getOperand(0).getValueType()) {
6148           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
6149                              VT.getVectorNumElements();
6150           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6151                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
6152                         Value, DAG.getConstant(index, dl, MVT::i32)),
6153                            DAG.getConstant(index, dl, MVT::i32));
6154         } else
6155           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6156                         Value->getOperand(0), Value->getOperand(1));
6157       } else
6158         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
6159 
6160       if (!usesOnlyOneValue) {
6161         // The dominant value was splatted as 'N', but we now have to insert
6162         // all differing elements.
6163         for (unsigned I = 0; I < NumElts; ++I) {
6164           if (Op.getOperand(I) == Value)
6165             continue;
6166           SmallVector<SDValue, 3> Ops;
6167           Ops.push_back(N);
6168           Ops.push_back(Op.getOperand(I));
6169           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
6170           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
6171         }
6172       }
6173       return N;
6174     }
6175     if (VT.getVectorElementType().isFloatingPoint()) {
6176       SmallVector<SDValue, 8> Ops;
6177       for (unsigned i = 0; i < NumElts; ++i)
6178         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6179                                   Op.getOperand(i)));
6180       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
6181       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
6182       Val = LowerBUILD_VECTOR(Val, DAG, ST);
6183       if (Val.getNode())
6184         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6185     }
6186     if (usesOnlyOneValue) {
6187       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
6188       if (isConstant && Val.getNode())
6189         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
6190     }
6191   }
6192 
6193   // If all elements are constants and the case above didn't get hit, fall back
6194   // to the default expansion, which will generate a load from the constant
6195   // pool.
6196   if (isConstant)
6197     return SDValue();
6198 
6199   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
6200   if (NumElts >= 4) {
6201     SDValue shuffle = ReconstructShuffle(Op, DAG);
6202     if (shuffle != SDValue())
6203       return shuffle;
6204   }
6205 
6206   // Vectors with 32- or 64-bit elements can be built by directly assigning
6207   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
6208   // will be legalized.
6209   if (EltSize >= 32) {
6210     // Do the expansion with floating-point types, since that is what the VFP
6211     // registers are defined to use, and since i64 is not legal.
6212     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6213     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6214     SmallVector<SDValue, 8> Ops;
6215     for (unsigned i = 0; i < NumElts; ++i)
6216       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
6217     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6218     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6219   }
6220 
6221   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
6222   // know the default expansion would otherwise fall back on something even
6223   // worse. For a vector with one or two non-undef values, that's
6224   // scalar_to_vector for the elements followed by a shuffle (provided the
6225   // shuffle is valid for the target) and materialization element by element
6226   // on the stack followed by a load for everything else.
6227   if (!isConstant && !usesOnlyOneValue) {
6228     SDValue Vec = DAG.getUNDEF(VT);
6229     for (unsigned i = 0 ; i < NumElts; ++i) {
6230       SDValue V = Op.getOperand(i);
6231       if (V.isUndef())
6232         continue;
6233       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
6234       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
6235     }
6236     return Vec;
6237   }
6238 
6239   return SDValue();
6240 }
6241 
6242 // Gather data to see if the operation can be modelled as a
6243 // shuffle in combination with VEXTs.
6244 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
6245                                               SelectionDAG &DAG) const {
6246   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
6247   SDLoc dl(Op);
6248   EVT VT = Op.getValueType();
6249   unsigned NumElts = VT.getVectorNumElements();
6250 
6251   struct ShuffleSourceInfo {
6252     SDValue Vec;
6253     unsigned MinElt;
6254     unsigned MaxElt;
6255 
6256     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
6257     // be compatible with the shuffle we intend to construct. As a result
6258     // ShuffleVec will be some sliding window into the original Vec.
6259     SDValue ShuffleVec;
6260 
6261     // Code should guarantee that element i in Vec starts at element "WindowBase
6262     // + i * WindowScale in ShuffleVec".
6263     int WindowBase;
6264     int WindowScale;
6265 
6266     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
6267     ShuffleSourceInfo(SDValue Vec)
6268         : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
6269           WindowScale(1) {}
6270   };
6271 
6272   // First gather all vectors used as an immediate source for this BUILD_VECTOR
6273   // node.
6274   SmallVector<ShuffleSourceInfo, 2> Sources;
6275   for (unsigned i = 0; i < NumElts; ++i) {
6276     SDValue V = Op.getOperand(i);
6277     if (V.isUndef())
6278       continue;
6279     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
6280       // A shuffle can only come from building a vector from various
6281       // elements of other vectors.
6282       return SDValue();
6283     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
6284       // Furthermore, shuffles require a constant mask, whereas extractelts
6285       // accept variable indices.
6286       return SDValue();
6287     }
6288 
6289     // Add this element source to the list if it's not already there.
6290     SDValue SourceVec = V.getOperand(0);
6291     auto Source = find(Sources, SourceVec);
6292     if (Source == Sources.end())
6293       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
6294 
6295     // Update the minimum and maximum lane number seen.
6296     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
6297     Source->MinElt = std::min(Source->MinElt, EltNo);
6298     Source->MaxElt = std::max(Source->MaxElt, EltNo);
6299   }
6300 
6301   // Currently only do something sane when at most two source vectors
6302   // are involved.
6303   if (Sources.size() > 2)
6304     return SDValue();
6305 
6306   // Find out the smallest element size among result and two sources, and use
6307   // it as element size to build the shuffle_vector.
6308   EVT SmallestEltTy = VT.getVectorElementType();
6309   for (auto &Source : Sources) {
6310     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
6311     if (SrcEltTy.bitsLT(SmallestEltTy))
6312       SmallestEltTy = SrcEltTy;
6313   }
6314   unsigned ResMultiplier =
6315       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
6316   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
6317   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
6318 
6319   // If the source vector is too wide or too narrow, we may nevertheless be able
6320   // to construct a compatible shuffle either by concatenating it with UNDEF or
6321   // extracting a suitable range of elements.
6322   for (auto &Src : Sources) {
6323     EVT SrcVT = Src.ShuffleVec.getValueType();
6324 
6325     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
6326       continue;
6327 
6328     // This stage of the search produces a source with the same element type as
6329     // the original, but with a total width matching the BUILD_VECTOR output.
6330     EVT EltVT = SrcVT.getVectorElementType();
6331     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
6332     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
6333 
6334     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
6335       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
6336         return SDValue();
6337       // We can pad out the smaller vector for free, so if it's part of a
6338       // shuffle...
6339       Src.ShuffleVec =
6340           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
6341                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
6342       continue;
6343     }
6344 
6345     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
6346       return SDValue();
6347 
6348     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
6349       // Span too large for a VEXT to cope
6350       return SDValue();
6351     }
6352 
6353     if (Src.MinElt >= NumSrcElts) {
6354       // The extraction can just take the second half
6355       Src.ShuffleVec =
6356           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6357                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6358       Src.WindowBase = -NumSrcElts;
6359     } else if (Src.MaxElt < NumSrcElts) {
6360       // The extraction can just take the first half
6361       Src.ShuffleVec =
6362           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6363                       DAG.getConstant(0, dl, MVT::i32));
6364     } else {
6365       // An actual VEXT is needed
6366       SDValue VEXTSrc1 =
6367           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6368                       DAG.getConstant(0, dl, MVT::i32));
6369       SDValue VEXTSrc2 =
6370           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6371                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6372 
6373       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
6374                                    VEXTSrc2,
6375                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
6376       Src.WindowBase = -Src.MinElt;
6377     }
6378   }
6379 
6380   // Another possible incompatibility occurs from the vector element types. We
6381   // can fix this by bitcasting the source vectors to the same type we intend
6382   // for the shuffle.
6383   for (auto &Src : Sources) {
6384     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
6385     if (SrcEltTy == SmallestEltTy)
6386       continue;
6387     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
6388     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
6389     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
6390     Src.WindowBase *= Src.WindowScale;
6391   }
6392 
6393   // Final sanity check before we try to actually produce a shuffle.
6394   DEBUG(
6395     for (auto Src : Sources)
6396       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
6397   );
6398 
6399   // The stars all align, our next step is to produce the mask for the shuffle.
6400   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
6401   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
6402   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
6403     SDValue Entry = Op.getOperand(i);
6404     if (Entry.isUndef())
6405       continue;
6406 
6407     auto Src = find(Sources, Entry.getOperand(0));
6408     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
6409 
6410     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
6411     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
6412     // segment.
6413     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
6414     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
6415                                VT.getScalarSizeInBits());
6416     int LanesDefined = BitsDefined / BitsPerShuffleLane;
6417 
6418     // This source is expected to fill ResMultiplier lanes of the final shuffle,
6419     // starting at the appropriate offset.
6420     int *LaneMask = &Mask[i * ResMultiplier];
6421 
6422     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
6423     ExtractBase += NumElts * (Src - Sources.begin());
6424     for (int j = 0; j < LanesDefined; ++j)
6425       LaneMask[j] = ExtractBase + j;
6426   }
6427 
6428   // Final check before we try to produce nonsense...
6429   if (!isShuffleMaskLegal(Mask, ShuffleVT))
6430     return SDValue();
6431 
6432   // We can't handle more than two sources. This should have already
6433   // been checked before this point.
6434   assert(Sources.size() <= 2 && "Too many sources!");
6435 
6436   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
6437   for (unsigned i = 0; i < Sources.size(); ++i)
6438     ShuffleOps[i] = Sources[i].ShuffleVec;
6439 
6440   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
6441                                          ShuffleOps[1], Mask);
6442   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
6443 }
6444 
6445 /// isShuffleMaskLegal - Targets can use this to indicate that they only
6446 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
6447 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
6448 /// are assumed to be legal.
6449 bool
6450 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
6451                                       EVT VT) const {
6452   if (VT.getVectorNumElements() == 4 &&
6453       (VT.is128BitVector() || VT.is64BitVector())) {
6454     unsigned PFIndexes[4];
6455     for (unsigned i = 0; i != 4; ++i) {
6456       if (M[i] < 0)
6457         PFIndexes[i] = 8;
6458       else
6459         PFIndexes[i] = M[i];
6460     }
6461 
6462     // Compute the index in the perfect shuffle table.
6463     unsigned PFTableIndex =
6464       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6465     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6466     unsigned Cost = (PFEntry >> 30);
6467 
6468     if (Cost <= 4)
6469       return true;
6470   }
6471 
6472   bool ReverseVEXT, isV_UNDEF;
6473   unsigned Imm, WhichResult;
6474 
6475   unsigned EltSize = VT.getScalarSizeInBits();
6476   return (EltSize >= 32 ||
6477           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
6478           isVREVMask(M, VT, 64) ||
6479           isVREVMask(M, VT, 32) ||
6480           isVREVMask(M, VT, 16) ||
6481           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
6482           isVTBLMask(M, VT) ||
6483           isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
6484           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
6485 }
6486 
6487 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
6488 /// the specified operations to build the shuffle.
6489 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
6490                                       SDValue RHS, SelectionDAG &DAG,
6491                                       const SDLoc &dl) {
6492   unsigned OpNum = (PFEntry >> 26) & 0x0F;
6493   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
6494   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
6495 
6496   enum {
6497     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
6498     OP_VREV,
6499     OP_VDUP0,
6500     OP_VDUP1,
6501     OP_VDUP2,
6502     OP_VDUP3,
6503     OP_VEXT1,
6504     OP_VEXT2,
6505     OP_VEXT3,
6506     OP_VUZPL, // VUZP, left result
6507     OP_VUZPR, // VUZP, right result
6508     OP_VZIPL, // VZIP, left result
6509     OP_VZIPR, // VZIP, right result
6510     OP_VTRNL, // VTRN, left result
6511     OP_VTRNR  // VTRN, right result
6512   };
6513 
6514   if (OpNum == OP_COPY) {
6515     if (LHSID == (1*9+2)*9+3) return LHS;
6516     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
6517     return RHS;
6518   }
6519 
6520   SDValue OpLHS, OpRHS;
6521   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
6522   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
6523   EVT VT = OpLHS.getValueType();
6524 
6525   switch (OpNum) {
6526   default: llvm_unreachable("Unknown shuffle opcode!");
6527   case OP_VREV:
6528     // VREV divides the vector in half and swaps within the half.
6529     if (VT.getVectorElementType() == MVT::i32 ||
6530         VT.getVectorElementType() == MVT::f32)
6531       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
6532     // vrev <4 x i16> -> VREV32
6533     if (VT.getVectorElementType() == MVT::i16)
6534       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
6535     // vrev <4 x i8> -> VREV16
6536     assert(VT.getVectorElementType() == MVT::i8);
6537     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
6538   case OP_VDUP0:
6539   case OP_VDUP1:
6540   case OP_VDUP2:
6541   case OP_VDUP3:
6542     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6543                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
6544   case OP_VEXT1:
6545   case OP_VEXT2:
6546   case OP_VEXT3:
6547     return DAG.getNode(ARMISD::VEXT, dl, VT,
6548                        OpLHS, OpRHS,
6549                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
6550   case OP_VUZPL:
6551   case OP_VUZPR:
6552     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
6553                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
6554   case OP_VZIPL:
6555   case OP_VZIPR:
6556     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
6557                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
6558   case OP_VTRNL:
6559   case OP_VTRNR:
6560     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
6561                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
6562   }
6563 }
6564 
6565 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
6566                                        ArrayRef<int> ShuffleMask,
6567                                        SelectionDAG &DAG) {
6568   // Check to see if we can use the VTBL instruction.
6569   SDValue V1 = Op.getOperand(0);
6570   SDValue V2 = Op.getOperand(1);
6571   SDLoc DL(Op);
6572 
6573   SmallVector<SDValue, 8> VTBLMask;
6574   for (ArrayRef<int>::iterator
6575          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
6576     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
6577 
6578   if (V2.getNode()->isUndef())
6579     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
6580                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6581 
6582   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
6583                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6584 }
6585 
6586 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
6587                                                       SelectionDAG &DAG) {
6588   SDLoc DL(Op);
6589   SDValue OpLHS = Op.getOperand(0);
6590   EVT VT = OpLHS.getValueType();
6591 
6592   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
6593          "Expect an v8i16/v16i8 type");
6594   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
6595   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
6596   // extract the first 8 bytes into the top double word and the last 8 bytes
6597   // into the bottom double word. The v8i16 case is similar.
6598   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
6599   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
6600                      DAG.getConstant(ExtractNum, DL, MVT::i32));
6601 }
6602 
6603 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
6604   SDValue V1 = Op.getOperand(0);
6605   SDValue V2 = Op.getOperand(1);
6606   SDLoc dl(Op);
6607   EVT VT = Op.getValueType();
6608   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
6609 
6610   // Convert shuffles that are directly supported on NEON to target-specific
6611   // DAG nodes, instead of keeping them as shuffles and matching them again
6612   // during code selection.  This is more efficient and avoids the possibility
6613   // of inconsistencies between legalization and selection.
6614   // FIXME: floating-point vectors should be canonicalized to integer vectors
6615   // of the same time so that they get CSEd properly.
6616   ArrayRef<int> ShuffleMask = SVN->getMask();
6617 
6618   unsigned EltSize = VT.getScalarSizeInBits();
6619   if (EltSize <= 32) {
6620     if (SVN->isSplat()) {
6621       int Lane = SVN->getSplatIndex();
6622       // If this is undef splat, generate it via "just" vdup, if possible.
6623       if (Lane == -1) Lane = 0;
6624 
6625       // Test if V1 is a SCALAR_TO_VECTOR.
6626       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
6627         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6628       }
6629       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
6630       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
6631       // reaches it).
6632       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
6633           !isa<ConstantSDNode>(V1.getOperand(0))) {
6634         bool IsScalarToVector = true;
6635         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
6636           if (!V1.getOperand(i).isUndef()) {
6637             IsScalarToVector = false;
6638             break;
6639           }
6640         if (IsScalarToVector)
6641           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6642       }
6643       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
6644                          DAG.getConstant(Lane, dl, MVT::i32));
6645     }
6646 
6647     bool ReverseVEXT;
6648     unsigned Imm;
6649     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
6650       if (ReverseVEXT)
6651         std::swap(V1, V2);
6652       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
6653                          DAG.getConstant(Imm, dl, MVT::i32));
6654     }
6655 
6656     if (isVREVMask(ShuffleMask, VT, 64))
6657       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
6658     if (isVREVMask(ShuffleMask, VT, 32))
6659       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
6660     if (isVREVMask(ShuffleMask, VT, 16))
6661       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
6662 
6663     if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
6664       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
6665                          DAG.getConstant(Imm, dl, MVT::i32));
6666     }
6667 
6668     // Check for Neon shuffles that modify both input vectors in place.
6669     // If both results are used, i.e., if there are two shuffles with the same
6670     // source operands and with masks corresponding to both results of one of
6671     // these operations, DAG memoization will ensure that a single node is
6672     // used for both shuffles.
6673     unsigned WhichResult;
6674     bool isV_UNDEF;
6675     if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6676             ShuffleMask, VT, WhichResult, isV_UNDEF)) {
6677       if (isV_UNDEF)
6678         V2 = V1;
6679       return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
6680           .getValue(WhichResult);
6681     }
6682 
6683     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
6684     // shuffles that produce a result larger than their operands with:
6685     //   shuffle(concat(v1, undef), concat(v2, undef))
6686     // ->
6687     //   shuffle(concat(v1, v2), undef)
6688     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
6689     //
6690     // This is useful in the general case, but there are special cases where
6691     // native shuffles produce larger results: the two-result ops.
6692     //
6693     // Look through the concat when lowering them:
6694     //   shuffle(concat(v1, v2), undef)
6695     // ->
6696     //   concat(VZIP(v1, v2):0, :1)
6697     //
6698     if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
6699       SDValue SubV1 = V1->getOperand(0);
6700       SDValue SubV2 = V1->getOperand(1);
6701       EVT SubVT = SubV1.getValueType();
6702 
6703       // We expect these to have been canonicalized to -1.
6704       assert(all_of(ShuffleMask, [&](int i) {
6705         return i < (int)VT.getVectorNumElements();
6706       }) && "Unexpected shuffle index into UNDEF operand!");
6707 
6708       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6709               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
6710         if (isV_UNDEF)
6711           SubV2 = SubV1;
6712         assert((WhichResult == 0) &&
6713                "In-place shuffle of concat can only have one result!");
6714         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
6715                                   SubV1, SubV2);
6716         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
6717                            Res.getValue(1));
6718       }
6719     }
6720   }
6721 
6722   // If the shuffle is not directly supported and it has 4 elements, use
6723   // the PerfectShuffle-generated table to synthesize it from other shuffles.
6724   unsigned NumElts = VT.getVectorNumElements();
6725   if (NumElts == 4) {
6726     unsigned PFIndexes[4];
6727     for (unsigned i = 0; i != 4; ++i) {
6728       if (ShuffleMask[i] < 0)
6729         PFIndexes[i] = 8;
6730       else
6731         PFIndexes[i] = ShuffleMask[i];
6732     }
6733 
6734     // Compute the index in the perfect shuffle table.
6735     unsigned PFTableIndex =
6736       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6737     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6738     unsigned Cost = (PFEntry >> 30);
6739 
6740     if (Cost <= 4)
6741       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
6742   }
6743 
6744   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
6745   if (EltSize >= 32) {
6746     // Do the expansion with floating-point types, since that is what the VFP
6747     // registers are defined to use, and since i64 is not legal.
6748     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6749     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6750     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
6751     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
6752     SmallVector<SDValue, 8> Ops;
6753     for (unsigned i = 0; i < NumElts; ++i) {
6754       if (ShuffleMask[i] < 0)
6755         Ops.push_back(DAG.getUNDEF(EltVT));
6756       else
6757         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6758                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
6759                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
6760                                                   dl, MVT::i32)));
6761     }
6762     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6763     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6764   }
6765 
6766   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
6767     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
6768 
6769   if (VT == MVT::v8i8)
6770     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
6771       return NewOp;
6772 
6773   return SDValue();
6774 }
6775 
6776 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6777   // INSERT_VECTOR_ELT is legal only for immediate indexes.
6778   SDValue Lane = Op.getOperand(2);
6779   if (!isa<ConstantSDNode>(Lane))
6780     return SDValue();
6781 
6782   return Op;
6783 }
6784 
6785 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6786   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
6787   SDValue Lane = Op.getOperand(1);
6788   if (!isa<ConstantSDNode>(Lane))
6789     return SDValue();
6790 
6791   SDValue Vec = Op.getOperand(0);
6792   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
6793     SDLoc dl(Op);
6794     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
6795   }
6796 
6797   return Op;
6798 }
6799 
6800 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6801   // The only time a CONCAT_VECTORS operation can have legal types is when
6802   // two 64-bit vectors are concatenated to a 128-bit vector.
6803   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
6804          "unexpected CONCAT_VECTORS");
6805   SDLoc dl(Op);
6806   SDValue Val = DAG.getUNDEF(MVT::v2f64);
6807   SDValue Op0 = Op.getOperand(0);
6808   SDValue Op1 = Op.getOperand(1);
6809   if (!Op0.isUndef())
6810     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6811                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
6812                       DAG.getIntPtrConstant(0, dl));
6813   if (!Op1.isUndef())
6814     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6815                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
6816                       DAG.getIntPtrConstant(1, dl));
6817   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
6818 }
6819 
6820 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
6821 /// element has been zero/sign-extended, depending on the isSigned parameter,
6822 /// from an integer type half its size.
6823 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
6824                                    bool isSigned) {
6825   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
6826   EVT VT = N->getValueType(0);
6827   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
6828     SDNode *BVN = N->getOperand(0).getNode();
6829     if (BVN->getValueType(0) != MVT::v4i32 ||
6830         BVN->getOpcode() != ISD::BUILD_VECTOR)
6831       return false;
6832     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6833     unsigned HiElt = 1 - LoElt;
6834     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
6835     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
6836     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
6837     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
6838     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
6839       return false;
6840     if (isSigned) {
6841       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
6842           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
6843         return true;
6844     } else {
6845       if (Hi0->isNullValue() && Hi1->isNullValue())
6846         return true;
6847     }
6848     return false;
6849   }
6850 
6851   if (N->getOpcode() != ISD::BUILD_VECTOR)
6852     return false;
6853 
6854   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
6855     SDNode *Elt = N->getOperand(i).getNode();
6856     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
6857       unsigned EltSize = VT.getScalarSizeInBits();
6858       unsigned HalfSize = EltSize / 2;
6859       if (isSigned) {
6860         if (!isIntN(HalfSize, C->getSExtValue()))
6861           return false;
6862       } else {
6863         if (!isUIntN(HalfSize, C->getZExtValue()))
6864           return false;
6865       }
6866       continue;
6867     }
6868     return false;
6869   }
6870 
6871   return true;
6872 }
6873 
6874 /// isSignExtended - Check if a node is a vector value that is sign-extended
6875 /// or a constant BUILD_VECTOR with sign-extended elements.
6876 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
6877   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
6878     return true;
6879   if (isExtendedBUILD_VECTOR(N, DAG, true))
6880     return true;
6881   return false;
6882 }
6883 
6884 /// isZeroExtended - Check if a node is a vector value that is zero-extended
6885 /// or a constant BUILD_VECTOR with zero-extended elements.
6886 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
6887   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
6888     return true;
6889   if (isExtendedBUILD_VECTOR(N, DAG, false))
6890     return true;
6891   return false;
6892 }
6893 
6894 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
6895   if (OrigVT.getSizeInBits() >= 64)
6896     return OrigVT;
6897 
6898   assert(OrigVT.isSimple() && "Expecting a simple value type");
6899 
6900   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
6901   switch (OrigSimpleTy) {
6902   default: llvm_unreachable("Unexpected Vector Type");
6903   case MVT::v2i8:
6904   case MVT::v2i16:
6905      return MVT::v2i32;
6906   case MVT::v4i8:
6907     return  MVT::v4i16;
6908   }
6909 }
6910 
6911 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
6912 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
6913 /// We insert the required extension here to get the vector to fill a D register.
6914 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
6915                                             const EVT &OrigTy,
6916                                             const EVT &ExtTy,
6917                                             unsigned ExtOpcode) {
6918   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
6919   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
6920   // 64-bits we need to insert a new extension so that it will be 64-bits.
6921   assert(ExtTy.is128BitVector() && "Unexpected extension size");
6922   if (OrigTy.getSizeInBits() >= 64)
6923     return N;
6924 
6925   // Must extend size to at least 64 bits to be used as an operand for VMULL.
6926   EVT NewVT = getExtensionTo64Bits(OrigTy);
6927 
6928   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
6929 }
6930 
6931 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
6932 /// does not do any sign/zero extension. If the original vector is less
6933 /// than 64 bits, an appropriate extension will be added after the load to
6934 /// reach a total size of 64 bits. We have to add the extension separately
6935 /// because ARM does not have a sign/zero extending load for vectors.
6936 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
6937   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
6938 
6939   // The load already has the right type.
6940   if (ExtendedTy == LD->getMemoryVT())
6941     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
6942                        LD->getBasePtr(), LD->getPointerInfo(),
6943                        LD->getAlignment(), LD->getMemOperand()->getFlags());
6944 
6945   // We need to create a zextload/sextload. We cannot just create a load
6946   // followed by a zext/zext node because LowerMUL is also run during normal
6947   // operation legalization where we can't create illegal types.
6948   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
6949                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
6950                         LD->getMemoryVT(), LD->getAlignment(),
6951                         LD->getMemOperand()->getFlags());
6952 }
6953 
6954 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
6955 /// extending load, or BUILD_VECTOR with extended elements, return the
6956 /// unextended value. The unextended vector should be 64 bits so that it can
6957 /// be used as an operand to a VMULL instruction. If the original vector size
6958 /// before extension is less than 64 bits we add a an extension to resize
6959 /// the vector to 64 bits.
6960 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
6961   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
6962     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
6963                                         N->getOperand(0)->getValueType(0),
6964                                         N->getValueType(0),
6965                                         N->getOpcode());
6966 
6967   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
6968     return SkipLoadExtensionForVMULL(LD, DAG);
6969 
6970   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
6971   // have been legalized as a BITCAST from v4i32.
6972   if (N->getOpcode() == ISD::BITCAST) {
6973     SDNode *BVN = N->getOperand(0).getNode();
6974     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
6975            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
6976     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6977     return DAG.getBuildVector(
6978         MVT::v2i32, SDLoc(N),
6979         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
6980   }
6981   // Construct a new BUILD_VECTOR with elements truncated to half the size.
6982   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
6983   EVT VT = N->getValueType(0);
6984   unsigned EltSize = VT.getScalarSizeInBits() / 2;
6985   unsigned NumElts = VT.getVectorNumElements();
6986   MVT TruncVT = MVT::getIntegerVT(EltSize);
6987   SmallVector<SDValue, 8> Ops;
6988   SDLoc dl(N);
6989   for (unsigned i = 0; i != NumElts; ++i) {
6990     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
6991     const APInt &CInt = C->getAPIntValue();
6992     // Element types smaller than 32 bits are not legal, so use i32 elements.
6993     // The values are implicitly truncated so sext vs. zext doesn't matter.
6994     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
6995   }
6996   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
6997 }
6998 
6999 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
7000   unsigned Opcode = N->getOpcode();
7001   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
7002     SDNode *N0 = N->getOperand(0).getNode();
7003     SDNode *N1 = N->getOperand(1).getNode();
7004     return N0->hasOneUse() && N1->hasOneUse() &&
7005       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
7006   }
7007   return false;
7008 }
7009 
7010 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
7011   unsigned Opcode = N->getOpcode();
7012   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
7013     SDNode *N0 = N->getOperand(0).getNode();
7014     SDNode *N1 = N->getOperand(1).getNode();
7015     return N0->hasOneUse() && N1->hasOneUse() &&
7016       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
7017   }
7018   return false;
7019 }
7020 
7021 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
7022   // Multiplications are only custom-lowered for 128-bit vectors so that
7023   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
7024   EVT VT = Op.getValueType();
7025   assert(VT.is128BitVector() && VT.isInteger() &&
7026          "unexpected type for custom-lowering ISD::MUL");
7027   SDNode *N0 = Op.getOperand(0).getNode();
7028   SDNode *N1 = Op.getOperand(1).getNode();
7029   unsigned NewOpc = 0;
7030   bool isMLA = false;
7031   bool isN0SExt = isSignExtended(N0, DAG);
7032   bool isN1SExt = isSignExtended(N1, DAG);
7033   if (isN0SExt && isN1SExt)
7034     NewOpc = ARMISD::VMULLs;
7035   else {
7036     bool isN0ZExt = isZeroExtended(N0, DAG);
7037     bool isN1ZExt = isZeroExtended(N1, DAG);
7038     if (isN0ZExt && isN1ZExt)
7039       NewOpc = ARMISD::VMULLu;
7040     else if (isN1SExt || isN1ZExt) {
7041       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
7042       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
7043       if (isN1SExt && isAddSubSExt(N0, DAG)) {
7044         NewOpc = ARMISD::VMULLs;
7045         isMLA = true;
7046       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
7047         NewOpc = ARMISD::VMULLu;
7048         isMLA = true;
7049       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
7050         std::swap(N0, N1);
7051         NewOpc = ARMISD::VMULLu;
7052         isMLA = true;
7053       }
7054     }
7055 
7056     if (!NewOpc) {
7057       if (VT == MVT::v2i64)
7058         // Fall through to expand this.  It is not legal.
7059         return SDValue();
7060       else
7061         // Other vector multiplications are legal.
7062         return Op;
7063     }
7064   }
7065 
7066   // Legalize to a VMULL instruction.
7067   SDLoc DL(Op);
7068   SDValue Op0;
7069   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
7070   if (!isMLA) {
7071     Op0 = SkipExtensionForVMULL(N0, DAG);
7072     assert(Op0.getValueType().is64BitVector() &&
7073            Op1.getValueType().is64BitVector() &&
7074            "unexpected types for extended operands to VMULL");
7075     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
7076   }
7077 
7078   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
7079   // isel lowering to take advantage of no-stall back to back vmul + vmla.
7080   //   vmull q0, d4, d6
7081   //   vmlal q0, d5, d6
7082   // is faster than
7083   //   vaddl q0, d4, d5
7084   //   vmovl q1, d6
7085   //   vmul  q0, q0, q1
7086   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
7087   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
7088   EVT Op1VT = Op1.getValueType();
7089   return DAG.getNode(N0->getOpcode(), DL, VT,
7090                      DAG.getNode(NewOpc, DL, VT,
7091                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
7092                      DAG.getNode(NewOpc, DL, VT,
7093                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
7094 }
7095 
7096 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
7097                               SelectionDAG &DAG) {
7098   // TODO: Should this propagate fast-math-flags?
7099 
7100   // Convert to float
7101   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
7102   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
7103   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
7104   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
7105   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
7106   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
7107   // Get reciprocal estimate.
7108   // float4 recip = vrecpeq_f32(yf);
7109   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7110                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7111                    Y);
7112   // Because char has a smaller range than uchar, we can actually get away
7113   // without any newton steps.  This requires that we use a weird bias
7114   // of 0xb000, however (again, this has been exhaustively tested).
7115   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
7116   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
7117   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
7118   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
7119   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
7120   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
7121   // Convert back to short.
7122   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
7123   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
7124   return X;
7125 }
7126 
7127 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
7128                                SelectionDAG &DAG) {
7129   // TODO: Should this propagate fast-math-flags?
7130 
7131   SDValue N2;
7132   // Convert to float.
7133   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
7134   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
7135   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
7136   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
7137   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7138   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7139 
7140   // Use reciprocal estimate and one refinement step.
7141   // float4 recip = vrecpeq_f32(yf);
7142   // recip *= vrecpsq_f32(yf, recip);
7143   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7144                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7145                    N1);
7146   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7147                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7148                    N1, N2);
7149   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7150   // Because short has a smaller range than ushort, we can actually get away
7151   // with only a single newton step.  This requires that we use a weird bias
7152   // of 89, however (again, this has been exhaustively tested).
7153   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
7154   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7155   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7156   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
7157   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7158   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7159   // Convert back to integer and return.
7160   // return vmovn_s32(vcvt_s32_f32(result));
7161   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7162   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7163   return N0;
7164 }
7165 
7166 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
7167   EVT VT = Op.getValueType();
7168   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7169          "unexpected type for custom-lowering ISD::SDIV");
7170 
7171   SDLoc dl(Op);
7172   SDValue N0 = Op.getOperand(0);
7173   SDValue N1 = Op.getOperand(1);
7174   SDValue N2, N3;
7175 
7176   if (VT == MVT::v8i8) {
7177     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
7178     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
7179 
7180     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7181                      DAG.getIntPtrConstant(4, dl));
7182     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7183                      DAG.getIntPtrConstant(4, dl));
7184     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7185                      DAG.getIntPtrConstant(0, dl));
7186     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7187                      DAG.getIntPtrConstant(0, dl));
7188 
7189     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
7190     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
7191 
7192     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7193     N0 = LowerCONCAT_VECTORS(N0, DAG);
7194 
7195     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
7196     return N0;
7197   }
7198   return LowerSDIV_v4i16(N0, N1, dl, DAG);
7199 }
7200 
7201 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
7202   // TODO: Should this propagate fast-math-flags?
7203   EVT VT = Op.getValueType();
7204   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7205          "unexpected type for custom-lowering ISD::UDIV");
7206 
7207   SDLoc dl(Op);
7208   SDValue N0 = Op.getOperand(0);
7209   SDValue N1 = Op.getOperand(1);
7210   SDValue N2, N3;
7211 
7212   if (VT == MVT::v8i8) {
7213     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
7214     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
7215 
7216     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7217                      DAG.getIntPtrConstant(4, dl));
7218     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7219                      DAG.getIntPtrConstant(4, dl));
7220     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7221                      DAG.getIntPtrConstant(0, dl));
7222     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7223                      DAG.getIntPtrConstant(0, dl));
7224 
7225     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
7226     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
7227 
7228     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7229     N0 = LowerCONCAT_VECTORS(N0, DAG);
7230 
7231     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
7232                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
7233                                      MVT::i32),
7234                      N0);
7235     return N0;
7236   }
7237 
7238   // v4i16 sdiv ... Convert to float.
7239   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
7240   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
7241   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
7242   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
7243   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7244   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7245 
7246   // Use reciprocal estimate and two refinement steps.
7247   // float4 recip = vrecpeq_f32(yf);
7248   // recip *= vrecpsq_f32(yf, recip);
7249   // recip *= vrecpsq_f32(yf, recip);
7250   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7251                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7252                    BN1);
7253   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7254                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7255                    BN1, N2);
7256   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7257   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7258                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7259                    BN1, N2);
7260   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7261   // Simply multiplying by the reciprocal estimate can leave us a few ulps
7262   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
7263   // and that it will never cause us to return an answer too large).
7264   // float4 result = as_float4(as_int4(xf*recip) + 2);
7265   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7266   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7267   N1 = DAG.getConstant(2, dl, MVT::v4i32);
7268   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7269   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7270   // Convert back to integer and return.
7271   // return vmovn_u32(vcvt_s32_f32(result));
7272   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7273   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7274   return N0;
7275 }
7276 
7277 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
7278   EVT VT = Op.getNode()->getValueType(0);
7279   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
7280 
7281   unsigned Opc;
7282   bool ExtraOp = false;
7283   switch (Op.getOpcode()) {
7284   default: llvm_unreachable("Invalid code");
7285   case ISD::ADDC: Opc = ARMISD::ADDC; break;
7286   case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
7287   case ISD::SUBC: Opc = ARMISD::SUBC; break;
7288   case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
7289   }
7290 
7291   if (!ExtraOp)
7292     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
7293                        Op.getOperand(1));
7294   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
7295                      Op.getOperand(1), Op.getOperand(2));
7296 }
7297 
7298 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
7299   assert(Subtarget->isTargetDarwin());
7300 
7301   // For iOS, we want to call an alternative entry point: __sincos_stret,
7302   // return values are passed via sret.
7303   SDLoc dl(Op);
7304   SDValue Arg = Op.getOperand(0);
7305   EVT ArgVT = Arg.getValueType();
7306   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
7307   auto PtrVT = getPointerTy(DAG.getDataLayout());
7308 
7309   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7310   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7311 
7312   // Pair of floats / doubles used to pass the result.
7313   Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
7314   auto &DL = DAG.getDataLayout();
7315 
7316   ArgListTy Args;
7317   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
7318   SDValue SRet;
7319   if (ShouldUseSRet) {
7320     // Create stack object for sret.
7321     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
7322     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
7323     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
7324     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
7325 
7326     ArgListEntry Entry;
7327     Entry.Node = SRet;
7328     Entry.Ty = RetTy->getPointerTo();
7329     Entry.isSExt = false;
7330     Entry.isZExt = false;
7331     Entry.isSRet = true;
7332     Args.push_back(Entry);
7333     RetTy = Type::getVoidTy(*DAG.getContext());
7334   }
7335 
7336   ArgListEntry Entry;
7337   Entry.Node = Arg;
7338   Entry.Ty = ArgTy;
7339   Entry.isSExt = false;
7340   Entry.isZExt = false;
7341   Args.push_back(Entry);
7342 
7343   const char *LibcallName =
7344       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
7345   RTLIB::Libcall LC =
7346       (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
7347   CallingConv::ID CC = getLibcallCallingConv(LC);
7348   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
7349 
7350   TargetLowering::CallLoweringInfo CLI(DAG);
7351   CLI.setDebugLoc(dl)
7352       .setChain(DAG.getEntryNode())
7353       .setCallee(CC, RetTy, Callee, std::move(Args))
7354       .setDiscardResult(ShouldUseSRet);
7355   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7356 
7357   if (!ShouldUseSRet)
7358     return CallResult.first;
7359 
7360   SDValue LoadSin =
7361       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
7362 
7363   // Address of cos field.
7364   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
7365                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
7366   SDValue LoadCos =
7367       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
7368 
7369   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
7370   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
7371                      LoadSin.getValue(0), LoadCos.getValue(0));
7372 }
7373 
7374 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
7375                                                   bool Signed,
7376                                                   SDValue &Chain) const {
7377   EVT VT = Op.getValueType();
7378   assert((VT == MVT::i32 || VT == MVT::i64) &&
7379          "unexpected type for custom lowering DIV");
7380   SDLoc dl(Op);
7381 
7382   const auto &DL = DAG.getDataLayout();
7383   const auto &TLI = DAG.getTargetLoweringInfo();
7384 
7385   const char *Name = nullptr;
7386   if (Signed)
7387     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
7388   else
7389     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
7390 
7391   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
7392 
7393   ARMTargetLowering::ArgListTy Args;
7394 
7395   for (auto AI : {1, 0}) {
7396     ArgListEntry Arg;
7397     Arg.Node = Op.getOperand(AI);
7398     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
7399     Args.push_back(Arg);
7400   }
7401 
7402   CallLoweringInfo CLI(DAG);
7403   CLI.setDebugLoc(dl)
7404     .setChain(Chain)
7405     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
7406                ES, std::move(Args));
7407 
7408   return LowerCallTo(CLI).first;
7409 }
7410 
7411 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
7412                                             bool Signed) const {
7413   assert(Op.getValueType() == MVT::i32 &&
7414          "unexpected type for custom lowering DIV");
7415   SDLoc dl(Op);
7416 
7417   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
7418                                DAG.getEntryNode(), Op.getOperand(1));
7419 
7420   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7421 }
7422 
7423 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
7424   SDLoc DL(N);
7425   SDValue Op = N->getOperand(1);
7426   if (N->getValueType(0) == MVT::i32)
7427     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
7428   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7429                            DAG.getConstant(0, DL, MVT::i32));
7430   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7431                            DAG.getConstant(1, DL, MVT::i32));
7432   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
7433                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
7434 }
7435 
7436 void ARMTargetLowering::ExpandDIV_Windows(
7437     SDValue Op, SelectionDAG &DAG, bool Signed,
7438     SmallVectorImpl<SDValue> &Results) const {
7439   const auto &DL = DAG.getDataLayout();
7440   const auto &TLI = DAG.getTargetLoweringInfo();
7441 
7442   assert(Op.getValueType() == MVT::i64 &&
7443          "unexpected type for custom lowering DIV");
7444   SDLoc dl(Op);
7445 
7446   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
7447 
7448   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7449 
7450   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
7451   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
7452                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
7453   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
7454 
7455   Results.push_back(Lower);
7456   Results.push_back(Upper);
7457 }
7458 
7459 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
7460   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
7461     // Acquire/Release load/store is not legal for targets without a dmb or
7462     // equivalent available.
7463     return SDValue();
7464 
7465   // Monotonic load/store is legal for all targets.
7466   return Op;
7467 }
7468 
7469 static void ReplaceREADCYCLECOUNTER(SDNode *N,
7470                                     SmallVectorImpl<SDValue> &Results,
7471                                     SelectionDAG &DAG,
7472                                     const ARMSubtarget *Subtarget) {
7473   SDLoc DL(N);
7474   // Under Power Management extensions, the cycle-count is:
7475   //    mrc p15, #0, <Rt>, c9, c13, #0
7476   SDValue Ops[] = { N->getOperand(0), // Chain
7477                     DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
7478                     DAG.getConstant(15, DL, MVT::i32),
7479                     DAG.getConstant(0, DL, MVT::i32),
7480                     DAG.getConstant(9, DL, MVT::i32),
7481                     DAG.getConstant(13, DL, MVT::i32),
7482                     DAG.getConstant(0, DL, MVT::i32)
7483   };
7484 
7485   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
7486                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
7487   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
7488                                 DAG.getConstant(0, DL, MVT::i32)));
7489   Results.push_back(Cycles32.getValue(1));
7490 }
7491 
7492 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
7493   SDLoc dl(V.getNode());
7494   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
7495   SDValue VHi = DAG.getAnyExtOrTrunc(
7496       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
7497       dl, MVT::i32);
7498   SDValue RegClass =
7499       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
7500   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
7501   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
7502   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
7503   return SDValue(
7504       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
7505 }
7506 
7507 static void ReplaceCMP_SWAP_64Results(SDNode *N,
7508                                        SmallVectorImpl<SDValue> & Results,
7509                                        SelectionDAG &DAG) {
7510   assert(N->getValueType(0) == MVT::i64 &&
7511          "AtomicCmpSwap on types less than 64 should be legal");
7512   SDValue Ops[] = {N->getOperand(1),
7513                    createGPRPairNode(DAG, N->getOperand(2)),
7514                    createGPRPairNode(DAG, N->getOperand(3)),
7515                    N->getOperand(0)};
7516   SDNode *CmpSwap = DAG.getMachineNode(
7517       ARM::CMP_SWAP_64, SDLoc(N),
7518       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
7519 
7520   MachineFunction &MF = DAG.getMachineFunction();
7521   MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
7522   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
7523   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
7524 
7525   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
7526                                                SDValue(CmpSwap, 0)));
7527   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
7528                                                SDValue(CmpSwap, 0)));
7529   Results.push_back(SDValue(CmpSwap, 2));
7530 }
7531 
7532 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
7533                           SelectionDAG &DAG) {
7534   const auto &TLI = DAG.getTargetLoweringInfo();
7535 
7536   assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
7537          "Custom lowering is MSVCRT specific!");
7538 
7539   SDLoc dl(Op);
7540   SDValue Val = Op.getOperand(0);
7541   MVT Ty = Val->getSimpleValueType(0);
7542   SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
7543   SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
7544                                          TLI.getPointerTy(DAG.getDataLayout()));
7545 
7546   TargetLowering::ArgListTy Args;
7547   TargetLowering::ArgListEntry Entry;
7548 
7549   Entry.Node = Val;
7550   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
7551   Entry.isZExt = true;
7552   Args.push_back(Entry);
7553 
7554   Entry.Node = Exponent;
7555   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
7556   Entry.isZExt = true;
7557   Args.push_back(Entry);
7558 
7559   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
7560 
7561   // In the in-chain to the call is the entry node  If we are emitting a
7562   // tailcall, the chain will be mutated if the node has a non-entry input
7563   // chain.
7564   SDValue InChain = DAG.getEntryNode();
7565   SDValue TCChain = InChain;
7566 
7567   const auto *F = DAG.getMachineFunction().getFunction();
7568   bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
7569               F->getReturnType() == LCRTy;
7570   if (IsTC)
7571     InChain = TCChain;
7572 
7573   TargetLowering::CallLoweringInfo CLI(DAG);
7574   CLI.setDebugLoc(dl)
7575       .setChain(InChain)
7576       .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
7577       .setTailCall(IsTC);
7578   std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
7579 
7580   // Return the chain (the DAG root) if it is a tail call
7581   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
7582 }
7583 
7584 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7585   switch (Op.getOpcode()) {
7586   default: llvm_unreachable("Don't know how to custom lower this!");
7587   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
7588   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
7589   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
7590   case ISD::GlobalAddress:
7591     switch (Subtarget->getTargetTriple().getObjectFormat()) {
7592     default: llvm_unreachable("unknown object format");
7593     case Triple::COFF:
7594       return LowerGlobalAddressWindows(Op, DAG);
7595     case Triple::ELF:
7596       return LowerGlobalAddressELF(Op, DAG);
7597     case Triple::MachO:
7598       return LowerGlobalAddressDarwin(Op, DAG);
7599     }
7600   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
7601   case ISD::SELECT:        return LowerSELECT(Op, DAG);
7602   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
7603   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
7604   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
7605   case ISD::VASTART:       return LowerVASTART(Op, DAG);
7606   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
7607   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
7608   case ISD::SINT_TO_FP:
7609   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
7610   case ISD::FP_TO_SINT:
7611   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
7612   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
7613   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
7614   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
7615   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
7616   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
7617   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
7618   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
7619                                                                Subtarget);
7620   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
7621   case ISD::SHL:
7622   case ISD::SRL:
7623   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
7624   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
7625   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
7626   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
7627   case ISD::SRL_PARTS:
7628   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
7629   case ISD::CTTZ:
7630   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
7631   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
7632   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
7633   case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
7634   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
7635   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
7636   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
7637   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
7638   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7639   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
7640   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
7641   case ISD::MUL:           return LowerMUL(Op, DAG);
7642   case ISD::SDIV:
7643     if (Subtarget->isTargetWindows())
7644       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
7645     return LowerSDIV(Op, DAG);
7646   case ISD::UDIV:
7647     if (Subtarget->isTargetWindows())
7648       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
7649     return LowerUDIV(Op, DAG);
7650   case ISD::ADDC:
7651   case ISD::ADDE:
7652   case ISD::SUBC:
7653   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
7654   case ISD::SADDO:
7655   case ISD::UADDO:
7656   case ISD::SSUBO:
7657   case ISD::USUBO:
7658     return LowerXALUO(Op, DAG);
7659   case ISD::ATOMIC_LOAD:
7660   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
7661   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
7662   case ISD::SDIVREM:
7663   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
7664   case ISD::DYNAMIC_STACKALLOC:
7665     if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
7666       return LowerDYNAMIC_STACKALLOC(Op, DAG);
7667     llvm_unreachable("Don't know how to custom lower this!");
7668   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
7669   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
7670   case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
7671   case ARMISD::WIN__DBZCHK: return SDValue();
7672   }
7673 }
7674 
7675 /// ReplaceNodeResults - Replace the results of node with an illegal result
7676 /// type with new values built out of custom code.
7677 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
7678                                            SmallVectorImpl<SDValue> &Results,
7679                                            SelectionDAG &DAG) const {
7680   SDValue Res;
7681   switch (N->getOpcode()) {
7682   default:
7683     llvm_unreachable("Don't know how to custom expand this!");
7684   case ISD::READ_REGISTER:
7685     ExpandREAD_REGISTER(N, Results, DAG);
7686     break;
7687   case ISD::BITCAST:
7688     Res = ExpandBITCAST(N, DAG);
7689     break;
7690   case ISD::SRL:
7691   case ISD::SRA:
7692     Res = Expand64BitShift(N, DAG, Subtarget);
7693     break;
7694   case ISD::SREM:
7695   case ISD::UREM:
7696     Res = LowerREM(N, DAG);
7697     break;
7698   case ISD::SDIVREM:
7699   case ISD::UDIVREM:
7700     Res = LowerDivRem(SDValue(N, 0), DAG);
7701     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
7702     Results.push_back(Res.getValue(0));
7703     Results.push_back(Res.getValue(1));
7704     return;
7705   case ISD::READCYCLECOUNTER:
7706     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
7707     return;
7708   case ISD::UDIV:
7709   case ISD::SDIV:
7710     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
7711     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
7712                              Results);
7713   case ISD::ATOMIC_CMP_SWAP:
7714     ReplaceCMP_SWAP_64Results(N, Results, DAG);
7715     return;
7716   }
7717   if (Res.getNode())
7718     Results.push_back(Res);
7719 }
7720 
7721 //===----------------------------------------------------------------------===//
7722 //                           ARM Scheduler Hooks
7723 //===----------------------------------------------------------------------===//
7724 
7725 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
7726 /// registers the function context.
7727 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
7728                                                MachineBasicBlock *MBB,
7729                                                MachineBasicBlock *DispatchBB,
7730                                                int FI) const {
7731   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
7732          "ROPI/RWPI not currently supported with SjLj");
7733   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7734   DebugLoc dl = MI.getDebugLoc();
7735   MachineFunction *MF = MBB->getParent();
7736   MachineRegisterInfo *MRI = &MF->getRegInfo();
7737   MachineConstantPool *MCP = MF->getConstantPool();
7738   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
7739   const Function *F = MF->getFunction();
7740 
7741   bool isThumb = Subtarget->isThumb();
7742   bool isThumb2 = Subtarget->isThumb2();
7743 
7744   unsigned PCLabelId = AFI->createPICLabelUId();
7745   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
7746   ARMConstantPoolValue *CPV =
7747     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
7748   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
7749 
7750   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
7751                                            : &ARM::GPRRegClass;
7752 
7753   // Grab constant pool and fixed stack memory operands.
7754   MachineMemOperand *CPMMO =
7755       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
7756                                MachineMemOperand::MOLoad, 4, 4);
7757 
7758   MachineMemOperand *FIMMOSt =
7759       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
7760                                MachineMemOperand::MOStore, 4, 4);
7761 
7762   // Load the address of the dispatch MBB into the jump buffer.
7763   if (isThumb2) {
7764     // Incoming value: jbuf
7765     //   ldr.n  r5, LCPI1_1
7766     //   orr    r5, r5, #1
7767     //   add    r5, pc
7768     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
7769     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7770     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
7771                    .addConstantPoolIndex(CPI)
7772                    .addMemOperand(CPMMO));
7773     // Set the low bit because of thumb mode.
7774     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7775     AddDefaultCC(
7776       AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
7777                      .addReg(NewVReg1, RegState::Kill)
7778                      .addImm(0x01)));
7779     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7780     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
7781       .addReg(NewVReg2, RegState::Kill)
7782       .addImm(PCLabelId);
7783     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
7784                    .addReg(NewVReg3, RegState::Kill)
7785                    .addFrameIndex(FI)
7786                    .addImm(36)  // &jbuf[1] :: pc
7787                    .addMemOperand(FIMMOSt));
7788   } else if (isThumb) {
7789     // Incoming value: jbuf
7790     //   ldr.n  r1, LCPI1_4
7791     //   add    r1, pc
7792     //   mov    r2, #1
7793     //   orrs   r1, r2
7794     //   add    r2, $jbuf, #+4 ; &jbuf[1]
7795     //   str    r1, [r2]
7796     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7797     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
7798                    .addConstantPoolIndex(CPI)
7799                    .addMemOperand(CPMMO));
7800     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7801     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
7802       .addReg(NewVReg1, RegState::Kill)
7803       .addImm(PCLabelId);
7804     // Set the low bit because of thumb mode.
7805     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7806     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
7807                    .addReg(ARM::CPSR, RegState::Define)
7808                    .addImm(1));
7809     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7810     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
7811                    .addReg(ARM::CPSR, RegState::Define)
7812                    .addReg(NewVReg2, RegState::Kill)
7813                    .addReg(NewVReg3, RegState::Kill));
7814     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7815     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
7816             .addFrameIndex(FI)
7817             .addImm(36); // &jbuf[1] :: pc
7818     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
7819                    .addReg(NewVReg4, RegState::Kill)
7820                    .addReg(NewVReg5, RegState::Kill)
7821                    .addImm(0)
7822                    .addMemOperand(FIMMOSt));
7823   } else {
7824     // Incoming value: jbuf
7825     //   ldr  r1, LCPI1_1
7826     //   add  r1, pc, r1
7827     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
7828     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7829     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
7830                    .addConstantPoolIndex(CPI)
7831                    .addImm(0)
7832                    .addMemOperand(CPMMO));
7833     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7834     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
7835                    .addReg(NewVReg1, RegState::Kill)
7836                    .addImm(PCLabelId));
7837     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
7838                    .addReg(NewVReg2, RegState::Kill)
7839                    .addFrameIndex(FI)
7840                    .addImm(36)  // &jbuf[1] :: pc
7841                    .addMemOperand(FIMMOSt));
7842   }
7843 }
7844 
7845 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
7846                                               MachineBasicBlock *MBB) const {
7847   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7848   DebugLoc dl = MI.getDebugLoc();
7849   MachineFunction *MF = MBB->getParent();
7850   MachineRegisterInfo *MRI = &MF->getRegInfo();
7851   MachineFrameInfo &MFI = MF->getFrameInfo();
7852   int FI = MFI.getFunctionContextIndex();
7853 
7854   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
7855                                                         : &ARM::GPRnopcRegClass;
7856 
7857   // Get a mapping of the call site numbers to all of the landing pads they're
7858   // associated with.
7859   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
7860   unsigned MaxCSNum = 0;
7861   MachineModuleInfo &MMI = MF->getMMI();
7862   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
7863        ++BB) {
7864     if (!BB->isEHPad()) continue;
7865 
7866     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
7867     // pad.
7868     for (MachineBasicBlock::iterator
7869            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
7870       if (!II->isEHLabel()) continue;
7871 
7872       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
7873       if (!MMI.hasCallSiteLandingPad(Sym)) continue;
7874 
7875       SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
7876       for (SmallVectorImpl<unsigned>::iterator
7877              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
7878            CSI != CSE; ++CSI) {
7879         CallSiteNumToLPad[*CSI].push_back(&*BB);
7880         MaxCSNum = std::max(MaxCSNum, *CSI);
7881       }
7882       break;
7883     }
7884   }
7885 
7886   // Get an ordered list of the machine basic blocks for the jump table.
7887   std::vector<MachineBasicBlock*> LPadList;
7888   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
7889   LPadList.reserve(CallSiteNumToLPad.size());
7890   for (unsigned I = 1; I <= MaxCSNum; ++I) {
7891     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
7892     for (SmallVectorImpl<MachineBasicBlock*>::iterator
7893            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
7894       LPadList.push_back(*II);
7895       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
7896     }
7897   }
7898 
7899   assert(!LPadList.empty() &&
7900          "No landing pad destinations for the dispatch jump table!");
7901 
7902   // Create the jump table and associated information.
7903   MachineJumpTableInfo *JTI =
7904     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
7905   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
7906 
7907   // Create the MBBs for the dispatch code.
7908 
7909   // Shove the dispatch's address into the return slot in the function context.
7910   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
7911   DispatchBB->setIsEHPad();
7912 
7913   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7914   unsigned trap_opcode;
7915   if (Subtarget->isThumb())
7916     trap_opcode = ARM::tTRAP;
7917   else
7918     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
7919 
7920   BuildMI(TrapBB, dl, TII->get(trap_opcode));
7921   DispatchBB->addSuccessor(TrapBB);
7922 
7923   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
7924   DispatchBB->addSuccessor(DispContBB);
7925 
7926   // Insert and MBBs.
7927   MF->insert(MF->end(), DispatchBB);
7928   MF->insert(MF->end(), DispContBB);
7929   MF->insert(MF->end(), TrapBB);
7930 
7931   // Insert code into the entry block that creates and registers the function
7932   // context.
7933   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
7934 
7935   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
7936       MachinePointerInfo::getFixedStack(*MF, FI),
7937       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
7938 
7939   MachineInstrBuilder MIB;
7940   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
7941 
7942   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
7943   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
7944 
7945   // Add a register mask with no preserved registers.  This results in all
7946   // registers being marked as clobbered. This can't work if the dispatch block
7947   // is in a Thumb1 function and is linked with ARM code which uses the FP
7948   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
7949   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
7950 
7951   bool IsPositionIndependent = isPositionIndependent();
7952   unsigned NumLPads = LPadList.size();
7953   if (Subtarget->isThumb2()) {
7954     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7955     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
7956                    .addFrameIndex(FI)
7957                    .addImm(4)
7958                    .addMemOperand(FIMMOLd));
7959 
7960     if (NumLPads < 256) {
7961       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
7962                      .addReg(NewVReg1)
7963                      .addImm(LPadList.size()));
7964     } else {
7965       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7966       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
7967                      .addImm(NumLPads & 0xFFFF));
7968 
7969       unsigned VReg2 = VReg1;
7970       if ((NumLPads & 0xFFFF0000) != 0) {
7971         VReg2 = MRI->createVirtualRegister(TRC);
7972         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
7973                        .addReg(VReg1)
7974                        .addImm(NumLPads >> 16));
7975       }
7976 
7977       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
7978                      .addReg(NewVReg1)
7979                      .addReg(VReg2));
7980     }
7981 
7982     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
7983       .addMBB(TrapBB)
7984       .addImm(ARMCC::HI)
7985       .addReg(ARM::CPSR);
7986 
7987     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7988     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
7989                    .addJumpTableIndex(MJTI));
7990 
7991     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7992     AddDefaultCC(
7993       AddDefaultPred(
7994         BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
7995         .addReg(NewVReg3, RegState::Kill)
7996         .addReg(NewVReg1)
7997         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
7998 
7999     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
8000       .addReg(NewVReg4, RegState::Kill)
8001       .addReg(NewVReg1)
8002       .addJumpTableIndex(MJTI);
8003   } else if (Subtarget->isThumb()) {
8004     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8005     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
8006                    .addFrameIndex(FI)
8007                    .addImm(1)
8008                    .addMemOperand(FIMMOLd));
8009 
8010     if (NumLPads < 256) {
8011       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
8012                      .addReg(NewVReg1)
8013                      .addImm(NumLPads));
8014     } else {
8015       MachineConstantPool *ConstantPool = MF->getConstantPool();
8016       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8017       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
8018 
8019       // MachineConstantPool wants an explicit alignment.
8020       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8021       if (Align == 0)
8022         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8023       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8024 
8025       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8026       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
8027                      .addReg(VReg1, RegState::Define)
8028                      .addConstantPoolIndex(Idx));
8029       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
8030                      .addReg(NewVReg1)
8031                      .addReg(VReg1));
8032     }
8033 
8034     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
8035       .addMBB(TrapBB)
8036       .addImm(ARMCC::HI)
8037       .addReg(ARM::CPSR);
8038 
8039     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
8040     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
8041                    .addReg(ARM::CPSR, RegState::Define)
8042                    .addReg(NewVReg1)
8043                    .addImm(2));
8044 
8045     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8046     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
8047                    .addJumpTableIndex(MJTI));
8048 
8049     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8050     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
8051                    .addReg(ARM::CPSR, RegState::Define)
8052                    .addReg(NewVReg2, RegState::Kill)
8053                    .addReg(NewVReg3));
8054 
8055     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
8056         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
8057 
8058     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8059     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
8060                    .addReg(NewVReg4, RegState::Kill)
8061                    .addImm(0)
8062                    .addMemOperand(JTMMOLd));
8063 
8064     unsigned NewVReg6 = NewVReg5;
8065     if (IsPositionIndependent) {
8066       NewVReg6 = MRI->createVirtualRegister(TRC);
8067       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
8068                      .addReg(ARM::CPSR, RegState::Define)
8069                      .addReg(NewVReg5, RegState::Kill)
8070                      .addReg(NewVReg3));
8071     }
8072 
8073     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
8074       .addReg(NewVReg6, RegState::Kill)
8075       .addJumpTableIndex(MJTI);
8076   } else {
8077     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8078     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
8079                    .addFrameIndex(FI)
8080                    .addImm(4)
8081                    .addMemOperand(FIMMOLd));
8082 
8083     if (NumLPads < 256) {
8084       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
8085                      .addReg(NewVReg1)
8086                      .addImm(NumLPads));
8087     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
8088       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8089       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
8090                      .addImm(NumLPads & 0xFFFF));
8091 
8092       unsigned VReg2 = VReg1;
8093       if ((NumLPads & 0xFFFF0000) != 0) {
8094         VReg2 = MRI->createVirtualRegister(TRC);
8095         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
8096                        .addReg(VReg1)
8097                        .addImm(NumLPads >> 16));
8098       }
8099 
8100       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8101                      .addReg(NewVReg1)
8102                      .addReg(VReg2));
8103     } else {
8104       MachineConstantPool *ConstantPool = MF->getConstantPool();
8105       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8106       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
8107 
8108       // MachineConstantPool wants an explicit alignment.
8109       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8110       if (Align == 0)
8111         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8112       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8113 
8114       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8115       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
8116                      .addReg(VReg1, RegState::Define)
8117                      .addConstantPoolIndex(Idx)
8118                      .addImm(0));
8119       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8120                      .addReg(NewVReg1)
8121                      .addReg(VReg1, RegState::Kill));
8122     }
8123 
8124     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
8125       .addMBB(TrapBB)
8126       .addImm(ARMCC::HI)
8127       .addReg(ARM::CPSR);
8128 
8129     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8130     AddDefaultCC(
8131       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
8132                      .addReg(NewVReg1)
8133                      .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
8134     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8135     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
8136                    .addJumpTableIndex(MJTI));
8137 
8138     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
8139         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
8140     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8141     AddDefaultPred(
8142       BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
8143       .addReg(NewVReg3, RegState::Kill)
8144       .addReg(NewVReg4)
8145       .addImm(0)
8146       .addMemOperand(JTMMOLd));
8147 
8148     if (IsPositionIndependent) {
8149       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
8150         .addReg(NewVReg5, RegState::Kill)
8151         .addReg(NewVReg4)
8152         .addJumpTableIndex(MJTI);
8153     } else {
8154       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
8155         .addReg(NewVReg5, RegState::Kill)
8156         .addJumpTableIndex(MJTI);
8157     }
8158   }
8159 
8160   // Add the jump table entries as successors to the MBB.
8161   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
8162   for (std::vector<MachineBasicBlock*>::iterator
8163          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
8164     MachineBasicBlock *CurMBB = *I;
8165     if (SeenMBBs.insert(CurMBB).second)
8166       DispContBB->addSuccessor(CurMBB);
8167   }
8168 
8169   // N.B. the order the invoke BBs are processed in doesn't matter here.
8170   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
8171   SmallVector<MachineBasicBlock*, 64> MBBLPads;
8172   for (MachineBasicBlock *BB : InvokeBBs) {
8173 
8174     // Remove the landing pad successor from the invoke block and replace it
8175     // with the new dispatch block.
8176     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
8177                                                   BB->succ_end());
8178     while (!Successors.empty()) {
8179       MachineBasicBlock *SMBB = Successors.pop_back_val();
8180       if (SMBB->isEHPad()) {
8181         BB->removeSuccessor(SMBB);
8182         MBBLPads.push_back(SMBB);
8183       }
8184     }
8185 
8186     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
8187     BB->normalizeSuccProbs();
8188 
8189     // Find the invoke call and mark all of the callee-saved registers as
8190     // 'implicit defined' so that they're spilled. This prevents code from
8191     // moving instructions to before the EH block, where they will never be
8192     // executed.
8193     for (MachineBasicBlock::reverse_iterator
8194            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
8195       if (!II->isCall()) continue;
8196 
8197       DenseMap<unsigned, bool> DefRegs;
8198       for (MachineInstr::mop_iterator
8199              OI = II->operands_begin(), OE = II->operands_end();
8200            OI != OE; ++OI) {
8201         if (!OI->isReg()) continue;
8202         DefRegs[OI->getReg()] = true;
8203       }
8204 
8205       MachineInstrBuilder MIB(*MF, &*II);
8206 
8207       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
8208         unsigned Reg = SavedRegs[i];
8209         if (Subtarget->isThumb2() &&
8210             !ARM::tGPRRegClass.contains(Reg) &&
8211             !ARM::hGPRRegClass.contains(Reg))
8212           continue;
8213         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
8214           continue;
8215         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
8216           continue;
8217         if (!DefRegs[Reg])
8218           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
8219       }
8220 
8221       break;
8222     }
8223   }
8224 
8225   // Mark all former landing pads as non-landing pads. The dispatch is the only
8226   // landing pad now.
8227   for (SmallVectorImpl<MachineBasicBlock*>::iterator
8228          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
8229     (*I)->setIsEHPad(false);
8230 
8231   // The instruction is gone now.
8232   MI.eraseFromParent();
8233 }
8234 
8235 static
8236 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
8237   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
8238        E = MBB->succ_end(); I != E; ++I)
8239     if (*I != Succ)
8240       return *I;
8241   llvm_unreachable("Expecting a BB with two successors!");
8242 }
8243 
8244 /// Return the load opcode for a given load size. If load size >= 8,
8245 /// neon opcode will be returned.
8246 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
8247   if (LdSize >= 8)
8248     return LdSize == 16 ? ARM::VLD1q32wb_fixed
8249                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
8250   if (IsThumb1)
8251     return LdSize == 4 ? ARM::tLDRi
8252                        : LdSize == 2 ? ARM::tLDRHi
8253                                      : LdSize == 1 ? ARM::tLDRBi : 0;
8254   if (IsThumb2)
8255     return LdSize == 4 ? ARM::t2LDR_POST
8256                        : LdSize == 2 ? ARM::t2LDRH_POST
8257                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
8258   return LdSize == 4 ? ARM::LDR_POST_IMM
8259                      : LdSize == 2 ? ARM::LDRH_POST
8260                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
8261 }
8262 
8263 /// Return the store opcode for a given store size. If store size >= 8,
8264 /// neon opcode will be returned.
8265 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
8266   if (StSize >= 8)
8267     return StSize == 16 ? ARM::VST1q32wb_fixed
8268                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
8269   if (IsThumb1)
8270     return StSize == 4 ? ARM::tSTRi
8271                        : StSize == 2 ? ARM::tSTRHi
8272                                      : StSize == 1 ? ARM::tSTRBi : 0;
8273   if (IsThumb2)
8274     return StSize == 4 ? ARM::t2STR_POST
8275                        : StSize == 2 ? ARM::t2STRH_POST
8276                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
8277   return StSize == 4 ? ARM::STR_POST_IMM
8278                      : StSize == 2 ? ARM::STRH_POST
8279                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
8280 }
8281 
8282 /// Emit a post-increment load operation with given size. The instructions
8283 /// will be added to BB at Pos.
8284 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8285                        const TargetInstrInfo *TII, const DebugLoc &dl,
8286                        unsigned LdSize, unsigned Data, unsigned AddrIn,
8287                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8288   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
8289   assert(LdOpc != 0 && "Should have a load opcode");
8290   if (LdSize >= 8) {
8291     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8292                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
8293                        .addImm(0));
8294   } else if (IsThumb1) {
8295     // load + update AddrIn
8296     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8297                        .addReg(AddrIn).addImm(0));
8298     MachineInstrBuilder MIB =
8299         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
8300     MIB = AddDefaultT1CC(MIB);
8301     MIB.addReg(AddrIn).addImm(LdSize);
8302     AddDefaultPred(MIB);
8303   } else if (IsThumb2) {
8304     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8305                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
8306                        .addImm(LdSize));
8307   } else { // arm
8308     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8309                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
8310                        .addReg(0).addImm(LdSize));
8311   }
8312 }
8313 
8314 /// Emit a post-increment store operation with given size. The instructions
8315 /// will be added to BB at Pos.
8316 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8317                        const TargetInstrInfo *TII, const DebugLoc &dl,
8318                        unsigned StSize, unsigned Data, unsigned AddrIn,
8319                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8320   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
8321   assert(StOpc != 0 && "Should have a store opcode");
8322   if (StSize >= 8) {
8323     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8324                        .addReg(AddrIn).addImm(0).addReg(Data));
8325   } else if (IsThumb1) {
8326     // store + update AddrIn
8327     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
8328                        .addReg(AddrIn).addImm(0));
8329     MachineInstrBuilder MIB =
8330         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
8331     MIB = AddDefaultT1CC(MIB);
8332     MIB.addReg(AddrIn).addImm(StSize);
8333     AddDefaultPred(MIB);
8334   } else if (IsThumb2) {
8335     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8336                        .addReg(Data).addReg(AddrIn).addImm(StSize));
8337   } else { // arm
8338     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8339                        .addReg(Data).addReg(AddrIn).addReg(0)
8340                        .addImm(StSize));
8341   }
8342 }
8343 
8344 MachineBasicBlock *
8345 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
8346                                    MachineBasicBlock *BB) const {
8347   // This pseudo instruction has 3 operands: dst, src, size
8348   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
8349   // Otherwise, we will generate unrolled scalar copies.
8350   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8351   const BasicBlock *LLVM_BB = BB->getBasicBlock();
8352   MachineFunction::iterator It = ++BB->getIterator();
8353 
8354   unsigned dest = MI.getOperand(0).getReg();
8355   unsigned src = MI.getOperand(1).getReg();
8356   unsigned SizeVal = MI.getOperand(2).getImm();
8357   unsigned Align = MI.getOperand(3).getImm();
8358   DebugLoc dl = MI.getDebugLoc();
8359 
8360   MachineFunction *MF = BB->getParent();
8361   MachineRegisterInfo &MRI = MF->getRegInfo();
8362   unsigned UnitSize = 0;
8363   const TargetRegisterClass *TRC = nullptr;
8364   const TargetRegisterClass *VecTRC = nullptr;
8365 
8366   bool IsThumb1 = Subtarget->isThumb1Only();
8367   bool IsThumb2 = Subtarget->isThumb2();
8368   bool IsThumb = Subtarget->isThumb();
8369 
8370   if (Align & 1) {
8371     UnitSize = 1;
8372   } else if (Align & 2) {
8373     UnitSize = 2;
8374   } else {
8375     // Check whether we can use NEON instructions.
8376     if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
8377         Subtarget->hasNEON()) {
8378       if ((Align % 16 == 0) && SizeVal >= 16)
8379         UnitSize = 16;
8380       else if ((Align % 8 == 0) && SizeVal >= 8)
8381         UnitSize = 8;
8382     }
8383     // Can't use NEON instructions.
8384     if (UnitSize == 0)
8385       UnitSize = 4;
8386   }
8387 
8388   // Select the correct opcode and register class for unit size load/store
8389   bool IsNeon = UnitSize >= 8;
8390   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
8391   if (IsNeon)
8392     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
8393                             : UnitSize == 8 ? &ARM::DPRRegClass
8394                                             : nullptr;
8395 
8396   unsigned BytesLeft = SizeVal % UnitSize;
8397   unsigned LoopSize = SizeVal - BytesLeft;
8398 
8399   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
8400     // Use LDR and STR to copy.
8401     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
8402     // [destOut] = STR_POST(scratch, destIn, UnitSize)
8403     unsigned srcIn = src;
8404     unsigned destIn = dest;
8405     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
8406       unsigned srcOut = MRI.createVirtualRegister(TRC);
8407       unsigned destOut = MRI.createVirtualRegister(TRC);
8408       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
8409       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
8410                  IsThumb1, IsThumb2);
8411       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
8412                  IsThumb1, IsThumb2);
8413       srcIn = srcOut;
8414       destIn = destOut;
8415     }
8416 
8417     // Handle the leftover bytes with LDRB and STRB.
8418     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
8419     // [destOut] = STRB_POST(scratch, destIn, 1)
8420     for (unsigned i = 0; i < BytesLeft; i++) {
8421       unsigned srcOut = MRI.createVirtualRegister(TRC);
8422       unsigned destOut = MRI.createVirtualRegister(TRC);
8423       unsigned scratch = MRI.createVirtualRegister(TRC);
8424       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
8425                  IsThumb1, IsThumb2);
8426       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
8427                  IsThumb1, IsThumb2);
8428       srcIn = srcOut;
8429       destIn = destOut;
8430     }
8431     MI.eraseFromParent(); // The instruction is gone now.
8432     return BB;
8433   }
8434 
8435   // Expand the pseudo op to a loop.
8436   // thisMBB:
8437   //   ...
8438   //   movw varEnd, # --> with thumb2
8439   //   movt varEnd, #
8440   //   ldrcp varEnd, idx --> without thumb2
8441   //   fallthrough --> loopMBB
8442   // loopMBB:
8443   //   PHI varPhi, varEnd, varLoop
8444   //   PHI srcPhi, src, srcLoop
8445   //   PHI destPhi, dst, destLoop
8446   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
8447   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
8448   //   subs varLoop, varPhi, #UnitSize
8449   //   bne loopMBB
8450   //   fallthrough --> exitMBB
8451   // exitMBB:
8452   //   epilogue to handle left-over bytes
8453   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
8454   //   [destOut] = STRB_POST(scratch, destLoop, 1)
8455   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
8456   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
8457   MF->insert(It, loopMBB);
8458   MF->insert(It, exitMBB);
8459 
8460   // Transfer the remainder of BB and its successor edges to exitMBB.
8461   exitMBB->splice(exitMBB->begin(), BB,
8462                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
8463   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
8464 
8465   // Load an immediate to varEnd.
8466   unsigned varEnd = MRI.createVirtualRegister(TRC);
8467   if (Subtarget->useMovt(*MF)) {
8468     unsigned Vtmp = varEnd;
8469     if ((LoopSize & 0xFFFF0000) != 0)
8470       Vtmp = MRI.createVirtualRegister(TRC);
8471     AddDefaultPred(BuildMI(BB, dl,
8472                            TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16),
8473                            Vtmp).addImm(LoopSize & 0xFFFF));
8474 
8475     if ((LoopSize & 0xFFFF0000) != 0)
8476       AddDefaultPred(BuildMI(BB, dl,
8477                              TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16),
8478                              varEnd)
8479                          .addReg(Vtmp)
8480                          .addImm(LoopSize >> 16));
8481   } else {
8482     MachineConstantPool *ConstantPool = MF->getConstantPool();
8483     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8484     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
8485 
8486     // MachineConstantPool wants an explicit alignment.
8487     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8488     if (Align == 0)
8489       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8490     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8491 
8492     if (IsThumb)
8493       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
8494           varEnd, RegState::Define).addConstantPoolIndex(Idx));
8495     else
8496       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
8497           varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
8498   }
8499   BB->addSuccessor(loopMBB);
8500 
8501   // Generate the loop body:
8502   //   varPhi = PHI(varLoop, varEnd)
8503   //   srcPhi = PHI(srcLoop, src)
8504   //   destPhi = PHI(destLoop, dst)
8505   MachineBasicBlock *entryBB = BB;
8506   BB = loopMBB;
8507   unsigned varLoop = MRI.createVirtualRegister(TRC);
8508   unsigned varPhi = MRI.createVirtualRegister(TRC);
8509   unsigned srcLoop = MRI.createVirtualRegister(TRC);
8510   unsigned srcPhi = MRI.createVirtualRegister(TRC);
8511   unsigned destLoop = MRI.createVirtualRegister(TRC);
8512   unsigned destPhi = MRI.createVirtualRegister(TRC);
8513 
8514   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
8515     .addReg(varLoop).addMBB(loopMBB)
8516     .addReg(varEnd).addMBB(entryBB);
8517   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
8518     .addReg(srcLoop).addMBB(loopMBB)
8519     .addReg(src).addMBB(entryBB);
8520   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
8521     .addReg(destLoop).addMBB(loopMBB)
8522     .addReg(dest).addMBB(entryBB);
8523 
8524   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
8525   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
8526   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
8527   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
8528              IsThumb1, IsThumb2);
8529   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
8530              IsThumb1, IsThumb2);
8531 
8532   // Decrement loop variable by UnitSize.
8533   if (IsThumb1) {
8534     MachineInstrBuilder MIB =
8535         BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
8536     MIB = AddDefaultT1CC(MIB);
8537     MIB.addReg(varPhi).addImm(UnitSize);
8538     AddDefaultPred(MIB);
8539   } else {
8540     MachineInstrBuilder MIB =
8541         BuildMI(*BB, BB->end(), dl,
8542                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
8543     AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
8544     MIB->getOperand(5).setReg(ARM::CPSR);
8545     MIB->getOperand(5).setIsDef(true);
8546   }
8547   BuildMI(*BB, BB->end(), dl,
8548           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
8549       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
8550 
8551   // loopMBB can loop back to loopMBB or fall through to exitMBB.
8552   BB->addSuccessor(loopMBB);
8553   BB->addSuccessor(exitMBB);
8554 
8555   // Add epilogue to handle BytesLeft.
8556   BB = exitMBB;
8557   auto StartOfExit = exitMBB->begin();
8558 
8559   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
8560   //   [destOut] = STRB_POST(scratch, destLoop, 1)
8561   unsigned srcIn = srcLoop;
8562   unsigned destIn = destLoop;
8563   for (unsigned i = 0; i < BytesLeft; i++) {
8564     unsigned srcOut = MRI.createVirtualRegister(TRC);
8565     unsigned destOut = MRI.createVirtualRegister(TRC);
8566     unsigned scratch = MRI.createVirtualRegister(TRC);
8567     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
8568                IsThumb1, IsThumb2);
8569     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
8570                IsThumb1, IsThumb2);
8571     srcIn = srcOut;
8572     destIn = destOut;
8573   }
8574 
8575   MI.eraseFromParent(); // The instruction is gone now.
8576   return BB;
8577 }
8578 
8579 MachineBasicBlock *
8580 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
8581                                        MachineBasicBlock *MBB) const {
8582   const TargetMachine &TM = getTargetMachine();
8583   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
8584   DebugLoc DL = MI.getDebugLoc();
8585 
8586   assert(Subtarget->isTargetWindows() &&
8587          "__chkstk is only supported on Windows");
8588   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
8589 
8590   // __chkstk takes the number of words to allocate on the stack in R4, and
8591   // returns the stack adjustment in number of bytes in R4.  This will not
8592   // clober any other registers (other than the obvious lr).
8593   //
8594   // Although, technically, IP should be considered a register which may be
8595   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
8596   // thumb-2 environment, so there is no interworking required.  As a result, we
8597   // do not expect a veneer to be emitted by the linker, clobbering IP.
8598   //
8599   // Each module receives its own copy of __chkstk, so no import thunk is
8600   // required, again, ensuring that IP is not clobbered.
8601   //
8602   // Finally, although some linkers may theoretically provide a trampoline for
8603   // out of range calls (which is quite common due to a 32M range limitation of
8604   // branches for Thumb), we can generate the long-call version via
8605   // -mcmodel=large, alleviating the need for the trampoline which may clobber
8606   // IP.
8607 
8608   switch (TM.getCodeModel()) {
8609   case CodeModel::Small:
8610   case CodeModel::Medium:
8611   case CodeModel::Default:
8612   case CodeModel::Kernel:
8613     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
8614       .addImm((unsigned)ARMCC::AL).addReg(0)
8615       .addExternalSymbol("__chkstk")
8616       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8617       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8618       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
8619     break;
8620   case CodeModel::Large:
8621   case CodeModel::JITDefault: {
8622     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
8623     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
8624 
8625     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
8626       .addExternalSymbol("__chkstk");
8627     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
8628       .addImm((unsigned)ARMCC::AL).addReg(0)
8629       .addReg(Reg, RegState::Kill)
8630       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8631       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8632       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
8633     break;
8634   }
8635   }
8636 
8637   AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
8638                                       ARM::SP)
8639                          .addReg(ARM::SP, RegState::Kill)
8640                          .addReg(ARM::R4, RegState::Kill)
8641                          .setMIFlags(MachineInstr::FrameSetup)));
8642 
8643   MI.eraseFromParent();
8644   return MBB;
8645 }
8646 
8647 MachineBasicBlock *
8648 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
8649                                        MachineBasicBlock *MBB) const {
8650   DebugLoc DL = MI.getDebugLoc();
8651   MachineFunction *MF = MBB->getParent();
8652   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8653 
8654   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
8655   MF->insert(++MBB->getIterator(), ContBB);
8656   ContBB->splice(ContBB->begin(), MBB,
8657                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
8658   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
8659   MBB->addSuccessor(ContBB);
8660 
8661   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
8662   BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
8663   MF->push_back(TrapBB);
8664   MBB->addSuccessor(TrapBB);
8665 
8666   AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
8667                      .addReg(MI.getOperand(0).getReg())
8668                      .addImm(0));
8669   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
8670       .addMBB(TrapBB)
8671       .addImm(ARMCC::EQ)
8672       .addReg(ARM::CPSR);
8673 
8674   MI.eraseFromParent();
8675   return ContBB;
8676 }
8677 
8678 MachineBasicBlock *
8679 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
8680                                                MachineBasicBlock *BB) const {
8681   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8682   DebugLoc dl = MI.getDebugLoc();
8683   bool isThumb2 = Subtarget->isThumb2();
8684   switch (MI.getOpcode()) {
8685   default: {
8686     MI.dump();
8687     llvm_unreachable("Unexpected instr type to insert");
8688   }
8689 
8690   // Thumb1 post-indexed loads are really just single-register LDMs.
8691   case ARM::tLDR_postidx: {
8692     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
8693       .addOperand(MI.getOperand(1)) // Rn_wb
8694       .addOperand(MI.getOperand(2)) // Rn
8695       .addOperand(MI.getOperand(3)) // PredImm
8696       .addOperand(MI.getOperand(4)) // PredReg
8697       .addOperand(MI.getOperand(0)); // Rt
8698     MI.eraseFromParent();
8699     return BB;
8700   }
8701 
8702   // The Thumb2 pre-indexed stores have the same MI operands, they just
8703   // define them differently in the .td files from the isel patterns, so
8704   // they need pseudos.
8705   case ARM::t2STR_preidx:
8706     MI.setDesc(TII->get(ARM::t2STR_PRE));
8707     return BB;
8708   case ARM::t2STRB_preidx:
8709     MI.setDesc(TII->get(ARM::t2STRB_PRE));
8710     return BB;
8711   case ARM::t2STRH_preidx:
8712     MI.setDesc(TII->get(ARM::t2STRH_PRE));
8713     return BB;
8714 
8715   case ARM::STRi_preidx:
8716   case ARM::STRBi_preidx: {
8717     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
8718                                                          : ARM::STRB_PRE_IMM;
8719     // Decode the offset.
8720     unsigned Offset = MI.getOperand(4).getImm();
8721     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
8722     Offset = ARM_AM::getAM2Offset(Offset);
8723     if (isSub)
8724       Offset = -Offset;
8725 
8726     MachineMemOperand *MMO = *MI.memoperands_begin();
8727     BuildMI(*BB, MI, dl, TII->get(NewOpc))
8728         .addOperand(MI.getOperand(0)) // Rn_wb
8729         .addOperand(MI.getOperand(1)) // Rt
8730         .addOperand(MI.getOperand(2)) // Rn
8731         .addImm(Offset)               // offset (skip GPR==zero_reg)
8732         .addOperand(MI.getOperand(5)) // pred
8733         .addOperand(MI.getOperand(6))
8734         .addMemOperand(MMO);
8735     MI.eraseFromParent();
8736     return BB;
8737   }
8738   case ARM::STRr_preidx:
8739   case ARM::STRBr_preidx:
8740   case ARM::STRH_preidx: {
8741     unsigned NewOpc;
8742     switch (MI.getOpcode()) {
8743     default: llvm_unreachable("unexpected opcode!");
8744     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
8745     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
8746     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
8747     }
8748     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
8749     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
8750       MIB.addOperand(MI.getOperand(i));
8751     MI.eraseFromParent();
8752     return BB;
8753   }
8754 
8755   case ARM::tMOVCCr_pseudo: {
8756     // To "insert" a SELECT_CC instruction, we actually have to insert the
8757     // diamond control-flow pattern.  The incoming instruction knows the
8758     // destination vreg to set, the condition code register to branch on, the
8759     // true/false values to select between, and a branch opcode to use.
8760     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8761     MachineFunction::iterator It = ++BB->getIterator();
8762 
8763     //  thisMBB:
8764     //  ...
8765     //   TrueVal = ...
8766     //   cmpTY ccX, r1, r2
8767     //   bCC copy1MBB
8768     //   fallthrough --> copy0MBB
8769     MachineBasicBlock *thisMBB  = BB;
8770     MachineFunction *F = BB->getParent();
8771     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8772     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
8773     F->insert(It, copy0MBB);
8774     F->insert(It, sinkMBB);
8775 
8776     // Transfer the remainder of BB and its successor edges to sinkMBB.
8777     sinkMBB->splice(sinkMBB->begin(), BB,
8778                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
8779     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
8780 
8781     BB->addSuccessor(copy0MBB);
8782     BB->addSuccessor(sinkMBB);
8783 
8784     BuildMI(BB, dl, TII->get(ARM::tBcc))
8785         .addMBB(sinkMBB)
8786         .addImm(MI.getOperand(3).getImm())
8787         .addReg(MI.getOperand(4).getReg());
8788 
8789     //  copy0MBB:
8790     //   %FalseValue = ...
8791     //   # fallthrough to sinkMBB
8792     BB = copy0MBB;
8793 
8794     // Update machine-CFG edges
8795     BB->addSuccessor(sinkMBB);
8796 
8797     //  sinkMBB:
8798     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8799     //  ...
8800     BB = sinkMBB;
8801     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
8802         .addReg(MI.getOperand(1).getReg())
8803         .addMBB(copy0MBB)
8804         .addReg(MI.getOperand(2).getReg())
8805         .addMBB(thisMBB);
8806 
8807     MI.eraseFromParent(); // The pseudo instruction is gone now.
8808     return BB;
8809   }
8810 
8811   case ARM::BCCi64:
8812   case ARM::BCCZi64: {
8813     // If there is an unconditional branch to the other successor, remove it.
8814     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
8815 
8816     // Compare both parts that make up the double comparison separately for
8817     // equality.
8818     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
8819 
8820     unsigned LHS1 = MI.getOperand(1).getReg();
8821     unsigned LHS2 = MI.getOperand(2).getReg();
8822     if (RHSisZero) {
8823       AddDefaultPred(BuildMI(BB, dl,
8824                              TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8825                      .addReg(LHS1).addImm(0));
8826       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8827         .addReg(LHS2).addImm(0)
8828         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8829     } else {
8830       unsigned RHS1 = MI.getOperand(3).getReg();
8831       unsigned RHS2 = MI.getOperand(4).getReg();
8832       AddDefaultPred(BuildMI(BB, dl,
8833                              TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8834                      .addReg(LHS1).addReg(RHS1));
8835       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8836         .addReg(LHS2).addReg(RHS2)
8837         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8838     }
8839 
8840     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
8841     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
8842     if (MI.getOperand(0).getImm() == ARMCC::NE)
8843       std::swap(destMBB, exitMBB);
8844 
8845     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
8846       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
8847     if (isThumb2)
8848       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
8849     else
8850       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
8851 
8852     MI.eraseFromParent(); // The pseudo instruction is gone now.
8853     return BB;
8854   }
8855 
8856   case ARM::Int_eh_sjlj_setjmp:
8857   case ARM::Int_eh_sjlj_setjmp_nofp:
8858   case ARM::tInt_eh_sjlj_setjmp:
8859   case ARM::t2Int_eh_sjlj_setjmp:
8860   case ARM::t2Int_eh_sjlj_setjmp_nofp:
8861     return BB;
8862 
8863   case ARM::Int_eh_sjlj_setup_dispatch:
8864     EmitSjLjDispatchBlock(MI, BB);
8865     return BB;
8866 
8867   case ARM::ABS:
8868   case ARM::t2ABS: {
8869     // To insert an ABS instruction, we have to insert the
8870     // diamond control-flow pattern.  The incoming instruction knows the
8871     // source vreg to test against 0, the destination vreg to set,
8872     // the condition code register to branch on, the
8873     // true/false values to select between, and a branch opcode to use.
8874     // It transforms
8875     //     V1 = ABS V0
8876     // into
8877     //     V2 = MOVS V0
8878     //     BCC                      (branch to SinkBB if V0 >= 0)
8879     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
8880     //     SinkBB: V1 = PHI(V2, V3)
8881     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8882     MachineFunction::iterator BBI = ++BB->getIterator();
8883     MachineFunction *Fn = BB->getParent();
8884     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
8885     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
8886     Fn->insert(BBI, RSBBB);
8887     Fn->insert(BBI, SinkBB);
8888 
8889     unsigned int ABSSrcReg = MI.getOperand(1).getReg();
8890     unsigned int ABSDstReg = MI.getOperand(0).getReg();
8891     bool ABSSrcKIll = MI.getOperand(1).isKill();
8892     bool isThumb2 = Subtarget->isThumb2();
8893     MachineRegisterInfo &MRI = Fn->getRegInfo();
8894     // In Thumb mode S must not be specified if source register is the SP or
8895     // PC and if destination register is the SP, so restrict register class
8896     unsigned NewRsbDstReg =
8897       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
8898 
8899     // Transfer the remainder of BB and its successor edges to sinkMBB.
8900     SinkBB->splice(SinkBB->begin(), BB,
8901                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
8902     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
8903 
8904     BB->addSuccessor(RSBBB);
8905     BB->addSuccessor(SinkBB);
8906 
8907     // fall through to SinkMBB
8908     RSBBB->addSuccessor(SinkBB);
8909 
8910     // insert a cmp at the end of BB
8911     AddDefaultPred(BuildMI(BB, dl,
8912                            TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8913                    .addReg(ABSSrcReg).addImm(0));
8914 
8915     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
8916     BuildMI(BB, dl,
8917       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
8918       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
8919 
8920     // insert rsbri in RSBBB
8921     // Note: BCC and rsbri will be converted into predicated rsbmi
8922     // by if-conversion pass
8923     BuildMI(*RSBBB, RSBBB->begin(), dl,
8924       TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
8925       .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
8926       .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
8927 
8928     // insert PHI in SinkBB,
8929     // reuse ABSDstReg to not change uses of ABS instruction
8930     BuildMI(*SinkBB, SinkBB->begin(), dl,
8931       TII->get(ARM::PHI), ABSDstReg)
8932       .addReg(NewRsbDstReg).addMBB(RSBBB)
8933       .addReg(ABSSrcReg).addMBB(BB);
8934 
8935     // remove ABS instruction
8936     MI.eraseFromParent();
8937 
8938     // return last added BB
8939     return SinkBB;
8940   }
8941   case ARM::COPY_STRUCT_BYVAL_I32:
8942     ++NumLoopByVals;
8943     return EmitStructByval(MI, BB);
8944   case ARM::WIN__CHKSTK:
8945     return EmitLowered__chkstk(MI, BB);
8946   case ARM::WIN__DBZCHK:
8947     return EmitLowered__dbzchk(MI, BB);
8948   }
8949 }
8950 
8951 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers
8952 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
8953 /// instead of as a custom inserter because we need the use list from the SDNode.
8954 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
8955                                     MachineInstr &MI, const SDNode *Node) {
8956   bool isThumb1 = Subtarget->isThumb1Only();
8957 
8958   DebugLoc DL = MI.getDebugLoc();
8959   MachineFunction *MF = MI.getParent()->getParent();
8960   MachineRegisterInfo &MRI = MF->getRegInfo();
8961   MachineInstrBuilder MIB(*MF, MI);
8962 
8963   // If the new dst/src is unused mark it as dead.
8964   if (!Node->hasAnyUseOfValue(0)) {
8965     MI.getOperand(0).setIsDead(true);
8966   }
8967   if (!Node->hasAnyUseOfValue(1)) {
8968     MI.getOperand(1).setIsDead(true);
8969   }
8970 
8971   // The MEMCPY both defines and kills the scratch registers.
8972   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
8973     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
8974                                                          : &ARM::GPRRegClass);
8975     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
8976   }
8977 }
8978 
8979 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8980                                                       SDNode *Node) const {
8981   if (MI.getOpcode() == ARM::MEMCPY) {
8982     attachMEMCPYScratchRegs(Subtarget, MI, Node);
8983     return;
8984   }
8985 
8986   const MCInstrDesc *MCID = &MI.getDesc();
8987   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
8988   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
8989   // operand is still set to noreg. If needed, set the optional operand's
8990   // register to CPSR, and remove the redundant implicit def.
8991   //
8992   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
8993 
8994   // Rename pseudo opcodes.
8995   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
8996   if (NewOpc) {
8997     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
8998     MCID = &TII->get(NewOpc);
8999 
9000     assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
9001            "converted opcode should be the same except for cc_out");
9002 
9003     MI.setDesc(*MCID);
9004 
9005     // Add the optional cc_out operand
9006     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
9007   }
9008   unsigned ccOutIdx = MCID->getNumOperands() - 1;
9009 
9010   // Any ARM instruction that sets the 's' bit should specify an optional
9011   // "cc_out" operand in the last operand position.
9012   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
9013     assert(!NewOpc && "Optional cc_out operand required");
9014     return;
9015   }
9016   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
9017   // since we already have an optional CPSR def.
9018   bool definesCPSR = false;
9019   bool deadCPSR = false;
9020   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
9021        ++i) {
9022     const MachineOperand &MO = MI.getOperand(i);
9023     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
9024       definesCPSR = true;
9025       if (MO.isDead())
9026         deadCPSR = true;
9027       MI.RemoveOperand(i);
9028       break;
9029     }
9030   }
9031   if (!definesCPSR) {
9032     assert(!NewOpc && "Optional cc_out operand required");
9033     return;
9034   }
9035   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
9036   if (deadCPSR) {
9037     assert(!MI.getOperand(ccOutIdx).getReg() &&
9038            "expect uninitialized optional cc_out operand");
9039     return;
9040   }
9041 
9042   // If this instruction was defined with an optional CPSR def and its dag node
9043   // had a live implicit CPSR def, then activate the optional CPSR def.
9044   MachineOperand &MO = MI.getOperand(ccOutIdx);
9045   MO.setReg(ARM::CPSR);
9046   MO.setIsDef(true);
9047 }
9048 
9049 //===----------------------------------------------------------------------===//
9050 //                           ARM Optimization Hooks
9051 //===----------------------------------------------------------------------===//
9052 
9053 // Helper function that checks if N is a null or all ones constant.
9054 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
9055   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
9056 }
9057 
9058 // Return true if N is conditionally 0 or all ones.
9059 // Detects these expressions where cc is an i1 value:
9060 //
9061 //   (select cc 0, y)   [AllOnes=0]
9062 //   (select cc y, 0)   [AllOnes=0]
9063 //   (zext cc)          [AllOnes=0]
9064 //   (sext cc)          [AllOnes=0/1]
9065 //   (select cc -1, y)  [AllOnes=1]
9066 //   (select cc y, -1)  [AllOnes=1]
9067 //
9068 // Invert is set when N is the null/all ones constant when CC is false.
9069 // OtherOp is set to the alternative value of N.
9070 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
9071                                        SDValue &CC, bool &Invert,
9072                                        SDValue &OtherOp,
9073                                        SelectionDAG &DAG) {
9074   switch (N->getOpcode()) {
9075   default: return false;
9076   case ISD::SELECT: {
9077     CC = N->getOperand(0);
9078     SDValue N1 = N->getOperand(1);
9079     SDValue N2 = N->getOperand(2);
9080     if (isZeroOrAllOnes(N1, AllOnes)) {
9081       Invert = false;
9082       OtherOp = N2;
9083       return true;
9084     }
9085     if (isZeroOrAllOnes(N2, AllOnes)) {
9086       Invert = true;
9087       OtherOp = N1;
9088       return true;
9089     }
9090     return false;
9091   }
9092   case ISD::ZERO_EXTEND:
9093     // (zext cc) can never be the all ones value.
9094     if (AllOnes)
9095       return false;
9096     LLVM_FALLTHROUGH;
9097   case ISD::SIGN_EXTEND: {
9098     SDLoc dl(N);
9099     EVT VT = N->getValueType(0);
9100     CC = N->getOperand(0);
9101     if (CC.getValueType() != MVT::i1)
9102       return false;
9103     Invert = !AllOnes;
9104     if (AllOnes)
9105       // When looking for an AllOnes constant, N is an sext, and the 'other'
9106       // value is 0.
9107       OtherOp = DAG.getConstant(0, dl, VT);
9108     else if (N->getOpcode() == ISD::ZERO_EXTEND)
9109       // When looking for a 0 constant, N can be zext or sext.
9110       OtherOp = DAG.getConstant(1, dl, VT);
9111     else
9112       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
9113                                 VT);
9114     return true;
9115   }
9116   }
9117 }
9118 
9119 // Combine a constant select operand into its use:
9120 //
9121 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
9122 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
9123 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
9124 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
9125 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
9126 //
9127 // The transform is rejected if the select doesn't have a constant operand that
9128 // is null, or all ones when AllOnes is set.
9129 //
9130 // Also recognize sext/zext from i1:
9131 //
9132 //   (add (zext cc), x) -> (select cc (add x, 1), x)
9133 //   (add (sext cc), x) -> (select cc (add x, -1), x)
9134 //
9135 // These transformations eventually create predicated instructions.
9136 //
9137 // @param N       The node to transform.
9138 // @param Slct    The N operand that is a select.
9139 // @param OtherOp The other N operand (x above).
9140 // @param DCI     Context.
9141 // @param AllOnes Require the select constant to be all ones instead of null.
9142 // @returns The new node, or SDValue() on failure.
9143 static
9144 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
9145                             TargetLowering::DAGCombinerInfo &DCI,
9146                             bool AllOnes = false) {
9147   SelectionDAG &DAG = DCI.DAG;
9148   EVT VT = N->getValueType(0);
9149   SDValue NonConstantVal;
9150   SDValue CCOp;
9151   bool SwapSelectOps;
9152   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
9153                                   NonConstantVal, DAG))
9154     return SDValue();
9155 
9156   // Slct is now know to be the desired identity constant when CC is true.
9157   SDValue TrueVal = OtherOp;
9158   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
9159                                  OtherOp, NonConstantVal);
9160   // Unless SwapSelectOps says CC should be false.
9161   if (SwapSelectOps)
9162     std::swap(TrueVal, FalseVal);
9163 
9164   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
9165                      CCOp, TrueVal, FalseVal);
9166 }
9167 
9168 // Attempt combineSelectAndUse on each operand of a commutative operator N.
9169 static
9170 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
9171                                        TargetLowering::DAGCombinerInfo &DCI) {
9172   SDValue N0 = N->getOperand(0);
9173   SDValue N1 = N->getOperand(1);
9174   if (N0.getNode()->hasOneUse())
9175     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
9176       return Result;
9177   if (N1.getNode()->hasOneUse())
9178     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
9179       return Result;
9180   return SDValue();
9181 }
9182 
9183 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
9184 // (only after legalization).
9185 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9186                                  TargetLowering::DAGCombinerInfo &DCI,
9187                                  const ARMSubtarget *Subtarget) {
9188 
9189   // Only perform optimization if after legalize, and if NEON is available. We
9190   // also expected both operands to be BUILD_VECTORs.
9191   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
9192       || N0.getOpcode() != ISD::BUILD_VECTOR
9193       || N1.getOpcode() != ISD::BUILD_VECTOR)
9194     return SDValue();
9195 
9196   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
9197   EVT VT = N->getValueType(0);
9198   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
9199     return SDValue();
9200 
9201   // Check that the vector operands are of the right form.
9202   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
9203   // operands, where N is the size of the formed vector.
9204   // Each EXTRACT_VECTOR should have the same input vector and odd or even
9205   // index such that we have a pair wise add pattern.
9206 
9207   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
9208   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9209     return SDValue();
9210   SDValue Vec = N0->getOperand(0)->getOperand(0);
9211   SDNode *V = Vec.getNode();
9212   unsigned nextIndex = 0;
9213 
9214   // For each operands to the ADD which are BUILD_VECTORs,
9215   // check to see if each of their operands are an EXTRACT_VECTOR with
9216   // the same vector and appropriate index.
9217   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
9218     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
9219         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
9220 
9221       SDValue ExtVec0 = N0->getOperand(i);
9222       SDValue ExtVec1 = N1->getOperand(i);
9223 
9224       // First operand is the vector, verify its the same.
9225       if (V != ExtVec0->getOperand(0).getNode() ||
9226           V != ExtVec1->getOperand(0).getNode())
9227         return SDValue();
9228 
9229       // Second is the constant, verify its correct.
9230       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
9231       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
9232 
9233       // For the constant, we want to see all the even or all the odd.
9234       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
9235           || C1->getZExtValue() != nextIndex+1)
9236         return SDValue();
9237 
9238       // Increment index.
9239       nextIndex+=2;
9240     } else
9241       return SDValue();
9242   }
9243 
9244   // Create VPADDL node.
9245   SelectionDAG &DAG = DCI.DAG;
9246   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9247 
9248   SDLoc dl(N);
9249 
9250   // Build operand list.
9251   SmallVector<SDValue, 8> Ops;
9252   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
9253                                 TLI.getPointerTy(DAG.getDataLayout())));
9254 
9255   // Input is the vector.
9256   Ops.push_back(Vec);
9257 
9258   // Get widened type and narrowed type.
9259   MVT widenType;
9260   unsigned numElem = VT.getVectorNumElements();
9261 
9262   EVT inputLaneType = Vec.getValueType().getVectorElementType();
9263   switch (inputLaneType.getSimpleVT().SimpleTy) {
9264     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
9265     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
9266     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
9267     default:
9268       llvm_unreachable("Invalid vector element type for padd optimization.");
9269   }
9270 
9271   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
9272   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
9273   return DAG.getNode(ExtOp, dl, VT, tmp);
9274 }
9275 
9276 static SDValue findMUL_LOHI(SDValue V) {
9277   if (V->getOpcode() == ISD::UMUL_LOHI ||
9278       V->getOpcode() == ISD::SMUL_LOHI)
9279     return V;
9280   return SDValue();
9281 }
9282 
9283 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
9284                                      TargetLowering::DAGCombinerInfo &DCI,
9285                                      const ARMSubtarget *Subtarget) {
9286 
9287   // Look for multiply add opportunities.
9288   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
9289   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
9290   // a glue link from the first add to the second add.
9291   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
9292   // a S/UMLAL instruction.
9293   //                  UMUL_LOHI
9294   //                 / :lo    \ :hi
9295   //                /          \          [no multiline comment]
9296   //    loAdd ->  ADDE         |
9297   //                 \ :glue  /
9298   //                  \      /
9299   //                    ADDC   <- hiAdd
9300   //
9301   assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
9302   SDValue AddcOp0 = AddcNode->getOperand(0);
9303   SDValue AddcOp1 = AddcNode->getOperand(1);
9304 
9305   // Check if the two operands are from the same mul_lohi node.
9306   if (AddcOp0.getNode() == AddcOp1.getNode())
9307     return SDValue();
9308 
9309   assert(AddcNode->getNumValues() == 2 &&
9310          AddcNode->getValueType(0) == MVT::i32 &&
9311          "Expect ADDC with two result values. First: i32");
9312 
9313   // Check that we have a glued ADDC node.
9314   if (AddcNode->getValueType(1) != MVT::Glue)
9315     return SDValue();
9316 
9317   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
9318   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
9319       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
9320       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
9321       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
9322     return SDValue();
9323 
9324   // Look for the glued ADDE.
9325   SDNode* AddeNode = AddcNode->getGluedUser();
9326   if (!AddeNode)
9327     return SDValue();
9328 
9329   // Make sure it is really an ADDE.
9330   if (AddeNode->getOpcode() != ISD::ADDE)
9331     return SDValue();
9332 
9333   assert(AddeNode->getNumOperands() == 3 &&
9334          AddeNode->getOperand(2).getValueType() == MVT::Glue &&
9335          "ADDE node has the wrong inputs");
9336 
9337   // Check for the triangle shape.
9338   SDValue AddeOp0 = AddeNode->getOperand(0);
9339   SDValue AddeOp1 = AddeNode->getOperand(1);
9340 
9341   // Make sure that the ADDE operands are not coming from the same node.
9342   if (AddeOp0.getNode() == AddeOp1.getNode())
9343     return SDValue();
9344 
9345   // Find the MUL_LOHI node walking up ADDE's operands.
9346   bool IsLeftOperandMUL = false;
9347   SDValue MULOp = findMUL_LOHI(AddeOp0);
9348   if (MULOp == SDValue())
9349    MULOp = findMUL_LOHI(AddeOp1);
9350   else
9351     IsLeftOperandMUL = true;
9352   if (MULOp == SDValue())
9353     return SDValue();
9354 
9355   // Figure out the right opcode.
9356   unsigned Opc = MULOp->getOpcode();
9357   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
9358 
9359   // Figure out the high and low input values to the MLAL node.
9360   SDValue* HiAdd = nullptr;
9361   SDValue* LoMul = nullptr;
9362   SDValue* LowAdd = nullptr;
9363 
9364   // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
9365   if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
9366     return SDValue();
9367 
9368   if (IsLeftOperandMUL)
9369     HiAdd = &AddeOp1;
9370   else
9371     HiAdd = &AddeOp0;
9372 
9373 
9374   // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
9375   // whose low result is fed to the ADDC we are checking.
9376 
9377   if (AddcOp0 == MULOp.getValue(0)) {
9378     LoMul = &AddcOp0;
9379     LowAdd = &AddcOp1;
9380   }
9381   if (AddcOp1 == MULOp.getValue(0)) {
9382     LoMul = &AddcOp1;
9383     LowAdd = &AddcOp0;
9384   }
9385 
9386   if (!LoMul)
9387     return SDValue();
9388 
9389   // Create the merged node.
9390   SelectionDAG &DAG = DCI.DAG;
9391 
9392   // Build operand list.
9393   SmallVector<SDValue, 8> Ops;
9394   Ops.push_back(LoMul->getOperand(0));
9395   Ops.push_back(LoMul->getOperand(1));
9396   Ops.push_back(*LowAdd);
9397   Ops.push_back(*HiAdd);
9398 
9399   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
9400                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
9401 
9402   // Replace the ADDs' nodes uses by the MLA node's values.
9403   SDValue HiMLALResult(MLALNode.getNode(), 1);
9404   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
9405 
9406   SDValue LoMLALResult(MLALNode.getNode(), 0);
9407   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
9408 
9409   // Return original node to notify the driver to stop replacing.
9410   SDValue resNode(AddcNode, 0);
9411   return resNode;
9412 }
9413 
9414 static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
9415                                       TargetLowering::DAGCombinerInfo &DCI,
9416                                       const ARMSubtarget *Subtarget) {
9417   // UMAAL is similar to UMLAL except that it adds two unsigned values.
9418   // While trying to combine for the other MLAL nodes, first search for the
9419   // chance to use UMAAL. Check if Addc uses another addc node which can first
9420   // be combined into a UMLAL. The other pattern is AddcNode being combined
9421   // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
9422 
9423   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() ||
9424       (Subtarget->isThumb() && !Subtarget->hasThumb2()))
9425     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9426 
9427   SDNode *PrevAddc = nullptr;
9428   if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
9429     PrevAddc = AddcNode->getOperand(0).getNode();
9430   else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
9431     PrevAddc = AddcNode->getOperand(1).getNode();
9432 
9433   // If there's no addc chains, just return a search for any MLAL.
9434   if (PrevAddc == nullptr)
9435     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9436 
9437   // Try to convert the addc operand to an MLAL and if that fails try to
9438   // combine AddcNode.
9439   SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
9440   if (MLAL != SDValue(PrevAddc, 0))
9441     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9442 
9443   // Find the converted UMAAL or quit if it doesn't exist.
9444   SDNode *UmlalNode = nullptr;
9445   SDValue AddHi;
9446   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
9447     UmlalNode = AddcNode->getOperand(0).getNode();
9448     AddHi = AddcNode->getOperand(1);
9449   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
9450     UmlalNode = AddcNode->getOperand(1).getNode();
9451     AddHi = AddcNode->getOperand(0);
9452   } else {
9453     return SDValue();
9454   }
9455 
9456   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
9457   // the ADDC as well as Zero.
9458   auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
9459 
9460   if (!Zero || Zero->getZExtValue() != 0)
9461     return SDValue();
9462 
9463   // Check that we have a glued ADDC node.
9464   if (AddcNode->getValueType(1) != MVT::Glue)
9465     return SDValue();
9466 
9467   // Look for the glued ADDE.
9468   SDNode* AddeNode = AddcNode->getGluedUser();
9469   if (!AddeNode)
9470     return SDValue();
9471 
9472   if ((AddeNode->getOperand(0).getNode() == Zero &&
9473        AddeNode->getOperand(1).getNode() == UmlalNode) ||
9474       (AddeNode->getOperand(0).getNode() == UmlalNode &&
9475        AddeNode->getOperand(1).getNode() == Zero)) {
9476 
9477     SelectionDAG &DAG = DCI.DAG;
9478     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
9479                       UmlalNode->getOperand(2), AddHi };
9480     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
9481                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
9482 
9483     // Replace the ADDs' nodes uses by the UMAAL node's values.
9484     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
9485     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
9486 
9487     // Return original node to notify the driver to stop replacing.
9488     return SDValue(AddcNode, 0);
9489   }
9490   return SDValue();
9491 }
9492 
9493 /// PerformADDCCombine - Target-specific dag combine transform from
9494 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
9495 /// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
9496 static SDValue PerformADDCCombine(SDNode *N,
9497                                  TargetLowering::DAGCombinerInfo &DCI,
9498                                  const ARMSubtarget *Subtarget) {
9499 
9500   if (Subtarget->isThumb1Only()) return SDValue();
9501 
9502   // Only perform the checks after legalize when the pattern is available.
9503   if (DCI.isBeforeLegalize()) return SDValue();
9504 
9505   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
9506 }
9507 
9508 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
9509 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
9510 /// called with the default operands, and if that fails, with commuted
9511 /// operands.
9512 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
9513                                           TargetLowering::DAGCombinerInfo &DCI,
9514                                           const ARMSubtarget *Subtarget){
9515 
9516   // Attempt to create vpaddl for this add.
9517   if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
9518     return Result;
9519 
9520   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
9521   if (N0.getNode()->hasOneUse())
9522     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
9523       return Result;
9524   return SDValue();
9525 }
9526 
9527 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
9528 ///
9529 static SDValue PerformADDCombine(SDNode *N,
9530                                  TargetLowering::DAGCombinerInfo &DCI,
9531                                  const ARMSubtarget *Subtarget) {
9532   SDValue N0 = N->getOperand(0);
9533   SDValue N1 = N->getOperand(1);
9534 
9535   // First try with the default operand order.
9536   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
9537     return Result;
9538 
9539   // If that didn't work, try again with the operands commuted.
9540   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
9541 }
9542 
9543 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
9544 ///
9545 static SDValue PerformSUBCombine(SDNode *N,
9546                                  TargetLowering::DAGCombinerInfo &DCI) {
9547   SDValue N0 = N->getOperand(0);
9548   SDValue N1 = N->getOperand(1);
9549 
9550   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
9551   if (N1.getNode()->hasOneUse())
9552     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
9553       return Result;
9554 
9555   return SDValue();
9556 }
9557 
9558 /// PerformVMULCombine
9559 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
9560 /// special multiplier accumulator forwarding.
9561 ///   vmul d3, d0, d2
9562 ///   vmla d3, d1, d2
9563 /// is faster than
9564 ///   vadd d3, d0, d1
9565 ///   vmul d3, d3, d2
9566 //  However, for (A + B) * (A + B),
9567 //    vadd d2, d0, d1
9568 //    vmul d3, d0, d2
9569 //    vmla d3, d1, d2
9570 //  is slower than
9571 //    vadd d2, d0, d1
9572 //    vmul d3, d2, d2
9573 static SDValue PerformVMULCombine(SDNode *N,
9574                                   TargetLowering::DAGCombinerInfo &DCI,
9575                                   const ARMSubtarget *Subtarget) {
9576   if (!Subtarget->hasVMLxForwarding())
9577     return SDValue();
9578 
9579   SelectionDAG &DAG = DCI.DAG;
9580   SDValue N0 = N->getOperand(0);
9581   SDValue N1 = N->getOperand(1);
9582   unsigned Opcode = N0.getOpcode();
9583   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
9584       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
9585     Opcode = N1.getOpcode();
9586     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
9587         Opcode != ISD::FADD && Opcode != ISD::FSUB)
9588       return SDValue();
9589     std::swap(N0, N1);
9590   }
9591 
9592   if (N0 == N1)
9593     return SDValue();
9594 
9595   EVT VT = N->getValueType(0);
9596   SDLoc DL(N);
9597   SDValue N00 = N0->getOperand(0);
9598   SDValue N01 = N0->getOperand(1);
9599   return DAG.getNode(Opcode, DL, VT,
9600                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
9601                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
9602 }
9603 
9604 static SDValue PerformMULCombine(SDNode *N,
9605                                  TargetLowering::DAGCombinerInfo &DCI,
9606                                  const ARMSubtarget *Subtarget) {
9607   SelectionDAG &DAG = DCI.DAG;
9608 
9609   if (Subtarget->isThumb1Only())
9610     return SDValue();
9611 
9612   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9613     return SDValue();
9614 
9615   EVT VT = N->getValueType(0);
9616   if (VT.is64BitVector() || VT.is128BitVector())
9617     return PerformVMULCombine(N, DCI, Subtarget);
9618   if (VT != MVT::i32)
9619     return SDValue();
9620 
9621   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9622   if (!C)
9623     return SDValue();
9624 
9625   int64_t MulAmt = C->getSExtValue();
9626   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
9627 
9628   ShiftAmt = ShiftAmt & (32 - 1);
9629   SDValue V = N->getOperand(0);
9630   SDLoc DL(N);
9631 
9632   SDValue Res;
9633   MulAmt >>= ShiftAmt;
9634 
9635   if (MulAmt >= 0) {
9636     if (isPowerOf2_32(MulAmt - 1)) {
9637       // (mul x, 2^N + 1) => (add (shl x, N), x)
9638       Res = DAG.getNode(ISD::ADD, DL, VT,
9639                         V,
9640                         DAG.getNode(ISD::SHL, DL, VT,
9641                                     V,
9642                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
9643                                                     MVT::i32)));
9644     } else if (isPowerOf2_32(MulAmt + 1)) {
9645       // (mul x, 2^N - 1) => (sub (shl x, N), x)
9646       Res = DAG.getNode(ISD::SUB, DL, VT,
9647                         DAG.getNode(ISD::SHL, DL, VT,
9648                                     V,
9649                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
9650                                                     MVT::i32)),
9651                         V);
9652     } else
9653       return SDValue();
9654   } else {
9655     uint64_t MulAmtAbs = -MulAmt;
9656     if (isPowerOf2_32(MulAmtAbs + 1)) {
9657       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
9658       Res = DAG.getNode(ISD::SUB, DL, VT,
9659                         V,
9660                         DAG.getNode(ISD::SHL, DL, VT,
9661                                     V,
9662                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
9663                                                     MVT::i32)));
9664     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
9665       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
9666       Res = DAG.getNode(ISD::ADD, DL, VT,
9667                         V,
9668                         DAG.getNode(ISD::SHL, DL, VT,
9669                                     V,
9670                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
9671                                                     MVT::i32)));
9672       Res = DAG.getNode(ISD::SUB, DL, VT,
9673                         DAG.getConstant(0, DL, MVT::i32), Res);
9674 
9675     } else
9676       return SDValue();
9677   }
9678 
9679   if (ShiftAmt != 0)
9680     Res = DAG.getNode(ISD::SHL, DL, VT,
9681                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
9682 
9683   // Do not add new nodes to DAG combiner worklist.
9684   DCI.CombineTo(N, Res, false);
9685   return SDValue();
9686 }
9687 
9688 static SDValue PerformANDCombine(SDNode *N,
9689                                  TargetLowering::DAGCombinerInfo &DCI,
9690                                  const ARMSubtarget *Subtarget) {
9691 
9692   // Attempt to use immediate-form VBIC
9693   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9694   SDLoc dl(N);
9695   EVT VT = N->getValueType(0);
9696   SelectionDAG &DAG = DCI.DAG;
9697 
9698   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9699     return SDValue();
9700 
9701   APInt SplatBits, SplatUndef;
9702   unsigned SplatBitSize;
9703   bool HasAnyUndefs;
9704   if (BVN &&
9705       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9706     if (SplatBitSize <= 64) {
9707       EVT VbicVT;
9708       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
9709                                       SplatUndef.getZExtValue(), SplatBitSize,
9710                                       DAG, dl, VbicVT, VT.is128BitVector(),
9711                                       OtherModImm);
9712       if (Val.getNode()) {
9713         SDValue Input =
9714           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
9715         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
9716         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
9717       }
9718     }
9719   }
9720 
9721   if (!Subtarget->isThumb1Only()) {
9722     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
9723     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
9724       return Result;
9725   }
9726 
9727   return SDValue();
9728 }
9729 
9730 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
9731 static SDValue PerformORCombine(SDNode *N,
9732                                 TargetLowering::DAGCombinerInfo &DCI,
9733                                 const ARMSubtarget *Subtarget) {
9734   // Attempt to use immediate-form VORR
9735   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9736   SDLoc dl(N);
9737   EVT VT = N->getValueType(0);
9738   SelectionDAG &DAG = DCI.DAG;
9739 
9740   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9741     return SDValue();
9742 
9743   APInt SplatBits, SplatUndef;
9744   unsigned SplatBitSize;
9745   bool HasAnyUndefs;
9746   if (BVN && Subtarget->hasNEON() &&
9747       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9748     if (SplatBitSize <= 64) {
9749       EVT VorrVT;
9750       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
9751                                       SplatUndef.getZExtValue(), SplatBitSize,
9752                                       DAG, dl, VorrVT, VT.is128BitVector(),
9753                                       OtherModImm);
9754       if (Val.getNode()) {
9755         SDValue Input =
9756           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
9757         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
9758         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
9759       }
9760     }
9761   }
9762 
9763   if (!Subtarget->isThumb1Only()) {
9764     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
9765     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
9766       return Result;
9767   }
9768 
9769   // The code below optimizes (or (and X, Y), Z).
9770   // The AND operand needs to have a single user to make these optimizations
9771   // profitable.
9772   SDValue N0 = N->getOperand(0);
9773   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
9774     return SDValue();
9775   SDValue N1 = N->getOperand(1);
9776 
9777   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
9778   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
9779       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
9780     APInt SplatUndef;
9781     unsigned SplatBitSize;
9782     bool HasAnyUndefs;
9783 
9784     APInt SplatBits0, SplatBits1;
9785     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
9786     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
9787     // Ensure that the second operand of both ands are constants
9788     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
9789                                       HasAnyUndefs) && !HasAnyUndefs) {
9790         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
9791                                           HasAnyUndefs) && !HasAnyUndefs) {
9792             // Ensure that the bit width of the constants are the same and that
9793             // the splat arguments are logical inverses as per the pattern we
9794             // are trying to simplify.
9795             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
9796                 SplatBits0 == ~SplatBits1) {
9797                 // Canonicalize the vector type to make instruction selection
9798                 // simpler.
9799                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
9800                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
9801                                              N0->getOperand(1),
9802                                              N0->getOperand(0),
9803                                              N1->getOperand(0));
9804                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
9805             }
9806         }
9807     }
9808   }
9809 
9810   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
9811   // reasonable.
9812 
9813   // BFI is only available on V6T2+
9814   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
9815     return SDValue();
9816 
9817   SDLoc DL(N);
9818   // 1) or (and A, mask), val => ARMbfi A, val, mask
9819   //      iff (val & mask) == val
9820   //
9821   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
9822   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
9823   //          && mask == ~mask2
9824   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
9825   //          && ~mask == mask2
9826   //  (i.e., copy a bitfield value into another bitfield of the same width)
9827 
9828   if (VT != MVT::i32)
9829     return SDValue();
9830 
9831   SDValue N00 = N0.getOperand(0);
9832 
9833   // The value and the mask need to be constants so we can verify this is
9834   // actually a bitfield set. If the mask is 0xffff, we can do better
9835   // via a movt instruction, so don't use BFI in that case.
9836   SDValue MaskOp = N0.getOperand(1);
9837   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
9838   if (!MaskC)
9839     return SDValue();
9840   unsigned Mask = MaskC->getZExtValue();
9841   if (Mask == 0xffff)
9842     return SDValue();
9843   SDValue Res;
9844   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
9845   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9846   if (N1C) {
9847     unsigned Val = N1C->getZExtValue();
9848     if ((Val & ~Mask) != Val)
9849       return SDValue();
9850 
9851     if (ARM::isBitFieldInvertedMask(Mask)) {
9852       Val >>= countTrailingZeros(~Mask);
9853 
9854       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
9855                         DAG.getConstant(Val, DL, MVT::i32),
9856                         DAG.getConstant(Mask, DL, MVT::i32));
9857 
9858       // Do not add new nodes to DAG combiner worklist.
9859       DCI.CombineTo(N, Res, false);
9860       return SDValue();
9861     }
9862   } else if (N1.getOpcode() == ISD::AND) {
9863     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
9864     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
9865     if (!N11C)
9866       return SDValue();
9867     unsigned Mask2 = N11C->getZExtValue();
9868 
9869     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
9870     // as is to match.
9871     if (ARM::isBitFieldInvertedMask(Mask) &&
9872         (Mask == ~Mask2)) {
9873       // The pack halfword instruction works better for masks that fit it,
9874       // so use that when it's available.
9875       if (Subtarget->hasT2ExtractPack() &&
9876           (Mask == 0xffff || Mask == 0xffff0000))
9877         return SDValue();
9878       // 2a
9879       unsigned amt = countTrailingZeros(Mask2);
9880       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
9881                         DAG.getConstant(amt, DL, MVT::i32));
9882       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
9883                         DAG.getConstant(Mask, DL, MVT::i32));
9884       // Do not add new nodes to DAG combiner worklist.
9885       DCI.CombineTo(N, Res, false);
9886       return SDValue();
9887     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
9888                (~Mask == Mask2)) {
9889       // The pack halfword instruction works better for masks that fit it,
9890       // so use that when it's available.
9891       if (Subtarget->hasT2ExtractPack() &&
9892           (Mask2 == 0xffff || Mask2 == 0xffff0000))
9893         return SDValue();
9894       // 2b
9895       unsigned lsb = countTrailingZeros(Mask);
9896       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
9897                         DAG.getConstant(lsb, DL, MVT::i32));
9898       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
9899                         DAG.getConstant(Mask2, DL, MVT::i32));
9900       // Do not add new nodes to DAG combiner worklist.
9901       DCI.CombineTo(N, Res, false);
9902       return SDValue();
9903     }
9904   }
9905 
9906   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
9907       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
9908       ARM::isBitFieldInvertedMask(~Mask)) {
9909     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
9910     // where lsb(mask) == #shamt and masked bits of B are known zero.
9911     SDValue ShAmt = N00.getOperand(1);
9912     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
9913     unsigned LSB = countTrailingZeros(Mask);
9914     if (ShAmtC != LSB)
9915       return SDValue();
9916 
9917     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
9918                       DAG.getConstant(~Mask, DL, MVT::i32));
9919 
9920     // Do not add new nodes to DAG combiner worklist.
9921     DCI.CombineTo(N, Res, false);
9922   }
9923 
9924   return SDValue();
9925 }
9926 
9927 static SDValue PerformXORCombine(SDNode *N,
9928                                  TargetLowering::DAGCombinerInfo &DCI,
9929                                  const ARMSubtarget *Subtarget) {
9930   EVT VT = N->getValueType(0);
9931   SelectionDAG &DAG = DCI.DAG;
9932 
9933   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9934     return SDValue();
9935 
9936   if (!Subtarget->isThumb1Only()) {
9937     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
9938     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
9939       return Result;
9940   }
9941 
9942   return SDValue();
9943 }
9944 
9945 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
9946 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
9947 // their position in "to" (Rd).
9948 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
9949   assert(N->getOpcode() == ARMISD::BFI);
9950 
9951   SDValue From = N->getOperand(1);
9952   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
9953   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
9954 
9955   // If the Base came from a SHR #C, we can deduce that it is really testing bit
9956   // #C in the base of the SHR.
9957   if (From->getOpcode() == ISD::SRL &&
9958       isa<ConstantSDNode>(From->getOperand(1))) {
9959     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
9960     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
9961     FromMask <<= Shift.getLimitedValue(31);
9962     From = From->getOperand(0);
9963   }
9964 
9965   return From;
9966 }
9967 
9968 // If A and B contain one contiguous set of bits, does A | B == A . B?
9969 //
9970 // Neither A nor B must be zero.
9971 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
9972   unsigned LastActiveBitInA =  A.countTrailingZeros();
9973   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
9974   return LastActiveBitInA - 1 == FirstActiveBitInB;
9975 }
9976 
9977 static SDValue FindBFIToCombineWith(SDNode *N) {
9978   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
9979   // if one exists.
9980   APInt ToMask, FromMask;
9981   SDValue From = ParseBFI(N, ToMask, FromMask);
9982   SDValue To = N->getOperand(0);
9983 
9984   // Now check for a compatible BFI to merge with. We can pass through BFIs that
9985   // aren't compatible, but not if they set the same bit in their destination as
9986   // we do (or that of any BFI we're going to combine with).
9987   SDValue V = To;
9988   APInt CombinedToMask = ToMask;
9989   while (V.getOpcode() == ARMISD::BFI) {
9990     APInt NewToMask, NewFromMask;
9991     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
9992     if (NewFrom != From) {
9993       // This BFI has a different base. Keep going.
9994       CombinedToMask |= NewToMask;
9995       V = V.getOperand(0);
9996       continue;
9997     }
9998 
9999     // Do the written bits conflict with any we've seen so far?
10000     if ((NewToMask & CombinedToMask).getBoolValue())
10001       // Conflicting bits - bail out because going further is unsafe.
10002       return SDValue();
10003 
10004     // Are the new bits contiguous when combined with the old bits?
10005     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
10006         BitsProperlyConcatenate(FromMask, NewFromMask))
10007       return V;
10008     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
10009         BitsProperlyConcatenate(NewFromMask, FromMask))
10010       return V;
10011 
10012     // We've seen a write to some bits, so track it.
10013     CombinedToMask |= NewToMask;
10014     // Keep going...
10015     V = V.getOperand(0);
10016   }
10017 
10018   return SDValue();
10019 }
10020 
10021 static SDValue PerformBFICombine(SDNode *N,
10022                                  TargetLowering::DAGCombinerInfo &DCI) {
10023   SDValue N1 = N->getOperand(1);
10024   if (N1.getOpcode() == ISD::AND) {
10025     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
10026     // the bits being cleared by the AND are not demanded by the BFI.
10027     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
10028     if (!N11C)
10029       return SDValue();
10030     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
10031     unsigned LSB = countTrailingZeros(~InvMask);
10032     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
10033     assert(Width <
10034                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
10035            "undefined behavior");
10036     unsigned Mask = (1u << Width) - 1;
10037     unsigned Mask2 = N11C->getZExtValue();
10038     if ((Mask & (~Mask2)) == 0)
10039       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
10040                              N->getOperand(0), N1.getOperand(0),
10041                              N->getOperand(2));
10042   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
10043     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
10044     // Keep track of any consecutive bits set that all come from the same base
10045     // value. We can combine these together into a single BFI.
10046     SDValue CombineBFI = FindBFIToCombineWith(N);
10047     if (CombineBFI == SDValue())
10048       return SDValue();
10049 
10050     // We've found a BFI.
10051     APInt ToMask1, FromMask1;
10052     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
10053 
10054     APInt ToMask2, FromMask2;
10055     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
10056     assert(From1 == From2);
10057     (void)From2;
10058 
10059     // First, unlink CombineBFI.
10060     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
10061     // Then create a new BFI, combining the two together.
10062     APInt NewFromMask = FromMask1 | FromMask2;
10063     APInt NewToMask = ToMask1 | ToMask2;
10064 
10065     EVT VT = N->getValueType(0);
10066     SDLoc dl(N);
10067 
10068     if (NewFromMask[0] == 0)
10069       From1 = DCI.DAG.getNode(
10070         ISD::SRL, dl, VT, From1,
10071         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
10072     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
10073                            DCI.DAG.getConstant(~NewToMask, dl, VT));
10074   }
10075   return SDValue();
10076 }
10077 
10078 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
10079 /// ARMISD::VMOVRRD.
10080 static SDValue PerformVMOVRRDCombine(SDNode *N,
10081                                      TargetLowering::DAGCombinerInfo &DCI,
10082                                      const ARMSubtarget *Subtarget) {
10083   // vmovrrd(vmovdrr x, y) -> x,y
10084   SDValue InDouble = N->getOperand(0);
10085   if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
10086     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
10087 
10088   // vmovrrd(load f64) -> (load i32), (load i32)
10089   SDNode *InNode = InDouble.getNode();
10090   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
10091       InNode->getValueType(0) == MVT::f64 &&
10092       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
10093       !cast<LoadSDNode>(InNode)->isVolatile()) {
10094     // TODO: Should this be done for non-FrameIndex operands?
10095     LoadSDNode *LD = cast<LoadSDNode>(InNode);
10096 
10097     SelectionDAG &DAG = DCI.DAG;
10098     SDLoc DL(LD);
10099     SDValue BasePtr = LD->getBasePtr();
10100     SDValue NewLD1 =
10101         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
10102                     LD->getAlignment(), LD->getMemOperand()->getFlags());
10103 
10104     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
10105                                     DAG.getConstant(4, DL, MVT::i32));
10106     SDValue NewLD2 = DAG.getLoad(
10107         MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
10108         std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
10109 
10110     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
10111     if (DCI.DAG.getDataLayout().isBigEndian())
10112       std::swap (NewLD1, NewLD2);
10113     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
10114     return Result;
10115   }
10116 
10117   return SDValue();
10118 }
10119 
10120 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
10121 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
10122 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
10123   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
10124   SDValue Op0 = N->getOperand(0);
10125   SDValue Op1 = N->getOperand(1);
10126   if (Op0.getOpcode() == ISD::BITCAST)
10127     Op0 = Op0.getOperand(0);
10128   if (Op1.getOpcode() == ISD::BITCAST)
10129     Op1 = Op1.getOperand(0);
10130   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
10131       Op0.getNode() == Op1.getNode() &&
10132       Op0.getResNo() == 0 && Op1.getResNo() == 1)
10133     return DAG.getNode(ISD::BITCAST, SDLoc(N),
10134                        N->getValueType(0), Op0.getOperand(0));
10135   return SDValue();
10136 }
10137 
10138 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
10139 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
10140 /// i64 vector to have f64 elements, since the value can then be loaded
10141 /// directly into a VFP register.
10142 static bool hasNormalLoadOperand(SDNode *N) {
10143   unsigned NumElts = N->getValueType(0).getVectorNumElements();
10144   for (unsigned i = 0; i < NumElts; ++i) {
10145     SDNode *Elt = N->getOperand(i).getNode();
10146     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
10147       return true;
10148   }
10149   return false;
10150 }
10151 
10152 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
10153 /// ISD::BUILD_VECTOR.
10154 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
10155                                           TargetLowering::DAGCombinerInfo &DCI,
10156                                           const ARMSubtarget *Subtarget) {
10157   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
10158   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
10159   // into a pair of GPRs, which is fine when the value is used as a scalar,
10160   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
10161   SelectionDAG &DAG = DCI.DAG;
10162   if (N->getNumOperands() == 2)
10163     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
10164       return RV;
10165 
10166   // Load i64 elements as f64 values so that type legalization does not split
10167   // them up into i32 values.
10168   EVT VT = N->getValueType(0);
10169   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
10170     return SDValue();
10171   SDLoc dl(N);
10172   SmallVector<SDValue, 8> Ops;
10173   unsigned NumElts = VT.getVectorNumElements();
10174   for (unsigned i = 0; i < NumElts; ++i) {
10175     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
10176     Ops.push_back(V);
10177     // Make the DAGCombiner fold the bitcast.
10178     DCI.AddToWorklist(V.getNode());
10179   }
10180   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
10181   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
10182   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
10183 }
10184 
10185 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
10186 static SDValue
10187 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
10188   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
10189   // At that time, we may have inserted bitcasts from integer to float.
10190   // If these bitcasts have survived DAGCombine, change the lowering of this
10191   // BUILD_VECTOR in something more vector friendly, i.e., that does not
10192   // force to use floating point types.
10193 
10194   // Make sure we can change the type of the vector.
10195   // This is possible iff:
10196   // 1. The vector is only used in a bitcast to a integer type. I.e.,
10197   //    1.1. Vector is used only once.
10198   //    1.2. Use is a bit convert to an integer type.
10199   // 2. The size of its operands are 32-bits (64-bits are not legal).
10200   EVT VT = N->getValueType(0);
10201   EVT EltVT = VT.getVectorElementType();
10202 
10203   // Check 1.1. and 2.
10204   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
10205     return SDValue();
10206 
10207   // By construction, the input type must be float.
10208   assert(EltVT == MVT::f32 && "Unexpected type!");
10209 
10210   // Check 1.2.
10211   SDNode *Use = *N->use_begin();
10212   if (Use->getOpcode() != ISD::BITCAST ||
10213       Use->getValueType(0).isFloatingPoint())
10214     return SDValue();
10215 
10216   // Check profitability.
10217   // Model is, if more than half of the relevant operands are bitcast from
10218   // i32, turn the build_vector into a sequence of insert_vector_elt.
10219   // Relevant operands are everything that is not statically
10220   // (i.e., at compile time) bitcasted.
10221   unsigned NumOfBitCastedElts = 0;
10222   unsigned NumElts = VT.getVectorNumElements();
10223   unsigned NumOfRelevantElts = NumElts;
10224   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
10225     SDValue Elt = N->getOperand(Idx);
10226     if (Elt->getOpcode() == ISD::BITCAST) {
10227       // Assume only bit cast to i32 will go away.
10228       if (Elt->getOperand(0).getValueType() == MVT::i32)
10229         ++NumOfBitCastedElts;
10230     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
10231       // Constants are statically casted, thus do not count them as
10232       // relevant operands.
10233       --NumOfRelevantElts;
10234   }
10235 
10236   // Check if more than half of the elements require a non-free bitcast.
10237   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
10238     return SDValue();
10239 
10240   SelectionDAG &DAG = DCI.DAG;
10241   // Create the new vector type.
10242   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
10243   // Check if the type is legal.
10244   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10245   if (!TLI.isTypeLegal(VecVT))
10246     return SDValue();
10247 
10248   // Combine:
10249   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
10250   // => BITCAST INSERT_VECTOR_ELT
10251   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
10252   //                      (BITCAST EN), N.
10253   SDValue Vec = DAG.getUNDEF(VecVT);
10254   SDLoc dl(N);
10255   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
10256     SDValue V = N->getOperand(Idx);
10257     if (V.isUndef())
10258       continue;
10259     if (V.getOpcode() == ISD::BITCAST &&
10260         V->getOperand(0).getValueType() == MVT::i32)
10261       // Fold obvious case.
10262       V = V.getOperand(0);
10263     else {
10264       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
10265       // Make the DAGCombiner fold the bitcasts.
10266       DCI.AddToWorklist(V.getNode());
10267     }
10268     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
10269     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
10270   }
10271   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
10272   // Make the DAGCombiner fold the bitcasts.
10273   DCI.AddToWorklist(Vec.getNode());
10274   return Vec;
10275 }
10276 
10277 /// PerformInsertEltCombine - Target-specific dag combine xforms for
10278 /// ISD::INSERT_VECTOR_ELT.
10279 static SDValue PerformInsertEltCombine(SDNode *N,
10280                                        TargetLowering::DAGCombinerInfo &DCI) {
10281   // Bitcast an i64 load inserted into a vector to f64.
10282   // Otherwise, the i64 value will be legalized to a pair of i32 values.
10283   EVT VT = N->getValueType(0);
10284   SDNode *Elt = N->getOperand(1).getNode();
10285   if (VT.getVectorElementType() != MVT::i64 ||
10286       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
10287     return SDValue();
10288 
10289   SelectionDAG &DAG = DCI.DAG;
10290   SDLoc dl(N);
10291   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
10292                                  VT.getVectorNumElements());
10293   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
10294   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
10295   // Make the DAGCombiner fold the bitcasts.
10296   DCI.AddToWorklist(Vec.getNode());
10297   DCI.AddToWorklist(V.getNode());
10298   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
10299                                Vec, V, N->getOperand(2));
10300   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
10301 }
10302 
10303 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
10304 /// ISD::VECTOR_SHUFFLE.
10305 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
10306   // The LLVM shufflevector instruction does not require the shuffle mask
10307   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
10308   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
10309   // operands do not match the mask length, they are extended by concatenating
10310   // them with undef vectors.  That is probably the right thing for other
10311   // targets, but for NEON it is better to concatenate two double-register
10312   // size vector operands into a single quad-register size vector.  Do that
10313   // transformation here:
10314   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
10315   //   shuffle(concat(v1, v2), undef)
10316   SDValue Op0 = N->getOperand(0);
10317   SDValue Op1 = N->getOperand(1);
10318   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
10319       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
10320       Op0.getNumOperands() != 2 ||
10321       Op1.getNumOperands() != 2)
10322     return SDValue();
10323   SDValue Concat0Op1 = Op0.getOperand(1);
10324   SDValue Concat1Op1 = Op1.getOperand(1);
10325   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
10326     return SDValue();
10327   // Skip the transformation if any of the types are illegal.
10328   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10329   EVT VT = N->getValueType(0);
10330   if (!TLI.isTypeLegal(VT) ||
10331       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
10332       !TLI.isTypeLegal(Concat1Op1.getValueType()))
10333     return SDValue();
10334 
10335   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
10336                                   Op0.getOperand(0), Op1.getOperand(0));
10337   // Translate the shuffle mask.
10338   SmallVector<int, 16> NewMask;
10339   unsigned NumElts = VT.getVectorNumElements();
10340   unsigned HalfElts = NumElts/2;
10341   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
10342   for (unsigned n = 0; n < NumElts; ++n) {
10343     int MaskElt = SVN->getMaskElt(n);
10344     int NewElt = -1;
10345     if (MaskElt < (int)HalfElts)
10346       NewElt = MaskElt;
10347     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
10348       NewElt = HalfElts + MaskElt - NumElts;
10349     NewMask.push_back(NewElt);
10350   }
10351   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
10352                               DAG.getUNDEF(VT), NewMask);
10353 }
10354 
10355 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
10356 /// NEON load/store intrinsics, and generic vector load/stores, to merge
10357 /// base address updates.
10358 /// For generic load/stores, the memory type is assumed to be a vector.
10359 /// The caller is assumed to have checked legality.
10360 static SDValue CombineBaseUpdate(SDNode *N,
10361                                  TargetLowering::DAGCombinerInfo &DCI) {
10362   SelectionDAG &DAG = DCI.DAG;
10363   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
10364                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
10365   const bool isStore = N->getOpcode() == ISD::STORE;
10366   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
10367   SDValue Addr = N->getOperand(AddrOpIdx);
10368   MemSDNode *MemN = cast<MemSDNode>(N);
10369   SDLoc dl(N);
10370 
10371   // Search for a use of the address operand that is an increment.
10372   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
10373          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
10374     SDNode *User = *UI;
10375     if (User->getOpcode() != ISD::ADD ||
10376         UI.getUse().getResNo() != Addr.getResNo())
10377       continue;
10378 
10379     // Check that the add is independent of the load/store.  Otherwise, folding
10380     // it would create a cycle.
10381     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
10382       continue;
10383 
10384     // Find the new opcode for the updating load/store.
10385     bool isLoadOp = true;
10386     bool isLaneOp = false;
10387     unsigned NewOpc = 0;
10388     unsigned NumVecs = 0;
10389     if (isIntrinsic) {
10390       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
10391       switch (IntNo) {
10392       default: llvm_unreachable("unexpected intrinsic for Neon base update");
10393       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
10394         NumVecs = 1; break;
10395       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
10396         NumVecs = 2; break;
10397       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
10398         NumVecs = 3; break;
10399       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
10400         NumVecs = 4; break;
10401       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
10402         NumVecs = 2; isLaneOp = true; break;
10403       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
10404         NumVecs = 3; isLaneOp = true; break;
10405       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
10406         NumVecs = 4; isLaneOp = true; break;
10407       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
10408         NumVecs = 1; isLoadOp = false; break;
10409       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
10410         NumVecs = 2; isLoadOp = false; break;
10411       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
10412         NumVecs = 3; isLoadOp = false; break;
10413       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
10414         NumVecs = 4; isLoadOp = false; break;
10415       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
10416         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
10417       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
10418         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
10419       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
10420         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
10421       }
10422     } else {
10423       isLaneOp = true;
10424       switch (N->getOpcode()) {
10425       default: llvm_unreachable("unexpected opcode for Neon base update");
10426       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
10427       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
10428       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
10429       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
10430         NumVecs = 1; isLaneOp = false; break;
10431       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
10432         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
10433       }
10434     }
10435 
10436     // Find the size of memory referenced by the load/store.
10437     EVT VecTy;
10438     if (isLoadOp) {
10439       VecTy = N->getValueType(0);
10440     } else if (isIntrinsic) {
10441       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
10442     } else {
10443       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
10444       VecTy = N->getOperand(1).getValueType();
10445     }
10446 
10447     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
10448     if (isLaneOp)
10449       NumBytes /= VecTy.getVectorNumElements();
10450 
10451     // If the increment is a constant, it must match the memory ref size.
10452     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
10453     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
10454       uint64_t IncVal = CInc->getZExtValue();
10455       if (IncVal != NumBytes)
10456         continue;
10457     } else if (NumBytes >= 3 * 16) {
10458       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
10459       // separate instructions that make it harder to use a non-constant update.
10460       continue;
10461     }
10462 
10463     // OK, we found an ADD we can fold into the base update.
10464     // Now, create a _UPD node, taking care of not breaking alignment.
10465 
10466     EVT AlignedVecTy = VecTy;
10467     unsigned Alignment = MemN->getAlignment();
10468 
10469     // If this is a less-than-standard-aligned load/store, change the type to
10470     // match the standard alignment.
10471     // The alignment is overlooked when selecting _UPD variants; and it's
10472     // easier to introduce bitcasts here than fix that.
10473     // There are 3 ways to get to this base-update combine:
10474     // - intrinsics: they are assumed to be properly aligned (to the standard
10475     //   alignment of the memory type), so we don't need to do anything.
10476     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
10477     //   intrinsics, so, likewise, there's nothing to do.
10478     // - generic load/store instructions: the alignment is specified as an
10479     //   explicit operand, rather than implicitly as the standard alignment
10480     //   of the memory type (like the intrisics).  We need to change the
10481     //   memory type to match the explicit alignment.  That way, we don't
10482     //   generate non-standard-aligned ARMISD::VLDx nodes.
10483     if (isa<LSBaseSDNode>(N)) {
10484       if (Alignment == 0)
10485         Alignment = 1;
10486       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
10487         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
10488         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
10489         assert(!isLaneOp && "Unexpected generic load/store lane.");
10490         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
10491         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
10492       }
10493       // Don't set an explicit alignment on regular load/stores that we want
10494       // to transform to VLD/VST 1_UPD nodes.
10495       // This matches the behavior of regular load/stores, which only get an
10496       // explicit alignment if the MMO alignment is larger than the standard
10497       // alignment of the memory type.
10498       // Intrinsics, however, always get an explicit alignment, set to the
10499       // alignment of the MMO.
10500       Alignment = 1;
10501     }
10502 
10503     // Create the new updating load/store node.
10504     // First, create an SDVTList for the new updating node's results.
10505     EVT Tys[6];
10506     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
10507     unsigned n;
10508     for (n = 0; n < NumResultVecs; ++n)
10509       Tys[n] = AlignedVecTy;
10510     Tys[n++] = MVT::i32;
10511     Tys[n] = MVT::Other;
10512     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
10513 
10514     // Then, gather the new node's operands.
10515     SmallVector<SDValue, 8> Ops;
10516     Ops.push_back(N->getOperand(0)); // incoming chain
10517     Ops.push_back(N->getOperand(AddrOpIdx));
10518     Ops.push_back(Inc);
10519 
10520     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
10521       // Try to match the intrinsic's signature
10522       Ops.push_back(StN->getValue());
10523     } else {
10524       // Loads (and of course intrinsics) match the intrinsics' signature,
10525       // so just add all but the alignment operand.
10526       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
10527         Ops.push_back(N->getOperand(i));
10528     }
10529 
10530     // For all node types, the alignment operand is always the last one.
10531     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
10532 
10533     // If this is a non-standard-aligned STORE, the penultimate operand is the
10534     // stored value.  Bitcast it to the aligned type.
10535     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
10536       SDValue &StVal = Ops[Ops.size()-2];
10537       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
10538     }
10539 
10540     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
10541                                            Ops, AlignedVecTy,
10542                                            MemN->getMemOperand());
10543 
10544     // Update the uses.
10545     SmallVector<SDValue, 5> NewResults;
10546     for (unsigned i = 0; i < NumResultVecs; ++i)
10547       NewResults.push_back(SDValue(UpdN.getNode(), i));
10548 
10549     // If this is an non-standard-aligned LOAD, the first result is the loaded
10550     // value.  Bitcast it to the expected result type.
10551     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
10552       SDValue &LdVal = NewResults[0];
10553       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
10554     }
10555 
10556     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
10557     DCI.CombineTo(N, NewResults);
10558     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
10559 
10560     break;
10561   }
10562   return SDValue();
10563 }
10564 
10565 static SDValue PerformVLDCombine(SDNode *N,
10566                                  TargetLowering::DAGCombinerInfo &DCI) {
10567   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
10568     return SDValue();
10569 
10570   return CombineBaseUpdate(N, DCI);
10571 }
10572 
10573 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
10574 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
10575 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
10576 /// return true.
10577 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
10578   SelectionDAG &DAG = DCI.DAG;
10579   EVT VT = N->getValueType(0);
10580   // vldN-dup instructions only support 64-bit vectors for N > 1.
10581   if (!VT.is64BitVector())
10582     return false;
10583 
10584   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
10585   SDNode *VLD = N->getOperand(0).getNode();
10586   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
10587     return false;
10588   unsigned NumVecs = 0;
10589   unsigned NewOpc = 0;
10590   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
10591   if (IntNo == Intrinsic::arm_neon_vld2lane) {
10592     NumVecs = 2;
10593     NewOpc = ARMISD::VLD2DUP;
10594   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
10595     NumVecs = 3;
10596     NewOpc = ARMISD::VLD3DUP;
10597   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
10598     NumVecs = 4;
10599     NewOpc = ARMISD::VLD4DUP;
10600   } else {
10601     return false;
10602   }
10603 
10604   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
10605   // numbers match the load.
10606   unsigned VLDLaneNo =
10607     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
10608   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
10609        UI != UE; ++UI) {
10610     // Ignore uses of the chain result.
10611     if (UI.getUse().getResNo() == NumVecs)
10612       continue;
10613     SDNode *User = *UI;
10614     if (User->getOpcode() != ARMISD::VDUPLANE ||
10615         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
10616       return false;
10617   }
10618 
10619   // Create the vldN-dup node.
10620   EVT Tys[5];
10621   unsigned n;
10622   for (n = 0; n < NumVecs; ++n)
10623     Tys[n] = VT;
10624   Tys[n] = MVT::Other;
10625   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
10626   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
10627   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
10628   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
10629                                            Ops, VLDMemInt->getMemoryVT(),
10630                                            VLDMemInt->getMemOperand());
10631 
10632   // Update the uses.
10633   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
10634        UI != UE; ++UI) {
10635     unsigned ResNo = UI.getUse().getResNo();
10636     // Ignore uses of the chain result.
10637     if (ResNo == NumVecs)
10638       continue;
10639     SDNode *User = *UI;
10640     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
10641   }
10642 
10643   // Now the vldN-lane intrinsic is dead except for its chain result.
10644   // Update uses of the chain.
10645   std::vector<SDValue> VLDDupResults;
10646   for (unsigned n = 0; n < NumVecs; ++n)
10647     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
10648   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
10649   DCI.CombineTo(VLD, VLDDupResults);
10650 
10651   return true;
10652 }
10653 
10654 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
10655 /// ARMISD::VDUPLANE.
10656 static SDValue PerformVDUPLANECombine(SDNode *N,
10657                                       TargetLowering::DAGCombinerInfo &DCI) {
10658   SDValue Op = N->getOperand(0);
10659 
10660   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
10661   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
10662   if (CombineVLDDUP(N, DCI))
10663     return SDValue(N, 0);
10664 
10665   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
10666   // redundant.  Ignore bit_converts for now; element sizes are checked below.
10667   while (Op.getOpcode() == ISD::BITCAST)
10668     Op = Op.getOperand(0);
10669   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
10670     return SDValue();
10671 
10672   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
10673   unsigned EltSize = Op.getScalarValueSizeInBits();
10674   // The canonical VMOV for a zero vector uses a 32-bit element size.
10675   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10676   unsigned EltBits;
10677   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
10678     EltSize = 8;
10679   EVT VT = N->getValueType(0);
10680   if (EltSize > VT.getScalarSizeInBits())
10681     return SDValue();
10682 
10683   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
10684 }
10685 
10686 static SDValue PerformLOADCombine(SDNode *N,
10687                                   TargetLowering::DAGCombinerInfo &DCI) {
10688   EVT VT = N->getValueType(0);
10689 
10690   // If this is a legal vector load, try to combine it into a VLD1_UPD.
10691   if (ISD::isNormalLoad(N) && VT.isVector() &&
10692       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
10693     return CombineBaseUpdate(N, DCI);
10694 
10695   return SDValue();
10696 }
10697 
10698 /// PerformSTORECombine - Target-specific dag combine xforms for
10699 /// ISD::STORE.
10700 static SDValue PerformSTORECombine(SDNode *N,
10701                                    TargetLowering::DAGCombinerInfo &DCI) {
10702   StoreSDNode *St = cast<StoreSDNode>(N);
10703   if (St->isVolatile())
10704     return SDValue();
10705 
10706   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
10707   // pack all of the elements in one place.  Next, store to memory in fewer
10708   // chunks.
10709   SDValue StVal = St->getValue();
10710   EVT VT = StVal.getValueType();
10711   if (St->isTruncatingStore() && VT.isVector()) {
10712     SelectionDAG &DAG = DCI.DAG;
10713     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10714     EVT StVT = St->getMemoryVT();
10715     unsigned NumElems = VT.getVectorNumElements();
10716     assert(StVT != VT && "Cannot truncate to the same type");
10717     unsigned FromEltSz = VT.getScalarSizeInBits();
10718     unsigned ToEltSz = StVT.getScalarSizeInBits();
10719 
10720     // From, To sizes and ElemCount must be pow of two
10721     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
10722 
10723     // We are going to use the original vector elt for storing.
10724     // Accumulated smaller vector elements must be a multiple of the store size.
10725     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
10726 
10727     unsigned SizeRatio  = FromEltSz / ToEltSz;
10728     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
10729 
10730     // Create a type on which we perform the shuffle.
10731     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
10732                                      NumElems*SizeRatio);
10733     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
10734 
10735     SDLoc DL(St);
10736     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
10737     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
10738     for (unsigned i = 0; i < NumElems; ++i)
10739       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
10740                           ? (i + 1) * SizeRatio - 1
10741                           : i * SizeRatio;
10742 
10743     // Can't shuffle using an illegal type.
10744     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
10745 
10746     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
10747                                 DAG.getUNDEF(WideVec.getValueType()),
10748                                 ShuffleVec);
10749     // At this point all of the data is stored at the bottom of the
10750     // register. We now need to save it to mem.
10751 
10752     // Find the largest store unit
10753     MVT StoreType = MVT::i8;
10754     for (MVT Tp : MVT::integer_valuetypes()) {
10755       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
10756         StoreType = Tp;
10757     }
10758     // Didn't find a legal store type.
10759     if (!TLI.isTypeLegal(StoreType))
10760       return SDValue();
10761 
10762     // Bitcast the original vector into a vector of store-size units
10763     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
10764             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
10765     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
10766     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
10767     SmallVector<SDValue, 8> Chains;
10768     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
10769                                         TLI.getPointerTy(DAG.getDataLayout()));
10770     SDValue BasePtr = St->getBasePtr();
10771 
10772     // Perform one or more big stores into memory.
10773     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
10774     for (unsigned I = 0; I < E; I++) {
10775       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
10776                                    StoreType, ShuffWide,
10777                                    DAG.getIntPtrConstant(I, DL));
10778       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
10779                                 St->getPointerInfo(), St->getAlignment(),
10780                                 St->getMemOperand()->getFlags());
10781       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
10782                             Increment);
10783       Chains.push_back(Ch);
10784     }
10785     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
10786   }
10787 
10788   if (!ISD::isNormalStore(St))
10789     return SDValue();
10790 
10791   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
10792   // ARM stores of arguments in the same cache line.
10793   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
10794       StVal.getNode()->hasOneUse()) {
10795     SelectionDAG  &DAG = DCI.DAG;
10796     bool isBigEndian = DAG.getDataLayout().isBigEndian();
10797     SDLoc DL(St);
10798     SDValue BasePtr = St->getBasePtr();
10799     SDValue NewST1 = DAG.getStore(
10800         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
10801         BasePtr, St->getPointerInfo(), St->getAlignment(),
10802         St->getMemOperand()->getFlags());
10803 
10804     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
10805                                     DAG.getConstant(4, DL, MVT::i32));
10806     return DAG.getStore(NewST1.getValue(0), DL,
10807                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
10808                         OffsetPtr, St->getPointerInfo(),
10809                         std::min(4U, St->getAlignment() / 2),
10810                         St->getMemOperand()->getFlags());
10811   }
10812 
10813   if (StVal.getValueType() == MVT::i64 &&
10814       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10815 
10816     // Bitcast an i64 store extracted from a vector to f64.
10817     // Otherwise, the i64 value will be legalized to a pair of i32 values.
10818     SelectionDAG &DAG = DCI.DAG;
10819     SDLoc dl(StVal);
10820     SDValue IntVec = StVal.getOperand(0);
10821     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
10822                                    IntVec.getValueType().getVectorNumElements());
10823     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
10824     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
10825                                  Vec, StVal.getOperand(1));
10826     dl = SDLoc(N);
10827     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
10828     // Make the DAGCombiner fold the bitcasts.
10829     DCI.AddToWorklist(Vec.getNode());
10830     DCI.AddToWorklist(ExtElt.getNode());
10831     DCI.AddToWorklist(V.getNode());
10832     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
10833                         St->getPointerInfo(), St->getAlignment(),
10834                         St->getMemOperand()->getFlags(), St->getAAInfo());
10835   }
10836 
10837   // If this is a legal vector store, try to combine it into a VST1_UPD.
10838   if (ISD::isNormalStore(N) && VT.isVector() &&
10839       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
10840     return CombineBaseUpdate(N, DCI);
10841 
10842   return SDValue();
10843 }
10844 
10845 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
10846 /// can replace combinations of VMUL and VCVT (floating-point to integer)
10847 /// when the VMUL has a constant operand that is a power of 2.
10848 ///
10849 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
10850 ///  vmul.f32        d16, d17, d16
10851 ///  vcvt.s32.f32    d16, d16
10852 /// becomes:
10853 ///  vcvt.s32.f32    d16, d16, #3
10854 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
10855                                   const ARMSubtarget *Subtarget) {
10856   if (!Subtarget->hasNEON())
10857     return SDValue();
10858 
10859   SDValue Op = N->getOperand(0);
10860   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
10861       Op.getOpcode() != ISD::FMUL)
10862     return SDValue();
10863 
10864   SDValue ConstVec = Op->getOperand(1);
10865   if (!isa<BuildVectorSDNode>(ConstVec))
10866     return SDValue();
10867 
10868   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
10869   uint32_t FloatBits = FloatTy.getSizeInBits();
10870   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
10871   uint32_t IntBits = IntTy.getSizeInBits();
10872   unsigned NumLanes = Op.getValueType().getVectorNumElements();
10873   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
10874     // These instructions only exist converting from f32 to i32. We can handle
10875     // smaller integers by generating an extra truncate, but larger ones would
10876     // be lossy. We also can't handle more then 4 lanes, since these intructions
10877     // only support v2i32/v4i32 types.
10878     return SDValue();
10879   }
10880 
10881   BitVector UndefElements;
10882   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
10883   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
10884   if (C == -1 || C == 0 || C > 32)
10885     return SDValue();
10886 
10887   SDLoc dl(N);
10888   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
10889   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
10890     Intrinsic::arm_neon_vcvtfp2fxu;
10891   SDValue FixConv = DAG.getNode(
10892       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
10893       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
10894       DAG.getConstant(C, dl, MVT::i32));
10895 
10896   if (IntBits < FloatBits)
10897     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
10898 
10899   return FixConv;
10900 }
10901 
10902 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
10903 /// can replace combinations of VCVT (integer to floating-point) and VDIV
10904 /// when the VDIV has a constant operand that is a power of 2.
10905 ///
10906 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
10907 ///  vcvt.f32.s32    d16, d16
10908 ///  vdiv.f32        d16, d17, d16
10909 /// becomes:
10910 ///  vcvt.f32.s32    d16, d16, #3
10911 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
10912                                   const ARMSubtarget *Subtarget) {
10913   if (!Subtarget->hasNEON())
10914     return SDValue();
10915 
10916   SDValue Op = N->getOperand(0);
10917   unsigned OpOpcode = Op.getNode()->getOpcode();
10918   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
10919       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
10920     return SDValue();
10921 
10922   SDValue ConstVec = N->getOperand(1);
10923   if (!isa<BuildVectorSDNode>(ConstVec))
10924     return SDValue();
10925 
10926   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
10927   uint32_t FloatBits = FloatTy.getSizeInBits();
10928   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
10929   uint32_t IntBits = IntTy.getSizeInBits();
10930   unsigned NumLanes = Op.getValueType().getVectorNumElements();
10931   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
10932     // These instructions only exist converting from i32 to f32. We can handle
10933     // smaller integers by generating an extra extend, but larger ones would
10934     // be lossy. We also can't handle more then 4 lanes, since these intructions
10935     // only support v2i32/v4i32 types.
10936     return SDValue();
10937   }
10938 
10939   BitVector UndefElements;
10940   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
10941   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
10942   if (C == -1 || C == 0 || C > 32)
10943     return SDValue();
10944 
10945   SDLoc dl(N);
10946   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
10947   SDValue ConvInput = Op.getOperand(0);
10948   if (IntBits < FloatBits)
10949     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
10950                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
10951                             ConvInput);
10952 
10953   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
10954     Intrinsic::arm_neon_vcvtfxu2fp;
10955   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
10956                      Op.getValueType(),
10957                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
10958                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
10959 }
10960 
10961 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
10962 /// operand of a vector shift operation, where all the elements of the
10963 /// build_vector must have the same constant integer value.
10964 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10965   // Ignore bit_converts.
10966   while (Op.getOpcode() == ISD::BITCAST)
10967     Op = Op.getOperand(0);
10968   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
10969   APInt SplatBits, SplatUndef;
10970   unsigned SplatBitSize;
10971   bool HasAnyUndefs;
10972   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10973                                       HasAnyUndefs, ElementBits) ||
10974       SplatBitSize > ElementBits)
10975     return false;
10976   Cnt = SplatBits.getSExtValue();
10977   return true;
10978 }
10979 
10980 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
10981 /// operand of a vector shift left operation.  That value must be in the range:
10982 ///   0 <= Value < ElementBits for a left shift; or
10983 ///   0 <= Value <= ElementBits for a long left shift.
10984 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10985   assert(VT.isVector() && "vector shift count is not a vector type");
10986   int64_t ElementBits = VT.getScalarSizeInBits();
10987   if (! getVShiftImm(Op, ElementBits, Cnt))
10988     return false;
10989   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
10990 }
10991 
10992 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
10993 /// operand of a vector shift right operation.  For a shift opcode, the value
10994 /// is positive, but for an intrinsic the value count must be negative. The
10995 /// absolute value must be in the range:
10996 ///   1 <= |Value| <= ElementBits for a right shift; or
10997 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
10998 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
10999                          int64_t &Cnt) {
11000   assert(VT.isVector() && "vector shift count is not a vector type");
11001   int64_t ElementBits = VT.getScalarSizeInBits();
11002   if (! getVShiftImm(Op, ElementBits, Cnt))
11003     return false;
11004   if (!isIntrinsic)
11005     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
11006   if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
11007     Cnt = -Cnt;
11008     return true;
11009   }
11010   return false;
11011 }
11012 
11013 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
11014 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
11015   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
11016   switch (IntNo) {
11017   default:
11018     // Don't do anything for most intrinsics.
11019     break;
11020 
11021   // Vector shifts: check for immediate versions and lower them.
11022   // Note: This is done during DAG combining instead of DAG legalizing because
11023   // the build_vectors for 64-bit vector element shift counts are generally
11024   // not legal, and it is hard to see their values after they get legalized to
11025   // loads from a constant pool.
11026   case Intrinsic::arm_neon_vshifts:
11027   case Intrinsic::arm_neon_vshiftu:
11028   case Intrinsic::arm_neon_vrshifts:
11029   case Intrinsic::arm_neon_vrshiftu:
11030   case Intrinsic::arm_neon_vrshiftn:
11031   case Intrinsic::arm_neon_vqshifts:
11032   case Intrinsic::arm_neon_vqshiftu:
11033   case Intrinsic::arm_neon_vqshiftsu:
11034   case Intrinsic::arm_neon_vqshiftns:
11035   case Intrinsic::arm_neon_vqshiftnu:
11036   case Intrinsic::arm_neon_vqshiftnsu:
11037   case Intrinsic::arm_neon_vqrshiftns:
11038   case Intrinsic::arm_neon_vqrshiftnu:
11039   case Intrinsic::arm_neon_vqrshiftnsu: {
11040     EVT VT = N->getOperand(1).getValueType();
11041     int64_t Cnt;
11042     unsigned VShiftOpc = 0;
11043 
11044     switch (IntNo) {
11045     case Intrinsic::arm_neon_vshifts:
11046     case Intrinsic::arm_neon_vshiftu:
11047       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
11048         VShiftOpc = ARMISD::VSHL;
11049         break;
11050       }
11051       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
11052         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
11053                      ARMISD::VSHRs : ARMISD::VSHRu);
11054         break;
11055       }
11056       return SDValue();
11057 
11058     case Intrinsic::arm_neon_vrshifts:
11059     case Intrinsic::arm_neon_vrshiftu:
11060       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
11061         break;
11062       return SDValue();
11063 
11064     case Intrinsic::arm_neon_vqshifts:
11065     case Intrinsic::arm_neon_vqshiftu:
11066       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
11067         break;
11068       return SDValue();
11069 
11070     case Intrinsic::arm_neon_vqshiftsu:
11071       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
11072         break;
11073       llvm_unreachable("invalid shift count for vqshlu intrinsic");
11074 
11075     case Intrinsic::arm_neon_vrshiftn:
11076     case Intrinsic::arm_neon_vqshiftns:
11077     case Intrinsic::arm_neon_vqshiftnu:
11078     case Intrinsic::arm_neon_vqshiftnsu:
11079     case Intrinsic::arm_neon_vqrshiftns:
11080     case Intrinsic::arm_neon_vqrshiftnu:
11081     case Intrinsic::arm_neon_vqrshiftnsu:
11082       // Narrowing shifts require an immediate right shift.
11083       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
11084         break;
11085       llvm_unreachable("invalid shift count for narrowing vector shift "
11086                        "intrinsic");
11087 
11088     default:
11089       llvm_unreachable("unhandled vector shift");
11090     }
11091 
11092     switch (IntNo) {
11093     case Intrinsic::arm_neon_vshifts:
11094     case Intrinsic::arm_neon_vshiftu:
11095       // Opcode already set above.
11096       break;
11097     case Intrinsic::arm_neon_vrshifts:
11098       VShiftOpc = ARMISD::VRSHRs; break;
11099     case Intrinsic::arm_neon_vrshiftu:
11100       VShiftOpc = ARMISD::VRSHRu; break;
11101     case Intrinsic::arm_neon_vrshiftn:
11102       VShiftOpc = ARMISD::VRSHRN; break;
11103     case Intrinsic::arm_neon_vqshifts:
11104       VShiftOpc = ARMISD::VQSHLs; break;
11105     case Intrinsic::arm_neon_vqshiftu:
11106       VShiftOpc = ARMISD::VQSHLu; break;
11107     case Intrinsic::arm_neon_vqshiftsu:
11108       VShiftOpc = ARMISD::VQSHLsu; break;
11109     case Intrinsic::arm_neon_vqshiftns:
11110       VShiftOpc = ARMISD::VQSHRNs; break;
11111     case Intrinsic::arm_neon_vqshiftnu:
11112       VShiftOpc = ARMISD::VQSHRNu; break;
11113     case Intrinsic::arm_neon_vqshiftnsu:
11114       VShiftOpc = ARMISD::VQSHRNsu; break;
11115     case Intrinsic::arm_neon_vqrshiftns:
11116       VShiftOpc = ARMISD::VQRSHRNs; break;
11117     case Intrinsic::arm_neon_vqrshiftnu:
11118       VShiftOpc = ARMISD::VQRSHRNu; break;
11119     case Intrinsic::arm_neon_vqrshiftnsu:
11120       VShiftOpc = ARMISD::VQRSHRNsu; break;
11121     }
11122 
11123     SDLoc dl(N);
11124     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
11125                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
11126   }
11127 
11128   case Intrinsic::arm_neon_vshiftins: {
11129     EVT VT = N->getOperand(1).getValueType();
11130     int64_t Cnt;
11131     unsigned VShiftOpc = 0;
11132 
11133     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
11134       VShiftOpc = ARMISD::VSLI;
11135     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
11136       VShiftOpc = ARMISD::VSRI;
11137     else {
11138       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
11139     }
11140 
11141     SDLoc dl(N);
11142     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
11143                        N->getOperand(1), N->getOperand(2),
11144                        DAG.getConstant(Cnt, dl, MVT::i32));
11145   }
11146 
11147   case Intrinsic::arm_neon_vqrshifts:
11148   case Intrinsic::arm_neon_vqrshiftu:
11149     // No immediate versions of these to check for.
11150     break;
11151   }
11152 
11153   return SDValue();
11154 }
11155 
11156 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
11157 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
11158 /// combining instead of DAG legalizing because the build_vectors for 64-bit
11159 /// vector element shift counts are generally not legal, and it is hard to see
11160 /// their values after they get legalized to loads from a constant pool.
11161 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
11162                                    const ARMSubtarget *ST) {
11163   EVT VT = N->getValueType(0);
11164   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
11165     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
11166     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
11167     SDValue N1 = N->getOperand(1);
11168     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
11169       SDValue N0 = N->getOperand(0);
11170       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
11171           DAG.MaskedValueIsZero(N0.getOperand(0),
11172                                 APInt::getHighBitsSet(32, 16)))
11173         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
11174     }
11175   }
11176 
11177   // Nothing to be done for scalar shifts.
11178   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11179   if (!VT.isVector() || !TLI.isTypeLegal(VT))
11180     return SDValue();
11181 
11182   assert(ST->hasNEON() && "unexpected vector shift");
11183   int64_t Cnt;
11184 
11185   switch (N->getOpcode()) {
11186   default: llvm_unreachable("unexpected shift opcode");
11187 
11188   case ISD::SHL:
11189     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
11190       SDLoc dl(N);
11191       return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
11192                          DAG.getConstant(Cnt, dl, MVT::i32));
11193     }
11194     break;
11195 
11196   case ISD::SRA:
11197   case ISD::SRL:
11198     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
11199       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
11200                             ARMISD::VSHRs : ARMISD::VSHRu);
11201       SDLoc dl(N);
11202       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
11203                          DAG.getConstant(Cnt, dl, MVT::i32));
11204     }
11205   }
11206   return SDValue();
11207 }
11208 
11209 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
11210 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
11211 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
11212                                     const ARMSubtarget *ST) {
11213   SDValue N0 = N->getOperand(0);
11214 
11215   // Check for sign- and zero-extensions of vector extract operations of 8-
11216   // and 16-bit vector elements.  NEON supports these directly.  They are
11217   // handled during DAG combining because type legalization will promote them
11218   // to 32-bit types and it is messy to recognize the operations after that.
11219   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
11220     SDValue Vec = N0.getOperand(0);
11221     SDValue Lane = N0.getOperand(1);
11222     EVT VT = N->getValueType(0);
11223     EVT EltVT = N0.getValueType();
11224     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11225 
11226     if (VT == MVT::i32 &&
11227         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
11228         TLI.isTypeLegal(Vec.getValueType()) &&
11229         isa<ConstantSDNode>(Lane)) {
11230 
11231       unsigned Opc = 0;
11232       switch (N->getOpcode()) {
11233       default: llvm_unreachable("unexpected opcode");
11234       case ISD::SIGN_EXTEND:
11235         Opc = ARMISD::VGETLANEs;
11236         break;
11237       case ISD::ZERO_EXTEND:
11238       case ISD::ANY_EXTEND:
11239         Opc = ARMISD::VGETLANEu;
11240         break;
11241       }
11242       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
11243     }
11244   }
11245 
11246   return SDValue();
11247 }
11248 
11249 static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
11250                              APInt &KnownOne) {
11251   if (Op.getOpcode() == ARMISD::BFI) {
11252     // Conservatively, we can recurse down the first operand
11253     // and just mask out all affected bits.
11254     computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
11255 
11256     // The operand to BFI is already a mask suitable for removing the bits it
11257     // sets.
11258     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
11259     const APInt &Mask = CI->getAPIntValue();
11260     KnownZero &= Mask;
11261     KnownOne &= Mask;
11262     return;
11263   }
11264   if (Op.getOpcode() == ARMISD::CMOV) {
11265     APInt KZ2(KnownZero.getBitWidth(), 0);
11266     APInt KO2(KnownOne.getBitWidth(), 0);
11267     computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
11268     computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
11269 
11270     KnownZero &= KZ2;
11271     KnownOne &= KO2;
11272     return;
11273   }
11274   return DAG.computeKnownBits(Op, KnownZero, KnownOne);
11275 }
11276 
11277 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
11278   // If we have a CMOV, OR and AND combination such as:
11279   //   if (x & CN)
11280   //     y |= CM;
11281   //
11282   // And:
11283   //   * CN is a single bit;
11284   //   * All bits covered by CM are known zero in y
11285   //
11286   // Then we can convert this into a sequence of BFI instructions. This will
11287   // always be a win if CM is a single bit, will always be no worse than the
11288   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
11289   // three bits (due to the extra IT instruction).
11290 
11291   SDValue Op0 = CMOV->getOperand(0);
11292   SDValue Op1 = CMOV->getOperand(1);
11293   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
11294   auto CC = CCNode->getAPIntValue().getLimitedValue();
11295   SDValue CmpZ = CMOV->getOperand(4);
11296 
11297   // The compare must be against zero.
11298   if (!isNullConstant(CmpZ->getOperand(1)))
11299     return SDValue();
11300 
11301   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
11302   SDValue And = CmpZ->getOperand(0);
11303   if (And->getOpcode() != ISD::AND)
11304     return SDValue();
11305   ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
11306   if (!AndC || !AndC->getAPIntValue().isPowerOf2())
11307     return SDValue();
11308   SDValue X = And->getOperand(0);
11309 
11310   if (CC == ARMCC::EQ) {
11311     // We're performing an "equal to zero" compare. Swap the operands so we
11312     // canonicalize on a "not equal to zero" compare.
11313     std::swap(Op0, Op1);
11314   } else {
11315     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
11316   }
11317 
11318   if (Op1->getOpcode() != ISD::OR)
11319     return SDValue();
11320 
11321   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
11322   if (!OrC)
11323     return SDValue();
11324   SDValue Y = Op1->getOperand(0);
11325 
11326   if (Op0 != Y)
11327     return SDValue();
11328 
11329   // Now, is it profitable to continue?
11330   APInt OrCI = OrC->getAPIntValue();
11331   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
11332   if (OrCI.countPopulation() > Heuristic)
11333     return SDValue();
11334 
11335   // Lastly, can we determine that the bits defined by OrCI
11336   // are zero in Y?
11337   APInt KnownZero, KnownOne;
11338   computeKnownBits(DAG, Y, KnownZero, KnownOne);
11339   if ((OrCI & KnownZero) != OrCI)
11340     return SDValue();
11341 
11342   // OK, we can do the combine.
11343   SDValue V = Y;
11344   SDLoc dl(X);
11345   EVT VT = X.getValueType();
11346   unsigned BitInX = AndC->getAPIntValue().logBase2();
11347 
11348   if (BitInX != 0) {
11349     // We must shift X first.
11350     X = DAG.getNode(ISD::SRL, dl, VT, X,
11351                     DAG.getConstant(BitInX, dl, VT));
11352   }
11353 
11354   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
11355        BitInY < NumActiveBits; ++BitInY) {
11356     if (OrCI[BitInY] == 0)
11357       continue;
11358     APInt Mask(VT.getSizeInBits(), 0);
11359     Mask.setBit(BitInY);
11360     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
11361                     // Confusingly, the operand is an *inverted* mask.
11362                     DAG.getConstant(~Mask, dl, VT));
11363   }
11364 
11365   return V;
11366 }
11367 
11368 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
11369 SDValue
11370 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
11371   SDValue Cmp = N->getOperand(4);
11372   if (Cmp.getOpcode() != ARMISD::CMPZ)
11373     // Only looking at NE cases.
11374     return SDValue();
11375 
11376   EVT VT = N->getValueType(0);
11377   SDLoc dl(N);
11378   SDValue LHS = Cmp.getOperand(0);
11379   SDValue RHS = Cmp.getOperand(1);
11380   SDValue Chain = N->getOperand(0);
11381   SDValue BB = N->getOperand(1);
11382   SDValue ARMcc = N->getOperand(2);
11383   ARMCC::CondCodes CC =
11384     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
11385 
11386   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
11387   // -> (brcond Chain BB CC CPSR Cmp)
11388   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
11389       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
11390       LHS->getOperand(0)->hasOneUse()) {
11391     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
11392     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
11393     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
11394     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
11395     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
11396         (LHS01C && LHS01C->getZExtValue() == 1) &&
11397         (LHS1C && LHS1C->getZExtValue() == 1) &&
11398         (RHSC && RHSC->getZExtValue() == 0)) {
11399       return DAG.getNode(
11400           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
11401           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
11402     }
11403   }
11404 
11405   return SDValue();
11406 }
11407 
11408 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
11409 SDValue
11410 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
11411   SDValue Cmp = N->getOperand(4);
11412   if (Cmp.getOpcode() != ARMISD::CMPZ)
11413     // Only looking at EQ and NE cases.
11414     return SDValue();
11415 
11416   EVT VT = N->getValueType(0);
11417   SDLoc dl(N);
11418   SDValue LHS = Cmp.getOperand(0);
11419   SDValue RHS = Cmp.getOperand(1);
11420   SDValue FalseVal = N->getOperand(0);
11421   SDValue TrueVal = N->getOperand(1);
11422   SDValue ARMcc = N->getOperand(2);
11423   ARMCC::CondCodes CC =
11424     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
11425 
11426   // BFI is only available on V6T2+.
11427   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
11428     SDValue R = PerformCMOVToBFICombine(N, DAG);
11429     if (R)
11430       return R;
11431   }
11432 
11433   // Simplify
11434   //   mov     r1, r0
11435   //   cmp     r1, x
11436   //   mov     r0, y
11437   //   moveq   r0, x
11438   // to
11439   //   cmp     r0, x
11440   //   movne   r0, y
11441   //
11442   //   mov     r1, r0
11443   //   cmp     r1, x
11444   //   mov     r0, x
11445   //   movne   r0, y
11446   // to
11447   //   cmp     r0, x
11448   //   movne   r0, y
11449   /// FIXME: Turn this into a target neutral optimization?
11450   SDValue Res;
11451   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
11452     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
11453                       N->getOperand(3), Cmp);
11454   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
11455     SDValue ARMcc;
11456     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
11457     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
11458                       N->getOperand(3), NewCmp);
11459   }
11460 
11461   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
11462   // -> (cmov F T CC CPSR Cmp)
11463   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
11464     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
11465     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
11466     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
11467     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
11468         (LHS1C && LHS1C->getZExtValue() == 1) &&
11469         (RHSC && RHSC->getZExtValue() == 0)) {
11470       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
11471                          LHS->getOperand(2), LHS->getOperand(3),
11472                          LHS->getOperand(4));
11473     }
11474   }
11475 
11476   if (Res.getNode()) {
11477     APInt KnownZero, KnownOne;
11478     DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
11479     // Capture demanded bits information that would be otherwise lost.
11480     if (KnownZero == 0xfffffffe)
11481       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11482                         DAG.getValueType(MVT::i1));
11483     else if (KnownZero == 0xffffff00)
11484       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11485                         DAG.getValueType(MVT::i8));
11486     else if (KnownZero == 0xffff0000)
11487       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11488                         DAG.getValueType(MVT::i16));
11489   }
11490 
11491   return Res;
11492 }
11493 
11494 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
11495                                              DAGCombinerInfo &DCI) const {
11496   switch (N->getOpcode()) {
11497   default: break;
11498   case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
11499   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
11500   case ISD::SUB:        return PerformSUBCombine(N, DCI);
11501   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
11502   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
11503   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
11504   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
11505   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
11506   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
11507   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
11508   case ISD::STORE:      return PerformSTORECombine(N, DCI);
11509   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
11510   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
11511   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
11512   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
11513   case ISD::FP_TO_SINT:
11514   case ISD::FP_TO_UINT:
11515     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
11516   case ISD::FDIV:
11517     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
11518   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
11519   case ISD::SHL:
11520   case ISD::SRA:
11521   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
11522   case ISD::SIGN_EXTEND:
11523   case ISD::ZERO_EXTEND:
11524   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
11525   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
11526   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
11527   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
11528   case ARMISD::VLD2DUP:
11529   case ARMISD::VLD3DUP:
11530   case ARMISD::VLD4DUP:
11531     return PerformVLDCombine(N, DCI);
11532   case ARMISD::BUILD_VECTOR:
11533     return PerformARMBUILD_VECTORCombine(N, DCI);
11534   case ISD::INTRINSIC_VOID:
11535   case ISD::INTRINSIC_W_CHAIN:
11536     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
11537     case Intrinsic::arm_neon_vld1:
11538     case Intrinsic::arm_neon_vld2:
11539     case Intrinsic::arm_neon_vld3:
11540     case Intrinsic::arm_neon_vld4:
11541     case Intrinsic::arm_neon_vld2lane:
11542     case Intrinsic::arm_neon_vld3lane:
11543     case Intrinsic::arm_neon_vld4lane:
11544     case Intrinsic::arm_neon_vst1:
11545     case Intrinsic::arm_neon_vst2:
11546     case Intrinsic::arm_neon_vst3:
11547     case Intrinsic::arm_neon_vst4:
11548     case Intrinsic::arm_neon_vst2lane:
11549     case Intrinsic::arm_neon_vst3lane:
11550     case Intrinsic::arm_neon_vst4lane:
11551       return PerformVLDCombine(N, DCI);
11552     default: break;
11553     }
11554     break;
11555   }
11556   return SDValue();
11557 }
11558 
11559 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
11560                                                           EVT VT) const {
11561   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
11562 }
11563 
11564 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
11565                                                        unsigned,
11566                                                        unsigned,
11567                                                        bool *Fast) const {
11568   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
11569   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
11570 
11571   switch (VT.getSimpleVT().SimpleTy) {
11572   default:
11573     return false;
11574   case MVT::i8:
11575   case MVT::i16:
11576   case MVT::i32: {
11577     // Unaligned access can use (for example) LRDB, LRDH, LDR
11578     if (AllowsUnaligned) {
11579       if (Fast)
11580         *Fast = Subtarget->hasV7Ops();
11581       return true;
11582     }
11583     return false;
11584   }
11585   case MVT::f64:
11586   case MVT::v2f64: {
11587     // For any little-endian targets with neon, we can support unaligned ld/st
11588     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
11589     // A big-endian target may also explicitly support unaligned accesses
11590     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
11591       if (Fast)
11592         *Fast = true;
11593       return true;
11594     }
11595     return false;
11596   }
11597   }
11598 }
11599 
11600 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
11601                        unsigned AlignCheck) {
11602   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
11603           (DstAlign == 0 || DstAlign % AlignCheck == 0));
11604 }
11605 
11606 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
11607                                            unsigned DstAlign, unsigned SrcAlign,
11608                                            bool IsMemset, bool ZeroMemset,
11609                                            bool MemcpyStrSrc,
11610                                            MachineFunction &MF) const {
11611   const Function *F = MF.getFunction();
11612 
11613   // See if we can use NEON instructions for this...
11614   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
11615       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
11616     bool Fast;
11617     if (Size >= 16 &&
11618         (memOpAlign(SrcAlign, DstAlign, 16) ||
11619          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
11620       return MVT::v2f64;
11621     } else if (Size >= 8 &&
11622                (memOpAlign(SrcAlign, DstAlign, 8) ||
11623                 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
11624                  Fast))) {
11625       return MVT::f64;
11626     }
11627   }
11628 
11629   // Lowering to i32/i16 if the size permits.
11630   if (Size >= 4)
11631     return MVT::i32;
11632   else if (Size >= 2)
11633     return MVT::i16;
11634 
11635   // Let the target-independent logic figure it out.
11636   return MVT::Other;
11637 }
11638 
11639 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
11640   if (Val.getOpcode() != ISD::LOAD)
11641     return false;
11642 
11643   EVT VT1 = Val.getValueType();
11644   if (!VT1.isSimple() || !VT1.isInteger() ||
11645       !VT2.isSimple() || !VT2.isInteger())
11646     return false;
11647 
11648   switch (VT1.getSimpleVT().SimpleTy) {
11649   default: break;
11650   case MVT::i1:
11651   case MVT::i8:
11652   case MVT::i16:
11653     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
11654     return true;
11655   }
11656 
11657   return false;
11658 }
11659 
11660 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
11661   EVT VT = ExtVal.getValueType();
11662 
11663   if (!isTypeLegal(VT))
11664     return false;
11665 
11666   // Don't create a loadext if we can fold the extension into a wide/long
11667   // instruction.
11668   // If there's more than one user instruction, the loadext is desirable no
11669   // matter what.  There can be two uses by the same instruction.
11670   if (ExtVal->use_empty() ||
11671       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
11672     return true;
11673 
11674   SDNode *U = *ExtVal->use_begin();
11675   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
11676        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
11677     return false;
11678 
11679   return true;
11680 }
11681 
11682 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
11683   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11684     return false;
11685 
11686   if (!isTypeLegal(EVT::getEVT(Ty1)))
11687     return false;
11688 
11689   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
11690 
11691   // Assuming the caller doesn't have a zeroext or signext return parameter,
11692   // truncation all the way down to i1 is valid.
11693   return true;
11694 }
11695 
11696 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
11697                                                 const AddrMode &AM, Type *Ty,
11698                                                 unsigned AS) const {
11699   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
11700     if (Subtarget->hasFPAO())
11701       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
11702     return 0;
11703   }
11704   return -1;
11705 }
11706 
11707 
11708 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
11709   if (V < 0)
11710     return false;
11711 
11712   unsigned Scale = 1;
11713   switch (VT.getSimpleVT().SimpleTy) {
11714   default: return false;
11715   case MVT::i1:
11716   case MVT::i8:
11717     // Scale == 1;
11718     break;
11719   case MVT::i16:
11720     // Scale == 2;
11721     Scale = 2;
11722     break;
11723   case MVT::i32:
11724     // Scale == 4;
11725     Scale = 4;
11726     break;
11727   }
11728 
11729   if ((V & (Scale - 1)) != 0)
11730     return false;
11731   V /= Scale;
11732   return V == (V & ((1LL << 5) - 1));
11733 }
11734 
11735 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
11736                                       const ARMSubtarget *Subtarget) {
11737   bool isNeg = false;
11738   if (V < 0) {
11739     isNeg = true;
11740     V = - V;
11741   }
11742 
11743   switch (VT.getSimpleVT().SimpleTy) {
11744   default: return false;
11745   case MVT::i1:
11746   case MVT::i8:
11747   case MVT::i16:
11748   case MVT::i32:
11749     // + imm12 or - imm8
11750     if (isNeg)
11751       return V == (V & ((1LL << 8) - 1));
11752     return V == (V & ((1LL << 12) - 1));
11753   case MVT::f32:
11754   case MVT::f64:
11755     // Same as ARM mode. FIXME: NEON?
11756     if (!Subtarget->hasVFP2())
11757       return false;
11758     if ((V & 3) != 0)
11759       return false;
11760     V >>= 2;
11761     return V == (V & ((1LL << 8) - 1));
11762   }
11763 }
11764 
11765 /// isLegalAddressImmediate - Return true if the integer value can be used
11766 /// as the offset of the target addressing mode for load / store of the
11767 /// given type.
11768 static bool isLegalAddressImmediate(int64_t V, EVT VT,
11769                                     const ARMSubtarget *Subtarget) {
11770   if (V == 0)
11771     return true;
11772 
11773   if (!VT.isSimple())
11774     return false;
11775 
11776   if (Subtarget->isThumb1Only())
11777     return isLegalT1AddressImmediate(V, VT);
11778   else if (Subtarget->isThumb2())
11779     return isLegalT2AddressImmediate(V, VT, Subtarget);
11780 
11781   // ARM mode.
11782   if (V < 0)
11783     V = - V;
11784   switch (VT.getSimpleVT().SimpleTy) {
11785   default: return false;
11786   case MVT::i1:
11787   case MVT::i8:
11788   case MVT::i32:
11789     // +- imm12
11790     return V == (V & ((1LL << 12) - 1));
11791   case MVT::i16:
11792     // +- imm8
11793     return V == (V & ((1LL << 8) - 1));
11794   case MVT::f32:
11795   case MVT::f64:
11796     if (!Subtarget->hasVFP2()) // FIXME: NEON?
11797       return false;
11798     if ((V & 3) != 0)
11799       return false;
11800     V >>= 2;
11801     return V == (V & ((1LL << 8) - 1));
11802   }
11803 }
11804 
11805 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
11806                                                       EVT VT) const {
11807   int Scale = AM.Scale;
11808   if (Scale < 0)
11809     return false;
11810 
11811   switch (VT.getSimpleVT().SimpleTy) {
11812   default: return false;
11813   case MVT::i1:
11814   case MVT::i8:
11815   case MVT::i16:
11816   case MVT::i32:
11817     if (Scale == 1)
11818       return true;
11819     // r + r << imm
11820     Scale = Scale & ~1;
11821     return Scale == 2 || Scale == 4 || Scale == 8;
11822   case MVT::i64:
11823     // r + r
11824     if (((unsigned)AM.HasBaseReg + Scale) <= 2)
11825       return true;
11826     return false;
11827   case MVT::isVoid:
11828     // Note, we allow "void" uses (basically, uses that aren't loads or
11829     // stores), because arm allows folding a scale into many arithmetic
11830     // operations.  This should be made more precise and revisited later.
11831 
11832     // Allow r << imm, but the imm has to be a multiple of two.
11833     if (Scale & 1) return false;
11834     return isPowerOf2_32(Scale);
11835   }
11836 }
11837 
11838 /// isLegalAddressingMode - Return true if the addressing mode represented
11839 /// by AM is legal for this target, for a load/store of the specified type.
11840 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
11841                                               const AddrMode &AM, Type *Ty,
11842                                               unsigned AS) const {
11843   EVT VT = getValueType(DL, Ty, true);
11844   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
11845     return false;
11846 
11847   // Can never fold addr of global into load/store.
11848   if (AM.BaseGV)
11849     return false;
11850 
11851   switch (AM.Scale) {
11852   case 0:  // no scale reg, must be "r+i" or "r", or "i".
11853     break;
11854   case 1:
11855     if (Subtarget->isThumb1Only())
11856       return false;
11857     LLVM_FALLTHROUGH;
11858   default:
11859     // ARM doesn't support any R+R*scale+imm addr modes.
11860     if (AM.BaseOffs)
11861       return false;
11862 
11863     if (!VT.isSimple())
11864       return false;
11865 
11866     if (Subtarget->isThumb2())
11867       return isLegalT2ScaledAddressingMode(AM, VT);
11868 
11869     int Scale = AM.Scale;
11870     switch (VT.getSimpleVT().SimpleTy) {
11871     default: return false;
11872     case MVT::i1:
11873     case MVT::i8:
11874     case MVT::i32:
11875       if (Scale < 0) Scale = -Scale;
11876       if (Scale == 1)
11877         return true;
11878       // r + r << imm
11879       return isPowerOf2_32(Scale & ~1);
11880     case MVT::i16:
11881     case MVT::i64:
11882       // r + r
11883       if (((unsigned)AM.HasBaseReg + Scale) <= 2)
11884         return true;
11885       return false;
11886 
11887     case MVT::isVoid:
11888       // Note, we allow "void" uses (basically, uses that aren't loads or
11889       // stores), because arm allows folding a scale into many arithmetic
11890       // operations.  This should be made more precise and revisited later.
11891 
11892       // Allow r << imm, but the imm has to be a multiple of two.
11893       if (Scale & 1) return false;
11894       return isPowerOf2_32(Scale);
11895     }
11896   }
11897   return true;
11898 }
11899 
11900 /// isLegalICmpImmediate - Return true if the specified immediate is legal
11901 /// icmp immediate, that is the target has icmp instructions which can compare
11902 /// a register against the immediate without having to materialize the
11903 /// immediate into a register.
11904 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
11905   // Thumb2 and ARM modes can use cmn for negative immediates.
11906   if (!Subtarget->isThumb())
11907     return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
11908   if (Subtarget->isThumb2())
11909     return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
11910   // Thumb1 doesn't have cmn, and only 8-bit immediates.
11911   return Imm >= 0 && Imm <= 255;
11912 }
11913 
11914 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
11915 /// *or sub* immediate, that is the target has add or sub instructions which can
11916 /// add a register with the immediate without having to materialize the
11917 /// immediate into a register.
11918 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
11919   // Same encoding for add/sub, just flip the sign.
11920   int64_t AbsImm = std::abs(Imm);
11921   if (!Subtarget->isThumb())
11922     return ARM_AM::getSOImmVal(AbsImm) != -1;
11923   if (Subtarget->isThumb2())
11924     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
11925   // Thumb1 only has 8-bit unsigned immediate.
11926   return AbsImm >= 0 && AbsImm <= 255;
11927 }
11928 
11929 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
11930                                       bool isSEXTLoad, SDValue &Base,
11931                                       SDValue &Offset, bool &isInc,
11932                                       SelectionDAG &DAG) {
11933   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
11934     return false;
11935 
11936   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
11937     // AddressingMode 3
11938     Base = Ptr->getOperand(0);
11939     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11940       int RHSC = (int)RHS->getZExtValue();
11941       if (RHSC < 0 && RHSC > -256) {
11942         assert(Ptr->getOpcode() == ISD::ADD);
11943         isInc = false;
11944         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11945         return true;
11946       }
11947     }
11948     isInc = (Ptr->getOpcode() == ISD::ADD);
11949     Offset = Ptr->getOperand(1);
11950     return true;
11951   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
11952     // AddressingMode 2
11953     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11954       int RHSC = (int)RHS->getZExtValue();
11955       if (RHSC < 0 && RHSC > -0x1000) {
11956         assert(Ptr->getOpcode() == ISD::ADD);
11957         isInc = false;
11958         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11959         Base = Ptr->getOperand(0);
11960         return true;
11961       }
11962     }
11963 
11964     if (Ptr->getOpcode() == ISD::ADD) {
11965       isInc = true;
11966       ARM_AM::ShiftOpc ShOpcVal=
11967         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
11968       if (ShOpcVal != ARM_AM::no_shift) {
11969         Base = Ptr->getOperand(1);
11970         Offset = Ptr->getOperand(0);
11971       } else {
11972         Base = Ptr->getOperand(0);
11973         Offset = Ptr->getOperand(1);
11974       }
11975       return true;
11976     }
11977 
11978     isInc = (Ptr->getOpcode() == ISD::ADD);
11979     Base = Ptr->getOperand(0);
11980     Offset = Ptr->getOperand(1);
11981     return true;
11982   }
11983 
11984   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
11985   return false;
11986 }
11987 
11988 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
11989                                      bool isSEXTLoad, SDValue &Base,
11990                                      SDValue &Offset, bool &isInc,
11991                                      SelectionDAG &DAG) {
11992   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
11993     return false;
11994 
11995   Base = Ptr->getOperand(0);
11996   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11997     int RHSC = (int)RHS->getZExtValue();
11998     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
11999       assert(Ptr->getOpcode() == ISD::ADD);
12000       isInc = false;
12001       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
12002       return true;
12003     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
12004       isInc = Ptr->getOpcode() == ISD::ADD;
12005       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
12006       return true;
12007     }
12008   }
12009 
12010   return false;
12011 }
12012 
12013 /// getPreIndexedAddressParts - returns true by value, base pointer and
12014 /// offset pointer and addressing mode by reference if the node's address
12015 /// can be legally represented as pre-indexed load / store address.
12016 bool
12017 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
12018                                              SDValue &Offset,
12019                                              ISD::MemIndexedMode &AM,
12020                                              SelectionDAG &DAG) const {
12021   if (Subtarget->isThumb1Only())
12022     return false;
12023 
12024   EVT VT;
12025   SDValue Ptr;
12026   bool isSEXTLoad = false;
12027   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
12028     Ptr = LD->getBasePtr();
12029     VT  = LD->getMemoryVT();
12030     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
12031   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
12032     Ptr = ST->getBasePtr();
12033     VT  = ST->getMemoryVT();
12034   } else
12035     return false;
12036 
12037   bool isInc;
12038   bool isLegal = false;
12039   if (Subtarget->isThumb2())
12040     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
12041                                        Offset, isInc, DAG);
12042   else
12043     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
12044                                         Offset, isInc, DAG);
12045   if (!isLegal)
12046     return false;
12047 
12048   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
12049   return true;
12050 }
12051 
12052 /// getPostIndexedAddressParts - returns true by value, base pointer and
12053 /// offset pointer and addressing mode by reference if this node can be
12054 /// combined with a load / store to form a post-indexed load / store.
12055 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
12056                                                    SDValue &Base,
12057                                                    SDValue &Offset,
12058                                                    ISD::MemIndexedMode &AM,
12059                                                    SelectionDAG &DAG) const {
12060   EVT VT;
12061   SDValue Ptr;
12062   bool isSEXTLoad = false, isNonExt;
12063   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
12064     VT  = LD->getMemoryVT();
12065     Ptr = LD->getBasePtr();
12066     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
12067     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
12068   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
12069     VT  = ST->getMemoryVT();
12070     Ptr = ST->getBasePtr();
12071     isNonExt = !ST->isTruncatingStore();
12072   } else
12073     return false;
12074 
12075   if (Subtarget->isThumb1Only()) {
12076     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
12077     // must be non-extending/truncating, i32, with an offset of 4.
12078     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
12079     if (Op->getOpcode() != ISD::ADD || !isNonExt)
12080       return false;
12081     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12082     if (!RHS || RHS->getZExtValue() != 4)
12083       return false;
12084 
12085     Offset = Op->getOperand(1);
12086     Base = Op->getOperand(0);
12087     AM = ISD::POST_INC;
12088     return true;
12089   }
12090 
12091   bool isInc;
12092   bool isLegal = false;
12093   if (Subtarget->isThumb2())
12094     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
12095                                        isInc, DAG);
12096   else
12097     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
12098                                         isInc, DAG);
12099   if (!isLegal)
12100     return false;
12101 
12102   if (Ptr != Base) {
12103     // Swap base ptr and offset to catch more post-index load / store when
12104     // it's legal. In Thumb2 mode, offset must be an immediate.
12105     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
12106         !Subtarget->isThumb2())
12107       std::swap(Base, Offset);
12108 
12109     // Post-indexed load / store update the base pointer.
12110     if (Ptr != Base)
12111       return false;
12112   }
12113 
12114   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
12115   return true;
12116 }
12117 
12118 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
12119                                                       APInt &KnownZero,
12120                                                       APInt &KnownOne,
12121                                                       const SelectionDAG &DAG,
12122                                                       unsigned Depth) const {
12123   unsigned BitWidth = KnownOne.getBitWidth();
12124   KnownZero = KnownOne = APInt(BitWidth, 0);
12125   switch (Op.getOpcode()) {
12126   default: break;
12127   case ARMISD::ADDC:
12128   case ARMISD::ADDE:
12129   case ARMISD::SUBC:
12130   case ARMISD::SUBE:
12131     // These nodes' second result is a boolean
12132     if (Op.getResNo() == 0)
12133       break;
12134     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
12135     break;
12136   case ARMISD::CMOV: {
12137     // Bits are known zero/one if known on the LHS and RHS.
12138     DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
12139     if (KnownZero == 0 && KnownOne == 0) return;
12140 
12141     APInt KnownZeroRHS, KnownOneRHS;
12142     DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
12143     KnownZero &= KnownZeroRHS;
12144     KnownOne  &= KnownOneRHS;
12145     return;
12146   }
12147   case ISD::INTRINSIC_W_CHAIN: {
12148     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
12149     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
12150     switch (IntID) {
12151     default: return;
12152     case Intrinsic::arm_ldaex:
12153     case Intrinsic::arm_ldrex: {
12154       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
12155       unsigned MemBits = VT.getScalarSizeInBits();
12156       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
12157       return;
12158     }
12159     }
12160   }
12161   }
12162 }
12163 
12164 //===----------------------------------------------------------------------===//
12165 //                           ARM Inline Assembly Support
12166 //===----------------------------------------------------------------------===//
12167 
12168 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
12169   // Looking for "rev" which is V6+.
12170   if (!Subtarget->hasV6Ops())
12171     return false;
12172 
12173   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
12174   std::string AsmStr = IA->getAsmString();
12175   SmallVector<StringRef, 4> AsmPieces;
12176   SplitString(AsmStr, AsmPieces, ";\n");
12177 
12178   switch (AsmPieces.size()) {
12179   default: return false;
12180   case 1:
12181     AsmStr = AsmPieces[0];
12182     AsmPieces.clear();
12183     SplitString(AsmStr, AsmPieces, " \t,");
12184 
12185     // rev $0, $1
12186     if (AsmPieces.size() == 3 &&
12187         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
12188         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
12189       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
12190       if (Ty && Ty->getBitWidth() == 32)
12191         return IntrinsicLowering::LowerToByteSwap(CI);
12192     }
12193     break;
12194   }
12195 
12196   return false;
12197 }
12198 
12199 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12200   // At this point, we have to lower this constraint to something else, so we
12201   // lower it to an "r" or "w". However, by doing this we will force the result
12202   // to be in register, while the X constraint is much more permissive.
12203   //
12204   // Although we are correct (we are free to emit anything, without
12205   // constraints), we might break use cases that would expect us to be more
12206   // efficient and emit something else.
12207   if (!Subtarget->hasVFP2())
12208     return "r";
12209   if (ConstraintVT.isFloatingPoint())
12210     return "w";
12211   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
12212      (ConstraintVT.getSizeInBits() == 64 ||
12213       ConstraintVT.getSizeInBits() == 128))
12214     return "w";
12215 
12216   return "r";
12217 }
12218 
12219 /// getConstraintType - Given a constraint letter, return the type of
12220 /// constraint it is for this target.
12221 ARMTargetLowering::ConstraintType
12222 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
12223   if (Constraint.size() == 1) {
12224     switch (Constraint[0]) {
12225     default:  break;
12226     case 'l': return C_RegisterClass;
12227     case 'w': return C_RegisterClass;
12228     case 'h': return C_RegisterClass;
12229     case 'x': return C_RegisterClass;
12230     case 't': return C_RegisterClass;
12231     case 'j': return C_Other; // Constant for movw.
12232       // An address with a single base register. Due to the way we
12233       // currently handle addresses it is the same as an 'r' memory constraint.
12234     case 'Q': return C_Memory;
12235     }
12236   } else if (Constraint.size() == 2) {
12237     switch (Constraint[0]) {
12238     default: break;
12239     // All 'U+' constraints are addresses.
12240     case 'U': return C_Memory;
12241     }
12242   }
12243   return TargetLowering::getConstraintType(Constraint);
12244 }
12245 
12246 /// Examine constraint type and operand type and determine a weight value.
12247 /// This object must already have been set up with the operand type
12248 /// and the current alternative constraint selected.
12249 TargetLowering::ConstraintWeight
12250 ARMTargetLowering::getSingleConstraintMatchWeight(
12251     AsmOperandInfo &info, const char *constraint) const {
12252   ConstraintWeight weight = CW_Invalid;
12253   Value *CallOperandVal = info.CallOperandVal;
12254     // If we don't have a value, we can't do a match,
12255     // but allow it at the lowest weight.
12256   if (!CallOperandVal)
12257     return CW_Default;
12258   Type *type = CallOperandVal->getType();
12259   // Look at the constraint type.
12260   switch (*constraint) {
12261   default:
12262     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
12263     break;
12264   case 'l':
12265     if (type->isIntegerTy()) {
12266       if (Subtarget->isThumb())
12267         weight = CW_SpecificReg;
12268       else
12269         weight = CW_Register;
12270     }
12271     break;
12272   case 'w':
12273     if (type->isFloatingPointTy())
12274       weight = CW_Register;
12275     break;
12276   }
12277   return weight;
12278 }
12279 
12280 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
12281 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
12282     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12283   if (Constraint.size() == 1) {
12284     // GCC ARM Constraint Letters
12285     switch (Constraint[0]) {
12286     case 'l': // Low regs or general regs.
12287       if (Subtarget->isThumb())
12288         return RCPair(0U, &ARM::tGPRRegClass);
12289       return RCPair(0U, &ARM::GPRRegClass);
12290     case 'h': // High regs or no regs.
12291       if (Subtarget->isThumb())
12292         return RCPair(0U, &ARM::hGPRRegClass);
12293       break;
12294     case 'r':
12295       if (Subtarget->isThumb1Only())
12296         return RCPair(0U, &ARM::tGPRRegClass);
12297       return RCPair(0U, &ARM::GPRRegClass);
12298     case 'w':
12299       if (VT == MVT::Other)
12300         break;
12301       if (VT == MVT::f32)
12302         return RCPair(0U, &ARM::SPRRegClass);
12303       if (VT.getSizeInBits() == 64)
12304         return RCPair(0U, &ARM::DPRRegClass);
12305       if (VT.getSizeInBits() == 128)
12306         return RCPair(0U, &ARM::QPRRegClass);
12307       break;
12308     case 'x':
12309       if (VT == MVT::Other)
12310         break;
12311       if (VT == MVT::f32)
12312         return RCPair(0U, &ARM::SPR_8RegClass);
12313       if (VT.getSizeInBits() == 64)
12314         return RCPair(0U, &ARM::DPR_8RegClass);
12315       if (VT.getSizeInBits() == 128)
12316         return RCPair(0U, &ARM::QPR_8RegClass);
12317       break;
12318     case 't':
12319       if (VT == MVT::f32)
12320         return RCPair(0U, &ARM::SPRRegClass);
12321       break;
12322     }
12323   }
12324   if (StringRef("{cc}").equals_lower(Constraint))
12325     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
12326 
12327   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12328 }
12329 
12330 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12331 /// vector.  If it is invalid, don't add anything to Ops.
12332 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
12333                                                      std::string &Constraint,
12334                                                      std::vector<SDValue>&Ops,
12335                                                      SelectionDAG &DAG) const {
12336   SDValue Result;
12337 
12338   // Currently only support length 1 constraints.
12339   if (Constraint.length() != 1) return;
12340 
12341   char ConstraintLetter = Constraint[0];
12342   switch (ConstraintLetter) {
12343   default: break;
12344   case 'j':
12345   case 'I': case 'J': case 'K': case 'L':
12346   case 'M': case 'N': case 'O':
12347     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12348     if (!C)
12349       return;
12350 
12351     int64_t CVal64 = C->getSExtValue();
12352     int CVal = (int) CVal64;
12353     // None of these constraints allow values larger than 32 bits.  Check
12354     // that the value fits in an int.
12355     if (CVal != CVal64)
12356       return;
12357 
12358     switch (ConstraintLetter) {
12359       case 'j':
12360         // Constant suitable for movw, must be between 0 and
12361         // 65535.
12362         if (Subtarget->hasV6T2Ops())
12363           if (CVal >= 0 && CVal <= 65535)
12364             break;
12365         return;
12366       case 'I':
12367         if (Subtarget->isThumb1Only()) {
12368           // This must be a constant between 0 and 255, for ADD
12369           // immediates.
12370           if (CVal >= 0 && CVal <= 255)
12371             break;
12372         } else if (Subtarget->isThumb2()) {
12373           // A constant that can be used as an immediate value in a
12374           // data-processing instruction.
12375           if (ARM_AM::getT2SOImmVal(CVal) != -1)
12376             break;
12377         } else {
12378           // A constant that can be used as an immediate value in a
12379           // data-processing instruction.
12380           if (ARM_AM::getSOImmVal(CVal) != -1)
12381             break;
12382         }
12383         return;
12384 
12385       case 'J':
12386         if (Subtarget->isThumb1Only()) {
12387           // This must be a constant between -255 and -1, for negated ADD
12388           // immediates. This can be used in GCC with an "n" modifier that
12389           // prints the negated value, for use with SUB instructions. It is
12390           // not useful otherwise but is implemented for compatibility.
12391           if (CVal >= -255 && CVal <= -1)
12392             break;
12393         } else {
12394           // This must be a constant between -4095 and 4095. It is not clear
12395           // what this constraint is intended for. Implemented for
12396           // compatibility with GCC.
12397           if (CVal >= -4095 && CVal <= 4095)
12398             break;
12399         }
12400         return;
12401 
12402       case 'K':
12403         if (Subtarget->isThumb1Only()) {
12404           // A 32-bit value where only one byte has a nonzero value. Exclude
12405           // zero to match GCC. This constraint is used by GCC internally for
12406           // constants that can be loaded with a move/shift combination.
12407           // It is not useful otherwise but is implemented for compatibility.
12408           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
12409             break;
12410         } else if (Subtarget->isThumb2()) {
12411           // A constant whose bitwise inverse can be used as an immediate
12412           // value in a data-processing instruction. This can be used in GCC
12413           // with a "B" modifier that prints the inverted value, for use with
12414           // BIC and MVN instructions. It is not useful otherwise but is
12415           // implemented for compatibility.
12416           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
12417             break;
12418         } else {
12419           // A constant whose bitwise inverse can be used as an immediate
12420           // value in a data-processing instruction. This can be used in GCC
12421           // with a "B" modifier that prints the inverted value, for use with
12422           // BIC and MVN instructions. It is not useful otherwise but is
12423           // implemented for compatibility.
12424           if (ARM_AM::getSOImmVal(~CVal) != -1)
12425             break;
12426         }
12427         return;
12428 
12429       case 'L':
12430         if (Subtarget->isThumb1Only()) {
12431           // This must be a constant between -7 and 7,
12432           // for 3-operand ADD/SUB immediate instructions.
12433           if (CVal >= -7 && CVal < 7)
12434             break;
12435         } else if (Subtarget->isThumb2()) {
12436           // A constant whose negation can be used as an immediate value in a
12437           // data-processing instruction. This can be used in GCC with an "n"
12438           // modifier that prints the negated value, for use with SUB
12439           // instructions. It is not useful otherwise but is implemented for
12440           // compatibility.
12441           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
12442             break;
12443         } else {
12444           // A constant whose negation can be used as an immediate value in a
12445           // data-processing instruction. This can be used in GCC with an "n"
12446           // modifier that prints the negated value, for use with SUB
12447           // instructions. It is not useful otherwise but is implemented for
12448           // compatibility.
12449           if (ARM_AM::getSOImmVal(-CVal) != -1)
12450             break;
12451         }
12452         return;
12453 
12454       case 'M':
12455         if (Subtarget->isThumb1Only()) {
12456           // This must be a multiple of 4 between 0 and 1020, for
12457           // ADD sp + immediate.
12458           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
12459             break;
12460         } else {
12461           // A power of two or a constant between 0 and 32.  This is used in
12462           // GCC for the shift amount on shifted register operands, but it is
12463           // useful in general for any shift amounts.
12464           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
12465             break;
12466         }
12467         return;
12468 
12469       case 'N':
12470         if (Subtarget->isThumb()) {  // FIXME thumb2
12471           // This must be a constant between 0 and 31, for shift amounts.
12472           if (CVal >= 0 && CVal <= 31)
12473             break;
12474         }
12475         return;
12476 
12477       case 'O':
12478         if (Subtarget->isThumb()) {  // FIXME thumb2
12479           // This must be a multiple of 4 between -508 and 508, for
12480           // ADD/SUB sp = sp + immediate.
12481           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
12482             break;
12483         }
12484         return;
12485     }
12486     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
12487     break;
12488   }
12489 
12490   if (Result.getNode()) {
12491     Ops.push_back(Result);
12492     return;
12493   }
12494   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12495 }
12496 
12497 static RTLIB::Libcall getDivRemLibcall(
12498     const SDNode *N, MVT::SimpleValueType SVT) {
12499   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
12500           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
12501          "Unhandled Opcode in getDivRemLibcall");
12502   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
12503                   N->getOpcode() == ISD::SREM;
12504   RTLIB::Libcall LC;
12505   switch (SVT) {
12506   default: llvm_unreachable("Unexpected request for libcall!");
12507   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
12508   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
12509   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
12510   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
12511   }
12512   return LC;
12513 }
12514 
12515 static TargetLowering::ArgListTy getDivRemArgList(
12516     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
12517   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
12518           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
12519          "Unhandled Opcode in getDivRemArgList");
12520   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
12521                   N->getOpcode() == ISD::SREM;
12522   TargetLowering::ArgListTy Args;
12523   TargetLowering::ArgListEntry Entry;
12524   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
12525     EVT ArgVT = N->getOperand(i).getValueType();
12526     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
12527     Entry.Node = N->getOperand(i);
12528     Entry.Ty = ArgTy;
12529     Entry.isSExt = isSigned;
12530     Entry.isZExt = !isSigned;
12531     Args.push_back(Entry);
12532   }
12533   if (Subtarget->isTargetWindows() && Args.size() >= 2)
12534     std::swap(Args[0], Args[1]);
12535   return Args;
12536 }
12537 
12538 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
12539   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
12540           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
12541           Subtarget->isTargetWindows()) &&
12542          "Register-based DivRem lowering only");
12543   unsigned Opcode = Op->getOpcode();
12544   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
12545          "Invalid opcode for Div/Rem lowering");
12546   bool isSigned = (Opcode == ISD::SDIVREM);
12547   EVT VT = Op->getValueType(0);
12548   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
12549   SDLoc dl(Op);
12550 
12551   // If the target has hardware divide, use divide + multiply + subtract:
12552   //     div = a / b
12553   //     rem = a - b * div
12554   //     return {div, rem}
12555   // This should be lowered into UDIV/SDIV + MLS later on.
12556   if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() &&
12557       Op->getSimpleValueType(0) == MVT::i32) {
12558     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
12559     const SDValue Dividend = Op->getOperand(0);
12560     const SDValue Divisor = Op->getOperand(1);
12561     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
12562     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
12563     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
12564 
12565     SDValue Values[2] = {Div, Rem};
12566     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
12567   }
12568 
12569   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
12570                                        VT.getSimpleVT().SimpleTy);
12571   SDValue InChain = DAG.getEntryNode();
12572 
12573   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
12574                                                     DAG.getContext(),
12575                                                     Subtarget);
12576 
12577   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
12578                                          getPointerTy(DAG.getDataLayout()));
12579 
12580   Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
12581 
12582   if (Subtarget->isTargetWindows())
12583     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
12584 
12585   TargetLowering::CallLoweringInfo CLI(DAG);
12586   CLI.setDebugLoc(dl).setChain(InChain)
12587     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
12588     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
12589 
12590   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
12591   return CallInfo.first;
12592 }
12593 
12594 // Lowers REM using divmod helpers
12595 // see RTABI section 4.2/4.3
12596 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
12597   // Build return types (div and rem)
12598   std::vector<Type*> RetTyParams;
12599   Type *RetTyElement;
12600 
12601   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
12602   default: llvm_unreachable("Unexpected request for libcall!");
12603   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
12604   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
12605   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
12606   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
12607   }
12608 
12609   RetTyParams.push_back(RetTyElement);
12610   RetTyParams.push_back(RetTyElement);
12611   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
12612   Type *RetTy = StructType::get(*DAG.getContext(), ret);
12613 
12614   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
12615                                                              SimpleTy);
12616   SDValue InChain = DAG.getEntryNode();
12617   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
12618                                                     Subtarget);
12619   bool isSigned = N->getOpcode() == ISD::SREM;
12620   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
12621                                          getPointerTy(DAG.getDataLayout()));
12622 
12623   if (Subtarget->isTargetWindows())
12624     InChain = WinDBZCheckDenominator(DAG, N, InChain);
12625 
12626   // Lower call
12627   CallLoweringInfo CLI(DAG);
12628   CLI.setChain(InChain)
12629      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
12630      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
12631   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
12632 
12633   // Return second (rem) result operand (first contains div)
12634   SDNode *ResNode = CallResult.first.getNode();
12635   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
12636   return ResNode->getOperand(1);
12637 }
12638 
12639 SDValue
12640 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
12641   assert(Subtarget->isTargetWindows() && "unsupported target platform");
12642   SDLoc DL(Op);
12643 
12644   // Get the inputs.
12645   SDValue Chain = Op.getOperand(0);
12646   SDValue Size  = Op.getOperand(1);
12647 
12648   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
12649                               DAG.getConstant(2, DL, MVT::i32));
12650 
12651   SDValue Flag;
12652   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
12653   Flag = Chain.getValue(1);
12654 
12655   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
12656   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
12657 
12658   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
12659   Chain = NewSP.getValue(1);
12660 
12661   SDValue Ops[2] = { NewSP, Chain };
12662   return DAG.getMergeValues(Ops, DL);
12663 }
12664 
12665 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
12666   assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
12667          "Unexpected type for custom-lowering FP_EXTEND");
12668 
12669   RTLIB::Libcall LC;
12670   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
12671 
12672   SDValue SrcVal = Op.getOperand(0);
12673   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
12674                      SDLoc(Op)).first;
12675 }
12676 
12677 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
12678   assert(Op.getOperand(0).getValueType() == MVT::f64 &&
12679          Subtarget->isFPOnlySP() &&
12680          "Unexpected type for custom-lowering FP_ROUND");
12681 
12682   RTLIB::Libcall LC;
12683   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
12684 
12685   SDValue SrcVal = Op.getOperand(0);
12686   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
12687                      SDLoc(Op)).first;
12688 }
12689 
12690 bool
12691 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
12692   // The ARM target isn't yet aware of offsets.
12693   return false;
12694 }
12695 
12696 bool ARM::isBitFieldInvertedMask(unsigned v) {
12697   if (v == 0xffffffff)
12698     return false;
12699 
12700   // there can be 1's on either or both "outsides", all the "inside"
12701   // bits must be 0's
12702   return isShiftedMask_32(~v);
12703 }
12704 
12705 /// isFPImmLegal - Returns true if the target can instruction select the
12706 /// specified FP immediate natively. If false, the legalizer will
12707 /// materialize the FP immediate as a load from a constant pool.
12708 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
12709   if (!Subtarget->hasVFP3())
12710     return false;
12711   if (VT == MVT::f32)
12712     return ARM_AM::getFP32Imm(Imm) != -1;
12713   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
12714     return ARM_AM::getFP64Imm(Imm) != -1;
12715   return false;
12716 }
12717 
12718 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
12719 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
12720 /// specified in the intrinsic calls.
12721 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12722                                            const CallInst &I,
12723                                            unsigned Intrinsic) const {
12724   switch (Intrinsic) {
12725   case Intrinsic::arm_neon_vld1:
12726   case Intrinsic::arm_neon_vld2:
12727   case Intrinsic::arm_neon_vld3:
12728   case Intrinsic::arm_neon_vld4:
12729   case Intrinsic::arm_neon_vld2lane:
12730   case Intrinsic::arm_neon_vld3lane:
12731   case Intrinsic::arm_neon_vld4lane: {
12732     Info.opc = ISD::INTRINSIC_W_CHAIN;
12733     // Conservatively set memVT to the entire set of vectors loaded.
12734     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12735     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
12736     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12737     Info.ptrVal = I.getArgOperand(0);
12738     Info.offset = 0;
12739     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12740     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12741     Info.vol = false; // volatile loads with NEON intrinsics not supported
12742     Info.readMem = true;
12743     Info.writeMem = false;
12744     return true;
12745   }
12746   case Intrinsic::arm_neon_vst1:
12747   case Intrinsic::arm_neon_vst2:
12748   case Intrinsic::arm_neon_vst3:
12749   case Intrinsic::arm_neon_vst4:
12750   case Intrinsic::arm_neon_vst2lane:
12751   case Intrinsic::arm_neon_vst3lane:
12752   case Intrinsic::arm_neon_vst4lane: {
12753     Info.opc = ISD::INTRINSIC_VOID;
12754     // Conservatively set memVT to the entire set of vectors stored.
12755     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12756     unsigned NumElts = 0;
12757     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
12758       Type *ArgTy = I.getArgOperand(ArgI)->getType();
12759       if (!ArgTy->isVectorTy())
12760         break;
12761       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
12762     }
12763     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12764     Info.ptrVal = I.getArgOperand(0);
12765     Info.offset = 0;
12766     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12767     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12768     Info.vol = false; // volatile stores with NEON intrinsics not supported
12769     Info.readMem = false;
12770     Info.writeMem = true;
12771     return true;
12772   }
12773   case Intrinsic::arm_ldaex:
12774   case Intrinsic::arm_ldrex: {
12775     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12776     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
12777     Info.opc = ISD::INTRINSIC_W_CHAIN;
12778     Info.memVT = MVT::getVT(PtrTy->getElementType());
12779     Info.ptrVal = I.getArgOperand(0);
12780     Info.offset = 0;
12781     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12782     Info.vol = true;
12783     Info.readMem = true;
12784     Info.writeMem = false;
12785     return true;
12786   }
12787   case Intrinsic::arm_stlex:
12788   case Intrinsic::arm_strex: {
12789     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12790     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
12791     Info.opc = ISD::INTRINSIC_W_CHAIN;
12792     Info.memVT = MVT::getVT(PtrTy->getElementType());
12793     Info.ptrVal = I.getArgOperand(1);
12794     Info.offset = 0;
12795     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12796     Info.vol = true;
12797     Info.readMem = false;
12798     Info.writeMem = true;
12799     return true;
12800   }
12801   case Intrinsic::arm_stlexd:
12802   case Intrinsic::arm_strexd: {
12803     Info.opc = ISD::INTRINSIC_W_CHAIN;
12804     Info.memVT = MVT::i64;
12805     Info.ptrVal = I.getArgOperand(2);
12806     Info.offset = 0;
12807     Info.align = 8;
12808     Info.vol = true;
12809     Info.readMem = false;
12810     Info.writeMem = true;
12811     return true;
12812   }
12813   case Intrinsic::arm_ldaexd:
12814   case Intrinsic::arm_ldrexd: {
12815     Info.opc = ISD::INTRINSIC_W_CHAIN;
12816     Info.memVT = MVT::i64;
12817     Info.ptrVal = I.getArgOperand(0);
12818     Info.offset = 0;
12819     Info.align = 8;
12820     Info.vol = true;
12821     Info.readMem = true;
12822     Info.writeMem = false;
12823     return true;
12824   }
12825   default:
12826     break;
12827   }
12828 
12829   return false;
12830 }
12831 
12832 /// \brief Returns true if it is beneficial to convert a load of a constant
12833 /// to just the constant itself.
12834 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
12835                                                           Type *Ty) const {
12836   assert(Ty->isIntegerTy());
12837 
12838   unsigned Bits = Ty->getPrimitiveSizeInBits();
12839   if (Bits == 0 || Bits > 32)
12840     return false;
12841   return true;
12842 }
12843 
12844 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
12845                                         ARM_MB::MemBOpt Domain) const {
12846   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12847 
12848   // First, if the target has no DMB, see what fallback we can use.
12849   if (!Subtarget->hasDataBarrier()) {
12850     // Some ARMv6 cpus can support data barriers with an mcr instruction.
12851     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
12852     // here.
12853     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
12854       Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
12855       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
12856                         Builder.getInt32(0), Builder.getInt32(7),
12857                         Builder.getInt32(10), Builder.getInt32(5)};
12858       return Builder.CreateCall(MCR, args);
12859     } else {
12860       // Instead of using barriers, atomic accesses on these subtargets use
12861       // libcalls.
12862       llvm_unreachable("makeDMB on a target so old that it has no barriers");
12863     }
12864   } else {
12865     Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
12866     // Only a full system barrier exists in the M-class architectures.
12867     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
12868     Constant *CDomain = Builder.getInt32(Domain);
12869     return Builder.CreateCall(DMB, CDomain);
12870   }
12871 }
12872 
12873 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
12874 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
12875                                          AtomicOrdering Ord, bool IsStore,
12876                                          bool IsLoad) const {
12877   switch (Ord) {
12878   case AtomicOrdering::NotAtomic:
12879   case AtomicOrdering::Unordered:
12880     llvm_unreachable("Invalid fence: unordered/non-atomic");
12881   case AtomicOrdering::Monotonic:
12882   case AtomicOrdering::Acquire:
12883     return nullptr; // Nothing to do
12884   case AtomicOrdering::SequentiallyConsistent:
12885     if (!IsStore)
12886       return nullptr; // Nothing to do
12887     /*FALLTHROUGH*/
12888   case AtomicOrdering::Release:
12889   case AtomicOrdering::AcquireRelease:
12890     if (Subtarget->preferISHSTBarriers())
12891       return makeDMB(Builder, ARM_MB::ISHST);
12892     // FIXME: add a comment with a link to documentation justifying this.
12893     else
12894       return makeDMB(Builder, ARM_MB::ISH);
12895   }
12896   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
12897 }
12898 
12899 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
12900                                           AtomicOrdering Ord, bool IsStore,
12901                                           bool IsLoad) const {
12902   switch (Ord) {
12903   case AtomicOrdering::NotAtomic:
12904   case AtomicOrdering::Unordered:
12905     llvm_unreachable("Invalid fence: unordered/not-atomic");
12906   case AtomicOrdering::Monotonic:
12907   case AtomicOrdering::Release:
12908     return nullptr; // Nothing to do
12909   case AtomicOrdering::Acquire:
12910   case AtomicOrdering::AcquireRelease:
12911   case AtomicOrdering::SequentiallyConsistent:
12912     return makeDMB(Builder, ARM_MB::ISH);
12913   }
12914   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
12915 }
12916 
12917 // Loads and stores less than 64-bits are already atomic; ones above that
12918 // are doomed anyway, so defer to the default libcall and blame the OS when
12919 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
12920 // anything for those.
12921 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
12922   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
12923   return (Size == 64) && !Subtarget->isMClass();
12924 }
12925 
12926 // Loads and stores less than 64-bits are already atomic; ones above that
12927 // are doomed anyway, so defer to the default libcall and blame the OS when
12928 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
12929 // anything for those.
12930 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
12931 // guarantee, see DDI0406C ARM architecture reference manual,
12932 // sections A8.8.72-74 LDRD)
12933 TargetLowering::AtomicExpansionKind
12934 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
12935   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
12936   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
12937                                                   : AtomicExpansionKind::None;
12938 }
12939 
12940 // For the real atomic operations, we have ldrex/strex up to 32 bits,
12941 // and up to 64 bits on the non-M profiles
12942 TargetLowering::AtomicExpansionKind
12943 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
12944   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
12945   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
12946   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
12947              ? AtomicExpansionKind::LLSC
12948              : AtomicExpansionKind::None;
12949 }
12950 
12951 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
12952     AtomicCmpXchgInst *AI) const {
12953   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
12954   // implement cmpxchg without spilling. If the address being exchanged is also
12955   // on the stack and close enough to the spill slot, this can lead to a
12956   // situation where the monitor always gets cleared and the atomic operation
12957   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
12958   bool hasAtomicCmpXchg =
12959       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
12960   return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg;
12961 }
12962 
12963 bool ARMTargetLowering::shouldInsertFencesForAtomic(
12964     const Instruction *I) const {
12965   return InsertFencesForAtomic;
12966 }
12967 
12968 // This has so far only been implemented for MachO.
12969 bool ARMTargetLowering::useLoadStackGuardNode() const {
12970   return Subtarget->isTargetMachO();
12971 }
12972 
12973 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
12974                                                   unsigned &Cost) const {
12975   // If we do not have NEON, vector types are not natively supported.
12976   if (!Subtarget->hasNEON())
12977     return false;
12978 
12979   // Floating point values and vector values map to the same register file.
12980   // Therefore, although we could do a store extract of a vector type, this is
12981   // better to leave at float as we have more freedom in the addressing mode for
12982   // those.
12983   if (VectorTy->isFPOrFPVectorTy())
12984     return false;
12985 
12986   // If the index is unknown at compile time, this is very expensive to lower
12987   // and it is not possible to combine the store with the extract.
12988   if (!isa<ConstantInt>(Idx))
12989     return false;
12990 
12991   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
12992   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
12993   // We can do a store + vector extract on any vector that fits perfectly in a D
12994   // or Q register.
12995   if (BitWidth == 64 || BitWidth == 128) {
12996     Cost = 0;
12997     return true;
12998   }
12999   return false;
13000 }
13001 
13002 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
13003   return Subtarget->hasV6T2Ops();
13004 }
13005 
13006 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
13007   return Subtarget->hasV6T2Ops();
13008 }
13009 
13010 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
13011                                          AtomicOrdering Ord) const {
13012   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
13013   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
13014   bool IsAcquire = isAcquireOrStronger(Ord);
13015 
13016   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
13017   // intrinsic must return {i32, i32} and we have to recombine them into a
13018   // single i64 here.
13019   if (ValTy->getPrimitiveSizeInBits() == 64) {
13020     Intrinsic::ID Int =
13021         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
13022     Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
13023 
13024     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
13025     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
13026 
13027     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
13028     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
13029     if (!Subtarget->isLittle())
13030       std::swap (Lo, Hi);
13031     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
13032     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
13033     return Builder.CreateOr(
13034         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
13035   }
13036 
13037   Type *Tys[] = { Addr->getType() };
13038   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
13039   Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
13040 
13041   return Builder.CreateTruncOrBitCast(
13042       Builder.CreateCall(Ldrex, Addr),
13043       cast<PointerType>(Addr->getType())->getElementType());
13044 }
13045 
13046 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
13047     IRBuilder<> &Builder) const {
13048   if (!Subtarget->hasV7Ops())
13049     return;
13050   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
13051   Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
13052 }
13053 
13054 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
13055                                                Value *Addr,
13056                                                AtomicOrdering Ord) const {
13057   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
13058   bool IsRelease = isReleaseOrStronger(Ord);
13059 
13060   // Since the intrinsics must have legal type, the i64 intrinsics take two
13061   // parameters: "i32, i32". We must marshal Val into the appropriate form
13062   // before the call.
13063   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
13064     Intrinsic::ID Int =
13065         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
13066     Function *Strex = Intrinsic::getDeclaration(M, Int);
13067     Type *Int32Ty = Type::getInt32Ty(M->getContext());
13068 
13069     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
13070     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
13071     if (!Subtarget->isLittle())
13072       std::swap (Lo, Hi);
13073     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
13074     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
13075   }
13076 
13077   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
13078   Type *Tys[] = { Addr->getType() };
13079   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
13080 
13081   return Builder.CreateCall(
13082       Strex, {Builder.CreateZExtOrBitCast(
13083                   Val, Strex->getFunctionType()->getParamType(0)),
13084               Addr});
13085 }
13086 
13087 /// \brief Lower an interleaved load into a vldN intrinsic.
13088 ///
13089 /// E.g. Lower an interleaved load (Factor = 2):
13090 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
13091 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
13092 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
13093 ///
13094 ///      Into:
13095 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
13096 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
13097 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
13098 bool ARMTargetLowering::lowerInterleavedLoad(
13099     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
13100     ArrayRef<unsigned> Indices, unsigned Factor) const {
13101   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13102          "Invalid interleave factor");
13103   assert(!Shuffles.empty() && "Empty shufflevector input");
13104   assert(Shuffles.size() == Indices.size() &&
13105          "Unmatched number of shufflevectors and indices");
13106 
13107   VectorType *VecTy = Shuffles[0]->getType();
13108   Type *EltTy = VecTy->getVectorElementType();
13109 
13110   const DataLayout &DL = LI->getModule()->getDataLayout();
13111   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
13112   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
13113 
13114   // Skip if we do not have NEON and skip illegal vector types and vector types
13115   // with i64/f64 elements (vldN doesn't support i64/f64 elements).
13116   if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
13117     return false;
13118 
13119   // A pointer vector can not be the return type of the ldN intrinsics. Need to
13120   // load integer vectors first and then convert to pointer vectors.
13121   if (EltTy->isPointerTy())
13122     VecTy =
13123         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
13124 
13125   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
13126                                             Intrinsic::arm_neon_vld3,
13127                                             Intrinsic::arm_neon_vld4};
13128 
13129   IRBuilder<> Builder(LI);
13130   SmallVector<Value *, 2> Ops;
13131 
13132   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
13133   Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
13134   Ops.push_back(Builder.getInt32(LI->getAlignment()));
13135 
13136   Type *Tys[] = { VecTy, Int8Ptr };
13137   Function *VldnFunc =
13138       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
13139   CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
13140 
13141   // Replace uses of each shufflevector with the corresponding vector loaded
13142   // by ldN.
13143   for (unsigned i = 0; i < Shuffles.size(); i++) {
13144     ShuffleVectorInst *SV = Shuffles[i];
13145     unsigned Index = Indices[i];
13146 
13147     Value *SubVec = Builder.CreateExtractValue(VldN, Index);
13148 
13149     // Convert the integer vector to pointer vector if the element is pointer.
13150     if (EltTy->isPointerTy())
13151       SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
13152 
13153     SV->replaceAllUsesWith(SubVec);
13154   }
13155 
13156   return true;
13157 }
13158 
13159 /// \brief Get a mask consisting of sequential integers starting from \p Start.
13160 ///
13161 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
13162 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
13163                                    unsigned NumElts) {
13164   SmallVector<Constant *, 16> Mask;
13165   for (unsigned i = 0; i < NumElts; i++)
13166     Mask.push_back(Builder.getInt32(Start + i));
13167 
13168   return ConstantVector::get(Mask);
13169 }
13170 
13171 /// \brief Lower an interleaved store into a vstN intrinsic.
13172 ///
13173 /// E.g. Lower an interleaved store (Factor = 3):
13174 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
13175 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
13176 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
13177 ///
13178 ///      Into:
13179 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
13180 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
13181 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
13182 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
13183 ///
13184 /// Note that the new shufflevectors will be removed and we'll only generate one
13185 /// vst3 instruction in CodeGen.
13186 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
13187                                               ShuffleVectorInst *SVI,
13188                                               unsigned Factor) const {
13189   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13190          "Invalid interleave factor");
13191 
13192   VectorType *VecTy = SVI->getType();
13193   assert(VecTy->getVectorNumElements() % Factor == 0 &&
13194          "Invalid interleaved store");
13195 
13196   unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
13197   Type *EltTy = VecTy->getVectorElementType();
13198   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
13199 
13200   const DataLayout &DL = SI->getModule()->getDataLayout();
13201   unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
13202   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
13203 
13204   // Skip if we do not have NEON and skip illegal vector types and vector types
13205   // with i64/f64 elements (vstN doesn't support i64/f64 elements).
13206   if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
13207       EltIs64Bits)
13208     return false;
13209 
13210   Value *Op0 = SVI->getOperand(0);
13211   Value *Op1 = SVI->getOperand(1);
13212   IRBuilder<> Builder(SI);
13213 
13214   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
13215   // vectors to integer vectors.
13216   if (EltTy->isPointerTy()) {
13217     Type *IntTy = DL.getIntPtrType(EltTy);
13218 
13219     // Convert to the corresponding integer vector.
13220     Type *IntVecTy =
13221         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
13222     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
13223     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
13224 
13225     SubVecTy = VectorType::get(IntTy, NumSubElts);
13226   }
13227 
13228   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
13229                                              Intrinsic::arm_neon_vst3,
13230                                              Intrinsic::arm_neon_vst4};
13231   SmallVector<Value *, 6> Ops;
13232 
13233   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
13234   Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
13235 
13236   Type *Tys[] = { Int8Ptr, SubVecTy };
13237   Function *VstNFunc = Intrinsic::getDeclaration(
13238       SI->getModule(), StoreInts[Factor - 2], Tys);
13239 
13240   // Split the shufflevector operands into sub vectors for the new vstN call.
13241   for (unsigned i = 0; i < Factor; i++)
13242     Ops.push_back(Builder.CreateShuffleVector(
13243         Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
13244 
13245   Ops.push_back(Builder.getInt32(SI->getAlignment()));
13246   Builder.CreateCall(VstNFunc, Ops);
13247   return true;
13248 }
13249 
13250 enum HABaseType {
13251   HA_UNKNOWN = 0,
13252   HA_FLOAT,
13253   HA_DOUBLE,
13254   HA_VECT64,
13255   HA_VECT128
13256 };
13257 
13258 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
13259                                    uint64_t &Members) {
13260   if (auto *ST = dyn_cast<StructType>(Ty)) {
13261     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
13262       uint64_t SubMembers = 0;
13263       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
13264         return false;
13265       Members += SubMembers;
13266     }
13267   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
13268     uint64_t SubMembers = 0;
13269     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
13270       return false;
13271     Members += SubMembers * AT->getNumElements();
13272   } else if (Ty->isFloatTy()) {
13273     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
13274       return false;
13275     Members = 1;
13276     Base = HA_FLOAT;
13277   } else if (Ty->isDoubleTy()) {
13278     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
13279       return false;
13280     Members = 1;
13281     Base = HA_DOUBLE;
13282   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
13283     Members = 1;
13284     switch (Base) {
13285     case HA_FLOAT:
13286     case HA_DOUBLE:
13287       return false;
13288     case HA_VECT64:
13289       return VT->getBitWidth() == 64;
13290     case HA_VECT128:
13291       return VT->getBitWidth() == 128;
13292     case HA_UNKNOWN:
13293       switch (VT->getBitWidth()) {
13294       case 64:
13295         Base = HA_VECT64;
13296         return true;
13297       case 128:
13298         Base = HA_VECT128;
13299         return true;
13300       default:
13301         return false;
13302       }
13303     }
13304   }
13305 
13306   return (Members > 0 && Members <= 4);
13307 }
13308 
13309 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
13310 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
13311 /// passing according to AAPCS rules.
13312 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
13313     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
13314   if (getEffectiveCallingConv(CallConv, isVarArg) !=
13315       CallingConv::ARM_AAPCS_VFP)
13316     return false;
13317 
13318   HABaseType Base = HA_UNKNOWN;
13319   uint64_t Members = 0;
13320   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
13321   DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
13322 
13323   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
13324   return IsHA || IsIntArray;
13325 }
13326 
13327 unsigned ARMTargetLowering::getExceptionPointerRegister(
13328     const Constant *PersonalityFn) const {
13329   // Platforms which do not use SjLj EH may return values in these registers
13330   // via the personality function.
13331   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
13332 }
13333 
13334 unsigned ARMTargetLowering::getExceptionSelectorRegister(
13335     const Constant *PersonalityFn) const {
13336   // Platforms which do not use SjLj EH may return values in these registers
13337   // via the personality function.
13338   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
13339 }
13340 
13341 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
13342   // Update IsSplitCSR in ARMFunctionInfo.
13343   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
13344   AFI->setIsSplitCSR(true);
13345 }
13346 
13347 void ARMTargetLowering::insertCopiesSplitCSR(
13348     MachineBasicBlock *Entry,
13349     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
13350   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
13351   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
13352   if (!IStart)
13353     return;
13354 
13355   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
13356   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
13357   MachineBasicBlock::iterator MBBI = Entry->begin();
13358   for (const MCPhysReg *I = IStart; *I; ++I) {
13359     const TargetRegisterClass *RC = nullptr;
13360     if (ARM::GPRRegClass.contains(*I))
13361       RC = &ARM::GPRRegClass;
13362     else if (ARM::DPRRegClass.contains(*I))
13363       RC = &ARM::DPRRegClass;
13364     else
13365       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
13366 
13367     unsigned NewVR = MRI->createVirtualRegister(RC);
13368     // Create copy from CSR to a virtual register.
13369     // FIXME: this currently does not emit CFI pseudo-instructions, it works
13370     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
13371     // nounwind. If we want to generalize this later, we may need to emit
13372     // CFI pseudo-instructions.
13373     assert(Entry->getParent()->getFunction()->hasFnAttribute(
13374                Attribute::NoUnwind) &&
13375            "Function should be nounwind in insertCopiesSplitCSR!");
13376     Entry->addLiveIn(*I);
13377     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
13378         .addReg(*I);
13379 
13380     // Insert the copy-back instructions right before the terminator.
13381     for (auto *Exit : Exits)
13382       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
13383               TII->get(TargetOpcode::COPY), *I)
13384           .addReg(NewVR);
13385   }
13386 }
13387