1 //===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that ARM uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARMISelLowering.h"
16 #include "ARMCallingConv.h"
17 #include "ARMConstantPoolValue.h"
18 #include "ARMMachineFunctionInfo.h"
19 #include "ARMPerfectShuffle.h"
20 #include "ARMSubtarget.h"
21 #include "ARMTargetMachine.h"
22 #include "ARMTargetObjectFile.h"
23 #include "MCTargetDesc/ARMAddressingModes.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/ADT/StringExtras.h"
26 #include "llvm/ADT/StringSwitch.h"
27 #include "llvm/CodeGen/CallingConvLower.h"
28 #include "llvm/CodeGen/IntrinsicLowering.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFrameInfo.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
34 #include "llvm/CodeGen/MachineModuleInfo.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/SelectionDAG.h"
37 #include "llvm/IR/CallingConv.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/IR/DebugInfoMetadata.h"
41 #include "llvm/IR/GlobalValue.h"
42 #include "llvm/IR/IRBuilder.h"
43 #include "llvm/IR/Instruction.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/IntrinsicInst.h"
46 #include "llvm/IR/Intrinsics.h"
47 #include "llvm/IR/Type.h"
48 #include "llvm/MC/MCSectionMachO.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/Debug.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Support/raw_ostream.h"
54 #include "llvm/Target/TargetOptions.h"
55 #include <utility>
56 using namespace llvm;
57 
58 #define DEBUG_TYPE "arm-isel"
59 
60 STATISTIC(NumTailCalls, "Number of tail calls");
61 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
62 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
63 STATISTIC(NumConstpoolPromoted,
64   "Number of constants with their storage promoted into constant pools");
65 
66 static cl::opt<bool>
67 ARMInterworking("arm-interworking", cl::Hidden,
68   cl::desc("Enable / disable ARM interworking (for debugging only)"),
69   cl::init(true));
70 
71 static cl::opt<bool> EnableConstpoolPromotion(
72     "arm-promote-constant", cl::Hidden,
73     cl::desc("Enable / disable promotion of unnamed_addr constants into "
74              "constant pools"),
75     cl::init(true));
76 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
77     "arm-promote-constant-max-size", cl::Hidden,
78     cl::desc("Maximum size of constant to promote into a constant pool"),
79     cl::init(64));
80 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
81     "arm-promote-constant-max-total", cl::Hidden,
82     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
83     cl::init(128));
84 
85 namespace {
86   class ARMCCState : public CCState {
87   public:
88     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
89                SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
90                ParmContext PC)
91         : CCState(CC, isVarArg, MF, locs, C) {
92       assert(((PC == Call) || (PC == Prologue)) &&
93              "ARMCCState users must specify whether their context is call"
94              "or prologue generation.");
95       CallOrPrologue = PC;
96     }
97   };
98 }
99 
100 void ARMTargetLowering::InitLibcallCallingConvs() {
101   // The builtins on ARM always use AAPCS, irrespective of wheter C is AAPCS or
102   // AAPCS_VFP.
103   for (const auto LC : {
104            RTLIB::SHL_I16,
105            RTLIB::SHL_I32,
106            RTLIB::SHL_I64,
107            RTLIB::SHL_I128,
108            RTLIB::SRL_I16,
109            RTLIB::SRL_I32,
110            RTLIB::SRL_I64,
111            RTLIB::SRL_I128,
112            RTLIB::SRA_I16,
113            RTLIB::SRA_I32,
114            RTLIB::SRA_I64,
115            RTLIB::SRA_I128,
116            RTLIB::MUL_I8,
117            RTLIB::MUL_I16,
118            RTLIB::MUL_I32,
119            RTLIB::MUL_I64,
120            RTLIB::MUL_I128,
121            RTLIB::MULO_I32,
122            RTLIB::MULO_I64,
123            RTLIB::MULO_I128,
124            RTLIB::SDIV_I8,
125            RTLIB::SDIV_I16,
126            RTLIB::SDIV_I32,
127            RTLIB::SDIV_I64,
128            RTLIB::SDIV_I128,
129            RTLIB::UDIV_I8,
130            RTLIB::UDIV_I16,
131            RTLIB::UDIV_I32,
132            RTLIB::UDIV_I64,
133            RTLIB::UDIV_I128,
134            RTLIB::SREM_I8,
135            RTLIB::SREM_I16,
136            RTLIB::SREM_I32,
137            RTLIB::SREM_I64,
138            RTLIB::SREM_I128,
139            RTLIB::UREM_I8,
140            RTLIB::UREM_I16,
141            RTLIB::UREM_I32,
142            RTLIB::UREM_I64,
143            RTLIB::UREM_I128,
144            RTLIB::SDIVREM_I8,
145            RTLIB::SDIVREM_I16,
146            RTLIB::SDIVREM_I32,
147            RTLIB::SDIVREM_I64,
148            RTLIB::SDIVREM_I128,
149            RTLIB::UDIVREM_I8,
150            RTLIB::UDIVREM_I16,
151            RTLIB::UDIVREM_I32,
152            RTLIB::UDIVREM_I64,
153            RTLIB::UDIVREM_I128,
154            RTLIB::NEG_I32,
155            RTLIB::NEG_I64,
156            RTLIB::ADD_F32,
157            RTLIB::ADD_F64,
158            RTLIB::ADD_F80,
159            RTLIB::ADD_F128,
160            RTLIB::SUB_F32,
161            RTLIB::SUB_F64,
162            RTLIB::SUB_F80,
163            RTLIB::SUB_F128,
164            RTLIB::MUL_F32,
165            RTLIB::MUL_F64,
166            RTLIB::MUL_F80,
167            RTLIB::MUL_F128,
168            RTLIB::DIV_F32,
169            RTLIB::DIV_F64,
170            RTLIB::DIV_F80,
171            RTLIB::DIV_F128,
172            RTLIB::POWI_F32,
173            RTLIB::POWI_F64,
174            RTLIB::POWI_F80,
175            RTLIB::POWI_F128,
176            RTLIB::FPEXT_F64_F128,
177            RTLIB::FPEXT_F32_F128,
178            RTLIB::FPEXT_F32_F64,
179            RTLIB::FPEXT_F16_F32,
180            RTLIB::FPROUND_F32_F16,
181            RTLIB::FPROUND_F64_F16,
182            RTLIB::FPROUND_F80_F16,
183            RTLIB::FPROUND_F128_F16,
184            RTLIB::FPROUND_F64_F32,
185            RTLIB::FPROUND_F80_F32,
186            RTLIB::FPROUND_F128_F32,
187            RTLIB::FPROUND_F80_F64,
188            RTLIB::FPROUND_F128_F64,
189            RTLIB::FPTOSINT_F32_I32,
190            RTLIB::FPTOSINT_F32_I64,
191            RTLIB::FPTOSINT_F32_I128,
192            RTLIB::FPTOSINT_F64_I32,
193            RTLIB::FPTOSINT_F64_I64,
194            RTLIB::FPTOSINT_F64_I128,
195            RTLIB::FPTOSINT_F80_I32,
196            RTLIB::FPTOSINT_F80_I64,
197            RTLIB::FPTOSINT_F80_I128,
198            RTLIB::FPTOSINT_F128_I32,
199            RTLIB::FPTOSINT_F128_I64,
200            RTLIB::FPTOSINT_F128_I128,
201            RTLIB::FPTOUINT_F32_I32,
202            RTLIB::FPTOUINT_F32_I64,
203            RTLIB::FPTOUINT_F32_I128,
204            RTLIB::FPTOUINT_F64_I32,
205            RTLIB::FPTOUINT_F64_I64,
206            RTLIB::FPTOUINT_F64_I128,
207            RTLIB::FPTOUINT_F80_I32,
208            RTLIB::FPTOUINT_F80_I64,
209            RTLIB::FPTOUINT_F80_I128,
210            RTLIB::FPTOUINT_F128_I32,
211            RTLIB::FPTOUINT_F128_I64,
212            RTLIB::FPTOUINT_F128_I128,
213            RTLIB::SINTTOFP_I32_F32,
214            RTLIB::SINTTOFP_I32_F64,
215            RTLIB::SINTTOFP_I32_F80,
216            RTLIB::SINTTOFP_I32_F128,
217            RTLIB::SINTTOFP_I64_F32,
218            RTLIB::SINTTOFP_I64_F64,
219            RTLIB::SINTTOFP_I64_F80,
220            RTLIB::SINTTOFP_I64_F128,
221            RTLIB::SINTTOFP_I128_F32,
222            RTLIB::SINTTOFP_I128_F64,
223            RTLIB::SINTTOFP_I128_F80,
224            RTLIB::SINTTOFP_I128_F128,
225            RTLIB::UINTTOFP_I32_F32,
226            RTLIB::UINTTOFP_I32_F64,
227            RTLIB::UINTTOFP_I32_F80,
228            RTLIB::UINTTOFP_I32_F128,
229            RTLIB::UINTTOFP_I64_F32,
230            RTLIB::UINTTOFP_I64_F64,
231            RTLIB::UINTTOFP_I64_F80,
232            RTLIB::UINTTOFP_I64_F128,
233            RTLIB::UINTTOFP_I128_F32,
234            RTLIB::UINTTOFP_I128_F64,
235            RTLIB::UINTTOFP_I128_F80,
236            RTLIB::UINTTOFP_I128_F128,
237            RTLIB::OEQ_F32,
238            RTLIB::OEQ_F64,
239            RTLIB::OEQ_F128,
240            RTLIB::UNE_F32,
241            RTLIB::UNE_F64,
242            RTLIB::UNE_F128,
243            RTLIB::OGE_F32,
244            RTLIB::OGE_F64,
245            RTLIB::OGE_F128,
246            RTLIB::OLT_F32,
247            RTLIB::OLT_F64,
248            RTLIB::OLT_F128,
249            RTLIB::OLE_F32,
250            RTLIB::OLE_F64,
251            RTLIB::OLE_F128,
252            RTLIB::OGT_F32,
253            RTLIB::OGT_F64,
254            RTLIB::OGT_F128,
255            RTLIB::UO_F32,
256            RTLIB::UO_F64,
257            RTLIB::UO_F128,
258            RTLIB::O_F32,
259            RTLIB::O_F64,
260            RTLIB::O_F128,
261        })
262   setLibcallCallingConv(LC, CallingConv::ARM_AAPCS);
263 }
264 
265 // The APCS parameter registers.
266 static const MCPhysReg GPRArgRegs[] = {
267   ARM::R0, ARM::R1, ARM::R2, ARM::R3
268 };
269 
270 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
271                                        MVT PromotedBitwiseVT) {
272   if (VT != PromotedLdStVT) {
273     setOperationAction(ISD::LOAD, VT, Promote);
274     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
275 
276     setOperationAction(ISD::STORE, VT, Promote);
277     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
278   }
279 
280   MVT ElemTy = VT.getVectorElementType();
281   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
282     setOperationAction(ISD::SETCC, VT, Custom);
283   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
284   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
285   if (ElemTy == MVT::i32) {
286     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
287     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
288     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
289     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
290   } else {
291     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
292     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
293     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
294     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
295   }
296   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
297   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
298   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
299   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
300   setOperationAction(ISD::SELECT,            VT, Expand);
301   setOperationAction(ISD::SELECT_CC,         VT, Expand);
302   setOperationAction(ISD::VSELECT,           VT, Expand);
303   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
304   if (VT.isInteger()) {
305     setOperationAction(ISD::SHL, VT, Custom);
306     setOperationAction(ISD::SRA, VT, Custom);
307     setOperationAction(ISD::SRL, VT, Custom);
308   }
309 
310   // Promote all bit-wise operations.
311   if (VT.isInteger() && VT != PromotedBitwiseVT) {
312     setOperationAction(ISD::AND, VT, Promote);
313     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
314     setOperationAction(ISD::OR,  VT, Promote);
315     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
316     setOperationAction(ISD::XOR, VT, Promote);
317     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
318   }
319 
320   // Neon does not support vector divide/remainder operations.
321   setOperationAction(ISD::SDIV, VT, Expand);
322   setOperationAction(ISD::UDIV, VT, Expand);
323   setOperationAction(ISD::FDIV, VT, Expand);
324   setOperationAction(ISD::SREM, VT, Expand);
325   setOperationAction(ISD::UREM, VT, Expand);
326   setOperationAction(ISD::FREM, VT, Expand);
327 
328   if (!VT.isFloatingPoint() &&
329       VT != MVT::v2i64 && VT != MVT::v1i64)
330     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
331       setOperationAction(Opcode, VT, Legal);
332 }
333 
334 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
335   addRegisterClass(VT, &ARM::DPRRegClass);
336   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
337 }
338 
339 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
340   addRegisterClass(VT, &ARM::DPairRegClass);
341   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
342 }
343 
344 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
345                                      const ARMSubtarget &STI)
346     : TargetLowering(TM), Subtarget(&STI) {
347   RegInfo = Subtarget->getRegisterInfo();
348   Itins = Subtarget->getInstrItineraryData();
349 
350   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
351 
352   InitLibcallCallingConvs();
353 
354   if (Subtarget->isTargetMachO()) {
355     // Uses VFP for Thumb libfuncs if available.
356     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
357         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
358       static const struct {
359         const RTLIB::Libcall Op;
360         const char * const Name;
361         const ISD::CondCode Cond;
362       } LibraryCalls[] = {
363         // Single-precision floating-point arithmetic.
364         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
365         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
366         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
367         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
368 
369         // Double-precision floating-point arithmetic.
370         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
371         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
372         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
373         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
374 
375         // Single-precision comparisons.
376         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
377         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
378         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
379         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
380         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
381         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
382         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
383         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
384 
385         // Double-precision comparisons.
386         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
387         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
388         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
389         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
390         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
391         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
392         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
393         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
394 
395         // Floating-point to integer conversions.
396         // i64 conversions are done via library routines even when generating VFP
397         // instructions, so use the same ones.
398         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
399         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
400         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
401         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
402 
403         // Conversions between floating types.
404         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
405         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
406 
407         // Integer to floating-point conversions.
408         // i64 conversions are done via library routines even when generating VFP
409         // instructions, so use the same ones.
410         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
411         // e.g., __floatunsidf vs. __floatunssidfvfp.
412         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
413         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
414         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
415         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
416       };
417 
418       for (const auto &LC : LibraryCalls) {
419         setLibcallName(LC.Op, LC.Name);
420         if (LC.Cond != ISD::SETCC_INVALID)
421           setCmpLibcallCC(LC.Op, LC.Cond);
422       }
423     }
424 
425     // Set the correct calling convention for ARMv7k WatchOS. It's just
426     // AAPCS_VFP for functions as simple as libcalls.
427     if (Subtarget->isTargetWatchABI()) {
428       for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
429         setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
430     }
431   }
432 
433   // These libcalls are not available in 32-bit.
434   setLibcallName(RTLIB::SHL_I128, nullptr);
435   setLibcallName(RTLIB::SRL_I128, nullptr);
436   setLibcallName(RTLIB::SRA_I128, nullptr);
437 
438   // RTLIB
439   if (Subtarget->isAAPCS_ABI() &&
440       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
441        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
442     static const struct {
443       const RTLIB::Libcall Op;
444       const char * const Name;
445       const CallingConv::ID CC;
446       const ISD::CondCode Cond;
447     } LibraryCalls[] = {
448       // Double-precision floating-point arithmetic helper functions
449       // RTABI chapter 4.1.2, Table 2
450       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
451       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
452       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
453       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
454 
455       // Double-precision floating-point comparison helper functions
456       // RTABI chapter 4.1.2, Table 3
457       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
458       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
459       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
460       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
461       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
462       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
463       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
464       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
465 
466       // Single-precision floating-point arithmetic helper functions
467       // RTABI chapter 4.1.2, Table 4
468       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
469       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
470       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
471       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
472 
473       // Single-precision floating-point comparison helper functions
474       // RTABI chapter 4.1.2, Table 5
475       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
476       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
477       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
478       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
479       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
480       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
481       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
482       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
483 
484       // Floating-point to integer conversions.
485       // RTABI chapter 4.1.2, Table 6
486       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
487       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
488       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
489       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
490       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
491       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
492       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
493       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
494 
495       // Conversions between floating types.
496       // RTABI chapter 4.1.2, Table 7
497       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
498       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
499       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
500 
501       // Integer to floating-point conversions.
502       // RTABI chapter 4.1.2, Table 8
503       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
504       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
505       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
506       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
507       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
508       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
509       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
510       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
511 
512       // Long long helper functions
513       // RTABI chapter 4.2, Table 9
514       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
515       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
516       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
517       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
518 
519       // Integer division functions
520       // RTABI chapter 4.3.1
521       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
522       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
523       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
524       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
525       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
526       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
527       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
528       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
529     };
530 
531     for (const auto &LC : LibraryCalls) {
532       setLibcallName(LC.Op, LC.Name);
533       setLibcallCallingConv(LC.Op, LC.CC);
534       if (LC.Cond != ISD::SETCC_INVALID)
535         setCmpLibcallCC(LC.Op, LC.Cond);
536     }
537 
538     // EABI dependent RTLIB
539     if (TM.Options.EABIVersion == EABI::EABI4 ||
540         TM.Options.EABIVersion == EABI::EABI5) {
541       static const struct {
542         const RTLIB::Libcall Op;
543         const char *const Name;
544         const CallingConv::ID CC;
545         const ISD::CondCode Cond;
546       } MemOpsLibraryCalls[] = {
547         // Memory operations
548         // RTABI chapter 4.3.4
549         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
550         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
551         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
552       };
553 
554       for (const auto &LC : MemOpsLibraryCalls) {
555         setLibcallName(LC.Op, LC.Name);
556         setLibcallCallingConv(LC.Op, LC.CC);
557         if (LC.Cond != ISD::SETCC_INVALID)
558           setCmpLibcallCC(LC.Op, LC.Cond);
559       }
560     }
561   }
562 
563   if (Subtarget->isTargetWindows()) {
564     static const struct {
565       const RTLIB::Libcall Op;
566       const char * const Name;
567       const CallingConv::ID CC;
568     } LibraryCalls[] = {
569       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
570       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
571       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
572       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
573       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
574       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
575       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
576       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
577     };
578 
579     for (const auto &LC : LibraryCalls) {
580       setLibcallName(LC.Op, LC.Name);
581       setLibcallCallingConv(LC.Op, LC.CC);
582     }
583   }
584 
585   // Use divmod compiler-rt calls for iOS 5.0 and later.
586   if (Subtarget->isTargetWatchOS() ||
587       (Subtarget->isTargetIOS() &&
588        !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
589     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
590     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
591   }
592 
593   // The half <-> float conversion functions are always soft-float on
594   // non-watchos platforms, but are needed for some targets which use a
595   // hard-float calling convention by default.
596   if (!Subtarget->isTargetWatchABI()) {
597     if (Subtarget->isAAPCS_ABI()) {
598       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
599       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
600       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
601     } else {
602       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
603       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
604       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
605     }
606   }
607 
608   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
609   // a __gnu_ prefix (which is the default).
610   if (Subtarget->isTargetAEABI()) {
611     setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
612     setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
613     setLibcallName(RTLIB::FPEXT_F16_F32,   "__aeabi_h2f");
614   }
615 
616   if (Subtarget->isThumb1Only())
617     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
618   else
619     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
620   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
621       !Subtarget->isThumb1Only()) {
622     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
623     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
624   }
625 
626   for (MVT VT : MVT::vector_valuetypes()) {
627     for (MVT InnerVT : MVT::vector_valuetypes()) {
628       setTruncStoreAction(VT, InnerVT, Expand);
629       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
630       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
631       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
632     }
633 
634     setOperationAction(ISD::MULHS, VT, Expand);
635     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
636     setOperationAction(ISD::MULHU, VT, Expand);
637     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
638 
639     setOperationAction(ISD::BSWAP, VT, Expand);
640   }
641 
642   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
643   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
644 
645   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
646   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
647 
648   if (Subtarget->hasNEON()) {
649     addDRTypeForNEON(MVT::v2f32);
650     addDRTypeForNEON(MVT::v8i8);
651     addDRTypeForNEON(MVT::v4i16);
652     addDRTypeForNEON(MVT::v2i32);
653     addDRTypeForNEON(MVT::v1i64);
654 
655     addQRTypeForNEON(MVT::v4f32);
656     addQRTypeForNEON(MVT::v2f64);
657     addQRTypeForNEON(MVT::v16i8);
658     addQRTypeForNEON(MVT::v8i16);
659     addQRTypeForNEON(MVT::v4i32);
660     addQRTypeForNEON(MVT::v2i64);
661 
662     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
663     // neither Neon nor VFP support any arithmetic operations on it.
664     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
665     // supported for v4f32.
666     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
667     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
668     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
669     // FIXME: Code duplication: FDIV and FREM are expanded always, see
670     // ARMTargetLowering::addTypeForNEON method for details.
671     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
672     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
673     // FIXME: Create unittest.
674     // In another words, find a way when "copysign" appears in DAG with vector
675     // operands.
676     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
677     // FIXME: Code duplication: SETCC has custom operation action, see
678     // ARMTargetLowering::addTypeForNEON method for details.
679     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
680     // FIXME: Create unittest for FNEG and for FABS.
681     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
682     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
683     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
684     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
685     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
686     setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
687     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
688     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
689     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
690     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
691     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
692     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
693     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
694     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
695     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
696     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
697     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
698     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
699     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
700 
701     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
702     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
703     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
704     setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
705     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
706     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
707     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
708     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
709     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
710     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
711     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
712     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
713     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
714     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
715     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
716 
717     // Mark v2f32 intrinsics.
718     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
719     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
720     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
721     setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
722     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
723     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
724     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
725     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
726     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
727     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
728     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
729     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
730     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
731     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
732     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
733 
734     // Neon does not support some operations on v1i64 and v2i64 types.
735     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
736     // Custom handling for some quad-vector types to detect VMULL.
737     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
738     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
739     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
740     // Custom handling for some vector types to avoid expensive expansions
741     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
742     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
743     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
744     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
745     setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
746     setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
747     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
748     // a destination type that is wider than the source, and nor does
749     // it have a FP_TO_[SU]INT instruction with a narrower destination than
750     // source.
751     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
752     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
753     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
754     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
755 
756     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
757     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
758 
759     // NEON does not have single instruction CTPOP for vectors with element
760     // types wider than 8-bits.  However, custom lowering can leverage the
761     // v8i8/v16i8 vcnt instruction.
762     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
763     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
764     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
765     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
766     setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
767     setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
768 
769     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
770     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
771 
772     // NEON does not have single instruction CTTZ for vectors.
773     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
774     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
775     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
776     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
777 
778     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
779     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
780     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
781     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
782 
783     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
784     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
785     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
786     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
787 
788     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
789     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
790     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
791     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
792 
793     // NEON only has FMA instructions as of VFP4.
794     if (!Subtarget->hasVFP4()) {
795       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
796       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
797     }
798 
799     setTargetDAGCombine(ISD::INTRINSIC_VOID);
800     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
801     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
802     setTargetDAGCombine(ISD::SHL);
803     setTargetDAGCombine(ISD::SRL);
804     setTargetDAGCombine(ISD::SRA);
805     setTargetDAGCombine(ISD::SIGN_EXTEND);
806     setTargetDAGCombine(ISD::ZERO_EXTEND);
807     setTargetDAGCombine(ISD::ANY_EXTEND);
808     setTargetDAGCombine(ISD::BUILD_VECTOR);
809     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
810     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
811     setTargetDAGCombine(ISD::STORE);
812     setTargetDAGCombine(ISD::FP_TO_SINT);
813     setTargetDAGCombine(ISD::FP_TO_UINT);
814     setTargetDAGCombine(ISD::FDIV);
815     setTargetDAGCombine(ISD::LOAD);
816 
817     // It is legal to extload from v4i8 to v4i16 or v4i32.
818     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
819                    MVT::v2i32}) {
820       for (MVT VT : MVT::integer_vector_valuetypes()) {
821         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
822         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
823         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
824       }
825     }
826   }
827 
828   // ARM and Thumb2 support UMLAL/SMLAL.
829   if (!Subtarget->isThumb1Only())
830     setTargetDAGCombine(ISD::ADDC);
831 
832   if (Subtarget->isFPOnlySP()) {
833     // When targeting a floating-point unit with only single-precision
834     // operations, f64 is legal for the few double-precision instructions which
835     // are present However, no double-precision operations other than moves,
836     // loads and stores are provided by the hardware.
837     setOperationAction(ISD::FADD,       MVT::f64, Expand);
838     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
839     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
840     setOperationAction(ISD::FMA,        MVT::f64, Expand);
841     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
842     setOperationAction(ISD::FREM,       MVT::f64, Expand);
843     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
844     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
845     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
846     setOperationAction(ISD::FABS,       MVT::f64, Expand);
847     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
848     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
849     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
850     setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
851     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
852     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
853     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
854     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
855     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
856     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
857     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
858     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
859     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
860     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
861     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
862     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
863     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
864     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
865     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
866     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
867     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
868     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
869     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
870   }
871 
872   computeRegisterProperties(Subtarget->getRegisterInfo());
873 
874   // ARM does not have floating-point extending loads.
875   for (MVT VT : MVT::fp_valuetypes()) {
876     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
877     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
878   }
879 
880   // ... or truncating stores
881   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
882   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
883   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
884 
885   // ARM does not have i1 sign extending load.
886   for (MVT VT : MVT::integer_valuetypes())
887     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
888 
889   // ARM supports all 4 flavors of integer indexed load / store.
890   if (!Subtarget->isThumb1Only()) {
891     for (unsigned im = (unsigned)ISD::PRE_INC;
892          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
893       setIndexedLoadAction(im,  MVT::i1,  Legal);
894       setIndexedLoadAction(im,  MVT::i8,  Legal);
895       setIndexedLoadAction(im,  MVT::i16, Legal);
896       setIndexedLoadAction(im,  MVT::i32, Legal);
897       setIndexedStoreAction(im, MVT::i1,  Legal);
898       setIndexedStoreAction(im, MVT::i8,  Legal);
899       setIndexedStoreAction(im, MVT::i16, Legal);
900       setIndexedStoreAction(im, MVT::i32, Legal);
901     }
902   } else {
903     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
904     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
905     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
906   }
907 
908   setOperationAction(ISD::SADDO, MVT::i32, Custom);
909   setOperationAction(ISD::UADDO, MVT::i32, Custom);
910   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
911   setOperationAction(ISD::USUBO, MVT::i32, Custom);
912 
913   // i64 operation support.
914   setOperationAction(ISD::MUL,     MVT::i64, Expand);
915   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
916   if (Subtarget->isThumb1Only()) {
917     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
918     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
919   }
920   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
921       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
922     setOperationAction(ISD::MULHS, MVT::i32, Expand);
923 
924   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
925   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
926   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
927   setOperationAction(ISD::SRL,       MVT::i64, Custom);
928   setOperationAction(ISD::SRA,       MVT::i64, Custom);
929 
930   if (!Subtarget->isThumb1Only()) {
931     // FIXME: We should do this for Thumb1 as well.
932     setOperationAction(ISD::ADDC,    MVT::i32, Custom);
933     setOperationAction(ISD::ADDE,    MVT::i32, Custom);
934     setOperationAction(ISD::SUBC,    MVT::i32, Custom);
935     setOperationAction(ISD::SUBE,    MVT::i32, Custom);
936   }
937 
938   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
939     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
940 
941   // ARM does not have ROTL.
942   setOperationAction(ISD::ROTL, MVT::i32, Expand);
943   for (MVT VT : MVT::vector_valuetypes()) {
944     setOperationAction(ISD::ROTL, VT, Expand);
945     setOperationAction(ISD::ROTR, VT, Expand);
946   }
947   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
948   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
949   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
950     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
951 
952   // @llvm.readcyclecounter requires the Performance Monitors extension.
953   // Default to the 0 expansion on unsupported platforms.
954   // FIXME: Technically there are older ARM CPUs that have
955   // implementation-specific ways of obtaining this information.
956   if (Subtarget->hasPerfMon())
957     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
958 
959   // Only ARMv6 has BSWAP.
960   if (!Subtarget->hasV6Ops())
961     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
962 
963   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
964                                         : Subtarget->hasDivideInARMMode();
965   if (!hasDivide) {
966     // These are expanded into libcalls if the cpu doesn't have HW divider.
967     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
968     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
969   }
970 
971   if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
972     setOperationAction(ISD::SDIV, MVT::i32, Custom);
973     setOperationAction(ISD::UDIV, MVT::i32, Custom);
974 
975     setOperationAction(ISD::SDIV, MVT::i64, Custom);
976     setOperationAction(ISD::UDIV, MVT::i64, Custom);
977   }
978 
979   setOperationAction(ISD::SREM,  MVT::i32, Expand);
980   setOperationAction(ISD::UREM,  MVT::i32, Expand);
981   // Register based DivRem for AEABI (RTABI 4.2)
982   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
983       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
984       Subtarget->isTargetWindows()) {
985     setOperationAction(ISD::SREM, MVT::i64, Custom);
986     setOperationAction(ISD::UREM, MVT::i64, Custom);
987     HasStandaloneRem = false;
988 
989     for (const auto &LC :
990          {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32})
991       setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv"
992                                                       : "__aeabi_idivmod");
993     setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows()
994                                            ? "__rt_sdiv64"
995                                            : "__aeabi_ldivmod");
996     for (const auto &LC :
997          {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32})
998       setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv"
999                                                       : "__aeabi_uidivmod");
1000     setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows()
1001                                            ? "__rt_udiv64"
1002                                            : "__aeabi_uldivmod");
1003 
1004     setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
1005     setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
1006     setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
1007     setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
1008     setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
1009     setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
1010     setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
1011     setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
1012 
1013     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
1014     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1015     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
1016     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
1017   } else {
1018     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
1019     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
1020   }
1021 
1022   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
1023   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
1024   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
1025   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
1026 
1027   setOperationAction(ISD::TRAP, MVT::Other, Legal);
1028 
1029   // Use the default implementation.
1030   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
1031   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
1032   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
1033   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
1034   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
1035   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
1036 
1037   if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
1038     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1039   else
1040     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1041 
1042   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1043   // the default expansion.
1044   InsertFencesForAtomic = false;
1045   if (Subtarget->hasAnyDataBarrier() &&
1046       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1047     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1048     // to ldrex/strex loops already.
1049     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
1050     if (!Subtarget->isThumb() || !Subtarget->isMClass())
1051       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
1052 
1053     // On v8, we have particularly efficient implementations of atomic fences
1054     // if they can be combined with nearby atomic loads and stores.
1055     if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
1056       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1057       InsertFencesForAtomic = true;
1058     }
1059   } else {
1060     // If there's anything we can use as a barrier, go through custom lowering
1061     // for ATOMIC_FENCE.
1062     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
1063                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1064 
1065     // Set them all for expansion, which will force libcalls.
1066     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
1067     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
1068     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
1069     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
1070     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
1071     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
1072     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
1073     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1074     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1075     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1076     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1077     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1078     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1079     // Unordered/Monotonic case.
1080     setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1081     setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1082   }
1083 
1084   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
1085 
1086   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1087   if (!Subtarget->hasV6Ops()) {
1088     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1089     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
1090   }
1091   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1092 
1093   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1094       !Subtarget->isThumb1Only()) {
1095     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1096     // iff target supports vfp2.
1097     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1098     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1099   }
1100 
1101   // We want to custom lower some of our intrinsics.
1102   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1103   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1104   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1105   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1106   if (Subtarget->useSjLjEH())
1107     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1108 
1109   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
1110   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
1111   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
1112   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
1113   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
1114   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
1115   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1116   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1117   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1118 
1119   // Thumb-1 cannot currently select ARMISD::SUBE.
1120   if (!Subtarget->isThumb1Only())
1121     setOperationAction(ISD::SETCCE, MVT::i32, Custom);
1122 
1123   setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
1124   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
1125   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
1126   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
1127   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
1128 
1129   // We don't support sin/cos/fmod/copysign/pow
1130   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
1131   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
1132   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
1133   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
1134   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
1135   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
1136   setOperationAction(ISD::FREM,      MVT::f64, Expand);
1137   setOperationAction(ISD::FREM,      MVT::f32, Expand);
1138   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1139       !Subtarget->isThumb1Only()) {
1140     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1141     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1142   }
1143   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
1144   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
1145 
1146   if (!Subtarget->hasVFP4()) {
1147     setOperationAction(ISD::FMA, MVT::f64, Expand);
1148     setOperationAction(ISD::FMA, MVT::f32, Expand);
1149   }
1150 
1151   // Various VFP goodness
1152   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1153     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1154     if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
1155       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1156       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1157     }
1158 
1159     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1160     if (!Subtarget->hasFP16()) {
1161       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1162       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1163     }
1164   }
1165 
1166   // Combine sin / cos into one node or libcall if possible.
1167   if (Subtarget->hasSinCos()) {
1168     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1169     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1170     if (Subtarget->isTargetWatchABI()) {
1171       setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
1172       setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
1173     }
1174     if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
1175       // For iOS, we don't want to the normal expansion of a libcall to
1176       // sincos. We want to issue a libcall to __sincos_stret.
1177       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1178       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1179     }
1180   }
1181 
1182   // FP-ARMv8 implements a lot of rounding-like FP operations.
1183   if (Subtarget->hasFPARMv8()) {
1184     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1185     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1186     setOperationAction(ISD::FROUND, MVT::f32, Legal);
1187     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1188     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1189     setOperationAction(ISD::FRINT, MVT::f32, Legal);
1190     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1191     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1192     setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1193     setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1194     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1195     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1196 
1197     if (!Subtarget->isFPOnlySP()) {
1198       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1199       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1200       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1201       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1202       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1203       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1204       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1205       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1206     }
1207   }
1208 
1209   if (Subtarget->hasNEON()) {
1210     // vmin and vmax aren't available in a scalar form, so we use
1211     // a NEON instruction with an undef lane instead.
1212     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
1213     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
1214     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
1215     setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
1216     setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
1217     setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
1218   }
1219 
1220   // We have target-specific dag combine patterns for the following nodes:
1221   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1222   setTargetDAGCombine(ISD::ADD);
1223   setTargetDAGCombine(ISD::SUB);
1224   setTargetDAGCombine(ISD::MUL);
1225   setTargetDAGCombine(ISD::AND);
1226   setTargetDAGCombine(ISD::OR);
1227   setTargetDAGCombine(ISD::XOR);
1228 
1229   if (Subtarget->hasV6Ops())
1230     setTargetDAGCombine(ISD::SRL);
1231 
1232   setStackPointerRegisterToSaveRestore(ARM::SP);
1233 
1234   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1235       !Subtarget->hasVFP2())
1236     setSchedulingPreference(Sched::RegPressure);
1237   else
1238     setSchedulingPreference(Sched::Hybrid);
1239 
1240   //// temporary - rewrite interface to use type
1241   MaxStoresPerMemset = 8;
1242   MaxStoresPerMemsetOptSize = 4;
1243   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1244   MaxStoresPerMemcpyOptSize = 2;
1245   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1246   MaxStoresPerMemmoveOptSize = 2;
1247 
1248   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1249   // are at least 4 bytes aligned.
1250   setMinStackArgumentAlignment(4);
1251 
1252   // Prefer likely predicted branches to selects on out-of-order cores.
1253   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1254 
1255   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
1256 }
1257 
1258 bool ARMTargetLowering::useSoftFloat() const {
1259   return Subtarget->useSoftFloat();
1260 }
1261 
1262 // FIXME: It might make sense to define the representative register class as the
1263 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1264 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1265 // SPR's representative would be DPR_VFP2. This should work well if register
1266 // pressure tracking were modified such that a register use would increment the
1267 // pressure of the register class's representative and all of it's super
1268 // classes' representatives transitively. We have not implemented this because
1269 // of the difficulty prior to coalescing of modeling operand register classes
1270 // due to the common occurrence of cross class copies and subregister insertions
1271 // and extractions.
1272 std::pair<const TargetRegisterClass *, uint8_t>
1273 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1274                                            MVT VT) const {
1275   const TargetRegisterClass *RRC = nullptr;
1276   uint8_t Cost = 1;
1277   switch (VT.SimpleTy) {
1278   default:
1279     return TargetLowering::findRepresentativeClass(TRI, VT);
1280   // Use DPR as representative register class for all floating point
1281   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1282   // the cost is 1 for both f32 and f64.
1283   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1284   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1285     RRC = &ARM::DPRRegClass;
1286     // When NEON is used for SP, only half of the register file is available
1287     // because operations that define both SP and DP results will be constrained
1288     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1289     // coalescing by double-counting the SP regs. See the FIXME above.
1290     if (Subtarget->useNEONForSinglePrecisionFP())
1291       Cost = 2;
1292     break;
1293   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1294   case MVT::v4f32: case MVT::v2f64:
1295     RRC = &ARM::DPRRegClass;
1296     Cost = 2;
1297     break;
1298   case MVT::v4i64:
1299     RRC = &ARM::DPRRegClass;
1300     Cost = 4;
1301     break;
1302   case MVT::v8i64:
1303     RRC = &ARM::DPRRegClass;
1304     Cost = 8;
1305     break;
1306   }
1307   return std::make_pair(RRC, Cost);
1308 }
1309 
1310 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1311   switch ((ARMISD::NodeType)Opcode) {
1312   case ARMISD::FIRST_NUMBER:  break;
1313   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1314   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1315   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1316   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1317   case ARMISD::CALL:          return "ARMISD::CALL";
1318   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1319   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1320   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1321   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1322   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1323   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1324   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1325   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1326   case ARMISD::CMP:           return "ARMISD::CMP";
1327   case ARMISD::CMN:           return "ARMISD::CMN";
1328   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1329   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1330   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1331   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1332   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1333 
1334   case ARMISD::CMOV:          return "ARMISD::CMOV";
1335 
1336   case ARMISD::SSAT:          return "ARMISD::SSAT";
1337 
1338   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1339   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1340   case ARMISD::RRX:           return "ARMISD::RRX";
1341 
1342   case ARMISD::ADDC:          return "ARMISD::ADDC";
1343   case ARMISD::ADDE:          return "ARMISD::ADDE";
1344   case ARMISD::SUBC:          return "ARMISD::SUBC";
1345   case ARMISD::SUBE:          return "ARMISD::SUBE";
1346 
1347   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1348   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1349 
1350   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1351   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1352   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1353 
1354   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1355 
1356   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1357 
1358   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1359 
1360   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1361 
1362   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1363 
1364   case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
1365   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1366 
1367   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1368   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1369   case ARMISD::VCGE:          return "ARMISD::VCGE";
1370   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1371   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1372   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1373   case ARMISD::VCGT:          return "ARMISD::VCGT";
1374   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1375   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1376   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1377   case ARMISD::VTST:          return "ARMISD::VTST";
1378 
1379   case ARMISD::VSHL:          return "ARMISD::VSHL";
1380   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1381   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1382   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1383   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1384   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1385   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1386   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1387   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1388   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1389   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1390   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1391   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1392   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1393   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1394   case ARMISD::VSLI:          return "ARMISD::VSLI";
1395   case ARMISD::VSRI:          return "ARMISD::VSRI";
1396   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1397   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1398   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1399   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1400   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1401   case ARMISD::VDUP:          return "ARMISD::VDUP";
1402   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1403   case ARMISD::VEXT:          return "ARMISD::VEXT";
1404   case ARMISD::VREV64:        return "ARMISD::VREV64";
1405   case ARMISD::VREV32:        return "ARMISD::VREV32";
1406   case ARMISD::VREV16:        return "ARMISD::VREV16";
1407   case ARMISD::VZIP:          return "ARMISD::VZIP";
1408   case ARMISD::VUZP:          return "ARMISD::VUZP";
1409   case ARMISD::VTRN:          return "ARMISD::VTRN";
1410   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1411   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1412   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1413   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1414   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
1415   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1416   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1417   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1418   case ARMISD::BFI:           return "ARMISD::BFI";
1419   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1420   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1421   case ARMISD::VBSL:          return "ARMISD::VBSL";
1422   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1423   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1424   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1425   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1426   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1427   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1428   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1429   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1430   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1431   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1432   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1433   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1434   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1435   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1436   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1437   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1438   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1439   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1440   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1441   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1442   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1443   }
1444   return nullptr;
1445 }
1446 
1447 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1448                                           EVT VT) const {
1449   if (!VT.isVector())
1450     return getPointerTy(DL);
1451   return VT.changeVectorElementTypeToInteger();
1452 }
1453 
1454 /// getRegClassFor - Return the register class that should be used for the
1455 /// specified value type.
1456 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1457   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1458   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1459   // load / store 4 to 8 consecutive D registers.
1460   if (Subtarget->hasNEON()) {
1461     if (VT == MVT::v4i64)
1462       return &ARM::QQPRRegClass;
1463     if (VT == MVT::v8i64)
1464       return &ARM::QQQQPRRegClass;
1465   }
1466   return TargetLowering::getRegClassFor(VT);
1467 }
1468 
1469 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1470 // source/dest is aligned and the copy size is large enough. We therefore want
1471 // to align such objects passed to memory intrinsics.
1472 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1473                                                unsigned &PrefAlign) const {
1474   if (!isa<MemIntrinsic>(CI))
1475     return false;
1476   MinSize = 8;
1477   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1478   // cycle faster than 4-byte aligned LDM.
1479   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1480   return true;
1481 }
1482 
1483 // Create a fast isel object.
1484 FastISel *
1485 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1486                                   const TargetLibraryInfo *libInfo) const {
1487   return ARM::createFastISel(funcInfo, libInfo);
1488 }
1489 
1490 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1491   unsigned NumVals = N->getNumValues();
1492   if (!NumVals)
1493     return Sched::RegPressure;
1494 
1495   for (unsigned i = 0; i != NumVals; ++i) {
1496     EVT VT = N->getValueType(i);
1497     if (VT == MVT::Glue || VT == MVT::Other)
1498       continue;
1499     if (VT.isFloatingPoint() || VT.isVector())
1500       return Sched::ILP;
1501   }
1502 
1503   if (!N->isMachineOpcode())
1504     return Sched::RegPressure;
1505 
1506   // Load are scheduled for latency even if there instruction itinerary
1507   // is not available.
1508   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1509   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1510 
1511   if (MCID.getNumDefs() == 0)
1512     return Sched::RegPressure;
1513   if (!Itins->isEmpty() &&
1514       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1515     return Sched::ILP;
1516 
1517   return Sched::RegPressure;
1518 }
1519 
1520 //===----------------------------------------------------------------------===//
1521 // Lowering Code
1522 //===----------------------------------------------------------------------===//
1523 
1524 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1525 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1526   switch (CC) {
1527   default: llvm_unreachable("Unknown condition code!");
1528   case ISD::SETNE:  return ARMCC::NE;
1529   case ISD::SETEQ:  return ARMCC::EQ;
1530   case ISD::SETGT:  return ARMCC::GT;
1531   case ISD::SETGE:  return ARMCC::GE;
1532   case ISD::SETLT:  return ARMCC::LT;
1533   case ISD::SETLE:  return ARMCC::LE;
1534   case ISD::SETUGT: return ARMCC::HI;
1535   case ISD::SETUGE: return ARMCC::HS;
1536   case ISD::SETULT: return ARMCC::LO;
1537   case ISD::SETULE: return ARMCC::LS;
1538   }
1539 }
1540 
1541 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1542 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1543                         ARMCC::CondCodes &CondCode2) {
1544   CondCode2 = ARMCC::AL;
1545   switch (CC) {
1546   default: llvm_unreachable("Unknown FP condition!");
1547   case ISD::SETEQ:
1548   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1549   case ISD::SETGT:
1550   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1551   case ISD::SETGE:
1552   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1553   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1554   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1555   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1556   case ISD::SETO:   CondCode = ARMCC::VC; break;
1557   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1558   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1559   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1560   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1561   case ISD::SETLT:
1562   case ISD::SETULT: CondCode = ARMCC::LT; break;
1563   case ISD::SETLE:
1564   case ISD::SETULE: CondCode = ARMCC::LE; break;
1565   case ISD::SETNE:
1566   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1567   }
1568 }
1569 
1570 //===----------------------------------------------------------------------===//
1571 //                      Calling Convention Implementation
1572 //===----------------------------------------------------------------------===//
1573 
1574 #include "ARMGenCallingConv.inc"
1575 
1576 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1577 /// account presence of floating point hardware and calling convention
1578 /// limitations, such as support for variadic functions.
1579 CallingConv::ID
1580 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1581                                            bool isVarArg) const {
1582   switch (CC) {
1583   default:
1584     llvm_unreachable("Unsupported calling convention");
1585   case CallingConv::ARM_AAPCS:
1586   case CallingConv::ARM_APCS:
1587   case CallingConv::GHC:
1588     return CC;
1589   case CallingConv::PreserveMost:
1590     return CallingConv::PreserveMost;
1591   case CallingConv::ARM_AAPCS_VFP:
1592   case CallingConv::Swift:
1593     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1594   case CallingConv::C:
1595     if (!Subtarget->isAAPCS_ABI())
1596       return CallingConv::ARM_APCS;
1597     else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
1598              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1599              !isVarArg)
1600       return CallingConv::ARM_AAPCS_VFP;
1601     else
1602       return CallingConv::ARM_AAPCS;
1603   case CallingConv::Fast:
1604   case CallingConv::CXX_FAST_TLS:
1605     if (!Subtarget->isAAPCS_ABI()) {
1606       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1607         return CallingConv::Fast;
1608       return CallingConv::ARM_APCS;
1609     } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1610       return CallingConv::ARM_AAPCS_VFP;
1611     else
1612       return CallingConv::ARM_AAPCS;
1613   }
1614 }
1615 
1616 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1617 /// CallingConvention.
1618 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1619                                                  bool Return,
1620                                                  bool isVarArg) const {
1621   switch (getEffectiveCallingConv(CC, isVarArg)) {
1622   default:
1623     llvm_unreachable("Unsupported calling convention");
1624   case CallingConv::ARM_APCS:
1625     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1626   case CallingConv::ARM_AAPCS:
1627     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1628   case CallingConv::ARM_AAPCS_VFP:
1629     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1630   case CallingConv::Fast:
1631     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1632   case CallingConv::GHC:
1633     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1634   case CallingConv::PreserveMost:
1635     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1636   }
1637 }
1638 
1639 /// LowerCallResult - Lower the result values of a call into the
1640 /// appropriate copies out of appropriate physical registers.
1641 SDValue ARMTargetLowering::LowerCallResult(
1642     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
1643     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1644     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1645     SDValue ThisVal) const {
1646 
1647   // Assign locations to each value returned by this call.
1648   SmallVector<CCValAssign, 16> RVLocs;
1649   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1650                     *DAG.getContext(), Call);
1651   CCInfo.AnalyzeCallResult(Ins,
1652                            CCAssignFnForNode(CallConv, /* Return*/ true,
1653                                              isVarArg));
1654 
1655   // Copy all of the result registers out of their specified physreg.
1656   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1657     CCValAssign VA = RVLocs[i];
1658 
1659     // Pass 'this' value directly from the argument to return value, to avoid
1660     // reg unit interference
1661     if (i == 0 && isThisReturn) {
1662       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1663              "unexpected return calling convention register assignment");
1664       InVals.push_back(ThisVal);
1665       continue;
1666     }
1667 
1668     SDValue Val;
1669     if (VA.needsCustom()) {
1670       // Handle f64 or half of a v2f64.
1671       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1672                                       InFlag);
1673       Chain = Lo.getValue(1);
1674       InFlag = Lo.getValue(2);
1675       VA = RVLocs[++i]; // skip ahead to next loc
1676       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1677                                       InFlag);
1678       Chain = Hi.getValue(1);
1679       InFlag = Hi.getValue(2);
1680       if (!Subtarget->isLittle())
1681         std::swap (Lo, Hi);
1682       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1683 
1684       if (VA.getLocVT() == MVT::v2f64) {
1685         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1686         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1687                           DAG.getConstant(0, dl, MVT::i32));
1688 
1689         VA = RVLocs[++i]; // skip ahead to next loc
1690         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1691         Chain = Lo.getValue(1);
1692         InFlag = Lo.getValue(2);
1693         VA = RVLocs[++i]; // skip ahead to next loc
1694         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1695         Chain = Hi.getValue(1);
1696         InFlag = Hi.getValue(2);
1697         if (!Subtarget->isLittle())
1698           std::swap (Lo, Hi);
1699         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1700         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1701                           DAG.getConstant(1, dl, MVT::i32));
1702       }
1703     } else {
1704       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1705                                InFlag);
1706       Chain = Val.getValue(1);
1707       InFlag = Val.getValue(2);
1708     }
1709 
1710     switch (VA.getLocInfo()) {
1711     default: llvm_unreachable("Unknown loc info!");
1712     case CCValAssign::Full: break;
1713     case CCValAssign::BCvt:
1714       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1715       break;
1716     }
1717 
1718     InVals.push_back(Val);
1719   }
1720 
1721   return Chain;
1722 }
1723 
1724 /// LowerMemOpCallTo - Store the argument to the stack.
1725 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1726                                             SDValue Arg, const SDLoc &dl,
1727                                             SelectionDAG &DAG,
1728                                             const CCValAssign &VA,
1729                                             ISD::ArgFlagsTy Flags) const {
1730   unsigned LocMemOffset = VA.getLocMemOffset();
1731   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1732   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1733                        StackPtr, PtrOff);
1734   return DAG.getStore(
1735       Chain, dl, Arg, PtrOff,
1736       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
1737 }
1738 
1739 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1740                                          SDValue Chain, SDValue &Arg,
1741                                          RegsToPassVector &RegsToPass,
1742                                          CCValAssign &VA, CCValAssign &NextVA,
1743                                          SDValue &StackPtr,
1744                                          SmallVectorImpl<SDValue> &MemOpChains,
1745                                          ISD::ArgFlagsTy Flags) const {
1746 
1747   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1748                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1749   unsigned id = Subtarget->isLittle() ? 0 : 1;
1750   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1751 
1752   if (NextVA.isRegLoc())
1753     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1754   else {
1755     assert(NextVA.isMemLoc());
1756     if (!StackPtr.getNode())
1757       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1758                                     getPointerTy(DAG.getDataLayout()));
1759 
1760     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
1761                                            dl, DAG, NextVA,
1762                                            Flags));
1763   }
1764 }
1765 
1766 /// LowerCall - Lowering a call into a callseq_start <-
1767 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1768 /// nodes.
1769 SDValue
1770 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1771                              SmallVectorImpl<SDValue> &InVals) const {
1772   SelectionDAG &DAG                     = CLI.DAG;
1773   SDLoc &dl                             = CLI.DL;
1774   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1775   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1776   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1777   SDValue Chain                         = CLI.Chain;
1778   SDValue Callee                        = CLI.Callee;
1779   bool &isTailCall                      = CLI.IsTailCall;
1780   CallingConv::ID CallConv              = CLI.CallConv;
1781   bool doesNotRet                       = CLI.DoesNotReturn;
1782   bool isVarArg                         = CLI.IsVarArg;
1783 
1784   MachineFunction &MF = DAG.getMachineFunction();
1785   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1786   bool isThisReturn   = false;
1787   bool isSibCall      = false;
1788   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
1789 
1790   // Disable tail calls if they're not supported.
1791   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
1792     isTailCall = false;
1793 
1794   if (isTailCall) {
1795     // Check if it's really possible to do a tail call.
1796     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1797                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
1798                                                    Outs, OutVals, Ins, DAG);
1799     if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
1800       report_fatal_error("failed to perform tail call elimination on a call "
1801                          "site marked musttail");
1802     // We don't support GuaranteedTailCallOpt for ARM, only automatically
1803     // detected sibcalls.
1804     if (isTailCall) {
1805       ++NumTailCalls;
1806       isSibCall = true;
1807     }
1808   }
1809 
1810   // Analyze operands of the call, assigning locations to each operand.
1811   SmallVector<CCValAssign, 16> ArgLocs;
1812   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1813                     *DAG.getContext(), Call);
1814   CCInfo.AnalyzeCallOperands(Outs,
1815                              CCAssignFnForNode(CallConv, /* Return*/ false,
1816                                                isVarArg));
1817 
1818   // Get a count of how many bytes are to be pushed on the stack.
1819   unsigned NumBytes = CCInfo.getNextStackOffset();
1820 
1821   // For tail calls, memory operands are available in our caller's stack.
1822   if (isSibCall)
1823     NumBytes = 0;
1824 
1825   // Adjust the stack pointer for the new arguments...
1826   // These operations are automatically eliminated by the prolog/epilog pass
1827   if (!isSibCall)
1828     Chain = DAG.getCALLSEQ_START(Chain,
1829                                  DAG.getIntPtrConstant(NumBytes, dl, true), dl);
1830 
1831   SDValue StackPtr =
1832       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
1833 
1834   RegsToPassVector RegsToPass;
1835   SmallVector<SDValue, 8> MemOpChains;
1836 
1837   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1838   // of tail call optimization, arguments are handled later.
1839   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1840        i != e;
1841        ++i, ++realArgIdx) {
1842     CCValAssign &VA = ArgLocs[i];
1843     SDValue Arg = OutVals[realArgIdx];
1844     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1845     bool isByVal = Flags.isByVal();
1846 
1847     // Promote the value if needed.
1848     switch (VA.getLocInfo()) {
1849     default: llvm_unreachable("Unknown loc info!");
1850     case CCValAssign::Full: break;
1851     case CCValAssign::SExt:
1852       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1853       break;
1854     case CCValAssign::ZExt:
1855       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1856       break;
1857     case CCValAssign::AExt:
1858       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1859       break;
1860     case CCValAssign::BCvt:
1861       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1862       break;
1863     }
1864 
1865     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1866     if (VA.needsCustom()) {
1867       if (VA.getLocVT() == MVT::v2f64) {
1868         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1869                                   DAG.getConstant(0, dl, MVT::i32));
1870         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1871                                   DAG.getConstant(1, dl, MVT::i32));
1872 
1873         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1874                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1875 
1876         VA = ArgLocs[++i]; // skip ahead to next loc
1877         if (VA.isRegLoc()) {
1878           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1879                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1880         } else {
1881           assert(VA.isMemLoc());
1882 
1883           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1884                                                  dl, DAG, VA, Flags));
1885         }
1886       } else {
1887         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1888                          StackPtr, MemOpChains, Flags);
1889       }
1890     } else if (VA.isRegLoc()) {
1891       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
1892         assert(VA.getLocVT() == MVT::i32 &&
1893                "unexpected calling convention register assignment");
1894         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1895                "unexpected use of 'returned'");
1896         isThisReturn = true;
1897       }
1898       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1899     } else if (isByVal) {
1900       assert(VA.isMemLoc());
1901       unsigned offset = 0;
1902 
1903       // True if this byval aggregate will be split between registers
1904       // and memory.
1905       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1906       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
1907 
1908       if (CurByValIdx < ByValArgsCount) {
1909 
1910         unsigned RegBegin, RegEnd;
1911         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1912 
1913         EVT PtrVT =
1914             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1915         unsigned int i, j;
1916         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1917           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
1918           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1919           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1920                                      MachinePointerInfo(),
1921                                      DAG.InferPtrAlignment(AddArg));
1922           MemOpChains.push_back(Load.getValue(1));
1923           RegsToPass.push_back(std::make_pair(j, Load));
1924         }
1925 
1926         // If parameter size outsides register area, "offset" value
1927         // helps us to calculate stack slot for remained part properly.
1928         offset = RegEnd - RegBegin;
1929 
1930         CCInfo.nextInRegsParam();
1931       }
1932 
1933       if (Flags.getByValSize() > 4*offset) {
1934         auto PtrVT = getPointerTy(DAG.getDataLayout());
1935         unsigned LocMemOffset = VA.getLocMemOffset();
1936         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1937         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
1938         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
1939         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
1940         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
1941                                            MVT::i32);
1942         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
1943                                             MVT::i32);
1944 
1945         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1946         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1947         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1948                                           Ops));
1949       }
1950     } else if (!isSibCall) {
1951       assert(VA.isMemLoc());
1952 
1953       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1954                                              dl, DAG, VA, Flags));
1955     }
1956   }
1957 
1958   if (!MemOpChains.empty())
1959     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
1960 
1961   // Build a sequence of copy-to-reg nodes chained together with token chain
1962   // and flag operands which copy the outgoing args into the appropriate regs.
1963   SDValue InFlag;
1964   // Tail call byval lowering might overwrite argument registers so in case of
1965   // tail call optimization the copies to registers are lowered later.
1966   if (!isTailCall)
1967     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1968       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1969                                RegsToPass[i].second, InFlag);
1970       InFlag = Chain.getValue(1);
1971     }
1972 
1973   // For tail calls lower the arguments to the 'real' stack slot.
1974   if (isTailCall) {
1975     // Force all the incoming stack arguments to be loaded from the stack
1976     // before any new outgoing arguments are stored to the stack, because the
1977     // outgoing stack slots may alias the incoming argument stack slots, and
1978     // the alias isn't otherwise explicit. This is slightly more conservative
1979     // than necessary, because it means that each store effectively depends
1980     // on every argument instead of just those arguments it would clobber.
1981 
1982     // Do not flag preceding copytoreg stuff together with the following stuff.
1983     InFlag = SDValue();
1984     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1985       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1986                                RegsToPass[i].second, InFlag);
1987       InFlag = Chain.getValue(1);
1988     }
1989     InFlag = SDValue();
1990   }
1991 
1992   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1993   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1994   // node so that legalize doesn't hack it.
1995   bool isDirect = false;
1996 
1997   const TargetMachine &TM = getTargetMachine();
1998   const Module *Mod = MF.getFunction()->getParent();
1999   const GlobalValue *GV = nullptr;
2000   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2001     GV = G->getGlobal();
2002   bool isStub =
2003       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2004 
2005   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2006   bool isLocalARMFunc = false;
2007   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2008   auto PtrVt = getPointerTy(DAG.getDataLayout());
2009 
2010   if (Subtarget->genLongCalls()) {
2011     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2012            "long-calls codegen is not position independent!");
2013     // Handle a global address or an external symbol. If it's not one of
2014     // those, the target's already in a register, so we don't need to do
2015     // anything extra.
2016     if (isa<GlobalAddressSDNode>(Callee)) {
2017       // Create a constant pool entry for the callee address
2018       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2019       ARMConstantPoolValue *CPV =
2020         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2021 
2022       // Get the address of the callee into a register
2023       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2024       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2025       Callee = DAG.getLoad(
2026           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2027           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2028     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2029       const char *Sym = S->getSymbol();
2030 
2031       // Create a constant pool entry for the callee address
2032       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2033       ARMConstantPoolValue *CPV =
2034         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2035                                       ARMPCLabelIndex, 0);
2036       // Get the address of the callee into a register
2037       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2038       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2039       Callee = DAG.getLoad(
2040           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2041           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2042     }
2043   } else if (isa<GlobalAddressSDNode>(Callee)) {
2044     // If we're optimizing for minimum size and the function is called three or
2045     // more times in this block, we can improve codesize by calling indirectly
2046     // as BLXr has a 16-bit encoding.
2047     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2048     auto *BB = CLI.CS->getParent();
2049     bool PreferIndirect =
2050         Subtarget->isThumb() && MF.getFunction()->optForMinSize() &&
2051         count_if(GV->users(), [&BB](const User *U) {
2052           return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
2053         }) > 2;
2054 
2055     if (!PreferIndirect) {
2056       isDirect = true;
2057       bool isDef = GV->isStrongDefinitionForLinker();
2058 
2059       // ARM call to a local ARM function is predicable.
2060       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2061       // tBX takes a register source operand.
2062       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2063         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2064         Callee = DAG.getNode(
2065             ARMISD::WrapperPIC, dl, PtrVt,
2066             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2067         Callee = DAG.getLoad(
2068             PtrVt, dl, DAG.getEntryNode(), Callee,
2069             MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2070             /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
2071                                      MachineMemOperand::MOInvariant);
2072       } else if (Subtarget->isTargetCOFF()) {
2073         assert(Subtarget->isTargetWindows() &&
2074                "Windows is the only supported COFF target");
2075         unsigned TargetFlags = GV->hasDLLImportStorageClass()
2076                                    ? ARMII::MO_DLLIMPORT
2077                                    : ARMII::MO_NO_FLAG;
2078         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
2079                                             TargetFlags);
2080         if (GV->hasDLLImportStorageClass())
2081           Callee =
2082               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2083                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2084                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2085       } else {
2086         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2087       }
2088     }
2089   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2090     isDirect = true;
2091     // tBX takes a register source operand.
2092     const char *Sym = S->getSymbol();
2093     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2094       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2095       ARMConstantPoolValue *CPV =
2096         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2097                                       ARMPCLabelIndex, 4);
2098       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2099       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2100       Callee = DAG.getLoad(
2101           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2102           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2103       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2104       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2105     } else {
2106       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2107     }
2108   }
2109 
2110   // FIXME: handle tail calls differently.
2111   unsigned CallOpc;
2112   if (Subtarget->isThumb()) {
2113     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2114       CallOpc = ARMISD::CALL_NOLINK;
2115     else
2116       CallOpc = ARMISD::CALL;
2117   } else {
2118     if (!isDirect && !Subtarget->hasV5TOps())
2119       CallOpc = ARMISD::CALL_NOLINK;
2120     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2121              // Emit regular call when code size is the priority
2122              !MF.getFunction()->optForMinSize())
2123       // "mov lr, pc; b _foo" to avoid confusing the RSP
2124       CallOpc = ARMISD::CALL_NOLINK;
2125     else
2126       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2127   }
2128 
2129   std::vector<SDValue> Ops;
2130   Ops.push_back(Chain);
2131   Ops.push_back(Callee);
2132 
2133   // Add argument registers to the end of the list so that they are known live
2134   // into the call.
2135   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2136     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2137                                   RegsToPass[i].second.getValueType()));
2138 
2139   // Add a register mask operand representing the call-preserved registers.
2140   if (!isTailCall) {
2141     const uint32_t *Mask;
2142     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2143     if (isThisReturn) {
2144       // For 'this' returns, use the R0-preserving mask if applicable
2145       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2146       if (!Mask) {
2147         // Set isThisReturn to false if the calling convention is not one that
2148         // allows 'returned' to be modeled in this way, so LowerCallResult does
2149         // not try to pass 'this' straight through
2150         isThisReturn = false;
2151         Mask = ARI->getCallPreservedMask(MF, CallConv);
2152       }
2153     } else
2154       Mask = ARI->getCallPreservedMask(MF, CallConv);
2155 
2156     assert(Mask && "Missing call preserved mask for calling convention");
2157     Ops.push_back(DAG.getRegisterMask(Mask));
2158   }
2159 
2160   if (InFlag.getNode())
2161     Ops.push_back(InFlag);
2162 
2163   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2164   if (isTailCall) {
2165     MF.getFrameInfo().setHasTailCall();
2166     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2167   }
2168 
2169   // Returns a chain and a flag for retval copy to use.
2170   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2171   InFlag = Chain.getValue(1);
2172 
2173   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2174                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2175   if (!Ins.empty())
2176     InFlag = Chain.getValue(1);
2177 
2178   // Handle result values, copying them out of physregs into vregs that we
2179   // return.
2180   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2181                          InVals, isThisReturn,
2182                          isThisReturn ? OutVals[0] : SDValue());
2183 }
2184 
2185 /// HandleByVal - Every parameter *after* a byval parameter is passed
2186 /// on the stack.  Remember the next parameter register to allocate,
2187 /// and then confiscate the rest of the parameter registers to insure
2188 /// this.
2189 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2190                                     unsigned Align) const {
2191   assert((State->getCallOrPrologue() == Prologue ||
2192           State->getCallOrPrologue() == Call) &&
2193          "unhandled ParmContext");
2194 
2195   // Byval (as with any stack) slots are always at least 4 byte aligned.
2196   Align = std::max(Align, 4U);
2197 
2198   unsigned Reg = State->AllocateReg(GPRArgRegs);
2199   if (!Reg)
2200     return;
2201 
2202   unsigned AlignInRegs = Align / 4;
2203   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2204   for (unsigned i = 0; i < Waste; ++i)
2205     Reg = State->AllocateReg(GPRArgRegs);
2206 
2207   if (!Reg)
2208     return;
2209 
2210   unsigned Excess = 4 * (ARM::R4 - Reg);
2211 
2212   // Special case when NSAA != SP and parameter size greater than size of
2213   // all remained GPR regs. In that case we can't split parameter, we must
2214   // send it to stack. We also must set NCRN to R4, so waste all
2215   // remained registers.
2216   const unsigned NSAAOffset = State->getNextStackOffset();
2217   if (NSAAOffset != 0 && Size > Excess) {
2218     while (State->AllocateReg(GPRArgRegs))
2219       ;
2220     return;
2221   }
2222 
2223   // First register for byval parameter is the first register that wasn't
2224   // allocated before this method call, so it would be "reg".
2225   // If parameter is small enough to be saved in range [reg, r4), then
2226   // the end (first after last) register would be reg + param-size-in-regs,
2227   // else parameter would be splitted between registers and stack,
2228   // end register would be r4 in this case.
2229   unsigned ByValRegBegin = Reg;
2230   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2231   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2232   // Note, first register is allocated in the beginning of function already,
2233   // allocate remained amount of registers we need.
2234   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2235     State->AllocateReg(GPRArgRegs);
2236   // A byval parameter that is split between registers and memory needs its
2237   // size truncated here.
2238   // In the case where the entire structure fits in registers, we set the
2239   // size in memory to zero.
2240   Size = std::max<int>(Size - Excess, 0);
2241 }
2242 
2243 /// MatchingStackOffset - Return true if the given stack call argument is
2244 /// already available in the same position (relatively) of the caller's
2245 /// incoming argument stack.
2246 static
2247 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2248                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2249                          const TargetInstrInfo *TII) {
2250   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2251   int FI = INT_MAX;
2252   if (Arg.getOpcode() == ISD::CopyFromReg) {
2253     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2254     if (!TargetRegisterInfo::isVirtualRegister(VR))
2255       return false;
2256     MachineInstr *Def = MRI->getVRegDef(VR);
2257     if (!Def)
2258       return false;
2259     if (!Flags.isByVal()) {
2260       if (!TII->isLoadFromStackSlot(*Def, FI))
2261         return false;
2262     } else {
2263       return false;
2264     }
2265   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2266     if (Flags.isByVal())
2267       // ByVal argument is passed in as a pointer but it's now being
2268       // dereferenced. e.g.
2269       // define @foo(%struct.X* %A) {
2270       //   tail call @bar(%struct.X* byval %A)
2271       // }
2272       return false;
2273     SDValue Ptr = Ld->getBasePtr();
2274     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2275     if (!FINode)
2276       return false;
2277     FI = FINode->getIndex();
2278   } else
2279     return false;
2280 
2281   assert(FI != INT_MAX);
2282   if (!MFI.isFixedObjectIndex(FI))
2283     return false;
2284   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2285 }
2286 
2287 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2288 /// for tail call optimization. Targets which want to do tail call
2289 /// optimization should implement this function.
2290 bool
2291 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2292                                                      CallingConv::ID CalleeCC,
2293                                                      bool isVarArg,
2294                                                      bool isCalleeStructRet,
2295                                                      bool isCallerStructRet,
2296                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
2297                                     const SmallVectorImpl<SDValue> &OutVals,
2298                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2299                                                      SelectionDAG& DAG) const {
2300   MachineFunction &MF = DAG.getMachineFunction();
2301   const Function *CallerF = MF.getFunction();
2302   CallingConv::ID CallerCC = CallerF->getCallingConv();
2303 
2304   assert(Subtarget->supportsTailCall());
2305 
2306   // Look for obvious safe cases to perform tail call optimization that do not
2307   // require ABI changes. This is what gcc calls sibcall.
2308 
2309   // Do not sibcall optimize vararg calls unless the call site is not passing
2310   // any arguments.
2311   if (isVarArg && !Outs.empty())
2312     return false;
2313 
2314   // Exception-handling functions need a special set of instructions to indicate
2315   // a return to the hardware. Tail-calling another function would probably
2316   // break this.
2317   if (CallerF->hasFnAttribute("interrupt"))
2318     return false;
2319 
2320   // Also avoid sibcall optimization if either caller or callee uses struct
2321   // return semantics.
2322   if (isCalleeStructRet || isCallerStructRet)
2323     return false;
2324 
2325   // Externally-defined functions with weak linkage should not be
2326   // tail-called on ARM when the OS does not support dynamic
2327   // pre-emption of symbols, as the AAELF spec requires normal calls
2328   // to undefined weak functions to be replaced with a NOP or jump to the
2329   // next instruction. The behaviour of branch instructions in this
2330   // situation (as used for tail calls) is implementation-defined, so we
2331   // cannot rely on the linker replacing the tail call with a return.
2332   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2333     const GlobalValue *GV = G->getGlobal();
2334     const Triple &TT = getTargetMachine().getTargetTriple();
2335     if (GV->hasExternalWeakLinkage() &&
2336         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2337       return false;
2338   }
2339 
2340   // Check that the call results are passed in the same way.
2341   LLVMContext &C = *DAG.getContext();
2342   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2343                                   CCAssignFnForNode(CalleeCC, true, isVarArg),
2344                                   CCAssignFnForNode(CallerCC, true, isVarArg)))
2345     return false;
2346   // The callee has to preserve all registers the caller needs to preserve.
2347   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2348   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2349   if (CalleeCC != CallerCC) {
2350     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2351     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2352       return false;
2353   }
2354 
2355   // If Caller's vararg or byval argument has been split between registers and
2356   // stack, do not perform tail call, since part of the argument is in caller's
2357   // local frame.
2358   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2359   if (AFI_Caller->getArgRegsSaveSize())
2360     return false;
2361 
2362   // If the callee takes no arguments then go on to check the results of the
2363   // call.
2364   if (!Outs.empty()) {
2365     // Check if stack adjustment is needed. For now, do not do this if any
2366     // argument is passed on the stack.
2367     SmallVector<CCValAssign, 16> ArgLocs;
2368     ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
2369     CCInfo.AnalyzeCallOperands(Outs,
2370                                CCAssignFnForNode(CalleeCC, false, isVarArg));
2371     if (CCInfo.getNextStackOffset()) {
2372       // Check if the arguments are already laid out in the right way as
2373       // the caller's fixed stack objects.
2374       MachineFrameInfo &MFI = MF.getFrameInfo();
2375       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2376       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2377       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2378            i != e;
2379            ++i, ++realArgIdx) {
2380         CCValAssign &VA = ArgLocs[i];
2381         EVT RegVT = VA.getLocVT();
2382         SDValue Arg = OutVals[realArgIdx];
2383         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2384         if (VA.getLocInfo() == CCValAssign::Indirect)
2385           return false;
2386         if (VA.needsCustom()) {
2387           // f64 and vector types are split into multiple registers or
2388           // register/stack-slot combinations.  The types will not match
2389           // the registers; give up on memory f64 refs until we figure
2390           // out what to do about this.
2391           if (!VA.isRegLoc())
2392             return false;
2393           if (!ArgLocs[++i].isRegLoc())
2394             return false;
2395           if (RegVT == MVT::v2f64) {
2396             if (!ArgLocs[++i].isRegLoc())
2397               return false;
2398             if (!ArgLocs[++i].isRegLoc())
2399               return false;
2400           }
2401         } else if (!VA.isRegLoc()) {
2402           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2403                                    MFI, MRI, TII))
2404             return false;
2405         }
2406       }
2407     }
2408 
2409     const MachineRegisterInfo &MRI = MF.getRegInfo();
2410     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2411       return false;
2412   }
2413 
2414   return true;
2415 }
2416 
2417 bool
2418 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2419                                   MachineFunction &MF, bool isVarArg,
2420                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2421                                   LLVMContext &Context) const {
2422   SmallVector<CCValAssign, 16> RVLocs;
2423   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2424   return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
2425                                                     isVarArg));
2426 }
2427 
2428 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2429                                     const SDLoc &DL, SelectionDAG &DAG) {
2430   const MachineFunction &MF = DAG.getMachineFunction();
2431   const Function *F = MF.getFunction();
2432 
2433   StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
2434 
2435   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2436   // version of the "preferred return address". These offsets affect the return
2437   // instruction if this is a return from PL1 without hypervisor extensions.
2438   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2439   //    SWI:     0      "subs pc, lr, #0"
2440   //    ABORT:   +4     "subs pc, lr, #4"
2441   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2442   // UNDEF varies depending on where the exception came from ARM or Thumb
2443   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2444 
2445   int64_t LROffset;
2446   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2447       IntKind == "ABORT")
2448     LROffset = 4;
2449   else if (IntKind == "SWI" || IntKind == "UNDEF")
2450     LROffset = 0;
2451   else
2452     report_fatal_error("Unsupported interrupt attribute. If present, value "
2453                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2454 
2455   RetOps.insert(RetOps.begin() + 1,
2456                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2457 
2458   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2459 }
2460 
2461 SDValue
2462 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2463                                bool isVarArg,
2464                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2465                                const SmallVectorImpl<SDValue> &OutVals,
2466                                const SDLoc &dl, SelectionDAG &DAG) const {
2467 
2468   // CCValAssign - represent the assignment of the return value to a location.
2469   SmallVector<CCValAssign, 16> RVLocs;
2470 
2471   // CCState - Info about the registers and stack slots.
2472   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2473                     *DAG.getContext(), Call);
2474 
2475   // Analyze outgoing return values.
2476   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
2477                                                isVarArg));
2478 
2479   SDValue Flag;
2480   SmallVector<SDValue, 4> RetOps;
2481   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2482   bool isLittleEndian = Subtarget->isLittle();
2483 
2484   MachineFunction &MF = DAG.getMachineFunction();
2485   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2486   AFI->setReturnRegsCount(RVLocs.size());
2487 
2488   // Copy the result values into the output registers.
2489   for (unsigned i = 0, realRVLocIdx = 0;
2490        i != RVLocs.size();
2491        ++i, ++realRVLocIdx) {
2492     CCValAssign &VA = RVLocs[i];
2493     assert(VA.isRegLoc() && "Can only return in registers!");
2494 
2495     SDValue Arg = OutVals[realRVLocIdx];
2496 
2497     switch (VA.getLocInfo()) {
2498     default: llvm_unreachable("Unknown loc info!");
2499     case CCValAssign::Full: break;
2500     case CCValAssign::BCvt:
2501       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2502       break;
2503     }
2504 
2505     if (VA.needsCustom()) {
2506       if (VA.getLocVT() == MVT::v2f64) {
2507         // Extract the first half and return it in two registers.
2508         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2509                                    DAG.getConstant(0, dl, MVT::i32));
2510         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2511                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2512 
2513         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2514                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2515                                  Flag);
2516         Flag = Chain.getValue(1);
2517         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2518         VA = RVLocs[++i]; // skip ahead to next loc
2519         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2520                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2521                                  Flag);
2522         Flag = Chain.getValue(1);
2523         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2524         VA = RVLocs[++i]; // skip ahead to next loc
2525 
2526         // Extract the 2nd half and fall through to handle it as an f64 value.
2527         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2528                           DAG.getConstant(1, dl, MVT::i32));
2529       }
2530       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2531       // available.
2532       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2533                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2534       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2535                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2536                                Flag);
2537       Flag = Chain.getValue(1);
2538       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2539       VA = RVLocs[++i]; // skip ahead to next loc
2540       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2541                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2542                                Flag);
2543     } else
2544       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2545 
2546     // Guarantee that all emitted copies are
2547     // stuck together, avoiding something bad.
2548     Flag = Chain.getValue(1);
2549     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2550   }
2551   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2552   const MCPhysReg *I =
2553       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2554   if (I) {
2555     for (; *I; ++I) {
2556       if (ARM::GPRRegClass.contains(*I))
2557         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2558       else if (ARM::DPRRegClass.contains(*I))
2559         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
2560       else
2561         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2562     }
2563   }
2564 
2565   // Update chain and glue.
2566   RetOps[0] = Chain;
2567   if (Flag.getNode())
2568     RetOps.push_back(Flag);
2569 
2570   // CPUs which aren't M-class use a special sequence to return from
2571   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2572   // though we use "subs pc, lr, #N").
2573   //
2574   // M-class CPUs actually use a normal return sequence with a special
2575   // (hardware-provided) value in LR, so the normal code path works.
2576   if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
2577       !Subtarget->isMClass()) {
2578     if (Subtarget->isThumb1Only())
2579       report_fatal_error("interrupt attribute is not supported in Thumb1");
2580     return LowerInterruptReturn(RetOps, dl, DAG);
2581   }
2582 
2583   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2584 }
2585 
2586 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2587   if (N->getNumValues() != 1)
2588     return false;
2589   if (!N->hasNUsesOfValue(1, 0))
2590     return false;
2591 
2592   SDValue TCChain = Chain;
2593   SDNode *Copy = *N->use_begin();
2594   if (Copy->getOpcode() == ISD::CopyToReg) {
2595     // If the copy has a glue operand, we conservatively assume it isn't safe to
2596     // perform a tail call.
2597     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2598       return false;
2599     TCChain = Copy->getOperand(0);
2600   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2601     SDNode *VMov = Copy;
2602     // f64 returned in a pair of GPRs.
2603     SmallPtrSet<SDNode*, 2> Copies;
2604     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2605          UI != UE; ++UI) {
2606       if (UI->getOpcode() != ISD::CopyToReg)
2607         return false;
2608       Copies.insert(*UI);
2609     }
2610     if (Copies.size() > 2)
2611       return false;
2612 
2613     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2614          UI != UE; ++UI) {
2615       SDValue UseChain = UI->getOperand(0);
2616       if (Copies.count(UseChain.getNode()))
2617         // Second CopyToReg
2618         Copy = *UI;
2619       else {
2620         // We are at the top of this chain.
2621         // If the copy has a glue operand, we conservatively assume it
2622         // isn't safe to perform a tail call.
2623         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2624           return false;
2625         // First CopyToReg
2626         TCChain = UseChain;
2627       }
2628     }
2629   } else if (Copy->getOpcode() == ISD::BITCAST) {
2630     // f32 returned in a single GPR.
2631     if (!Copy->hasOneUse())
2632       return false;
2633     Copy = *Copy->use_begin();
2634     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2635       return false;
2636     // If the copy has a glue operand, we conservatively assume it isn't safe to
2637     // perform a tail call.
2638     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2639       return false;
2640     TCChain = Copy->getOperand(0);
2641   } else {
2642     return false;
2643   }
2644 
2645   bool HasRet = false;
2646   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2647        UI != UE; ++UI) {
2648     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2649         UI->getOpcode() != ARMISD::INTRET_FLAG)
2650       return false;
2651     HasRet = true;
2652   }
2653 
2654   if (!HasRet)
2655     return false;
2656 
2657   Chain = TCChain;
2658   return true;
2659 }
2660 
2661 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2662   if (!Subtarget->supportsTailCall())
2663     return false;
2664 
2665   auto Attr =
2666       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2667   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2668     return false;
2669 
2670   return true;
2671 }
2672 
2673 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2674 // and pass the lower and high parts through.
2675 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2676   SDLoc DL(Op);
2677   SDValue WriteValue = Op->getOperand(2);
2678 
2679   // This function is only supposed to be called for i64 type argument.
2680   assert(WriteValue.getValueType() == MVT::i64
2681           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2682 
2683   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2684                            DAG.getConstant(0, DL, MVT::i32));
2685   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2686                            DAG.getConstant(1, DL, MVT::i32));
2687   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2688   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2689 }
2690 
2691 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2692 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2693 // one of the above mentioned nodes. It has to be wrapped because otherwise
2694 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2695 // be used to form addressing mode. These wrapped nodes will be selected
2696 // into MOVi.
2697 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2698   EVT PtrVT = Op.getValueType();
2699   // FIXME there is no actual debug info here
2700   SDLoc dl(Op);
2701   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2702   SDValue Res;
2703   if (CP->isMachineConstantPoolEntry())
2704     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2705                                     CP->getAlignment());
2706   else
2707     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2708                                     CP->getAlignment());
2709   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2710 }
2711 
2712 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2713   return MachineJumpTableInfo::EK_Inline;
2714 }
2715 
2716 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2717                                              SelectionDAG &DAG) const {
2718   MachineFunction &MF = DAG.getMachineFunction();
2719   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2720   unsigned ARMPCLabelIndex = 0;
2721   SDLoc DL(Op);
2722   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2723   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2724   SDValue CPAddr;
2725   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
2726   if (!IsPositionIndependent) {
2727     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2728   } else {
2729     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2730     ARMPCLabelIndex = AFI->createPICLabelUId();
2731     ARMConstantPoolValue *CPV =
2732       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2733                                       ARMCP::CPBlockAddress, PCAdj);
2734     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2735   }
2736   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2737   SDValue Result = DAG.getLoad(
2738       PtrVT, DL, DAG.getEntryNode(), CPAddr,
2739       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2740   if (!IsPositionIndependent)
2741     return Result;
2742   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
2743   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2744 }
2745 
2746 /// \brief Convert a TLS address reference into the correct sequence of loads
2747 /// and calls to compute the variable's address for Darwin, and return an
2748 /// SDValue containing the final node.
2749 
2750 /// Darwin only has one TLS scheme which must be capable of dealing with the
2751 /// fully general situation, in the worst case. This means:
2752 ///     + "extern __thread" declaration.
2753 ///     + Defined in a possibly unknown dynamic library.
2754 ///
2755 /// The general system is that each __thread variable has a [3 x i32] descriptor
2756 /// which contains information used by the runtime to calculate the address. The
2757 /// only part of this the compiler needs to know about is the first word, which
2758 /// contains a function pointer that must be called with the address of the
2759 /// entire descriptor in "r0".
2760 ///
2761 /// Since this descriptor may be in a different unit, in general access must
2762 /// proceed along the usual ARM rules. A common sequence to produce is:
2763 ///
2764 ///     movw rT1, :lower16:_var$non_lazy_ptr
2765 ///     movt rT1, :upper16:_var$non_lazy_ptr
2766 ///     ldr r0, [rT1]
2767 ///     ldr rT2, [r0]
2768 ///     blx rT2
2769 ///     [...address now in r0...]
2770 SDValue
2771 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
2772                                                SelectionDAG &DAG) const {
2773   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
2774   SDLoc DL(Op);
2775 
2776   // First step is to get the address of the actua global symbol. This is where
2777   // the TLS descriptor lives.
2778   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
2779 
2780   // The first entry in the descriptor is a function pointer that we must call
2781   // to obtain the address of the variable.
2782   SDValue Chain = DAG.getEntryNode();
2783   SDValue FuncTLVGet = DAG.getLoad(
2784       MVT::i32, DL, Chain, DescAddr,
2785       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2786       /* Alignment = */ 4,
2787       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
2788           MachineMemOperand::MOInvariant);
2789   Chain = FuncTLVGet.getValue(1);
2790 
2791   MachineFunction &F = DAG.getMachineFunction();
2792   MachineFrameInfo &MFI = F.getFrameInfo();
2793   MFI.setAdjustsStack(true);
2794 
2795   // TLS calls preserve all registers except those that absolutely must be
2796   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
2797   // silly).
2798   auto TRI =
2799       getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
2800   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
2801   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
2802 
2803   // Finally, we can make the call. This is just a degenerate version of a
2804   // normal AArch64 call node: r0 takes the address of the descriptor, and
2805   // returns the address of the variable in this thread.
2806   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
2807   Chain =
2808       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
2809                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
2810                   DAG.getRegisterMask(Mask), Chain.getValue(1));
2811   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
2812 }
2813 
2814 SDValue
2815 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
2816                                                 SelectionDAG &DAG) const {
2817   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
2818 
2819   SDValue Chain = DAG.getEntryNode();
2820   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2821   SDLoc DL(Op);
2822 
2823   // Load the current TEB (thread environment block)
2824   SDValue Ops[] = {Chain,
2825                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
2826                    DAG.getConstant(15, DL, MVT::i32),
2827                    DAG.getConstant(0, DL, MVT::i32),
2828                    DAG.getConstant(13, DL, MVT::i32),
2829                    DAG.getConstant(0, DL, MVT::i32),
2830                    DAG.getConstant(2, DL, MVT::i32)};
2831   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
2832                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
2833 
2834   SDValue TEB = CurrentTEB.getValue(0);
2835   Chain = CurrentTEB.getValue(1);
2836 
2837   // Load the ThreadLocalStoragePointer from the TEB
2838   // A pointer to the TLS array is located at offset 0x2c from the TEB.
2839   SDValue TLSArray =
2840       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
2841   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
2842 
2843   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
2844   // offset into the TLSArray.
2845 
2846   // Load the TLS index from the C runtime
2847   SDValue TLSIndex =
2848       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
2849   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
2850   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
2851 
2852   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
2853                               DAG.getConstant(2, DL, MVT::i32));
2854   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
2855                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
2856                             MachinePointerInfo());
2857 
2858   // Get the offset of the start of the .tls section (section base)
2859   const auto *GA = cast<GlobalAddressSDNode>(Op);
2860   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
2861   SDValue Offset = DAG.getLoad(
2862       PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
2863                                     DAG.getTargetConstantPool(CPV, PtrVT, 4)),
2864       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2865 
2866   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
2867 }
2868 
2869 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
2870 SDValue
2871 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2872                                                  SelectionDAG &DAG) const {
2873   SDLoc dl(GA);
2874   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2875   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2876   MachineFunction &MF = DAG.getMachineFunction();
2877   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2878   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2879   ARMConstantPoolValue *CPV =
2880     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2881                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2882   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2883   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2884   Argument = DAG.getLoad(
2885       PtrVT, dl, DAG.getEntryNode(), Argument,
2886       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2887   SDValue Chain = Argument.getValue(1);
2888 
2889   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2890   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2891 
2892   // call __tls_get_addr.
2893   ArgListTy Args;
2894   ArgListEntry Entry;
2895   Entry.Node = Argument;
2896   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2897   Args.push_back(Entry);
2898 
2899   // FIXME: is there useful debug info available here?
2900   TargetLowering::CallLoweringInfo CLI(DAG);
2901   CLI.setDebugLoc(dl).setChain(Chain)
2902     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
2903                DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
2904 
2905   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2906   return CallResult.first;
2907 }
2908 
2909 // Lower ISD::GlobalTLSAddress using the "initial exec" or
2910 // "local exec" model.
2911 SDValue
2912 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2913                                         SelectionDAG &DAG,
2914                                         TLSModel::Model model) const {
2915   const GlobalValue *GV = GA->getGlobal();
2916   SDLoc dl(GA);
2917   SDValue Offset;
2918   SDValue Chain = DAG.getEntryNode();
2919   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2920   // Get the Thread Pointer
2921   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2922 
2923   if (model == TLSModel::InitialExec) {
2924     MachineFunction &MF = DAG.getMachineFunction();
2925     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2926     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2927     // Initial exec model.
2928     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2929     ARMConstantPoolValue *CPV =
2930       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2931                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2932                                       true);
2933     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2934     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2935     Offset = DAG.getLoad(
2936         PtrVT, dl, Chain, Offset,
2937         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2938     Chain = Offset.getValue(1);
2939 
2940     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2941     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2942 
2943     Offset = DAG.getLoad(
2944         PtrVT, dl, Chain, Offset,
2945         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2946   } else {
2947     // local exec model
2948     assert(model == TLSModel::LocalExec);
2949     ARMConstantPoolValue *CPV =
2950       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2951     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2952     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2953     Offset = DAG.getLoad(
2954         PtrVT, dl, Chain, Offset,
2955         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2956   }
2957 
2958   // The address of the thread local variable is the add of the thread
2959   // pointer with the offset of the variable.
2960   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2961 }
2962 
2963 SDValue
2964 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2965   if (Subtarget->isTargetDarwin())
2966     return LowerGlobalTLSAddressDarwin(Op, DAG);
2967 
2968   if (Subtarget->isTargetWindows())
2969     return LowerGlobalTLSAddressWindows(Op, DAG);
2970 
2971   // TODO: implement the "local dynamic" model
2972   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
2973   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2974   if (DAG.getTarget().Options.EmulatedTLS)
2975     return LowerToTLSEmulatedModel(GA, DAG);
2976 
2977   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2978 
2979   switch (model) {
2980     case TLSModel::GeneralDynamic:
2981     case TLSModel::LocalDynamic:
2982       return LowerToTLSGeneralDynamicModel(GA, DAG);
2983     case TLSModel::InitialExec:
2984     case TLSModel::LocalExec:
2985       return LowerToTLSExecModels(GA, DAG, model);
2986   }
2987   llvm_unreachable("bogus TLS model");
2988 }
2989 
2990 /// Return true if all users of V are within function F, looking through
2991 /// ConstantExprs.
2992 static bool allUsersAreInFunction(const Value *V, const Function *F) {
2993   SmallVector<const User*,4> Worklist;
2994   for (auto *U : V->users())
2995     Worklist.push_back(U);
2996   while (!Worklist.empty()) {
2997     auto *U = Worklist.pop_back_val();
2998     if (isa<ConstantExpr>(U)) {
2999       for (auto *UU : U->users())
3000         Worklist.push_back(UU);
3001       continue;
3002     }
3003 
3004     auto *I = dyn_cast<Instruction>(U);
3005     if (!I || I->getParent()->getParent() != F)
3006       return false;
3007   }
3008   return true;
3009 }
3010 
3011 /// Return true if all users of V are within some (any) function, looking through
3012 /// ConstantExprs. In other words, are there any global constant users?
3013 static bool allUsersAreInFunctions(const Value *V) {
3014   SmallVector<const User*,4> Worklist;
3015   for (auto *U : V->users())
3016     Worklist.push_back(U);
3017   while (!Worklist.empty()) {
3018     auto *U = Worklist.pop_back_val();
3019     if (isa<ConstantExpr>(U)) {
3020       for (auto *UU : U->users())
3021         Worklist.push_back(UU);
3022       continue;
3023     }
3024 
3025     if (!isa<Instruction>(U))
3026       return false;
3027   }
3028   return true;
3029 }
3030 
3031 // Return true if T is an integer, float or an array/vector of either.
3032 static bool isSimpleType(Type *T) {
3033   if (T->isIntegerTy() || T->isFloatingPointTy())
3034     return true;
3035   Type *SubT = nullptr;
3036   if (T->isArrayTy())
3037     SubT = T->getArrayElementType();
3038   else if (T->isVectorTy())
3039     SubT = T->getVectorElementType();
3040   else
3041     return false;
3042   return SubT->isIntegerTy() || SubT->isFloatingPointTy();
3043 }
3044 
3045 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
3046                                      EVT PtrVT, SDLoc dl) {
3047   // If we're creating a pool entry for a constant global with unnamed address,
3048   // and the global is small enough, we can emit it inline into the constant pool
3049   // to save ourselves an indirection.
3050   //
3051   // This is a win if the constant is only used in one function (so it doesn't
3052   // need to be duplicated) or duplicating the constant wouldn't increase code
3053   // size (implying the constant is no larger than 4 bytes).
3054   const Function *F = DAG.getMachineFunction().getFunction();
3055 
3056   // We rely on this decision to inline being idemopotent and unrelated to the
3057   // use-site. We know that if we inline a variable at one use site, we'll
3058   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3059   // doesn't know about this optimization, so bail out if it's enabled else
3060   // we could decide to inline here (and thus never emit the GV) but require
3061   // the GV from fast-isel generated code.
3062   if (!EnableConstpoolPromotion ||
3063       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3064       return SDValue();
3065 
3066   auto *GVar = dyn_cast<GlobalVariable>(GV);
3067   if (!GVar || !GVar->hasInitializer() ||
3068       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3069       !GVar->hasLocalLinkage())
3070     return SDValue();
3071 
3072   // Ensure that we don't try and inline any type that contains pointers. If
3073   // we inline a value that contains relocations, we move the relocations from
3074   // .data to .text which is not ideal.
3075   auto *Init = GVar->getInitializer();
3076   if (!isSimpleType(Init->getType()))
3077     return SDValue();
3078 
3079   // The constant islands pass can only really deal with alignment requests
3080   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3081   // any type wanting greater alignment requirements than 4 bytes. We also
3082   // can only promote constants that are multiples of 4 bytes in size or
3083   // are paddable to a multiple of 4. Currently we only try and pad constants
3084   // that are strings for simplicity.
3085   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3086   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3087   unsigned Align = GVar->getAlignment();
3088   unsigned RequiredPadding = 4 - (Size % 4);
3089   bool PaddingPossible =
3090     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3091   if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
3092     return SDValue();
3093 
3094   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3095   MachineFunction &MF = DAG.getMachineFunction();
3096   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3097 
3098   // We can't bloat the constant pool too much, else the ConstantIslands pass
3099   // may fail to converge. If we haven't promoted this global yet (it may have
3100   // multiple uses), and promoting it would increase the constant pool size (Sz
3101   // > 4), ensure we have space to do so up to MaxTotal.
3102   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3103     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3104         ConstpoolPromotionMaxTotal)
3105       return SDValue();
3106 
3107   // This is only valid if all users are in a single function OR it has users
3108   // in multiple functions but it no larger than a pointer. We also check if
3109   // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
3110   // address taken.
3111   if (!allUsersAreInFunction(GVar, F) &&
3112       !(Size <= 4 && allUsersAreInFunctions(GVar)))
3113     return SDValue();
3114 
3115   // We're going to inline this global. Pad it out if needed.
3116   if (RequiredPadding != 4) {
3117     StringRef S = CDAInit->getAsString();
3118 
3119     SmallVector<uint8_t,16> V(S.size());
3120     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3121     while (RequiredPadding--)
3122       V.push_back(0);
3123     Init = ConstantDataArray::get(*DAG.getContext(), V);
3124   }
3125 
3126   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3127   SDValue CPAddr =
3128     DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
3129   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3130     AFI->markGlobalAsPromotedToConstantPool(GVar);
3131     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3132                                       PaddedSize - 4);
3133   }
3134   ++NumConstpoolPromoted;
3135   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3136 }
3137 
3138 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3139                                                  SelectionDAG &DAG) const {
3140   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3141   SDLoc dl(Op);
3142   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3143   const TargetMachine &TM = getTargetMachine();
3144   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3145     GV = GA->getBaseObject();
3146   bool IsRO =
3147       (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
3148       isa<Function>(GV);
3149 
3150   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
3151     if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
3152       return V;
3153 
3154   if (isPositionIndependent()) {
3155     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3156 
3157     MachineFunction &MF = DAG.getMachineFunction();
3158     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3159     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3160     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3161     SDLoc dl(Op);
3162     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3163     ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
3164         GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
3165         UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
3166         /*AddCurrentAddress=*/UseGOT_PREL);
3167     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3168     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3169     SDValue Result = DAG.getLoad(
3170         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3171         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3172     SDValue Chain = Result.getValue(1);
3173     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3174     Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3175     if (UseGOT_PREL)
3176       Result =
3177           DAG.getLoad(PtrVT, dl, Chain, Result,
3178                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3179     return Result;
3180   } else if (Subtarget->isROPI() && IsRO) {
3181     // PC-relative.
3182     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3183     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3184     return Result;
3185   } else if (Subtarget->isRWPI() && !IsRO) {
3186     // SB-relative.
3187     ARMConstantPoolValue *CPV =
3188       ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3189     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3190     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3191     SDValue G = DAG.getLoad(
3192         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3193         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3194     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3195     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
3196     return Result;
3197   }
3198 
3199   // If we have T2 ops, we can materialize the address directly via movt/movw
3200   // pair. This is always cheaper.
3201   if (Subtarget->useMovt(DAG.getMachineFunction())) {
3202     ++NumMovwMovt;
3203     // FIXME: Once remat is capable of dealing with instructions with register
3204     // operands, expand this into two nodes.
3205     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3206                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3207   } else {
3208     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
3209     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3210     return DAG.getLoad(
3211         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3212         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3213   }
3214 }
3215 
3216 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3217                                                     SelectionDAG &DAG) const {
3218   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3219          "ROPI/RWPI not currently supported for Darwin");
3220   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3221   SDLoc dl(Op);
3222   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3223 
3224   if (Subtarget->useMovt(DAG.getMachineFunction()))
3225     ++NumMovwMovt;
3226 
3227   // FIXME: Once remat is capable of dealing with instructions with register
3228   // operands, expand this into multiple nodes
3229   unsigned Wrapper =
3230       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3231 
3232   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3233   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3234 
3235   if (Subtarget->isGVIndirectSymbol(GV))
3236     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3237                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3238   return Result;
3239 }
3240 
3241 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3242                                                      SelectionDAG &DAG) const {
3243   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3244   assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
3245          "Windows on ARM expects to use movw/movt");
3246   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3247          "ROPI/RWPI not currently supported for Windows");
3248 
3249   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3250   const ARMII::TOF TargetFlags =
3251     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
3252   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3253   SDValue Result;
3254   SDLoc DL(Op);
3255 
3256   ++NumMovwMovt;
3257 
3258   // FIXME: Once remat is capable of dealing with instructions with register
3259   // operands, expand this into two nodes.
3260   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3261                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
3262                                                   TargetFlags));
3263   if (GV->hasDLLImportStorageClass())
3264     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3265                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3266   return Result;
3267 }
3268 
3269 SDValue
3270 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3271   SDLoc dl(Op);
3272   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3273   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3274                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3275                      Op.getOperand(1), Val);
3276 }
3277 
3278 SDValue
3279 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3280   SDLoc dl(Op);
3281   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3282                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3283 }
3284 
3285 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3286                                                       SelectionDAG &DAG) const {
3287   SDLoc dl(Op);
3288   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3289                      Op.getOperand(0));
3290 }
3291 
3292 SDValue
3293 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3294                                           const ARMSubtarget *Subtarget) const {
3295   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3296   SDLoc dl(Op);
3297   switch (IntNo) {
3298   default: return SDValue();    // Don't custom lower most intrinsics.
3299   case Intrinsic::arm_rbit: {
3300     assert(Op.getOperand(1).getValueType() == MVT::i32 &&
3301            "RBIT intrinsic must have i32 type!");
3302     return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
3303   }
3304   case Intrinsic::thread_pointer: {
3305     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3306     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3307   }
3308   case Intrinsic::eh_sjlj_lsda: {
3309     MachineFunction &MF = DAG.getMachineFunction();
3310     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3311     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3312     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3313     SDValue CPAddr;
3314     bool IsPositionIndependent = isPositionIndependent();
3315     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3316     ARMConstantPoolValue *CPV =
3317       ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
3318                                       ARMCP::CPLSDA, PCAdj);
3319     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3320     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3321     SDValue Result = DAG.getLoad(
3322         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3323         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3324 
3325     if (IsPositionIndependent) {
3326       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3327       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3328     }
3329     return Result;
3330   }
3331   case Intrinsic::arm_neon_vmulls:
3332   case Intrinsic::arm_neon_vmullu: {
3333     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3334       ? ARMISD::VMULLs : ARMISD::VMULLu;
3335     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3336                        Op.getOperand(1), Op.getOperand(2));
3337   }
3338   case Intrinsic::arm_neon_vminnm:
3339   case Intrinsic::arm_neon_vmaxnm: {
3340     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3341       ? ISD::FMINNUM : ISD::FMAXNUM;
3342     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3343                        Op.getOperand(1), Op.getOperand(2));
3344   }
3345   case Intrinsic::arm_neon_vminu:
3346   case Intrinsic::arm_neon_vmaxu: {
3347     if (Op.getValueType().isFloatingPoint())
3348       return SDValue();
3349     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3350       ? ISD::UMIN : ISD::UMAX;
3351     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3352                          Op.getOperand(1), Op.getOperand(2));
3353   }
3354   case Intrinsic::arm_neon_vmins:
3355   case Intrinsic::arm_neon_vmaxs: {
3356     // v{min,max}s is overloaded between signed integers and floats.
3357     if (!Op.getValueType().isFloatingPoint()) {
3358       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3359         ? ISD::SMIN : ISD::SMAX;
3360       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3361                          Op.getOperand(1), Op.getOperand(2));
3362     }
3363     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3364       ? ISD::FMINNAN : ISD::FMAXNAN;
3365     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3366                        Op.getOperand(1), Op.getOperand(2));
3367   }
3368   }
3369 }
3370 
3371 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
3372                                  const ARMSubtarget *Subtarget) {
3373   // FIXME: handle "fence singlethread" more efficiently.
3374   SDLoc dl(Op);
3375   if (!Subtarget->hasDataBarrier()) {
3376     // Some ARMv6 cpus can support data barriers with an mcr instruction.
3377     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3378     // here.
3379     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3380            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3381     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3382                        DAG.getConstant(0, dl, MVT::i32));
3383   }
3384 
3385   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
3386   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
3387   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
3388   if (Subtarget->isMClass()) {
3389     // Only a full system barrier exists in the M-class architectures.
3390     Domain = ARM_MB::SY;
3391   } else if (Subtarget->preferISHSTBarriers() &&
3392              Ord == AtomicOrdering::Release) {
3393     // Swift happens to implement ISHST barriers in a way that's compatible with
3394     // Release semantics but weaker than ISH so we'd be fools not to use
3395     // it. Beware: other processors probably don't!
3396     Domain = ARM_MB::ISHST;
3397   }
3398 
3399   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3400                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3401                      DAG.getConstant(Domain, dl, MVT::i32));
3402 }
3403 
3404 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
3405                              const ARMSubtarget *Subtarget) {
3406   // ARM pre v5TE and Thumb1 does not have preload instructions.
3407   if (!(Subtarget->isThumb2() ||
3408         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
3409     // Just preserve the chain.
3410     return Op.getOperand(0);
3411 
3412   SDLoc dl(Op);
3413   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
3414   if (!isRead &&
3415       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
3416     // ARMv7 with MP extension has PLDW.
3417     return Op.getOperand(0);
3418 
3419   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3420   if (Subtarget->isThumb()) {
3421     // Invert the bits.
3422     isRead = ~isRead & 1;
3423     isData = ~isData & 1;
3424   }
3425 
3426   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
3427                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
3428                      DAG.getConstant(isData, dl, MVT::i32));
3429 }
3430 
3431 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
3432   MachineFunction &MF = DAG.getMachineFunction();
3433   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
3434 
3435   // vastart just stores the address of the VarArgsFrameIndex slot into the
3436   // memory location argument.
3437   SDLoc dl(Op);
3438   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
3439   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3440   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3441   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3442                       MachinePointerInfo(SV));
3443 }
3444 
3445 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
3446                                                 CCValAssign &NextVA,
3447                                                 SDValue &Root,
3448                                                 SelectionDAG &DAG,
3449                                                 const SDLoc &dl) const {
3450   MachineFunction &MF = DAG.getMachineFunction();
3451   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3452 
3453   const TargetRegisterClass *RC;
3454   if (AFI->isThumb1OnlyFunction())
3455     RC = &ARM::tGPRRegClass;
3456   else
3457     RC = &ARM::GPRRegClass;
3458 
3459   // Transform the arguments stored in physical registers into virtual ones.
3460   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3461   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3462 
3463   SDValue ArgValue2;
3464   if (NextVA.isMemLoc()) {
3465     MachineFrameInfo &MFI = MF.getFrameInfo();
3466     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
3467 
3468     // Create load node to retrieve arguments from the stack.
3469     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3470     ArgValue2 = DAG.getLoad(
3471         MVT::i32, dl, Root, FIN,
3472         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3473   } else {
3474     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3475     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3476   }
3477   if (!Subtarget->isLittle())
3478     std::swap (ArgValue, ArgValue2);
3479   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
3480 }
3481 
3482 // The remaining GPRs hold either the beginning of variable-argument
3483 // data, or the beginning of an aggregate passed by value (usually
3484 // byval).  Either way, we allocate stack slots adjacent to the data
3485 // provided by our caller, and store the unallocated registers there.
3486 // If this is a variadic function, the va_list pointer will begin with
3487 // these values; otherwise, this reassembles a (byval) structure that
3488 // was split between registers and memory.
3489 // Return: The frame index registers were stored into.
3490 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
3491                                       const SDLoc &dl, SDValue &Chain,
3492                                       const Value *OrigArg,
3493                                       unsigned InRegsParamRecordIdx,
3494                                       int ArgOffset, unsigned ArgSize) const {
3495   // Currently, two use-cases possible:
3496   // Case #1. Non-var-args function, and we meet first byval parameter.
3497   //          Setup first unallocated register as first byval register;
3498   //          eat all remained registers
3499   //          (these two actions are performed by HandleByVal method).
3500   //          Then, here, we initialize stack frame with
3501   //          "store-reg" instructions.
3502   // Case #2. Var-args function, that doesn't contain byval parameters.
3503   //          The same: eat all remained unallocated registers,
3504   //          initialize stack frame.
3505 
3506   MachineFunction &MF = DAG.getMachineFunction();
3507   MachineFrameInfo &MFI = MF.getFrameInfo();
3508   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3509   unsigned RBegin, REnd;
3510   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
3511     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
3512   } else {
3513     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3514     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
3515     REnd = ARM::R4;
3516   }
3517 
3518   if (REnd != RBegin)
3519     ArgOffset = -4 * (ARM::R4 - RBegin);
3520 
3521   auto PtrVT = getPointerTy(DAG.getDataLayout());
3522   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
3523   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
3524 
3525   SmallVector<SDValue, 4> MemOps;
3526   const TargetRegisterClass *RC =
3527       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
3528 
3529   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3530     unsigned VReg = MF.addLiveIn(Reg, RC);
3531     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3532     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
3533                                  MachinePointerInfo(OrigArg, 4 * i));
3534     MemOps.push_back(Store);
3535     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3536   }
3537 
3538   if (!MemOps.empty())
3539     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3540   return FrameIndex;
3541 }
3542 
3543 // Setup stack frame, the va_list pointer will start from.
3544 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3545                                              const SDLoc &dl, SDValue &Chain,
3546                                              unsigned ArgOffset,
3547                                              unsigned TotalArgRegsSaveSize,
3548                                              bool ForceMutable) const {
3549   MachineFunction &MF = DAG.getMachineFunction();
3550   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3551 
3552   // Try to store any remaining integer argument regs
3553   // to their spots on the stack so that they may be loaded by dereferencing
3554   // the result of va_next.
3555   // If there is no regs to be stored, just point address after last
3556   // argument passed via stack.
3557   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3558                                   CCInfo.getInRegsParamsCount(),
3559                                   CCInfo.getNextStackOffset(), 4);
3560   AFI->setVarArgsFrameIndex(FrameIndex);
3561 }
3562 
3563 SDValue ARMTargetLowering::LowerFormalArguments(
3564     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3565     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3566     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3567   MachineFunction &MF = DAG.getMachineFunction();
3568   MachineFrameInfo &MFI = MF.getFrameInfo();
3569 
3570   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3571 
3572   // Assign locations to all of the incoming arguments.
3573   SmallVector<CCValAssign, 16> ArgLocs;
3574   ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3575                     *DAG.getContext(), Prologue);
3576   CCInfo.AnalyzeFormalArguments(Ins,
3577                                 CCAssignFnForNode(CallConv, /* Return*/ false,
3578                                                   isVarArg));
3579 
3580   SmallVector<SDValue, 16> ArgValues;
3581   SDValue ArgValue;
3582   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
3583   unsigned CurArgIdx = 0;
3584 
3585   // Initially ArgRegsSaveSize is zero.
3586   // Then we increase this value each time we meet byval parameter.
3587   // We also increase this value in case of varargs function.
3588   AFI->setArgRegsSaveSize(0);
3589 
3590   // Calculate the amount of stack space that we need to allocate to store
3591   // byval and variadic arguments that are passed in registers.
3592   // We need to know this before we allocate the first byval or variadic
3593   // argument, as they will be allocated a stack slot below the CFA (Canonical
3594   // Frame Address, the stack pointer at entry to the function).
3595   unsigned ArgRegBegin = ARM::R4;
3596   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3597     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3598       break;
3599 
3600     CCValAssign &VA = ArgLocs[i];
3601     unsigned Index = VA.getValNo();
3602     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3603     if (!Flags.isByVal())
3604       continue;
3605 
3606     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3607     unsigned RBegin, REnd;
3608     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3609     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3610 
3611     CCInfo.nextInRegsParam();
3612   }
3613   CCInfo.rewindByValRegsInfo();
3614 
3615   int lastInsIndex = -1;
3616   if (isVarArg && MFI.hasVAStart()) {
3617     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3618     if (RegIdx != array_lengthof(GPRArgRegs))
3619       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3620   }
3621 
3622   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3623   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3624   auto PtrVT = getPointerTy(DAG.getDataLayout());
3625 
3626   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3627     CCValAssign &VA = ArgLocs[i];
3628     if (Ins[VA.getValNo()].isOrigArg()) {
3629       std::advance(CurOrigArg,
3630                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3631       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3632     }
3633     // Arguments stored in registers.
3634     if (VA.isRegLoc()) {
3635       EVT RegVT = VA.getLocVT();
3636 
3637       if (VA.needsCustom()) {
3638         // f64 and vector types are split up into multiple registers or
3639         // combinations of registers and stack slots.
3640         if (VA.getLocVT() == MVT::v2f64) {
3641           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3642                                                    Chain, DAG, dl);
3643           VA = ArgLocs[++i]; // skip ahead to next loc
3644           SDValue ArgValue2;
3645           if (VA.isMemLoc()) {
3646             int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
3647             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3648             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
3649                                     MachinePointerInfo::getFixedStack(
3650                                         DAG.getMachineFunction(), FI));
3651           } else {
3652             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3653                                              Chain, DAG, dl);
3654           }
3655           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3656           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3657                                  ArgValue, ArgValue1,
3658                                  DAG.getIntPtrConstant(0, dl));
3659           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3660                                  ArgValue, ArgValue2,
3661                                  DAG.getIntPtrConstant(1, dl));
3662         } else
3663           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3664 
3665       } else {
3666         const TargetRegisterClass *RC;
3667 
3668         if (RegVT == MVT::f32)
3669           RC = &ARM::SPRRegClass;
3670         else if (RegVT == MVT::f64)
3671           RC = &ARM::DPRRegClass;
3672         else if (RegVT == MVT::v2f64)
3673           RC = &ARM::QPRRegClass;
3674         else if (RegVT == MVT::i32)
3675           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3676                                            : &ARM::GPRRegClass;
3677         else
3678           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3679 
3680         // Transform the arguments in physical registers into virtual ones.
3681         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3682         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3683       }
3684 
3685       // If this is an 8 or 16-bit value, it is really passed promoted
3686       // to 32 bits.  Insert an assert[sz]ext to capture this, then
3687       // truncate to the right size.
3688       switch (VA.getLocInfo()) {
3689       default: llvm_unreachable("Unknown loc info!");
3690       case CCValAssign::Full: break;
3691       case CCValAssign::BCvt:
3692         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
3693         break;
3694       case CCValAssign::SExt:
3695         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3696                                DAG.getValueType(VA.getValVT()));
3697         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3698         break;
3699       case CCValAssign::ZExt:
3700         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3701                                DAG.getValueType(VA.getValVT()));
3702         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3703         break;
3704       }
3705 
3706       InVals.push_back(ArgValue);
3707 
3708     } else { // VA.isRegLoc()
3709 
3710       // sanity check
3711       assert(VA.isMemLoc());
3712       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
3713 
3714       int index = VA.getValNo();
3715 
3716       // Some Ins[] entries become multiple ArgLoc[] entries.
3717       // Process them only once.
3718       if (index != lastInsIndex)
3719         {
3720           ISD::ArgFlagsTy Flags = Ins[index].Flags;
3721           // FIXME: For now, all byval parameter objects are marked mutable.
3722           // This can be changed with more analysis.
3723           // In case of tail call optimization mark all arguments mutable.
3724           // Since they could be overwritten by lowering of arguments in case of
3725           // a tail call.
3726           if (Flags.isByVal()) {
3727             assert(Ins[index].isOrigArg() &&
3728                    "Byval arguments cannot be implicit");
3729             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
3730 
3731             int FrameIndex = StoreByValRegs(
3732                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
3733                 VA.getLocMemOffset(), Flags.getByValSize());
3734             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
3735             CCInfo.nextInRegsParam();
3736           } else {
3737             unsigned FIOffset = VA.getLocMemOffset();
3738             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
3739                                            FIOffset, true);
3740 
3741             // Create load nodes to retrieve arguments from the stack.
3742             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3743             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
3744                                          MachinePointerInfo::getFixedStack(
3745                                              DAG.getMachineFunction(), FI)));
3746           }
3747           lastInsIndex = index;
3748         }
3749     }
3750   }
3751 
3752   // varargs
3753   if (isVarArg && MFI.hasVAStart())
3754     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3755                          CCInfo.getNextStackOffset(),
3756                          TotalArgRegsSaveSize);
3757 
3758   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
3759 
3760   return Chain;
3761 }
3762 
3763 /// isFloatingPointZero - Return true if this is +0.0.
3764 static bool isFloatingPointZero(SDValue Op) {
3765   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3766     return CFP->getValueAPF().isPosZero();
3767   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3768     // Maybe this has already been legalized into the constant pool?
3769     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3770       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3771       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3772         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3773           return CFP->getValueAPF().isPosZero();
3774     }
3775   } else if (Op->getOpcode() == ISD::BITCAST &&
3776              Op->getValueType(0) == MVT::f64) {
3777     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
3778     // created by LowerConstantFP().
3779     SDValue BitcastOp = Op->getOperand(0);
3780     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
3781         isNullConstant(BitcastOp->getOperand(0)))
3782       return true;
3783   }
3784   return false;
3785 }
3786 
3787 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3788 /// the given operands.
3789 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3790                                      SDValue &ARMcc, SelectionDAG &DAG,
3791                                      const SDLoc &dl) const {
3792   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3793     unsigned C = RHSC->getZExtValue();
3794     if (!isLegalICmpImmediate(C)) {
3795       // Constant does not fit, try adjusting it by one?
3796       switch (CC) {
3797       default: break;
3798       case ISD::SETLT:
3799       case ISD::SETGE:
3800         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3801           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3802           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3803         }
3804         break;
3805       case ISD::SETULT:
3806       case ISD::SETUGE:
3807         if (C != 0 && isLegalICmpImmediate(C-1)) {
3808           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3809           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3810         }
3811         break;
3812       case ISD::SETLE:
3813       case ISD::SETGT:
3814         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3815           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3816           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3817         }
3818         break;
3819       case ISD::SETULE:
3820       case ISD::SETUGT:
3821         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3822           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3823           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3824         }
3825         break;
3826       }
3827     }
3828   }
3829 
3830   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3831   ARMISD::NodeType CompareType;
3832   switch (CondCode) {
3833   default:
3834     CompareType = ARMISD::CMP;
3835     break;
3836   case ARMCC::EQ:
3837   case ARMCC::NE:
3838     // Uses only Z Flag
3839     CompareType = ARMISD::CMPZ;
3840     break;
3841   }
3842   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3843   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3844 }
3845 
3846 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3847 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
3848                                      SelectionDAG &DAG, const SDLoc &dl) const {
3849   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
3850   SDValue Cmp;
3851   if (!isFloatingPointZero(RHS))
3852     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
3853   else
3854     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
3855   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3856 }
3857 
3858 /// duplicateCmp - Glue values can have only one use, so this function
3859 /// duplicates a comparison node.
3860 SDValue
3861 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3862   unsigned Opc = Cmp.getOpcode();
3863   SDLoc DL(Cmp);
3864   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3865     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3866 
3867   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3868   Cmp = Cmp.getOperand(0);
3869   Opc = Cmp.getOpcode();
3870   if (Opc == ARMISD::CMPFP)
3871     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3872   else {
3873     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3874     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
3875   }
3876   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3877 }
3878 
3879 std::pair<SDValue, SDValue>
3880 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
3881                                  SDValue &ARMcc) const {
3882   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
3883 
3884   SDValue Value, OverflowCmp;
3885   SDValue LHS = Op.getOperand(0);
3886   SDValue RHS = Op.getOperand(1);
3887   SDLoc dl(Op);
3888 
3889   // FIXME: We are currently always generating CMPs because we don't support
3890   // generating CMN through the backend. This is not as good as the natural
3891   // CMP case because it causes a register dependency and cannot be folded
3892   // later.
3893 
3894   switch (Op.getOpcode()) {
3895   default:
3896     llvm_unreachable("Unknown overflow instruction!");
3897   case ISD::SADDO:
3898     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3899     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3900     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3901     break;
3902   case ISD::UADDO:
3903     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3904     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3905     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3906     break;
3907   case ISD::SSUBO:
3908     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3909     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3910     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3911     break;
3912   case ISD::USUBO:
3913     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3914     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
3915     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
3916     break;
3917   } // switch (...)
3918 
3919   return std::make_pair(Value, OverflowCmp);
3920 }
3921 
3922 
3923 SDValue
3924 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
3925   // Let legalize expand this if it isn't a legal type yet.
3926   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3927     return SDValue();
3928 
3929   SDValue Value, OverflowCmp;
3930   SDValue ARMcc;
3931   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
3932   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3933   SDLoc dl(Op);
3934   // We use 0 and 1 as false and true values.
3935   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3936   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3937   EVT VT = Op.getValueType();
3938 
3939   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
3940                                  ARMcc, CCR, OverflowCmp);
3941 
3942   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3943   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3944 }
3945 
3946 
3947 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
3948   SDValue Cond = Op.getOperand(0);
3949   SDValue SelectTrue = Op.getOperand(1);
3950   SDValue SelectFalse = Op.getOperand(2);
3951   SDLoc dl(Op);
3952   unsigned Opc = Cond.getOpcode();
3953 
3954   if (Cond.getResNo() == 1 &&
3955       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3956        Opc == ISD::USUBO)) {
3957     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
3958       return SDValue();
3959 
3960     SDValue Value, OverflowCmp;
3961     SDValue ARMcc;
3962     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
3963     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3964     EVT VT = Op.getValueType();
3965 
3966     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
3967                    OverflowCmp, DAG);
3968   }
3969 
3970   // Convert:
3971   //
3972   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
3973   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
3974   //
3975   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
3976     const ConstantSDNode *CMOVTrue =
3977       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
3978     const ConstantSDNode *CMOVFalse =
3979       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3980 
3981     if (CMOVTrue && CMOVFalse) {
3982       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
3983       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
3984 
3985       SDValue True;
3986       SDValue False;
3987       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
3988         True = SelectTrue;
3989         False = SelectFalse;
3990       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
3991         True = SelectFalse;
3992         False = SelectTrue;
3993       }
3994 
3995       if (True.getNode() && False.getNode()) {
3996         EVT VT = Op.getValueType();
3997         SDValue ARMcc = Cond.getOperand(2);
3998         SDValue CCR = Cond.getOperand(3);
3999         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
4000         assert(True.getValueType() == VT);
4001         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
4002       }
4003     }
4004   }
4005 
4006   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4007   // undefined bits before doing a full-word comparison with zero.
4008   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4009                      DAG.getConstant(1, dl, Cond.getValueType()));
4010 
4011   return DAG.getSelectCC(dl, Cond,
4012                          DAG.getConstant(0, dl, Cond.getValueType()),
4013                          SelectTrue, SelectFalse, ISD::SETNE);
4014 }
4015 
4016 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
4017                                  bool &swpCmpOps, bool &swpVselOps) {
4018   // Start by selecting the GE condition code for opcodes that return true for
4019   // 'equality'
4020   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4021       CC == ISD::SETULE)
4022     CondCode = ARMCC::GE;
4023 
4024   // and GT for opcodes that return false for 'equality'.
4025   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4026            CC == ISD::SETULT)
4027     CondCode = ARMCC::GT;
4028 
4029   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4030   // to swap the compare operands.
4031   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4032       CC == ISD::SETULT)
4033     swpCmpOps = true;
4034 
4035   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4036   // If we have an unordered opcode, we need to swap the operands to the VSEL
4037   // instruction (effectively negating the condition).
4038   //
4039   // This also has the effect of swapping which one of 'less' or 'greater'
4040   // returns true, so we also swap the compare operands. It also switches
4041   // whether we return true for 'equality', so we compensate by picking the
4042   // opposite condition code to our original choice.
4043   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4044       CC == ISD::SETUGT) {
4045     swpCmpOps = !swpCmpOps;
4046     swpVselOps = !swpVselOps;
4047     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4048   }
4049 
4050   // 'ordered' is 'anything but unordered', so use the VS condition code and
4051   // swap the VSEL operands.
4052   if (CC == ISD::SETO) {
4053     CondCode = ARMCC::VS;
4054     swpVselOps = true;
4055   }
4056 
4057   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4058   // code and swap the VSEL operands.
4059   if (CC == ISD::SETUNE) {
4060     CondCode = ARMCC::EQ;
4061     swpVselOps = true;
4062   }
4063 }
4064 
4065 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4066                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
4067                                    SDValue Cmp, SelectionDAG &DAG) const {
4068   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
4069     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4070                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4071     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4072                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4073 
4074     SDValue TrueLow = TrueVal.getValue(0);
4075     SDValue TrueHigh = TrueVal.getValue(1);
4076     SDValue FalseLow = FalseVal.getValue(0);
4077     SDValue FalseHigh = FalseVal.getValue(1);
4078 
4079     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4080                               ARMcc, CCR, Cmp);
4081     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4082                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
4083 
4084     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4085   } else {
4086     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
4087                        Cmp);
4088   }
4089 }
4090 
4091 static bool isGTorGE(ISD::CondCode CC) {
4092   return CC == ISD::SETGT || CC == ISD::SETGE;
4093 }
4094 
4095 static bool isLTorLE(ISD::CondCode CC) {
4096   return CC == ISD::SETLT || CC == ISD::SETLE;
4097 }
4098 
4099 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4100 // All of these conditions (and their <= and >= counterparts) will do:
4101 //          x < k ? k : x
4102 //          x > k ? x : k
4103 //          k < x ? x : k
4104 //          k > x ? k : x
4105 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4106                             const SDValue TrueVal, const SDValue FalseVal,
4107                             const ISD::CondCode CC, const SDValue K) {
4108   return (isGTorGE(CC) &&
4109           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4110          (isLTorLE(CC) &&
4111           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4112 }
4113 
4114 // Similar to isLowerSaturate(), but checks for upper-saturating conditions.
4115 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
4116                             const SDValue TrueVal, const SDValue FalseVal,
4117                             const ISD::CondCode CC, const SDValue K) {
4118   return (isGTorGE(CC) &&
4119           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
4120          (isLTorLE(CC) &&
4121           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
4122 }
4123 
4124 // Check if two chained conditionals could be converted into SSAT.
4125 //
4126 // SSAT can replace a set of two conditional selectors that bound a number to an
4127 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4128 //
4129 //     x < -k ? -k : (x > k ? k : x)
4130 //     x < -k ? -k : (x < k ? x : k)
4131 //     x > -k ? (x > k ? k : x) : -k
4132 //     x < k ? (x < -k ? -k : x) : k
4133 //     etc.
4134 //
4135 // It returns true if the conversion can be done, false otherwise.
4136 // Additionally, the variable is returned in parameter V and the constant in K.
4137 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
4138                                     uint64_t &K) {
4139 
4140   SDValue LHS1 = Op.getOperand(0);
4141   SDValue RHS1 = Op.getOperand(1);
4142   SDValue TrueVal1 = Op.getOperand(2);
4143   SDValue FalseVal1 = Op.getOperand(3);
4144   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4145 
4146   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4147   if (Op2.getOpcode() != ISD::SELECT_CC)
4148     return false;
4149 
4150   SDValue LHS2 = Op2.getOperand(0);
4151   SDValue RHS2 = Op2.getOperand(1);
4152   SDValue TrueVal2 = Op2.getOperand(2);
4153   SDValue FalseVal2 = Op2.getOperand(3);
4154   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
4155 
4156   // Find out which are the constants and which are the variables
4157   // in each conditional
4158   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
4159                                                         ? &RHS1
4160                                                         : NULL;
4161   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
4162                                                         ? &RHS2
4163                                                         : NULL;
4164   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
4165   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
4166   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
4167   SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
4168 
4169   // We must detect cases where the original operations worked with 16- or
4170   // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
4171   // must work with sign-extended values but the select operations return
4172   // the original non-extended value.
4173   SDValue V2TmpReg = V2Tmp;
4174   if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
4175     V2TmpReg = V2Tmp->getOperand(0);
4176 
4177   // Check that the registers and the constants have the correct values
4178   // in both conditionals
4179   if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
4180       V2TmpReg != V2)
4181     return false;
4182 
4183   // Figure out which conditional is saturating the lower/upper bound.
4184   const SDValue *LowerCheckOp =
4185       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4186           ? &Op
4187           : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
4188                                                                        : NULL;
4189   const SDValue *UpperCheckOp =
4190       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4191           ? &Op
4192           : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
4193                                                                        : NULL;
4194 
4195   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
4196     return false;
4197 
4198   // Check that the constant in the lower-bound check is
4199   // the opposite of the constant in the upper-bound check
4200   // in 1's complement.
4201   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
4202   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
4203   int64_t PosVal = std::max(Val1, Val2);
4204 
4205   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
4206        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
4207       Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
4208 
4209     V = V2;
4210     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
4211     return true;
4212   }
4213 
4214   return false;
4215 }
4216 
4217 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
4218 
4219   EVT VT = Op.getValueType();
4220   SDLoc dl(Op);
4221 
4222   // Try to convert two saturating conditional selects into a single SSAT
4223   SDValue SatValue;
4224   uint64_t SatConstant;
4225   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
4226       isSaturatingConditional(Op, SatValue, SatConstant))
4227     return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
4228                        DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4229 
4230   SDValue LHS = Op.getOperand(0);
4231   SDValue RHS = Op.getOperand(1);
4232   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4233   SDValue TrueVal = Op.getOperand(2);
4234   SDValue FalseVal = Op.getOperand(3);
4235 
4236   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4237     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4238                                                     dl);
4239 
4240     // If softenSetCCOperands only returned one value, we should compare it to
4241     // zero.
4242     if (!RHS.getNode()) {
4243       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4244       CC = ISD::SETNE;
4245     }
4246   }
4247 
4248   if (LHS.getValueType() == MVT::i32) {
4249     // Try to generate VSEL on ARMv8.
4250     // The VSEL instruction can't use all the usual ARM condition
4251     // codes: it only has two bits to select the condition code, so it's
4252     // constrained to use only GE, GT, VS and EQ.
4253     //
4254     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
4255     // swap the operands of the previous compare instruction (effectively
4256     // inverting the compare condition, swapping 'less' and 'greater') and
4257     // sometimes need to swap the operands to the VSEL (which inverts the
4258     // condition in the sense of firing whenever the previous condition didn't)
4259     if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
4260                                     TrueVal.getValueType() == MVT::f64)) {
4261       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4262       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
4263           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
4264         CC = ISD::getSetCCInverse(CC, true);
4265         std::swap(TrueVal, FalseVal);
4266       }
4267     }
4268 
4269     SDValue ARMcc;
4270     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4271     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4272     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4273   }
4274 
4275   ARMCC::CondCodes CondCode, CondCode2;
4276   FPCCToARMCC(CC, CondCode, CondCode2);
4277 
4278   // Try to generate VMAXNM/VMINNM on ARMv8.
4279   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
4280                                   TrueVal.getValueType() == MVT::f64)) {
4281     bool swpCmpOps = false;
4282     bool swpVselOps = false;
4283     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
4284 
4285     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
4286         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
4287       if (swpCmpOps)
4288         std::swap(LHS, RHS);
4289       if (swpVselOps)
4290         std::swap(TrueVal, FalseVal);
4291     }
4292   }
4293 
4294   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4295   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
4296   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4297   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4298   if (CondCode2 != ARMCC::AL) {
4299     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
4300     // FIXME: Needs another CMP because flag can have but one use.
4301     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
4302     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
4303   }
4304   return Result;
4305 }
4306 
4307 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
4308 /// to morph to an integer compare sequence.
4309 static bool canChangeToInt(SDValue Op, bool &SeenZero,
4310                            const ARMSubtarget *Subtarget) {
4311   SDNode *N = Op.getNode();
4312   if (!N->hasOneUse())
4313     // Otherwise it requires moving the value from fp to integer registers.
4314     return false;
4315   if (!N->getNumValues())
4316     return false;
4317   EVT VT = Op.getValueType();
4318   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
4319     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
4320     // vmrs are very slow, e.g. cortex-a8.
4321     return false;
4322 
4323   if (isFloatingPointZero(Op)) {
4324     SeenZero = true;
4325     return true;
4326   }
4327   return ISD::isNormalLoad(N);
4328 }
4329 
4330 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
4331   if (isFloatingPointZero(Op))
4332     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
4333 
4334   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
4335     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
4336                        Ld->getPointerInfo(), Ld->getAlignment(),
4337                        Ld->getMemOperand()->getFlags());
4338 
4339   llvm_unreachable("Unknown VFP cmp argument!");
4340 }
4341 
4342 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
4343                            SDValue &RetVal1, SDValue &RetVal2) {
4344   SDLoc dl(Op);
4345 
4346   if (isFloatingPointZero(Op)) {
4347     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
4348     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
4349     return;
4350   }
4351 
4352   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
4353     SDValue Ptr = Ld->getBasePtr();
4354     RetVal1 =
4355         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
4356                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
4357 
4358     EVT PtrType = Ptr.getValueType();
4359     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
4360     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
4361                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
4362     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
4363                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
4364                           Ld->getMemOperand()->getFlags());
4365     return;
4366   }
4367 
4368   llvm_unreachable("Unknown VFP cmp argument!");
4369 }
4370 
4371 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
4372 /// f32 and even f64 comparisons to integer ones.
4373 SDValue
4374 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
4375   SDValue Chain = Op.getOperand(0);
4376   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4377   SDValue LHS = Op.getOperand(2);
4378   SDValue RHS = Op.getOperand(3);
4379   SDValue Dest = Op.getOperand(4);
4380   SDLoc dl(Op);
4381 
4382   bool LHSSeenZero = false;
4383   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
4384   bool RHSSeenZero = false;
4385   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
4386   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
4387     // If unsafe fp math optimization is enabled and there are no other uses of
4388     // the CMP operands, and the condition code is EQ or NE, we can optimize it
4389     // to an integer comparison.
4390     if (CC == ISD::SETOEQ)
4391       CC = ISD::SETEQ;
4392     else if (CC == ISD::SETUNE)
4393       CC = ISD::SETNE;
4394 
4395     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4396     SDValue ARMcc;
4397     if (LHS.getValueType() == MVT::f32) {
4398       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4399                         bitcastf32Toi32(LHS, DAG), Mask);
4400       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4401                         bitcastf32Toi32(RHS, DAG), Mask);
4402       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4403       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4404       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4405                          Chain, Dest, ARMcc, CCR, Cmp);
4406     }
4407 
4408     SDValue LHS1, LHS2;
4409     SDValue RHS1, RHS2;
4410     expandf64Toi32(LHS, DAG, LHS1, LHS2);
4411     expandf64Toi32(RHS, DAG, RHS1, RHS2);
4412     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
4413     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
4414     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4415     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4416     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4417     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
4418     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
4419   }
4420 
4421   return SDValue();
4422 }
4423 
4424 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
4425   SDValue Chain = Op.getOperand(0);
4426   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4427   SDValue LHS = Op.getOperand(2);
4428   SDValue RHS = Op.getOperand(3);
4429   SDValue Dest = Op.getOperand(4);
4430   SDLoc dl(Op);
4431 
4432   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4433     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4434                                                     dl);
4435 
4436     // If softenSetCCOperands only returned one value, we should compare it to
4437     // zero.
4438     if (!RHS.getNode()) {
4439       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4440       CC = ISD::SETNE;
4441     }
4442   }
4443 
4444   if (LHS.getValueType() == MVT::i32) {
4445     SDValue ARMcc;
4446     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4447     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4448     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4449                        Chain, Dest, ARMcc, CCR, Cmp);
4450   }
4451 
4452   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
4453 
4454   if (getTargetMachine().Options.UnsafeFPMath &&
4455       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
4456        CC == ISD::SETNE || CC == ISD::SETUNE)) {
4457     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
4458       return Result;
4459   }
4460 
4461   ARMCC::CondCodes CondCode, CondCode2;
4462   FPCCToARMCC(CC, CondCode, CondCode2);
4463 
4464   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4465   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
4466   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4467   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4468   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
4469   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4470   if (CondCode2 != ARMCC::AL) {
4471     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
4472     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
4473     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4474   }
4475   return Res;
4476 }
4477 
4478 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
4479   SDValue Chain = Op.getOperand(0);
4480   SDValue Table = Op.getOperand(1);
4481   SDValue Index = Op.getOperand(2);
4482   SDLoc dl(Op);
4483 
4484   EVT PTy = getPointerTy(DAG.getDataLayout());
4485   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
4486   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
4487   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
4488   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
4489   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
4490   if (Subtarget->isThumb2()) {
4491     // Thumb2 uses a two-level jump. That is, it jumps into the jump table
4492     // which does another jump to the destination. This also makes it easier
4493     // to translate it to TBB / TBH later.
4494     // FIXME: This might not work if the function is extremely large.
4495     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
4496                        Addr, Op.getOperand(2), JTI);
4497   }
4498   if (isPositionIndependent() || Subtarget->isROPI()) {
4499     Addr =
4500         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
4501                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4502     Chain = Addr.getValue(1);
4503     Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
4504     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4505   } else {
4506     Addr =
4507         DAG.getLoad(PTy, dl, Chain, Addr,
4508                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4509     Chain = Addr.getValue(1);
4510     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4511   }
4512 }
4513 
4514 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
4515   EVT VT = Op.getValueType();
4516   SDLoc dl(Op);
4517 
4518   if (Op.getValueType().getVectorElementType() == MVT::i32) {
4519     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
4520       return Op;
4521     return DAG.UnrollVectorOp(Op.getNode());
4522   }
4523 
4524   assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
4525          "Invalid type for custom lowering!");
4526   if (VT != MVT::v4i16)
4527     return DAG.UnrollVectorOp(Op.getNode());
4528 
4529   Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
4530   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
4531 }
4532 
4533 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
4534   EVT VT = Op.getValueType();
4535   if (VT.isVector())
4536     return LowerVectorFP_TO_INT(Op, DAG);
4537   if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
4538     RTLIB::Libcall LC;
4539     if (Op.getOpcode() == ISD::FP_TO_SINT)
4540       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
4541                               Op.getValueType());
4542     else
4543       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
4544                               Op.getValueType());
4545     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4546                        /*isSigned*/ false, SDLoc(Op)).first;
4547   }
4548 
4549   return Op;
4550 }
4551 
4552 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4553   EVT VT = Op.getValueType();
4554   SDLoc dl(Op);
4555 
4556   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
4557     if (VT.getVectorElementType() == MVT::f32)
4558       return Op;
4559     return DAG.UnrollVectorOp(Op.getNode());
4560   }
4561 
4562   assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
4563          "Invalid type for custom lowering!");
4564   if (VT != MVT::v4f32)
4565     return DAG.UnrollVectorOp(Op.getNode());
4566 
4567   unsigned CastOpc;
4568   unsigned Opc;
4569   switch (Op.getOpcode()) {
4570   default: llvm_unreachable("Invalid opcode!");
4571   case ISD::SINT_TO_FP:
4572     CastOpc = ISD::SIGN_EXTEND;
4573     Opc = ISD::SINT_TO_FP;
4574     break;
4575   case ISD::UINT_TO_FP:
4576     CastOpc = ISD::ZERO_EXTEND;
4577     Opc = ISD::UINT_TO_FP;
4578     break;
4579   }
4580 
4581   Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
4582   return DAG.getNode(Opc, dl, VT, Op);
4583 }
4584 
4585 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
4586   EVT VT = Op.getValueType();
4587   if (VT.isVector())
4588     return LowerVectorINT_TO_FP(Op, DAG);
4589   if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
4590     RTLIB::Libcall LC;
4591     if (Op.getOpcode() == ISD::SINT_TO_FP)
4592       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
4593                               Op.getValueType());
4594     else
4595       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
4596                               Op.getValueType());
4597     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4598                        /*isSigned*/ false, SDLoc(Op)).first;
4599   }
4600 
4601   return Op;
4602 }
4603 
4604 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
4605   // Implement fcopysign with a fabs and a conditional fneg.
4606   SDValue Tmp0 = Op.getOperand(0);
4607   SDValue Tmp1 = Op.getOperand(1);
4608   SDLoc dl(Op);
4609   EVT VT = Op.getValueType();
4610   EVT SrcVT = Tmp1.getValueType();
4611   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
4612     Tmp0.getOpcode() == ARMISD::VMOVDRR;
4613   bool UseNEON = !InGPR && Subtarget->hasNEON();
4614 
4615   if (UseNEON) {
4616     // Use VBSL to copy the sign bit.
4617     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
4618     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
4619                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
4620     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
4621     if (VT == MVT::f64)
4622       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4623                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
4624                          DAG.getConstant(32, dl, MVT::i32));
4625     else /*if (VT == MVT::f32)*/
4626       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
4627     if (SrcVT == MVT::f32) {
4628       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
4629       if (VT == MVT::f64)
4630         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4631                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
4632                            DAG.getConstant(32, dl, MVT::i32));
4633     } else if (VT == MVT::f32)
4634       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
4635                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
4636                          DAG.getConstant(32, dl, MVT::i32));
4637     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
4638     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
4639 
4640     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
4641                                             dl, MVT::i32);
4642     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
4643     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
4644                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
4645 
4646     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
4647                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
4648                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
4649     if (VT == MVT::f32) {
4650       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
4651       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
4652                         DAG.getConstant(0, dl, MVT::i32));
4653     } else {
4654       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
4655     }
4656 
4657     return Res;
4658   }
4659 
4660   // Bitcast operand 1 to i32.
4661   if (SrcVT == MVT::f64)
4662     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4663                        Tmp1).getValue(1);
4664   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
4665 
4666   // Or in the signbit with integer operations.
4667   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
4668   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4669   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
4670   if (VT == MVT::f32) {
4671     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
4672                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
4673     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
4674                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
4675   }
4676 
4677   // f64: Or the high part with signbit and then combine two parts.
4678   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
4679                      Tmp0);
4680   SDValue Lo = Tmp0.getValue(0);
4681   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
4682   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
4683   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
4684 }
4685 
4686 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
4687   MachineFunction &MF = DAG.getMachineFunction();
4688   MachineFrameInfo &MFI = MF.getFrameInfo();
4689   MFI.setReturnAddressIsTaken(true);
4690 
4691   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
4692     return SDValue();
4693 
4694   EVT VT = Op.getValueType();
4695   SDLoc dl(Op);
4696   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4697   if (Depth) {
4698     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
4699     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
4700     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
4701                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
4702                        MachinePointerInfo());
4703   }
4704 
4705   // Return LR, which contains the return address. Mark it an implicit live-in.
4706   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4707   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
4708 }
4709 
4710 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
4711   const ARMBaseRegisterInfo &ARI =
4712     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
4713   MachineFunction &MF = DAG.getMachineFunction();
4714   MachineFrameInfo &MFI = MF.getFrameInfo();
4715   MFI.setFrameAddressIsTaken(true);
4716 
4717   EVT VT = Op.getValueType();
4718   SDLoc dl(Op);  // FIXME probably not meaningful
4719   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4720   unsigned FrameReg = ARI.getFrameRegister(MF);
4721   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
4722   while (Depth--)
4723     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
4724                             MachinePointerInfo());
4725   return FrameAddr;
4726 }
4727 
4728 // FIXME? Maybe this could be a TableGen attribute on some registers and
4729 // this table could be generated automatically from RegInfo.
4730 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
4731                                               SelectionDAG &DAG) const {
4732   unsigned Reg = StringSwitch<unsigned>(RegName)
4733                        .Case("sp", ARM::SP)
4734                        .Default(0);
4735   if (Reg)
4736     return Reg;
4737   report_fatal_error(Twine("Invalid register name \""
4738                               + StringRef(RegName)  + "\"."));
4739 }
4740 
4741 // Result is 64 bit value so split into two 32 bit values and return as a
4742 // pair of values.
4743 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
4744                                 SelectionDAG &DAG) {
4745   SDLoc DL(N);
4746 
4747   // This function is only supposed to be called for i64 type destination.
4748   assert(N->getValueType(0) == MVT::i64
4749           && "ExpandREAD_REGISTER called for non-i64 type result.");
4750 
4751   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
4752                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
4753                              N->getOperand(0),
4754                              N->getOperand(1));
4755 
4756   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
4757                     Read.getValue(1)));
4758   Results.push_back(Read.getOperand(0));
4759 }
4760 
4761 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
4762 /// When \p DstVT, the destination type of \p BC, is on the vector
4763 /// register bank and the source of bitcast, \p Op, operates on the same bank,
4764 /// it might be possible to combine them, such that everything stays on the
4765 /// vector register bank.
4766 /// \p return The node that would replace \p BT, if the combine
4767 /// is possible.
4768 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
4769                                                 SelectionDAG &DAG) {
4770   SDValue Op = BC->getOperand(0);
4771   EVT DstVT = BC->getValueType(0);
4772 
4773   // The only vector instruction that can produce a scalar (remember,
4774   // since the bitcast was about to be turned into VMOVDRR, the source
4775   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
4776   // Moreover, we can do this combine only if there is one use.
4777   // Finally, if the destination type is not a vector, there is not
4778   // much point on forcing everything on the vector bank.
4779   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
4780       !Op.hasOneUse())
4781     return SDValue();
4782 
4783   // If the index is not constant, we will introduce an additional
4784   // multiply that will stick.
4785   // Give up in that case.
4786   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
4787   if (!Index)
4788     return SDValue();
4789   unsigned DstNumElt = DstVT.getVectorNumElements();
4790 
4791   // Compute the new index.
4792   const APInt &APIntIndex = Index->getAPIntValue();
4793   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
4794   NewIndex *= APIntIndex;
4795   // Check if the new constant index fits into i32.
4796   if (NewIndex.getBitWidth() > 32)
4797     return SDValue();
4798 
4799   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
4800   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
4801   SDLoc dl(Op);
4802   SDValue ExtractSrc = Op.getOperand(0);
4803   EVT VecVT = EVT::getVectorVT(
4804       *DAG.getContext(), DstVT.getScalarType(),
4805       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
4806   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
4807   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
4808                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
4809 }
4810 
4811 /// ExpandBITCAST - If the target supports VFP, this function is called to
4812 /// expand a bit convert where either the source or destination type is i64 to
4813 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
4814 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
4815 /// vectors), since the legalizer won't know what to do with that.
4816 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
4817   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4818   SDLoc dl(N);
4819   SDValue Op = N->getOperand(0);
4820 
4821   // This function is only supposed to be called for i64 types, either as the
4822   // source or destination of the bit convert.
4823   EVT SrcVT = Op.getValueType();
4824   EVT DstVT = N->getValueType(0);
4825   assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
4826          "ExpandBITCAST called for non-i64 type");
4827 
4828   // Turn i64->f64 into VMOVDRR.
4829   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
4830     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
4831     // if we can combine the bitcast with its source.
4832     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
4833       return Val;
4834 
4835     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4836                              DAG.getConstant(0, dl, MVT::i32));
4837     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
4838                              DAG.getConstant(1, dl, MVT::i32));
4839     return DAG.getNode(ISD::BITCAST, dl, DstVT,
4840                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
4841   }
4842 
4843   // Turn f64->i64 into VMOVRRD.
4844   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
4845     SDValue Cvt;
4846     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
4847         SrcVT.getVectorNumElements() > 1)
4848       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4849                         DAG.getVTList(MVT::i32, MVT::i32),
4850                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
4851     else
4852       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
4853                         DAG.getVTList(MVT::i32, MVT::i32), Op);
4854     // Merge the pieces into a single i64 value.
4855     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
4856   }
4857 
4858   return SDValue();
4859 }
4860 
4861 /// getZeroVector - Returns a vector of specified type with all zero elements.
4862 /// Zero vectors are used to represent vector negation and in those cases
4863 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
4864 /// not support i64 elements, so sometimes the zero vectors will need to be
4865 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
4866 /// zero vector.
4867 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4868   assert(VT.isVector() && "Expected a vector type");
4869   // The canonical modified immediate encoding of a zero vector is....0!
4870   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
4871   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
4872   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
4873   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4874 }
4875 
4876 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
4877 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4878 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
4879                                                 SelectionDAG &DAG) const {
4880   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4881   EVT VT = Op.getValueType();
4882   unsigned VTBits = VT.getSizeInBits();
4883   SDLoc dl(Op);
4884   SDValue ShOpLo = Op.getOperand(0);
4885   SDValue ShOpHi = Op.getOperand(1);
4886   SDValue ShAmt  = Op.getOperand(2);
4887   SDValue ARMcc;
4888   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
4889 
4890   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
4891 
4892   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4893                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4894   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
4895   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4896                                    DAG.getConstant(VTBits, dl, MVT::i32));
4897   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
4898   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4899   SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
4900 
4901   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4902   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4903                           ISD::SETGE, ARMcc, DAG, dl);
4904   SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
4905   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
4906                            CCR, Cmp);
4907 
4908   SDValue Ops[2] = { Lo, Hi };
4909   return DAG.getMergeValues(Ops, dl);
4910 }
4911 
4912 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
4913 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
4914 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
4915                                                SelectionDAG &DAG) const {
4916   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4917   EVT VT = Op.getValueType();
4918   unsigned VTBits = VT.getSizeInBits();
4919   SDLoc dl(Op);
4920   SDValue ShOpLo = Op.getOperand(0);
4921   SDValue ShOpHi = Op.getOperand(1);
4922   SDValue ShAmt  = Op.getOperand(2);
4923   SDValue ARMcc;
4924 
4925   assert(Op.getOpcode() == ISD::SHL_PARTS);
4926   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
4927                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
4928   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
4929   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
4930                                    DAG.getConstant(VTBits, dl, MVT::i32));
4931   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
4932   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
4933 
4934   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
4935   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4936   SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
4937                           ISD::SETGE, ARMcc, DAG, dl);
4938   SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4939   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
4940                            CCR, Cmp);
4941 
4942   SDValue Ops[2] = { Lo, Hi };
4943   return DAG.getMergeValues(Ops, dl);
4944 }
4945 
4946 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4947                                             SelectionDAG &DAG) const {
4948   // The rounding mode is in bits 23:22 of the FPSCR.
4949   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4950   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4951   // so that the shift + and get folded into a bitfield extract.
4952   SDLoc dl(Op);
4953   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
4954                               DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
4955                                               MVT::i32));
4956   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
4957                                   DAG.getConstant(1U << 22, dl, MVT::i32));
4958   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4959                               DAG.getConstant(22, dl, MVT::i32));
4960   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4961                      DAG.getConstant(3, dl, MVT::i32));
4962 }
4963 
4964 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
4965                          const ARMSubtarget *ST) {
4966   SDLoc dl(N);
4967   EVT VT = N->getValueType(0);
4968   if (VT.isVector()) {
4969     assert(ST->hasNEON());
4970 
4971     // Compute the least significant set bit: LSB = X & -X
4972     SDValue X = N->getOperand(0);
4973     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
4974     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
4975 
4976     EVT ElemTy = VT.getVectorElementType();
4977 
4978     if (ElemTy == MVT::i8) {
4979       // Compute with: cttz(x) = ctpop(lsb - 1)
4980       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4981                                 DAG.getTargetConstant(1, dl, ElemTy));
4982       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
4983       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
4984     }
4985 
4986     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
4987         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
4988       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
4989       unsigned NumBits = ElemTy.getSizeInBits();
4990       SDValue WidthMinus1 =
4991           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
4992                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
4993       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
4994       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
4995     }
4996 
4997     // Compute with: cttz(x) = ctpop(lsb - 1)
4998 
4999     // Since we can only compute the number of bits in a byte with vcnt.8, we
5000     // have to gather the result with pairwise addition (vpaddl) for i16, i32,
5001     // and i64.
5002 
5003     // Compute LSB - 1.
5004     SDValue Bits;
5005     if (ElemTy == MVT::i64) {
5006       // Load constant 0xffff'ffff'ffff'ffff to register.
5007       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5008                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
5009       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
5010     } else {
5011       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5012                                 DAG.getTargetConstant(1, dl, ElemTy));
5013       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5014     }
5015 
5016     // Count #bits with vcnt.8.
5017     EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5018     SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
5019     SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
5020 
5021     // Gather the #bits with vpaddl (pairwise add.)
5022     EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
5023     SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
5024         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5025         Cnt8);
5026     if (ElemTy == MVT::i16)
5027       return Cnt16;
5028 
5029     EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
5030     SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
5031         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5032         Cnt16);
5033     if (ElemTy == MVT::i32)
5034       return Cnt32;
5035 
5036     assert(ElemTy == MVT::i64);
5037     SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5038         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5039         Cnt32);
5040     return Cnt64;
5041   }
5042 
5043   if (!ST->hasV6T2Ops())
5044     return SDValue();
5045 
5046   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
5047   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
5048 }
5049 
5050 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
5051 /// for each 16-bit element from operand, repeated.  The basic idea is to
5052 /// leverage vcnt to get the 8-bit counts, gather and add the results.
5053 ///
5054 /// Trace for v4i16:
5055 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
5056 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
5057 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
5058 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
5059 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
5060 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
5061 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
5062 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
5063 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
5064   EVT VT = N->getValueType(0);
5065   SDLoc DL(N);
5066 
5067   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5068   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
5069   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
5070   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
5071   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
5072   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
5073 }
5074 
5075 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
5076 /// bit-count for each 16-bit element from the operand.  We need slightly
5077 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
5078 /// 64/128-bit registers.
5079 ///
5080 /// Trace for v4i16:
5081 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
5082 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
5083 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
5084 /// v4i16:Extracted = [k0    k1    k2    k3    ]
5085 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
5086   EVT VT = N->getValueType(0);
5087   SDLoc DL(N);
5088 
5089   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
5090   if (VT.is64BitVector()) {
5091     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
5092     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
5093                        DAG.getIntPtrConstant(0, DL));
5094   } else {
5095     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
5096                                     BitCounts, DAG.getIntPtrConstant(0, DL));
5097     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
5098   }
5099 }
5100 
5101 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
5102 /// bit-count for each 32-bit element from the operand.  The idea here is
5103 /// to split the vector into 16-bit elements, leverage the 16-bit count
5104 /// routine, and then combine the results.
5105 ///
5106 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
5107 /// input    = [v0    v1    ] (vi: 32-bit elements)
5108 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
5109 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
5110 /// vrev: N0 = [k1 k0 k3 k2 ]
5111 ///            [k0 k1 k2 k3 ]
5112 ///       N1 =+[k1 k0 k3 k2 ]
5113 ///            [k0 k2 k1 k3 ]
5114 ///       N2 =+[k1 k3 k0 k2 ]
5115 ///            [k0    k2    k1    k3    ]
5116 /// Extended =+[k1    k3    k0    k2    ]
5117 ///            [k0    k2    ]
5118 /// Extracted=+[k1    k3    ]
5119 ///
5120 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
5121   EVT VT = N->getValueType(0);
5122   SDLoc DL(N);
5123 
5124   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
5125 
5126   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
5127   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
5128   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
5129   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
5130   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
5131 
5132   if (VT.is64BitVector()) {
5133     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
5134     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
5135                        DAG.getIntPtrConstant(0, DL));
5136   } else {
5137     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
5138                                     DAG.getIntPtrConstant(0, DL));
5139     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
5140   }
5141 }
5142 
5143 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
5144                           const ARMSubtarget *ST) {
5145   EVT VT = N->getValueType(0);
5146 
5147   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
5148   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
5149           VT == MVT::v4i16 || VT == MVT::v8i16) &&
5150          "Unexpected type for custom ctpop lowering");
5151 
5152   if (VT.getVectorElementType() == MVT::i32)
5153     return lowerCTPOP32BitElements(N, DAG);
5154   else
5155     return lowerCTPOP16BitElements(N, DAG);
5156 }
5157 
5158 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
5159                           const ARMSubtarget *ST) {
5160   EVT VT = N->getValueType(0);
5161   SDLoc dl(N);
5162 
5163   if (!VT.isVector())
5164     return SDValue();
5165 
5166   // Lower vector shifts on NEON to use VSHL.
5167   assert(ST->hasNEON() && "unexpected vector shift");
5168 
5169   // Left shifts translate directly to the vshiftu intrinsic.
5170   if (N->getOpcode() == ISD::SHL)
5171     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5172                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
5173                                        MVT::i32),
5174                        N->getOperand(0), N->getOperand(1));
5175 
5176   assert((N->getOpcode() == ISD::SRA ||
5177           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
5178 
5179   // NEON uses the same intrinsics for both left and right shifts.  For
5180   // right shifts, the shift amounts are negative, so negate the vector of
5181   // shift amounts.
5182   EVT ShiftVT = N->getOperand(1).getValueType();
5183   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
5184                                      getZeroVector(ShiftVT, DAG, dl),
5185                                      N->getOperand(1));
5186   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
5187                              Intrinsic::arm_neon_vshifts :
5188                              Intrinsic::arm_neon_vshiftu);
5189   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5190                      DAG.getConstant(vshiftInt, dl, MVT::i32),
5191                      N->getOperand(0), NegatedCount);
5192 }
5193 
5194 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
5195                                 const ARMSubtarget *ST) {
5196   EVT VT = N->getValueType(0);
5197   SDLoc dl(N);
5198 
5199   // We can get here for a node like i32 = ISD::SHL i32, i64
5200   if (VT != MVT::i64)
5201     return SDValue();
5202 
5203   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
5204          "Unknown shift to lower!");
5205 
5206   // We only lower SRA, SRL of 1 here, all others use generic lowering.
5207   if (!isOneConstant(N->getOperand(1)))
5208     return SDValue();
5209 
5210   // If we are in thumb mode, we don't have RRX.
5211   if (ST->isThumb1Only()) return SDValue();
5212 
5213   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
5214   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5215                            DAG.getConstant(0, dl, MVT::i32));
5216   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5217                            DAG.getConstant(1, dl, MVT::i32));
5218 
5219   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
5220   // captures the result into a carry flag.
5221   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
5222   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
5223 
5224   // The low part is an ARMISD::RRX operand, which shifts the carry in.
5225   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
5226 
5227   // Merge the pieces into a single i64 value.
5228  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
5229 }
5230 
5231 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5232   SDValue TmpOp0, TmpOp1;
5233   bool Invert = false;
5234   bool Swap = false;
5235   unsigned Opc = 0;
5236 
5237   SDValue Op0 = Op.getOperand(0);
5238   SDValue Op1 = Op.getOperand(1);
5239   SDValue CC = Op.getOperand(2);
5240   EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
5241   EVT VT = Op.getValueType();
5242   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5243   SDLoc dl(Op);
5244 
5245   if (CmpVT.getVectorElementType() == MVT::i64)
5246     // 64-bit comparisons are not legal. We've marked SETCC as non-Custom,
5247     // but it's possible that our operands are 64-bit but our result is 32-bit.
5248     // Bail in this case.
5249     return SDValue();
5250 
5251   if (Op1.getValueType().isFloatingPoint()) {
5252     switch (SetCCOpcode) {
5253     default: llvm_unreachable("Illegal FP comparison");
5254     case ISD::SETUNE:
5255     case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
5256     case ISD::SETOEQ:
5257     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5258     case ISD::SETOLT:
5259     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
5260     case ISD::SETOGT:
5261     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5262     case ISD::SETOLE:
5263     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
5264     case ISD::SETOGE:
5265     case ISD::SETGE: Opc = ARMISD::VCGE; break;
5266     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
5267     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
5268     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
5269     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
5270     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
5271     case ISD::SETONE:
5272       // Expand this to (OLT | OGT).
5273       TmpOp0 = Op0;
5274       TmpOp1 = Op1;
5275       Opc = ISD::OR;
5276       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5277       Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
5278       break;
5279     case ISD::SETUO:
5280       Invert = true;
5281       LLVM_FALLTHROUGH;
5282     case ISD::SETO:
5283       // Expand this to (OLT | OGE).
5284       TmpOp0 = Op0;
5285       TmpOp1 = Op1;
5286       Opc = ISD::OR;
5287       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5288       Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
5289       break;
5290     }
5291   } else {
5292     // Integer comparisons.
5293     switch (SetCCOpcode) {
5294     default: llvm_unreachable("Illegal integer comparison");
5295     case ISD::SETNE:  Invert = true;
5296     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5297     case ISD::SETLT:  Swap = true;
5298     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5299     case ISD::SETLE:  Swap = true;
5300     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
5301     case ISD::SETULT: Swap = true;
5302     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
5303     case ISD::SETULE: Swap = true;
5304     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
5305     }
5306 
5307     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
5308     if (Opc == ARMISD::VCEQ) {
5309 
5310       SDValue AndOp;
5311       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5312         AndOp = Op0;
5313       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
5314         AndOp = Op1;
5315 
5316       // Ignore bitconvert.
5317       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
5318         AndOp = AndOp.getOperand(0);
5319 
5320       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
5321         Opc = ARMISD::VTST;
5322         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
5323         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
5324         Invert = !Invert;
5325       }
5326     }
5327   }
5328 
5329   if (Swap)
5330     std::swap(Op0, Op1);
5331 
5332   // If one of the operands is a constant vector zero, attempt to fold the
5333   // comparison to a specialized compare-against-zero form.
5334   SDValue SingleOp;
5335   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5336     SingleOp = Op0;
5337   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
5338     if (Opc == ARMISD::VCGE)
5339       Opc = ARMISD::VCLEZ;
5340     else if (Opc == ARMISD::VCGT)
5341       Opc = ARMISD::VCLTZ;
5342     SingleOp = Op1;
5343   }
5344 
5345   SDValue Result;
5346   if (SingleOp.getNode()) {
5347     switch (Opc) {
5348     case ARMISD::VCEQ:
5349       Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
5350     case ARMISD::VCGE:
5351       Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
5352     case ARMISD::VCLEZ:
5353       Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
5354     case ARMISD::VCGT:
5355       Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
5356     case ARMISD::VCLTZ:
5357       Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
5358     default:
5359       Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5360     }
5361   } else {
5362      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5363   }
5364 
5365   Result = DAG.getSExtOrTrunc(Result, dl, VT);
5366 
5367   if (Invert)
5368     Result = DAG.getNOT(dl, Result, VT);
5369 
5370   return Result;
5371 }
5372 
5373 static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
5374   SDValue LHS = Op.getOperand(0);
5375   SDValue RHS = Op.getOperand(1);
5376   SDValue Carry = Op.getOperand(2);
5377   SDValue Cond = Op.getOperand(3);
5378   SDLoc DL(Op);
5379 
5380   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
5381 
5382   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
5383   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
5384   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
5385 
5386   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
5387   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
5388   SDValue ARMcc = DAG.getConstant(
5389       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
5390   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5391   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
5392                                    Cmp.getValue(1), SDValue());
5393   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
5394                      CCR, Chain.getValue(1));
5395 }
5396 
5397 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
5398 /// valid vector constant for a NEON instruction with a "modified immediate"
5399 /// operand (e.g., VMOV).  If so, return the encoded value.
5400 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
5401                                  unsigned SplatBitSize, SelectionDAG &DAG,
5402                                  const SDLoc &dl, EVT &VT, bool is128Bits,
5403                                  NEONModImmType type) {
5404   unsigned OpCmode, Imm;
5405 
5406   // SplatBitSize is set to the smallest size that splats the vector, so a
5407   // zero vector will always have SplatBitSize == 8.  However, NEON modified
5408   // immediate instructions others than VMOV do not support the 8-bit encoding
5409   // of a zero vector, and the default encoding of zero is supposed to be the
5410   // 32-bit version.
5411   if (SplatBits == 0)
5412     SplatBitSize = 32;
5413 
5414   switch (SplatBitSize) {
5415   case 8:
5416     if (type != VMOVModImm)
5417       return SDValue();
5418     // Any 1-byte value is OK.  Op=0, Cmode=1110.
5419     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
5420     OpCmode = 0xe;
5421     Imm = SplatBits;
5422     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
5423     break;
5424 
5425   case 16:
5426     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
5427     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
5428     if ((SplatBits & ~0xff) == 0) {
5429       // Value = 0x00nn: Op=x, Cmode=100x.
5430       OpCmode = 0x8;
5431       Imm = SplatBits;
5432       break;
5433     }
5434     if ((SplatBits & ~0xff00) == 0) {
5435       // Value = 0xnn00: Op=x, Cmode=101x.
5436       OpCmode = 0xa;
5437       Imm = SplatBits >> 8;
5438       break;
5439     }
5440     return SDValue();
5441 
5442   case 32:
5443     // NEON's 32-bit VMOV supports splat values where:
5444     // * only one byte is nonzero, or
5445     // * the least significant byte is 0xff and the second byte is nonzero, or
5446     // * the least significant 2 bytes are 0xff and the third is nonzero.
5447     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
5448     if ((SplatBits & ~0xff) == 0) {
5449       // Value = 0x000000nn: Op=x, Cmode=000x.
5450       OpCmode = 0;
5451       Imm = SplatBits;
5452       break;
5453     }
5454     if ((SplatBits & ~0xff00) == 0) {
5455       // Value = 0x0000nn00: Op=x, Cmode=001x.
5456       OpCmode = 0x2;
5457       Imm = SplatBits >> 8;
5458       break;
5459     }
5460     if ((SplatBits & ~0xff0000) == 0) {
5461       // Value = 0x00nn0000: Op=x, Cmode=010x.
5462       OpCmode = 0x4;
5463       Imm = SplatBits >> 16;
5464       break;
5465     }
5466     if ((SplatBits & ~0xff000000) == 0) {
5467       // Value = 0xnn000000: Op=x, Cmode=011x.
5468       OpCmode = 0x6;
5469       Imm = SplatBits >> 24;
5470       break;
5471     }
5472 
5473     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
5474     if (type == OtherModImm) return SDValue();
5475 
5476     if ((SplatBits & ~0xffff) == 0 &&
5477         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
5478       // Value = 0x0000nnff: Op=x, Cmode=1100.
5479       OpCmode = 0xc;
5480       Imm = SplatBits >> 8;
5481       break;
5482     }
5483 
5484     if ((SplatBits & ~0xffffff) == 0 &&
5485         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
5486       // Value = 0x00nnffff: Op=x, Cmode=1101.
5487       OpCmode = 0xd;
5488       Imm = SplatBits >> 16;
5489       break;
5490     }
5491 
5492     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
5493     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
5494     // VMOV.I32.  A (very) minor optimization would be to replicate the value
5495     // and fall through here to test for a valid 64-bit splat.  But, then the
5496     // caller would also need to check and handle the change in size.
5497     return SDValue();
5498 
5499   case 64: {
5500     if (type != VMOVModImm)
5501       return SDValue();
5502     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
5503     uint64_t BitMask = 0xff;
5504     uint64_t Val = 0;
5505     unsigned ImmMask = 1;
5506     Imm = 0;
5507     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
5508       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
5509         Val |= BitMask;
5510         Imm |= ImmMask;
5511       } else if ((SplatBits & BitMask) != 0) {
5512         return SDValue();
5513       }
5514       BitMask <<= 8;
5515       ImmMask <<= 1;
5516     }
5517 
5518     if (DAG.getDataLayout().isBigEndian())
5519       // swap higher and lower 32 bit word
5520       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
5521 
5522     // Op=1, Cmode=1110.
5523     OpCmode = 0x1e;
5524     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
5525     break;
5526   }
5527 
5528   default:
5529     llvm_unreachable("unexpected size for isNEONModifiedImm");
5530   }
5531 
5532   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
5533   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
5534 }
5535 
5536 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
5537                                            const ARMSubtarget *ST) const {
5538   if (!ST->hasVFP3())
5539     return SDValue();
5540 
5541   bool IsDouble = Op.getValueType() == MVT::f64;
5542   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
5543 
5544   // Use the default (constant pool) lowering for double constants when we have
5545   // an SP-only FPU
5546   if (IsDouble && Subtarget->isFPOnlySP())
5547     return SDValue();
5548 
5549   // Try splatting with a VMOV.f32...
5550   const APFloat &FPVal = CFP->getValueAPF();
5551   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
5552 
5553   if (ImmVal != -1) {
5554     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
5555       // We have code in place to select a valid ConstantFP already, no need to
5556       // do any mangling.
5557       return Op;
5558     }
5559 
5560     // It's a float and we are trying to use NEON operations where
5561     // possible. Lower it to a splat followed by an extract.
5562     SDLoc DL(Op);
5563     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
5564     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
5565                                       NewVal);
5566     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
5567                        DAG.getConstant(0, DL, MVT::i32));
5568   }
5569 
5570   // The rest of our options are NEON only, make sure that's allowed before
5571   // proceeding..
5572   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
5573     return SDValue();
5574 
5575   EVT VMovVT;
5576   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
5577 
5578   // It wouldn't really be worth bothering for doubles except for one very
5579   // important value, which does happen to match: 0.0. So make sure we don't do
5580   // anything stupid.
5581   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
5582     return SDValue();
5583 
5584   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
5585   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
5586                                      VMovVT, false, VMOVModImm);
5587   if (NewVal != SDValue()) {
5588     SDLoc DL(Op);
5589     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
5590                                       NewVal);
5591     if (IsDouble)
5592       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5593 
5594     // It's a float: cast and extract a vector element.
5595     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5596                                        VecConstant);
5597     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5598                        DAG.getConstant(0, DL, MVT::i32));
5599   }
5600 
5601   // Finally, try a VMVN.i32
5602   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
5603                              false, VMVNModImm);
5604   if (NewVal != SDValue()) {
5605     SDLoc DL(Op);
5606     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
5607 
5608     if (IsDouble)
5609       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
5610 
5611     // It's a float: cast and extract a vector element.
5612     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
5613                                        VecConstant);
5614     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
5615                        DAG.getConstant(0, DL, MVT::i32));
5616   }
5617 
5618   return SDValue();
5619 }
5620 
5621 // check if an VEXT instruction can handle the shuffle mask when the
5622 // vector sources of the shuffle are the same.
5623 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
5624   unsigned NumElts = VT.getVectorNumElements();
5625 
5626   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5627   if (M[0] < 0)
5628     return false;
5629 
5630   Imm = M[0];
5631 
5632   // If this is a VEXT shuffle, the immediate value is the index of the first
5633   // element.  The other shuffle indices must be the successive elements after
5634   // the first one.
5635   unsigned ExpectedElt = Imm;
5636   for (unsigned i = 1; i < NumElts; ++i) {
5637     // Increment the expected index.  If it wraps around, just follow it
5638     // back to index zero and keep going.
5639     ++ExpectedElt;
5640     if (ExpectedElt == NumElts)
5641       ExpectedElt = 0;
5642 
5643     if (M[i] < 0) continue; // ignore UNDEF indices
5644     if (ExpectedElt != static_cast<unsigned>(M[i]))
5645       return false;
5646   }
5647 
5648   return true;
5649 }
5650 
5651 
5652 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
5653                        bool &ReverseVEXT, unsigned &Imm) {
5654   unsigned NumElts = VT.getVectorNumElements();
5655   ReverseVEXT = false;
5656 
5657   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5658   if (M[0] < 0)
5659     return false;
5660 
5661   Imm = M[0];
5662 
5663   // If this is a VEXT shuffle, the immediate value is the index of the first
5664   // element.  The other shuffle indices must be the successive elements after
5665   // the first one.
5666   unsigned ExpectedElt = Imm;
5667   for (unsigned i = 1; i < NumElts; ++i) {
5668     // Increment the expected index.  If it wraps around, it may still be
5669     // a VEXT but the source vectors must be swapped.
5670     ExpectedElt += 1;
5671     if (ExpectedElt == NumElts * 2) {
5672       ExpectedElt = 0;
5673       ReverseVEXT = true;
5674     }
5675 
5676     if (M[i] < 0) continue; // ignore UNDEF indices
5677     if (ExpectedElt != static_cast<unsigned>(M[i]))
5678       return false;
5679   }
5680 
5681   // Adjust the index value if the source operands will be swapped.
5682   if (ReverseVEXT)
5683     Imm -= NumElts;
5684 
5685   return true;
5686 }
5687 
5688 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
5689 /// instruction with the specified blocksize.  (The order of the elements
5690 /// within each block of the vector is reversed.)
5691 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
5692   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
5693          "Only possible block sizes for VREV are: 16, 32, 64");
5694 
5695   unsigned EltSz = VT.getScalarSizeInBits();
5696   if (EltSz == 64)
5697     return false;
5698 
5699   unsigned NumElts = VT.getVectorNumElements();
5700   unsigned BlockElts = M[0] + 1;
5701   // If the first shuffle index is UNDEF, be optimistic.
5702   if (M[0] < 0)
5703     BlockElts = BlockSize / EltSz;
5704 
5705   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
5706     return false;
5707 
5708   for (unsigned i = 0; i < NumElts; ++i) {
5709     if (M[i] < 0) continue; // ignore UNDEF indices
5710     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
5711       return false;
5712   }
5713 
5714   return true;
5715 }
5716 
5717 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
5718   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
5719   // range, then 0 is placed into the resulting vector. So pretty much any mask
5720   // of 8 elements can work here.
5721   return VT == MVT::v8i8 && M.size() == 8;
5722 }
5723 
5724 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
5725 // checking that pairs of elements in the shuffle mask represent the same index
5726 // in each vector, incrementing the expected index by 2 at each step.
5727 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
5728 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
5729 //  v2={e,f,g,h}
5730 // WhichResult gives the offset for each element in the mask based on which
5731 // of the two results it belongs to.
5732 //
5733 // The transpose can be represented either as:
5734 // result1 = shufflevector v1, v2, result1_shuffle_mask
5735 // result2 = shufflevector v1, v2, result2_shuffle_mask
5736 // where v1/v2 and the shuffle masks have the same number of elements
5737 // (here WhichResult (see below) indicates which result is being checked)
5738 //
5739 // or as:
5740 // results = shufflevector v1, v2, shuffle_mask
5741 // where both results are returned in one vector and the shuffle mask has twice
5742 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
5743 // want to check the low half and high half of the shuffle mask as if it were
5744 // the other case
5745 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5746   unsigned EltSz = VT.getScalarSizeInBits();
5747   if (EltSz == 64)
5748     return false;
5749 
5750   unsigned NumElts = VT.getVectorNumElements();
5751   if (M.size() != NumElts && M.size() != NumElts*2)
5752     return false;
5753 
5754   // If the mask is twice as long as the input vector then we need to check the
5755   // upper and lower parts of the mask with a matching value for WhichResult
5756   // FIXME: A mask with only even values will be rejected in case the first
5757   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
5758   // M[0] is used to determine WhichResult
5759   for (unsigned i = 0; i < M.size(); i += NumElts) {
5760     if (M.size() == NumElts * 2)
5761       WhichResult = i / NumElts;
5762     else
5763       WhichResult = M[i] == 0 ? 0 : 1;
5764     for (unsigned j = 0; j < NumElts; j += 2) {
5765       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5766           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
5767         return false;
5768     }
5769   }
5770 
5771   if (M.size() == NumElts*2)
5772     WhichResult = 0;
5773 
5774   return true;
5775 }
5776 
5777 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
5778 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5779 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
5780 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5781   unsigned EltSz = VT.getScalarSizeInBits();
5782   if (EltSz == 64)
5783     return false;
5784 
5785   unsigned NumElts = VT.getVectorNumElements();
5786   if (M.size() != NumElts && M.size() != NumElts*2)
5787     return false;
5788 
5789   for (unsigned i = 0; i < M.size(); i += NumElts) {
5790     if (M.size() == NumElts * 2)
5791       WhichResult = i / NumElts;
5792     else
5793       WhichResult = M[i] == 0 ? 0 : 1;
5794     for (unsigned j = 0; j < NumElts; j += 2) {
5795       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
5796           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
5797         return false;
5798     }
5799   }
5800 
5801   if (M.size() == NumElts*2)
5802     WhichResult = 0;
5803 
5804   return true;
5805 }
5806 
5807 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
5808 // that the mask elements are either all even and in steps of size 2 or all odd
5809 // and in steps of size 2.
5810 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
5811 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
5812 //  v2={e,f,g,h}
5813 // Requires similar checks to that of isVTRNMask with
5814 // respect the how results are returned.
5815 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5816   unsigned EltSz = VT.getScalarSizeInBits();
5817   if (EltSz == 64)
5818     return false;
5819 
5820   unsigned NumElts = VT.getVectorNumElements();
5821   if (M.size() != NumElts && M.size() != NumElts*2)
5822     return false;
5823 
5824   for (unsigned i = 0; i < M.size(); i += NumElts) {
5825     WhichResult = M[i] == 0 ? 0 : 1;
5826     for (unsigned j = 0; j < NumElts; ++j) {
5827       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
5828         return false;
5829     }
5830   }
5831 
5832   if (M.size() == NumElts*2)
5833     WhichResult = 0;
5834 
5835   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5836   if (VT.is64BitVector() && EltSz == 32)
5837     return false;
5838 
5839   return true;
5840 }
5841 
5842 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
5843 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5844 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
5845 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5846   unsigned EltSz = VT.getScalarSizeInBits();
5847   if (EltSz == 64)
5848     return false;
5849 
5850   unsigned NumElts = VT.getVectorNumElements();
5851   if (M.size() != NumElts && M.size() != NumElts*2)
5852     return false;
5853 
5854   unsigned Half = NumElts / 2;
5855   for (unsigned i = 0; i < M.size(); i += NumElts) {
5856     WhichResult = M[i] == 0 ? 0 : 1;
5857     for (unsigned j = 0; j < NumElts; j += Half) {
5858       unsigned Idx = WhichResult;
5859       for (unsigned k = 0; k < Half; ++k) {
5860         int MIdx = M[i + j + k];
5861         if (MIdx >= 0 && (unsigned) MIdx != Idx)
5862           return false;
5863         Idx += 2;
5864       }
5865     }
5866   }
5867 
5868   if (M.size() == NumElts*2)
5869     WhichResult = 0;
5870 
5871   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5872   if (VT.is64BitVector() && EltSz == 32)
5873     return false;
5874 
5875   return true;
5876 }
5877 
5878 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
5879 // that pairs of elements of the shufflemask represent the same index in each
5880 // vector incrementing sequentially through the vectors.
5881 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
5882 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
5883 //  v2={e,f,g,h}
5884 // Requires similar checks to that of isVTRNMask with respect the how results
5885 // are returned.
5886 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5887   unsigned EltSz = VT.getScalarSizeInBits();
5888   if (EltSz == 64)
5889     return false;
5890 
5891   unsigned NumElts = VT.getVectorNumElements();
5892   if (M.size() != NumElts && M.size() != NumElts*2)
5893     return false;
5894 
5895   for (unsigned i = 0; i < M.size(); i += NumElts) {
5896     WhichResult = M[i] == 0 ? 0 : 1;
5897     unsigned Idx = WhichResult * NumElts / 2;
5898     for (unsigned j = 0; j < NumElts; j += 2) {
5899       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5900           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
5901         return false;
5902       Idx += 1;
5903     }
5904   }
5905 
5906   if (M.size() == NumElts*2)
5907     WhichResult = 0;
5908 
5909   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5910   if (VT.is64BitVector() && EltSz == 32)
5911     return false;
5912 
5913   return true;
5914 }
5915 
5916 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
5917 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5918 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
5919 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
5920   unsigned EltSz = VT.getScalarSizeInBits();
5921   if (EltSz == 64)
5922     return false;
5923 
5924   unsigned NumElts = VT.getVectorNumElements();
5925   if (M.size() != NumElts && M.size() != NumElts*2)
5926     return false;
5927 
5928   for (unsigned i = 0; i < M.size(); i += NumElts) {
5929     WhichResult = M[i] == 0 ? 0 : 1;
5930     unsigned Idx = WhichResult * NumElts / 2;
5931     for (unsigned j = 0; j < NumElts; j += 2) {
5932       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
5933           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
5934         return false;
5935       Idx += 1;
5936     }
5937   }
5938 
5939   if (M.size() == NumElts*2)
5940     WhichResult = 0;
5941 
5942   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
5943   if (VT.is64BitVector() && EltSz == 32)
5944     return false;
5945 
5946   return true;
5947 }
5948 
5949 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
5950 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
5951 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
5952                                            unsigned &WhichResult,
5953                                            bool &isV_UNDEF) {
5954   isV_UNDEF = false;
5955   if (isVTRNMask(ShuffleMask, VT, WhichResult))
5956     return ARMISD::VTRN;
5957   if (isVUZPMask(ShuffleMask, VT, WhichResult))
5958     return ARMISD::VUZP;
5959   if (isVZIPMask(ShuffleMask, VT, WhichResult))
5960     return ARMISD::VZIP;
5961 
5962   isV_UNDEF = true;
5963   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
5964     return ARMISD::VTRN;
5965   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5966     return ARMISD::VUZP;
5967   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5968     return ARMISD::VZIP;
5969 
5970   return 0;
5971 }
5972 
5973 /// \return true if this is a reverse operation on an vector.
5974 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
5975   unsigned NumElts = VT.getVectorNumElements();
5976   // Make sure the mask has the right size.
5977   if (NumElts != M.size())
5978       return false;
5979 
5980   // Look for <15, ..., 3, -1, 1, 0>.
5981   for (unsigned i = 0; i != NumElts; ++i)
5982     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
5983       return false;
5984 
5985   return true;
5986 }
5987 
5988 // If N is an integer constant that can be moved into a register in one
5989 // instruction, return an SDValue of such a constant (will become a MOV
5990 // instruction).  Otherwise return null.
5991 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
5992                                      const ARMSubtarget *ST, const SDLoc &dl) {
5993   uint64_t Val;
5994   if (!isa<ConstantSDNode>(N))
5995     return SDValue();
5996   Val = cast<ConstantSDNode>(N)->getZExtValue();
5997 
5998   if (ST->isThumb1Only()) {
5999     if (Val <= 255 || ~Val <= 255)
6000       return DAG.getConstant(Val, dl, MVT::i32);
6001   } else {
6002     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
6003       return DAG.getConstant(Val, dl, MVT::i32);
6004   }
6005   return SDValue();
6006 }
6007 
6008 // If this is a case we can't handle, return null and let the default
6009 // expansion code take care of it.
6010 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
6011                                              const ARMSubtarget *ST) const {
6012   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
6013   SDLoc dl(Op);
6014   EVT VT = Op.getValueType();
6015 
6016   APInt SplatBits, SplatUndef;
6017   unsigned SplatBitSize;
6018   bool HasAnyUndefs;
6019   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
6020     if (SplatBitSize <= 64) {
6021       // Check if an immediate VMOV works.
6022       EVT VmovVT;
6023       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
6024                                       SplatUndef.getZExtValue(), SplatBitSize,
6025                                       DAG, dl, VmovVT, VT.is128BitVector(),
6026                                       VMOVModImm);
6027       if (Val.getNode()) {
6028         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
6029         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6030       }
6031 
6032       // Try an immediate VMVN.
6033       uint64_t NegatedImm = (~SplatBits).getZExtValue();
6034       Val = isNEONModifiedImm(NegatedImm,
6035                                       SplatUndef.getZExtValue(), SplatBitSize,
6036                                       DAG, dl, VmovVT, VT.is128BitVector(),
6037                                       VMVNModImm);
6038       if (Val.getNode()) {
6039         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
6040         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6041       }
6042 
6043       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
6044       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
6045         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
6046         if (ImmVal != -1) {
6047           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
6048           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
6049         }
6050       }
6051     }
6052   }
6053 
6054   // Scan through the operands to see if only one value is used.
6055   //
6056   // As an optimisation, even if more than one value is used it may be more
6057   // profitable to splat with one value then change some lanes.
6058   //
6059   // Heuristically we decide to do this if the vector has a "dominant" value,
6060   // defined as splatted to more than half of the lanes.
6061   unsigned NumElts = VT.getVectorNumElements();
6062   bool isOnlyLowElement = true;
6063   bool usesOnlyOneValue = true;
6064   bool hasDominantValue = false;
6065   bool isConstant = true;
6066 
6067   // Map of the number of times a particular SDValue appears in the
6068   // element list.
6069   DenseMap<SDValue, unsigned> ValueCounts;
6070   SDValue Value;
6071   for (unsigned i = 0; i < NumElts; ++i) {
6072     SDValue V = Op.getOperand(i);
6073     if (V.isUndef())
6074       continue;
6075     if (i > 0)
6076       isOnlyLowElement = false;
6077     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
6078       isConstant = false;
6079 
6080     ValueCounts.insert(std::make_pair(V, 0));
6081     unsigned &Count = ValueCounts[V];
6082 
6083     // Is this value dominant? (takes up more than half of the lanes)
6084     if (++Count > (NumElts / 2)) {
6085       hasDominantValue = true;
6086       Value = V;
6087     }
6088   }
6089   if (ValueCounts.size() != 1)
6090     usesOnlyOneValue = false;
6091   if (!Value.getNode() && ValueCounts.size() > 0)
6092     Value = ValueCounts.begin()->first;
6093 
6094   if (ValueCounts.size() == 0)
6095     return DAG.getUNDEF(VT);
6096 
6097   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
6098   // Keep going if we are hitting this case.
6099   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
6100     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
6101 
6102   unsigned EltSize = VT.getScalarSizeInBits();
6103 
6104   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
6105   // i32 and try again.
6106   if (hasDominantValue && EltSize <= 32) {
6107     if (!isConstant) {
6108       SDValue N;
6109 
6110       // If we are VDUPing a value that comes directly from a vector, that will
6111       // cause an unnecessary move to and from a GPR, where instead we could
6112       // just use VDUPLANE. We can only do this if the lane being extracted
6113       // is at a constant index, as the VDUP from lane instructions only have
6114       // constant-index forms.
6115       ConstantSDNode *constIndex;
6116       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6117           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
6118         // We need to create a new undef vector to use for the VDUPLANE if the
6119         // size of the vector from which we get the value is different than the
6120         // size of the vector that we need to create. We will insert the element
6121         // such that the register coalescer will remove unnecessary copies.
6122         if (VT != Value->getOperand(0).getValueType()) {
6123           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
6124                              VT.getVectorNumElements();
6125           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6126                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
6127                         Value, DAG.getConstant(index, dl, MVT::i32)),
6128                            DAG.getConstant(index, dl, MVT::i32));
6129         } else
6130           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6131                         Value->getOperand(0), Value->getOperand(1));
6132       } else
6133         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
6134 
6135       if (!usesOnlyOneValue) {
6136         // The dominant value was splatted as 'N', but we now have to insert
6137         // all differing elements.
6138         for (unsigned I = 0; I < NumElts; ++I) {
6139           if (Op.getOperand(I) == Value)
6140             continue;
6141           SmallVector<SDValue, 3> Ops;
6142           Ops.push_back(N);
6143           Ops.push_back(Op.getOperand(I));
6144           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
6145           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
6146         }
6147       }
6148       return N;
6149     }
6150     if (VT.getVectorElementType().isFloatingPoint()) {
6151       SmallVector<SDValue, 8> Ops;
6152       for (unsigned i = 0; i < NumElts; ++i)
6153         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6154                                   Op.getOperand(i)));
6155       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
6156       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
6157       Val = LowerBUILD_VECTOR(Val, DAG, ST);
6158       if (Val.getNode())
6159         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6160     }
6161     if (usesOnlyOneValue) {
6162       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
6163       if (isConstant && Val.getNode())
6164         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
6165     }
6166   }
6167 
6168   // If all elements are constants and the case above didn't get hit, fall back
6169   // to the default expansion, which will generate a load from the constant
6170   // pool.
6171   if (isConstant)
6172     return SDValue();
6173 
6174   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
6175   if (NumElts >= 4) {
6176     SDValue shuffle = ReconstructShuffle(Op, DAG);
6177     if (shuffle != SDValue())
6178       return shuffle;
6179   }
6180 
6181   // Vectors with 32- or 64-bit elements can be built by directly assigning
6182   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
6183   // will be legalized.
6184   if (EltSize >= 32) {
6185     // Do the expansion with floating-point types, since that is what the VFP
6186     // registers are defined to use, and since i64 is not legal.
6187     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6188     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6189     SmallVector<SDValue, 8> Ops;
6190     for (unsigned i = 0; i < NumElts; ++i)
6191       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
6192     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6193     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6194   }
6195 
6196   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
6197   // know the default expansion would otherwise fall back on something even
6198   // worse. For a vector with one or two non-undef values, that's
6199   // scalar_to_vector for the elements followed by a shuffle (provided the
6200   // shuffle is valid for the target) and materialization element by element
6201   // on the stack followed by a load for everything else.
6202   if (!isConstant && !usesOnlyOneValue) {
6203     SDValue Vec = DAG.getUNDEF(VT);
6204     for (unsigned i = 0 ; i < NumElts; ++i) {
6205       SDValue V = Op.getOperand(i);
6206       if (V.isUndef())
6207         continue;
6208       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
6209       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
6210     }
6211     return Vec;
6212   }
6213 
6214   return SDValue();
6215 }
6216 
6217 // Gather data to see if the operation can be modelled as a
6218 // shuffle in combination with VEXTs.
6219 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
6220                                               SelectionDAG &DAG) const {
6221   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
6222   SDLoc dl(Op);
6223   EVT VT = Op.getValueType();
6224   unsigned NumElts = VT.getVectorNumElements();
6225 
6226   struct ShuffleSourceInfo {
6227     SDValue Vec;
6228     unsigned MinElt;
6229     unsigned MaxElt;
6230 
6231     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
6232     // be compatible with the shuffle we intend to construct. As a result
6233     // ShuffleVec will be some sliding window into the original Vec.
6234     SDValue ShuffleVec;
6235 
6236     // Code should guarantee that element i in Vec starts at element "WindowBase
6237     // + i * WindowScale in ShuffleVec".
6238     int WindowBase;
6239     int WindowScale;
6240 
6241     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
6242     ShuffleSourceInfo(SDValue Vec)
6243         : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
6244           WindowScale(1) {}
6245   };
6246 
6247   // First gather all vectors used as an immediate source for this BUILD_VECTOR
6248   // node.
6249   SmallVector<ShuffleSourceInfo, 2> Sources;
6250   for (unsigned i = 0; i < NumElts; ++i) {
6251     SDValue V = Op.getOperand(i);
6252     if (V.isUndef())
6253       continue;
6254     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
6255       // A shuffle can only come from building a vector from various
6256       // elements of other vectors.
6257       return SDValue();
6258     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
6259       // Furthermore, shuffles require a constant mask, whereas extractelts
6260       // accept variable indices.
6261       return SDValue();
6262     }
6263 
6264     // Add this element source to the list if it's not already there.
6265     SDValue SourceVec = V.getOperand(0);
6266     auto Source = find(Sources, SourceVec);
6267     if (Source == Sources.end())
6268       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
6269 
6270     // Update the minimum and maximum lane number seen.
6271     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
6272     Source->MinElt = std::min(Source->MinElt, EltNo);
6273     Source->MaxElt = std::max(Source->MaxElt, EltNo);
6274   }
6275 
6276   // Currently only do something sane when at most two source vectors
6277   // are involved.
6278   if (Sources.size() > 2)
6279     return SDValue();
6280 
6281   // Find out the smallest element size among result and two sources, and use
6282   // it as element size to build the shuffle_vector.
6283   EVT SmallestEltTy = VT.getVectorElementType();
6284   for (auto &Source : Sources) {
6285     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
6286     if (SrcEltTy.bitsLT(SmallestEltTy))
6287       SmallestEltTy = SrcEltTy;
6288   }
6289   unsigned ResMultiplier =
6290       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
6291   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
6292   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
6293 
6294   // If the source vector is too wide or too narrow, we may nevertheless be able
6295   // to construct a compatible shuffle either by concatenating it with UNDEF or
6296   // extracting a suitable range of elements.
6297   for (auto &Src : Sources) {
6298     EVT SrcVT = Src.ShuffleVec.getValueType();
6299 
6300     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
6301       continue;
6302 
6303     // This stage of the search produces a source with the same element type as
6304     // the original, but with a total width matching the BUILD_VECTOR output.
6305     EVT EltVT = SrcVT.getVectorElementType();
6306     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
6307     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
6308 
6309     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
6310       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
6311         return SDValue();
6312       // We can pad out the smaller vector for free, so if it's part of a
6313       // shuffle...
6314       Src.ShuffleVec =
6315           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
6316                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
6317       continue;
6318     }
6319 
6320     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
6321       return SDValue();
6322 
6323     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
6324       // Span too large for a VEXT to cope
6325       return SDValue();
6326     }
6327 
6328     if (Src.MinElt >= NumSrcElts) {
6329       // The extraction can just take the second half
6330       Src.ShuffleVec =
6331           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6332                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6333       Src.WindowBase = -NumSrcElts;
6334     } else if (Src.MaxElt < NumSrcElts) {
6335       // The extraction can just take the first half
6336       Src.ShuffleVec =
6337           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6338                       DAG.getConstant(0, dl, MVT::i32));
6339     } else {
6340       // An actual VEXT is needed
6341       SDValue VEXTSrc1 =
6342           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6343                       DAG.getConstant(0, dl, MVT::i32));
6344       SDValue VEXTSrc2 =
6345           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6346                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6347 
6348       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
6349                                    VEXTSrc2,
6350                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
6351       Src.WindowBase = -Src.MinElt;
6352     }
6353   }
6354 
6355   // Another possible incompatibility occurs from the vector element types. We
6356   // can fix this by bitcasting the source vectors to the same type we intend
6357   // for the shuffle.
6358   for (auto &Src : Sources) {
6359     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
6360     if (SrcEltTy == SmallestEltTy)
6361       continue;
6362     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
6363     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
6364     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
6365     Src.WindowBase *= Src.WindowScale;
6366   }
6367 
6368   // Final sanity check before we try to actually produce a shuffle.
6369   DEBUG(
6370     for (auto Src : Sources)
6371       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
6372   );
6373 
6374   // The stars all align, our next step is to produce the mask for the shuffle.
6375   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
6376   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
6377   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
6378     SDValue Entry = Op.getOperand(i);
6379     if (Entry.isUndef())
6380       continue;
6381 
6382     auto Src = find(Sources, Entry.getOperand(0));
6383     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
6384 
6385     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
6386     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
6387     // segment.
6388     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
6389     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
6390                                VT.getScalarSizeInBits());
6391     int LanesDefined = BitsDefined / BitsPerShuffleLane;
6392 
6393     // This source is expected to fill ResMultiplier lanes of the final shuffle,
6394     // starting at the appropriate offset.
6395     int *LaneMask = &Mask[i * ResMultiplier];
6396 
6397     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
6398     ExtractBase += NumElts * (Src - Sources.begin());
6399     for (int j = 0; j < LanesDefined; ++j)
6400       LaneMask[j] = ExtractBase + j;
6401   }
6402 
6403   // Final check before we try to produce nonsense...
6404   if (!isShuffleMaskLegal(Mask, ShuffleVT))
6405     return SDValue();
6406 
6407   // We can't handle more than two sources. This should have already
6408   // been checked before this point.
6409   assert(Sources.size() <= 2 && "Too many sources!");
6410 
6411   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
6412   for (unsigned i = 0; i < Sources.size(); ++i)
6413     ShuffleOps[i] = Sources[i].ShuffleVec;
6414 
6415   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
6416                                          ShuffleOps[1], Mask);
6417   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
6418 }
6419 
6420 /// isShuffleMaskLegal - Targets can use this to indicate that they only
6421 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
6422 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
6423 /// are assumed to be legal.
6424 bool
6425 ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
6426                                       EVT VT) const {
6427   if (VT.getVectorNumElements() == 4 &&
6428       (VT.is128BitVector() || VT.is64BitVector())) {
6429     unsigned PFIndexes[4];
6430     for (unsigned i = 0; i != 4; ++i) {
6431       if (M[i] < 0)
6432         PFIndexes[i] = 8;
6433       else
6434         PFIndexes[i] = M[i];
6435     }
6436 
6437     // Compute the index in the perfect shuffle table.
6438     unsigned PFTableIndex =
6439       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6440     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6441     unsigned Cost = (PFEntry >> 30);
6442 
6443     if (Cost <= 4)
6444       return true;
6445   }
6446 
6447   bool ReverseVEXT, isV_UNDEF;
6448   unsigned Imm, WhichResult;
6449 
6450   unsigned EltSize = VT.getScalarSizeInBits();
6451   return (EltSize >= 32 ||
6452           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
6453           isVREVMask(M, VT, 64) ||
6454           isVREVMask(M, VT, 32) ||
6455           isVREVMask(M, VT, 16) ||
6456           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
6457           isVTBLMask(M, VT) ||
6458           isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
6459           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
6460 }
6461 
6462 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
6463 /// the specified operations to build the shuffle.
6464 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
6465                                       SDValue RHS, SelectionDAG &DAG,
6466                                       const SDLoc &dl) {
6467   unsigned OpNum = (PFEntry >> 26) & 0x0F;
6468   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
6469   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
6470 
6471   enum {
6472     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
6473     OP_VREV,
6474     OP_VDUP0,
6475     OP_VDUP1,
6476     OP_VDUP2,
6477     OP_VDUP3,
6478     OP_VEXT1,
6479     OP_VEXT2,
6480     OP_VEXT3,
6481     OP_VUZPL, // VUZP, left result
6482     OP_VUZPR, // VUZP, right result
6483     OP_VZIPL, // VZIP, left result
6484     OP_VZIPR, // VZIP, right result
6485     OP_VTRNL, // VTRN, left result
6486     OP_VTRNR  // VTRN, right result
6487   };
6488 
6489   if (OpNum == OP_COPY) {
6490     if (LHSID == (1*9+2)*9+3) return LHS;
6491     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
6492     return RHS;
6493   }
6494 
6495   SDValue OpLHS, OpRHS;
6496   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
6497   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
6498   EVT VT = OpLHS.getValueType();
6499 
6500   switch (OpNum) {
6501   default: llvm_unreachable("Unknown shuffle opcode!");
6502   case OP_VREV:
6503     // VREV divides the vector in half and swaps within the half.
6504     if (VT.getVectorElementType() == MVT::i32 ||
6505         VT.getVectorElementType() == MVT::f32)
6506       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
6507     // vrev <4 x i16> -> VREV32
6508     if (VT.getVectorElementType() == MVT::i16)
6509       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
6510     // vrev <4 x i8> -> VREV16
6511     assert(VT.getVectorElementType() == MVT::i8);
6512     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
6513   case OP_VDUP0:
6514   case OP_VDUP1:
6515   case OP_VDUP2:
6516   case OP_VDUP3:
6517     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6518                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
6519   case OP_VEXT1:
6520   case OP_VEXT2:
6521   case OP_VEXT3:
6522     return DAG.getNode(ARMISD::VEXT, dl, VT,
6523                        OpLHS, OpRHS,
6524                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
6525   case OP_VUZPL:
6526   case OP_VUZPR:
6527     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
6528                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
6529   case OP_VZIPL:
6530   case OP_VZIPR:
6531     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
6532                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
6533   case OP_VTRNL:
6534   case OP_VTRNR:
6535     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
6536                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
6537   }
6538 }
6539 
6540 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
6541                                        ArrayRef<int> ShuffleMask,
6542                                        SelectionDAG &DAG) {
6543   // Check to see if we can use the VTBL instruction.
6544   SDValue V1 = Op.getOperand(0);
6545   SDValue V2 = Op.getOperand(1);
6546   SDLoc DL(Op);
6547 
6548   SmallVector<SDValue, 8> VTBLMask;
6549   for (ArrayRef<int>::iterator
6550          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
6551     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
6552 
6553   if (V2.getNode()->isUndef())
6554     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
6555                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6556 
6557   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
6558                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
6559 }
6560 
6561 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
6562                                                       SelectionDAG &DAG) {
6563   SDLoc DL(Op);
6564   SDValue OpLHS = Op.getOperand(0);
6565   EVT VT = OpLHS.getValueType();
6566 
6567   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
6568          "Expect an v8i16/v16i8 type");
6569   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
6570   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
6571   // extract the first 8 bytes into the top double word and the last 8 bytes
6572   // into the bottom double word. The v8i16 case is similar.
6573   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
6574   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
6575                      DAG.getConstant(ExtractNum, DL, MVT::i32));
6576 }
6577 
6578 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
6579   SDValue V1 = Op.getOperand(0);
6580   SDValue V2 = Op.getOperand(1);
6581   SDLoc dl(Op);
6582   EVT VT = Op.getValueType();
6583   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
6584 
6585   // Convert shuffles that are directly supported on NEON to target-specific
6586   // DAG nodes, instead of keeping them as shuffles and matching them again
6587   // during code selection.  This is more efficient and avoids the possibility
6588   // of inconsistencies between legalization and selection.
6589   // FIXME: floating-point vectors should be canonicalized to integer vectors
6590   // of the same time so that they get CSEd properly.
6591   ArrayRef<int> ShuffleMask = SVN->getMask();
6592 
6593   unsigned EltSize = VT.getScalarSizeInBits();
6594   if (EltSize <= 32) {
6595     if (SVN->isSplat()) {
6596       int Lane = SVN->getSplatIndex();
6597       // If this is undef splat, generate it via "just" vdup, if possible.
6598       if (Lane == -1) Lane = 0;
6599 
6600       // Test if V1 is a SCALAR_TO_VECTOR.
6601       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
6602         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6603       }
6604       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
6605       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
6606       // reaches it).
6607       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
6608           !isa<ConstantSDNode>(V1.getOperand(0))) {
6609         bool IsScalarToVector = true;
6610         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
6611           if (!V1.getOperand(i).isUndef()) {
6612             IsScalarToVector = false;
6613             break;
6614           }
6615         if (IsScalarToVector)
6616           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
6617       }
6618       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
6619                          DAG.getConstant(Lane, dl, MVT::i32));
6620     }
6621 
6622     bool ReverseVEXT;
6623     unsigned Imm;
6624     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
6625       if (ReverseVEXT)
6626         std::swap(V1, V2);
6627       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
6628                          DAG.getConstant(Imm, dl, MVT::i32));
6629     }
6630 
6631     if (isVREVMask(ShuffleMask, VT, 64))
6632       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
6633     if (isVREVMask(ShuffleMask, VT, 32))
6634       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
6635     if (isVREVMask(ShuffleMask, VT, 16))
6636       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
6637 
6638     if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
6639       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
6640                          DAG.getConstant(Imm, dl, MVT::i32));
6641     }
6642 
6643     // Check for Neon shuffles that modify both input vectors in place.
6644     // If both results are used, i.e., if there are two shuffles with the same
6645     // source operands and with masks corresponding to both results of one of
6646     // these operations, DAG memoization will ensure that a single node is
6647     // used for both shuffles.
6648     unsigned WhichResult;
6649     bool isV_UNDEF;
6650     if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6651             ShuffleMask, VT, WhichResult, isV_UNDEF)) {
6652       if (isV_UNDEF)
6653         V2 = V1;
6654       return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
6655           .getValue(WhichResult);
6656     }
6657 
6658     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
6659     // shuffles that produce a result larger than their operands with:
6660     //   shuffle(concat(v1, undef), concat(v2, undef))
6661     // ->
6662     //   shuffle(concat(v1, v2), undef)
6663     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
6664     //
6665     // This is useful in the general case, but there are special cases where
6666     // native shuffles produce larger results: the two-result ops.
6667     //
6668     // Look through the concat when lowering them:
6669     //   shuffle(concat(v1, v2), undef)
6670     // ->
6671     //   concat(VZIP(v1, v2):0, :1)
6672     //
6673     if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
6674       SDValue SubV1 = V1->getOperand(0);
6675       SDValue SubV2 = V1->getOperand(1);
6676       EVT SubVT = SubV1.getValueType();
6677 
6678       // We expect these to have been canonicalized to -1.
6679       assert(all_of(ShuffleMask, [&](int i) {
6680         return i < (int)VT.getVectorNumElements();
6681       }) && "Unexpected shuffle index into UNDEF operand!");
6682 
6683       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
6684               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
6685         if (isV_UNDEF)
6686           SubV2 = SubV1;
6687         assert((WhichResult == 0) &&
6688                "In-place shuffle of concat can only have one result!");
6689         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
6690                                   SubV1, SubV2);
6691         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
6692                            Res.getValue(1));
6693       }
6694     }
6695   }
6696 
6697   // If the shuffle is not directly supported and it has 4 elements, use
6698   // the PerfectShuffle-generated table to synthesize it from other shuffles.
6699   unsigned NumElts = VT.getVectorNumElements();
6700   if (NumElts == 4) {
6701     unsigned PFIndexes[4];
6702     for (unsigned i = 0; i != 4; ++i) {
6703       if (ShuffleMask[i] < 0)
6704         PFIndexes[i] = 8;
6705       else
6706         PFIndexes[i] = ShuffleMask[i];
6707     }
6708 
6709     // Compute the index in the perfect shuffle table.
6710     unsigned PFTableIndex =
6711       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6712     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6713     unsigned Cost = (PFEntry >> 30);
6714 
6715     if (Cost <= 4)
6716       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
6717   }
6718 
6719   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
6720   if (EltSize >= 32) {
6721     // Do the expansion with floating-point types, since that is what the VFP
6722     // registers are defined to use, and since i64 is not legal.
6723     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6724     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6725     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
6726     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
6727     SmallVector<SDValue, 8> Ops;
6728     for (unsigned i = 0; i < NumElts; ++i) {
6729       if (ShuffleMask[i] < 0)
6730         Ops.push_back(DAG.getUNDEF(EltVT));
6731       else
6732         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
6733                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
6734                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
6735                                                   dl, MVT::i32)));
6736     }
6737     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6738     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6739   }
6740 
6741   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
6742     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
6743 
6744   if (VT == MVT::v8i8)
6745     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
6746       return NewOp;
6747 
6748   return SDValue();
6749 }
6750 
6751 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6752   // INSERT_VECTOR_ELT is legal only for immediate indexes.
6753   SDValue Lane = Op.getOperand(2);
6754   if (!isa<ConstantSDNode>(Lane))
6755     return SDValue();
6756 
6757   return Op;
6758 }
6759 
6760 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
6761   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
6762   SDValue Lane = Op.getOperand(1);
6763   if (!isa<ConstantSDNode>(Lane))
6764     return SDValue();
6765 
6766   SDValue Vec = Op.getOperand(0);
6767   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
6768     SDLoc dl(Op);
6769     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
6770   }
6771 
6772   return Op;
6773 }
6774 
6775 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6776   // The only time a CONCAT_VECTORS operation can have legal types is when
6777   // two 64-bit vectors are concatenated to a 128-bit vector.
6778   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
6779          "unexpected CONCAT_VECTORS");
6780   SDLoc dl(Op);
6781   SDValue Val = DAG.getUNDEF(MVT::v2f64);
6782   SDValue Op0 = Op.getOperand(0);
6783   SDValue Op1 = Op.getOperand(1);
6784   if (!Op0.isUndef())
6785     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6786                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
6787                       DAG.getIntPtrConstant(0, dl));
6788   if (!Op1.isUndef())
6789     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
6790                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
6791                       DAG.getIntPtrConstant(1, dl));
6792   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
6793 }
6794 
6795 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
6796 /// element has been zero/sign-extended, depending on the isSigned parameter,
6797 /// from an integer type half its size.
6798 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
6799                                    bool isSigned) {
6800   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
6801   EVT VT = N->getValueType(0);
6802   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
6803     SDNode *BVN = N->getOperand(0).getNode();
6804     if (BVN->getValueType(0) != MVT::v4i32 ||
6805         BVN->getOpcode() != ISD::BUILD_VECTOR)
6806       return false;
6807     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6808     unsigned HiElt = 1 - LoElt;
6809     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
6810     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
6811     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
6812     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
6813     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
6814       return false;
6815     if (isSigned) {
6816       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
6817           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
6818         return true;
6819     } else {
6820       if (Hi0->isNullValue() && Hi1->isNullValue())
6821         return true;
6822     }
6823     return false;
6824   }
6825 
6826   if (N->getOpcode() != ISD::BUILD_VECTOR)
6827     return false;
6828 
6829   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
6830     SDNode *Elt = N->getOperand(i).getNode();
6831     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
6832       unsigned EltSize = VT.getScalarSizeInBits();
6833       unsigned HalfSize = EltSize / 2;
6834       if (isSigned) {
6835         if (!isIntN(HalfSize, C->getSExtValue()))
6836           return false;
6837       } else {
6838         if (!isUIntN(HalfSize, C->getZExtValue()))
6839           return false;
6840       }
6841       continue;
6842     }
6843     return false;
6844   }
6845 
6846   return true;
6847 }
6848 
6849 /// isSignExtended - Check if a node is a vector value that is sign-extended
6850 /// or a constant BUILD_VECTOR with sign-extended elements.
6851 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
6852   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
6853     return true;
6854   if (isExtendedBUILD_VECTOR(N, DAG, true))
6855     return true;
6856   return false;
6857 }
6858 
6859 /// isZeroExtended - Check if a node is a vector value that is zero-extended
6860 /// or a constant BUILD_VECTOR with zero-extended elements.
6861 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
6862   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
6863     return true;
6864   if (isExtendedBUILD_VECTOR(N, DAG, false))
6865     return true;
6866   return false;
6867 }
6868 
6869 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
6870   if (OrigVT.getSizeInBits() >= 64)
6871     return OrigVT;
6872 
6873   assert(OrigVT.isSimple() && "Expecting a simple value type");
6874 
6875   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
6876   switch (OrigSimpleTy) {
6877   default: llvm_unreachable("Unexpected Vector Type");
6878   case MVT::v2i8:
6879   case MVT::v2i16:
6880      return MVT::v2i32;
6881   case MVT::v4i8:
6882     return  MVT::v4i16;
6883   }
6884 }
6885 
6886 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
6887 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
6888 /// We insert the required extension here to get the vector to fill a D register.
6889 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
6890                                             const EVT &OrigTy,
6891                                             const EVT &ExtTy,
6892                                             unsigned ExtOpcode) {
6893   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
6894   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
6895   // 64-bits we need to insert a new extension so that it will be 64-bits.
6896   assert(ExtTy.is128BitVector() && "Unexpected extension size");
6897   if (OrigTy.getSizeInBits() >= 64)
6898     return N;
6899 
6900   // Must extend size to at least 64 bits to be used as an operand for VMULL.
6901   EVT NewVT = getExtensionTo64Bits(OrigTy);
6902 
6903   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
6904 }
6905 
6906 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
6907 /// does not do any sign/zero extension. If the original vector is less
6908 /// than 64 bits, an appropriate extension will be added after the load to
6909 /// reach a total size of 64 bits. We have to add the extension separately
6910 /// because ARM does not have a sign/zero extending load for vectors.
6911 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
6912   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
6913 
6914   // The load already has the right type.
6915   if (ExtendedTy == LD->getMemoryVT())
6916     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
6917                        LD->getBasePtr(), LD->getPointerInfo(),
6918                        LD->getAlignment(), LD->getMemOperand()->getFlags());
6919 
6920   // We need to create a zextload/sextload. We cannot just create a load
6921   // followed by a zext/zext node because LowerMUL is also run during normal
6922   // operation legalization where we can't create illegal types.
6923   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
6924                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
6925                         LD->getMemoryVT(), LD->getAlignment(),
6926                         LD->getMemOperand()->getFlags());
6927 }
6928 
6929 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
6930 /// extending load, or BUILD_VECTOR with extended elements, return the
6931 /// unextended value. The unextended vector should be 64 bits so that it can
6932 /// be used as an operand to a VMULL instruction. If the original vector size
6933 /// before extension is less than 64 bits we add a an extension to resize
6934 /// the vector to 64 bits.
6935 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
6936   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
6937     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
6938                                         N->getOperand(0)->getValueType(0),
6939                                         N->getValueType(0),
6940                                         N->getOpcode());
6941 
6942   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
6943     return SkipLoadExtensionForVMULL(LD, DAG);
6944 
6945   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
6946   // have been legalized as a BITCAST from v4i32.
6947   if (N->getOpcode() == ISD::BITCAST) {
6948     SDNode *BVN = N->getOperand(0).getNode();
6949     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
6950            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
6951     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
6952     return DAG.getBuildVector(
6953         MVT::v2i32, SDLoc(N),
6954         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
6955   }
6956   // Construct a new BUILD_VECTOR with elements truncated to half the size.
6957   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
6958   EVT VT = N->getValueType(0);
6959   unsigned EltSize = VT.getScalarSizeInBits() / 2;
6960   unsigned NumElts = VT.getVectorNumElements();
6961   MVT TruncVT = MVT::getIntegerVT(EltSize);
6962   SmallVector<SDValue, 8> Ops;
6963   SDLoc dl(N);
6964   for (unsigned i = 0; i != NumElts; ++i) {
6965     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
6966     const APInt &CInt = C->getAPIntValue();
6967     // Element types smaller than 32 bits are not legal, so use i32 elements.
6968     // The values are implicitly truncated so sext vs. zext doesn't matter.
6969     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
6970   }
6971   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
6972 }
6973 
6974 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
6975   unsigned Opcode = N->getOpcode();
6976   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6977     SDNode *N0 = N->getOperand(0).getNode();
6978     SDNode *N1 = N->getOperand(1).getNode();
6979     return N0->hasOneUse() && N1->hasOneUse() &&
6980       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
6981   }
6982   return false;
6983 }
6984 
6985 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
6986   unsigned Opcode = N->getOpcode();
6987   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
6988     SDNode *N0 = N->getOperand(0).getNode();
6989     SDNode *N1 = N->getOperand(1).getNode();
6990     return N0->hasOneUse() && N1->hasOneUse() &&
6991       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
6992   }
6993   return false;
6994 }
6995 
6996 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
6997   // Multiplications are only custom-lowered for 128-bit vectors so that
6998   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
6999   EVT VT = Op.getValueType();
7000   assert(VT.is128BitVector() && VT.isInteger() &&
7001          "unexpected type for custom-lowering ISD::MUL");
7002   SDNode *N0 = Op.getOperand(0).getNode();
7003   SDNode *N1 = Op.getOperand(1).getNode();
7004   unsigned NewOpc = 0;
7005   bool isMLA = false;
7006   bool isN0SExt = isSignExtended(N0, DAG);
7007   bool isN1SExt = isSignExtended(N1, DAG);
7008   if (isN0SExt && isN1SExt)
7009     NewOpc = ARMISD::VMULLs;
7010   else {
7011     bool isN0ZExt = isZeroExtended(N0, DAG);
7012     bool isN1ZExt = isZeroExtended(N1, DAG);
7013     if (isN0ZExt && isN1ZExt)
7014       NewOpc = ARMISD::VMULLu;
7015     else if (isN1SExt || isN1ZExt) {
7016       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
7017       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
7018       if (isN1SExt && isAddSubSExt(N0, DAG)) {
7019         NewOpc = ARMISD::VMULLs;
7020         isMLA = true;
7021       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
7022         NewOpc = ARMISD::VMULLu;
7023         isMLA = true;
7024       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
7025         std::swap(N0, N1);
7026         NewOpc = ARMISD::VMULLu;
7027         isMLA = true;
7028       }
7029     }
7030 
7031     if (!NewOpc) {
7032       if (VT == MVT::v2i64)
7033         // Fall through to expand this.  It is not legal.
7034         return SDValue();
7035       else
7036         // Other vector multiplications are legal.
7037         return Op;
7038     }
7039   }
7040 
7041   // Legalize to a VMULL instruction.
7042   SDLoc DL(Op);
7043   SDValue Op0;
7044   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
7045   if (!isMLA) {
7046     Op0 = SkipExtensionForVMULL(N0, DAG);
7047     assert(Op0.getValueType().is64BitVector() &&
7048            Op1.getValueType().is64BitVector() &&
7049            "unexpected types for extended operands to VMULL");
7050     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
7051   }
7052 
7053   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
7054   // isel lowering to take advantage of no-stall back to back vmul + vmla.
7055   //   vmull q0, d4, d6
7056   //   vmlal q0, d5, d6
7057   // is faster than
7058   //   vaddl q0, d4, d5
7059   //   vmovl q1, d6
7060   //   vmul  q0, q0, q1
7061   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
7062   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
7063   EVT Op1VT = Op1.getValueType();
7064   return DAG.getNode(N0->getOpcode(), DL, VT,
7065                      DAG.getNode(NewOpc, DL, VT,
7066                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
7067                      DAG.getNode(NewOpc, DL, VT,
7068                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
7069 }
7070 
7071 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
7072                               SelectionDAG &DAG) {
7073   // TODO: Should this propagate fast-math-flags?
7074 
7075   // Convert to float
7076   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
7077   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
7078   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
7079   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
7080   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
7081   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
7082   // Get reciprocal estimate.
7083   // float4 recip = vrecpeq_f32(yf);
7084   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7085                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7086                    Y);
7087   // Because char has a smaller range than uchar, we can actually get away
7088   // without any newton steps.  This requires that we use a weird bias
7089   // of 0xb000, however (again, this has been exhaustively tested).
7090   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
7091   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
7092   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
7093   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
7094   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
7095   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
7096   // Convert back to short.
7097   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
7098   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
7099   return X;
7100 }
7101 
7102 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
7103                                SelectionDAG &DAG) {
7104   // TODO: Should this propagate fast-math-flags?
7105 
7106   SDValue N2;
7107   // Convert to float.
7108   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
7109   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
7110   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
7111   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
7112   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7113   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7114 
7115   // Use reciprocal estimate and one refinement step.
7116   // float4 recip = vrecpeq_f32(yf);
7117   // recip *= vrecpsq_f32(yf, recip);
7118   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7119                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7120                    N1);
7121   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7122                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7123                    N1, N2);
7124   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7125   // Because short has a smaller range than ushort, we can actually get away
7126   // with only a single newton step.  This requires that we use a weird bias
7127   // of 89, however (again, this has been exhaustively tested).
7128   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
7129   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7130   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7131   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
7132   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7133   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7134   // Convert back to integer and return.
7135   // return vmovn_s32(vcvt_s32_f32(result));
7136   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7137   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7138   return N0;
7139 }
7140 
7141 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
7142   EVT VT = Op.getValueType();
7143   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7144          "unexpected type for custom-lowering ISD::SDIV");
7145 
7146   SDLoc dl(Op);
7147   SDValue N0 = Op.getOperand(0);
7148   SDValue N1 = Op.getOperand(1);
7149   SDValue N2, N3;
7150 
7151   if (VT == MVT::v8i8) {
7152     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
7153     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
7154 
7155     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7156                      DAG.getIntPtrConstant(4, dl));
7157     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7158                      DAG.getIntPtrConstant(4, dl));
7159     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7160                      DAG.getIntPtrConstant(0, dl));
7161     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7162                      DAG.getIntPtrConstant(0, dl));
7163 
7164     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
7165     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
7166 
7167     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7168     N0 = LowerCONCAT_VECTORS(N0, DAG);
7169 
7170     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
7171     return N0;
7172   }
7173   return LowerSDIV_v4i16(N0, N1, dl, DAG);
7174 }
7175 
7176 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
7177   // TODO: Should this propagate fast-math-flags?
7178   EVT VT = Op.getValueType();
7179   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7180          "unexpected type for custom-lowering ISD::UDIV");
7181 
7182   SDLoc dl(Op);
7183   SDValue N0 = Op.getOperand(0);
7184   SDValue N1 = Op.getOperand(1);
7185   SDValue N2, N3;
7186 
7187   if (VT == MVT::v8i8) {
7188     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
7189     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
7190 
7191     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7192                      DAG.getIntPtrConstant(4, dl));
7193     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7194                      DAG.getIntPtrConstant(4, dl));
7195     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7196                      DAG.getIntPtrConstant(0, dl));
7197     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7198                      DAG.getIntPtrConstant(0, dl));
7199 
7200     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
7201     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
7202 
7203     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7204     N0 = LowerCONCAT_VECTORS(N0, DAG);
7205 
7206     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
7207                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
7208                                      MVT::i32),
7209                      N0);
7210     return N0;
7211   }
7212 
7213   // v4i16 sdiv ... Convert to float.
7214   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
7215   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
7216   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
7217   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
7218   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7219   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7220 
7221   // Use reciprocal estimate and two refinement steps.
7222   // float4 recip = vrecpeq_f32(yf);
7223   // recip *= vrecpsq_f32(yf, recip);
7224   // recip *= vrecpsq_f32(yf, recip);
7225   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7226                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7227                    BN1);
7228   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7229                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7230                    BN1, N2);
7231   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7232   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7233                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7234                    BN1, N2);
7235   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7236   // Simply multiplying by the reciprocal estimate can leave us a few ulps
7237   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
7238   // and that it will never cause us to return an answer too large).
7239   // float4 result = as_float4(as_int4(xf*recip) + 2);
7240   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7241   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7242   N1 = DAG.getConstant(2, dl, MVT::v4i32);
7243   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7244   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7245   // Convert back to integer and return.
7246   // return vmovn_u32(vcvt_s32_f32(result));
7247   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7248   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7249   return N0;
7250 }
7251 
7252 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
7253   EVT VT = Op.getNode()->getValueType(0);
7254   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
7255 
7256   unsigned Opc;
7257   bool ExtraOp = false;
7258   switch (Op.getOpcode()) {
7259   default: llvm_unreachable("Invalid code");
7260   case ISD::ADDC: Opc = ARMISD::ADDC; break;
7261   case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
7262   case ISD::SUBC: Opc = ARMISD::SUBC; break;
7263   case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
7264   }
7265 
7266   if (!ExtraOp)
7267     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
7268                        Op.getOperand(1));
7269   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
7270                      Op.getOperand(1), Op.getOperand(2));
7271 }
7272 
7273 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
7274   assert(Subtarget->isTargetDarwin());
7275 
7276   // For iOS, we want to call an alternative entry point: __sincos_stret,
7277   // return values are passed via sret.
7278   SDLoc dl(Op);
7279   SDValue Arg = Op.getOperand(0);
7280   EVT ArgVT = Arg.getValueType();
7281   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
7282   auto PtrVT = getPointerTy(DAG.getDataLayout());
7283 
7284   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7285   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7286 
7287   // Pair of floats / doubles used to pass the result.
7288   Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
7289   auto &DL = DAG.getDataLayout();
7290 
7291   ArgListTy Args;
7292   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
7293   SDValue SRet;
7294   if (ShouldUseSRet) {
7295     // Create stack object for sret.
7296     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
7297     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
7298     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
7299     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
7300 
7301     ArgListEntry Entry;
7302     Entry.Node = SRet;
7303     Entry.Ty = RetTy->getPointerTo();
7304     Entry.isSExt = false;
7305     Entry.isZExt = false;
7306     Entry.isSRet = true;
7307     Args.push_back(Entry);
7308     RetTy = Type::getVoidTy(*DAG.getContext());
7309   }
7310 
7311   ArgListEntry Entry;
7312   Entry.Node = Arg;
7313   Entry.Ty = ArgTy;
7314   Entry.isSExt = false;
7315   Entry.isZExt = false;
7316   Args.push_back(Entry);
7317 
7318   const char *LibcallName =
7319       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
7320   RTLIB::Libcall LC =
7321       (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
7322   CallingConv::ID CC = getLibcallCallingConv(LC);
7323   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
7324 
7325   TargetLowering::CallLoweringInfo CLI(DAG);
7326   CLI.setDebugLoc(dl)
7327       .setChain(DAG.getEntryNode())
7328       .setCallee(CC, RetTy, Callee, std::move(Args))
7329       .setDiscardResult(ShouldUseSRet);
7330   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7331 
7332   if (!ShouldUseSRet)
7333     return CallResult.first;
7334 
7335   SDValue LoadSin =
7336       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
7337 
7338   // Address of cos field.
7339   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
7340                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
7341   SDValue LoadCos =
7342       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
7343 
7344   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
7345   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
7346                      LoadSin.getValue(0), LoadCos.getValue(0));
7347 }
7348 
7349 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
7350                                                   bool Signed,
7351                                                   SDValue &Chain) const {
7352   EVT VT = Op.getValueType();
7353   assert((VT == MVT::i32 || VT == MVT::i64) &&
7354          "unexpected type for custom lowering DIV");
7355   SDLoc dl(Op);
7356 
7357   const auto &DL = DAG.getDataLayout();
7358   const auto &TLI = DAG.getTargetLoweringInfo();
7359 
7360   const char *Name = nullptr;
7361   if (Signed)
7362     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
7363   else
7364     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
7365 
7366   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
7367 
7368   ARMTargetLowering::ArgListTy Args;
7369 
7370   for (auto AI : {1, 0}) {
7371     ArgListEntry Arg;
7372     Arg.Node = Op.getOperand(AI);
7373     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
7374     Args.push_back(Arg);
7375   }
7376 
7377   CallLoweringInfo CLI(DAG);
7378   CLI.setDebugLoc(dl)
7379     .setChain(Chain)
7380     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
7381                ES, std::move(Args));
7382 
7383   return LowerCallTo(CLI).first;
7384 }
7385 
7386 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
7387                                             bool Signed) const {
7388   assert(Op.getValueType() == MVT::i32 &&
7389          "unexpected type for custom lowering DIV");
7390   SDLoc dl(Op);
7391 
7392   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
7393                                DAG.getEntryNode(), Op.getOperand(1));
7394 
7395   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7396 }
7397 
7398 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
7399   SDLoc DL(N);
7400   SDValue Op = N->getOperand(1);
7401   if (N->getValueType(0) == MVT::i32)
7402     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
7403   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7404                            DAG.getConstant(0, DL, MVT::i32));
7405   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7406                            DAG.getConstant(1, DL, MVT::i32));
7407   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
7408                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
7409 }
7410 
7411 void ARMTargetLowering::ExpandDIV_Windows(
7412     SDValue Op, SelectionDAG &DAG, bool Signed,
7413     SmallVectorImpl<SDValue> &Results) const {
7414   const auto &DL = DAG.getDataLayout();
7415   const auto &TLI = DAG.getTargetLoweringInfo();
7416 
7417   assert(Op.getValueType() == MVT::i64 &&
7418          "unexpected type for custom lowering DIV");
7419   SDLoc dl(Op);
7420 
7421   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
7422 
7423   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7424 
7425   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
7426   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
7427                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
7428   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
7429 
7430   Results.push_back(Lower);
7431   Results.push_back(Upper);
7432 }
7433 
7434 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
7435   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
7436     // Acquire/Release load/store is not legal for targets without a dmb or
7437     // equivalent available.
7438     return SDValue();
7439 
7440   // Monotonic load/store is legal for all targets.
7441   return Op;
7442 }
7443 
7444 static void ReplaceREADCYCLECOUNTER(SDNode *N,
7445                                     SmallVectorImpl<SDValue> &Results,
7446                                     SelectionDAG &DAG,
7447                                     const ARMSubtarget *Subtarget) {
7448   SDLoc DL(N);
7449   // Under Power Management extensions, the cycle-count is:
7450   //    mrc p15, #0, <Rt>, c9, c13, #0
7451   SDValue Ops[] = { N->getOperand(0), // Chain
7452                     DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
7453                     DAG.getConstant(15, DL, MVT::i32),
7454                     DAG.getConstant(0, DL, MVT::i32),
7455                     DAG.getConstant(9, DL, MVT::i32),
7456                     DAG.getConstant(13, DL, MVT::i32),
7457                     DAG.getConstant(0, DL, MVT::i32)
7458   };
7459 
7460   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
7461                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
7462   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
7463                                 DAG.getConstant(0, DL, MVT::i32)));
7464   Results.push_back(Cycles32.getValue(1));
7465 }
7466 
7467 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
7468   SDLoc dl(V.getNode());
7469   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
7470   SDValue VHi = DAG.getAnyExtOrTrunc(
7471       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
7472       dl, MVT::i32);
7473   SDValue RegClass =
7474       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
7475   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
7476   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
7477   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
7478   return SDValue(
7479       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
7480 }
7481 
7482 static void ReplaceCMP_SWAP_64Results(SDNode *N,
7483                                        SmallVectorImpl<SDValue> & Results,
7484                                        SelectionDAG &DAG) {
7485   assert(N->getValueType(0) == MVT::i64 &&
7486          "AtomicCmpSwap on types less than 64 should be legal");
7487   SDValue Ops[] = {N->getOperand(1),
7488                    createGPRPairNode(DAG, N->getOperand(2)),
7489                    createGPRPairNode(DAG, N->getOperand(3)),
7490                    N->getOperand(0)};
7491   SDNode *CmpSwap = DAG.getMachineNode(
7492       ARM::CMP_SWAP_64, SDLoc(N),
7493       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
7494 
7495   MachineFunction &MF = DAG.getMachineFunction();
7496   MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
7497   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
7498   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
7499 
7500   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
7501                                                SDValue(CmpSwap, 0)));
7502   Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
7503                                                SDValue(CmpSwap, 0)));
7504   Results.push_back(SDValue(CmpSwap, 2));
7505 }
7506 
7507 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7508   switch (Op.getOpcode()) {
7509   default: llvm_unreachable("Don't know how to custom lower this!");
7510   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
7511   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
7512   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
7513   case ISD::GlobalAddress:
7514     switch (Subtarget->getTargetTriple().getObjectFormat()) {
7515     default: llvm_unreachable("unknown object format");
7516     case Triple::COFF:
7517       return LowerGlobalAddressWindows(Op, DAG);
7518     case Triple::ELF:
7519       return LowerGlobalAddressELF(Op, DAG);
7520     case Triple::MachO:
7521       return LowerGlobalAddressDarwin(Op, DAG);
7522     }
7523   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
7524   case ISD::SELECT:        return LowerSELECT(Op, DAG);
7525   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
7526   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
7527   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
7528   case ISD::VASTART:       return LowerVASTART(Op, DAG);
7529   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
7530   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
7531   case ISD::SINT_TO_FP:
7532   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
7533   case ISD::FP_TO_SINT:
7534   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
7535   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
7536   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
7537   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
7538   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
7539   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
7540   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
7541   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
7542                                                                Subtarget);
7543   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
7544   case ISD::SHL:
7545   case ISD::SRL:
7546   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
7547   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
7548   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
7549   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
7550   case ISD::SRL_PARTS:
7551   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
7552   case ISD::CTTZ:
7553   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
7554   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
7555   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
7556   case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
7557   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
7558   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
7559   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
7560   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
7561   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7562   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
7563   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
7564   case ISD::MUL:           return LowerMUL(Op, DAG);
7565   case ISD::SDIV:
7566     if (Subtarget->isTargetWindows())
7567       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
7568     return LowerSDIV(Op, DAG);
7569   case ISD::UDIV:
7570     if (Subtarget->isTargetWindows())
7571       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
7572     return LowerUDIV(Op, DAG);
7573   case ISD::ADDC:
7574   case ISD::ADDE:
7575   case ISD::SUBC:
7576   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
7577   case ISD::SADDO:
7578   case ISD::UADDO:
7579   case ISD::SSUBO:
7580   case ISD::USUBO:
7581     return LowerXALUO(Op, DAG);
7582   case ISD::ATOMIC_LOAD:
7583   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
7584   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
7585   case ISD::SDIVREM:
7586   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
7587   case ISD::DYNAMIC_STACKALLOC:
7588     if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
7589       return LowerDYNAMIC_STACKALLOC(Op, DAG);
7590     llvm_unreachable("Don't know how to custom lower this!");
7591   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
7592   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
7593   case ARMISD::WIN__DBZCHK: return SDValue();
7594   }
7595 }
7596 
7597 /// ReplaceNodeResults - Replace the results of node with an illegal result
7598 /// type with new values built out of custom code.
7599 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
7600                                            SmallVectorImpl<SDValue> &Results,
7601                                            SelectionDAG &DAG) const {
7602   SDValue Res;
7603   switch (N->getOpcode()) {
7604   default:
7605     llvm_unreachable("Don't know how to custom expand this!");
7606   case ISD::READ_REGISTER:
7607     ExpandREAD_REGISTER(N, Results, DAG);
7608     break;
7609   case ISD::BITCAST:
7610     Res = ExpandBITCAST(N, DAG);
7611     break;
7612   case ISD::SRL:
7613   case ISD::SRA:
7614     Res = Expand64BitShift(N, DAG, Subtarget);
7615     break;
7616   case ISD::SREM:
7617   case ISD::UREM:
7618     Res = LowerREM(N, DAG);
7619     break;
7620   case ISD::SDIVREM:
7621   case ISD::UDIVREM:
7622     Res = LowerDivRem(SDValue(N, 0), DAG);
7623     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
7624     Results.push_back(Res.getValue(0));
7625     Results.push_back(Res.getValue(1));
7626     return;
7627   case ISD::READCYCLECOUNTER:
7628     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
7629     return;
7630   case ISD::UDIV:
7631   case ISD::SDIV:
7632     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
7633     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
7634                              Results);
7635   case ISD::ATOMIC_CMP_SWAP:
7636     ReplaceCMP_SWAP_64Results(N, Results, DAG);
7637     return;
7638   }
7639   if (Res.getNode())
7640     Results.push_back(Res);
7641 }
7642 
7643 //===----------------------------------------------------------------------===//
7644 //                           ARM Scheduler Hooks
7645 //===----------------------------------------------------------------------===//
7646 
7647 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
7648 /// registers the function context.
7649 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
7650                                                MachineBasicBlock *MBB,
7651                                                MachineBasicBlock *DispatchBB,
7652                                                int FI) const {
7653   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
7654          "ROPI/RWPI not currently supported with SjLj");
7655   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7656   DebugLoc dl = MI.getDebugLoc();
7657   MachineFunction *MF = MBB->getParent();
7658   MachineRegisterInfo *MRI = &MF->getRegInfo();
7659   MachineConstantPool *MCP = MF->getConstantPool();
7660   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
7661   const Function *F = MF->getFunction();
7662 
7663   bool isThumb = Subtarget->isThumb();
7664   bool isThumb2 = Subtarget->isThumb2();
7665 
7666   unsigned PCLabelId = AFI->createPICLabelUId();
7667   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
7668   ARMConstantPoolValue *CPV =
7669     ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
7670   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
7671 
7672   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
7673                                            : &ARM::GPRRegClass;
7674 
7675   // Grab constant pool and fixed stack memory operands.
7676   MachineMemOperand *CPMMO =
7677       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
7678                                MachineMemOperand::MOLoad, 4, 4);
7679 
7680   MachineMemOperand *FIMMOSt =
7681       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
7682                                MachineMemOperand::MOStore, 4, 4);
7683 
7684   // Load the address of the dispatch MBB into the jump buffer.
7685   if (isThumb2) {
7686     // Incoming value: jbuf
7687     //   ldr.n  r5, LCPI1_1
7688     //   orr    r5, r5, #1
7689     //   add    r5, pc
7690     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
7691     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7692     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
7693                    .addConstantPoolIndex(CPI)
7694                    .addMemOperand(CPMMO));
7695     // Set the low bit because of thumb mode.
7696     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7697     AddDefaultCC(
7698       AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
7699                      .addReg(NewVReg1, RegState::Kill)
7700                      .addImm(0x01)));
7701     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7702     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
7703       .addReg(NewVReg2, RegState::Kill)
7704       .addImm(PCLabelId);
7705     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
7706                    .addReg(NewVReg3, RegState::Kill)
7707                    .addFrameIndex(FI)
7708                    .addImm(36)  // &jbuf[1] :: pc
7709                    .addMemOperand(FIMMOSt));
7710   } else if (isThumb) {
7711     // Incoming value: jbuf
7712     //   ldr.n  r1, LCPI1_4
7713     //   add    r1, pc
7714     //   mov    r2, #1
7715     //   orrs   r1, r2
7716     //   add    r2, $jbuf, #+4 ; &jbuf[1]
7717     //   str    r1, [r2]
7718     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7719     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
7720                    .addConstantPoolIndex(CPI)
7721                    .addMemOperand(CPMMO));
7722     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7723     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
7724       .addReg(NewVReg1, RegState::Kill)
7725       .addImm(PCLabelId);
7726     // Set the low bit because of thumb mode.
7727     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7728     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
7729                    .addReg(ARM::CPSR, RegState::Define)
7730                    .addImm(1));
7731     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7732     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
7733                    .addReg(ARM::CPSR, RegState::Define)
7734                    .addReg(NewVReg2, RegState::Kill)
7735                    .addReg(NewVReg3, RegState::Kill));
7736     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7737     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
7738             .addFrameIndex(FI)
7739             .addImm(36); // &jbuf[1] :: pc
7740     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
7741                    .addReg(NewVReg4, RegState::Kill)
7742                    .addReg(NewVReg5, RegState::Kill)
7743                    .addImm(0)
7744                    .addMemOperand(FIMMOSt));
7745   } else {
7746     // Incoming value: jbuf
7747     //   ldr  r1, LCPI1_1
7748     //   add  r1, pc, r1
7749     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
7750     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7751     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
7752                    .addConstantPoolIndex(CPI)
7753                    .addImm(0)
7754                    .addMemOperand(CPMMO));
7755     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7756     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
7757                    .addReg(NewVReg1, RegState::Kill)
7758                    .addImm(PCLabelId));
7759     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
7760                    .addReg(NewVReg2, RegState::Kill)
7761                    .addFrameIndex(FI)
7762                    .addImm(36)  // &jbuf[1] :: pc
7763                    .addMemOperand(FIMMOSt));
7764   }
7765 }
7766 
7767 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
7768                                               MachineBasicBlock *MBB) const {
7769   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
7770   DebugLoc dl = MI.getDebugLoc();
7771   MachineFunction *MF = MBB->getParent();
7772   MachineRegisterInfo *MRI = &MF->getRegInfo();
7773   MachineFrameInfo &MFI = MF->getFrameInfo();
7774   int FI = MFI.getFunctionContextIndex();
7775 
7776   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
7777                                                         : &ARM::GPRnopcRegClass;
7778 
7779   // Get a mapping of the call site numbers to all of the landing pads they're
7780   // associated with.
7781   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
7782   unsigned MaxCSNum = 0;
7783   MachineModuleInfo &MMI = MF->getMMI();
7784   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
7785        ++BB) {
7786     if (!BB->isEHPad()) continue;
7787 
7788     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
7789     // pad.
7790     for (MachineBasicBlock::iterator
7791            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
7792       if (!II->isEHLabel()) continue;
7793 
7794       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
7795       if (!MMI.hasCallSiteLandingPad(Sym)) continue;
7796 
7797       SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
7798       for (SmallVectorImpl<unsigned>::iterator
7799              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
7800            CSI != CSE; ++CSI) {
7801         CallSiteNumToLPad[*CSI].push_back(&*BB);
7802         MaxCSNum = std::max(MaxCSNum, *CSI);
7803       }
7804       break;
7805     }
7806   }
7807 
7808   // Get an ordered list of the machine basic blocks for the jump table.
7809   std::vector<MachineBasicBlock*> LPadList;
7810   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
7811   LPadList.reserve(CallSiteNumToLPad.size());
7812   for (unsigned I = 1; I <= MaxCSNum; ++I) {
7813     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
7814     for (SmallVectorImpl<MachineBasicBlock*>::iterator
7815            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
7816       LPadList.push_back(*II);
7817       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
7818     }
7819   }
7820 
7821   assert(!LPadList.empty() &&
7822          "No landing pad destinations for the dispatch jump table!");
7823 
7824   // Create the jump table and associated information.
7825   MachineJumpTableInfo *JTI =
7826     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
7827   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
7828 
7829   // Create the MBBs for the dispatch code.
7830 
7831   // Shove the dispatch's address into the return slot in the function context.
7832   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
7833   DispatchBB->setIsEHPad();
7834 
7835   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7836   unsigned trap_opcode;
7837   if (Subtarget->isThumb())
7838     trap_opcode = ARM::tTRAP;
7839   else
7840     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
7841 
7842   BuildMI(TrapBB, dl, TII->get(trap_opcode));
7843   DispatchBB->addSuccessor(TrapBB);
7844 
7845   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
7846   DispatchBB->addSuccessor(DispContBB);
7847 
7848   // Insert and MBBs.
7849   MF->insert(MF->end(), DispatchBB);
7850   MF->insert(MF->end(), DispContBB);
7851   MF->insert(MF->end(), TrapBB);
7852 
7853   // Insert code into the entry block that creates and registers the function
7854   // context.
7855   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
7856 
7857   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
7858       MachinePointerInfo::getFixedStack(*MF, FI),
7859       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
7860 
7861   MachineInstrBuilder MIB;
7862   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
7863 
7864   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
7865   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
7866 
7867   // Add a register mask with no preserved registers.  This results in all
7868   // registers being marked as clobbered. This can't work if the dispatch block
7869   // is in a Thumb1 function and is linked with ARM code which uses the FP
7870   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
7871   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
7872 
7873   bool IsPositionIndependent = isPositionIndependent();
7874   unsigned NumLPads = LPadList.size();
7875   if (Subtarget->isThumb2()) {
7876     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7877     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
7878                    .addFrameIndex(FI)
7879                    .addImm(4)
7880                    .addMemOperand(FIMMOLd));
7881 
7882     if (NumLPads < 256) {
7883       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
7884                      .addReg(NewVReg1)
7885                      .addImm(LPadList.size()));
7886     } else {
7887       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7888       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
7889                      .addImm(NumLPads & 0xFFFF));
7890 
7891       unsigned VReg2 = VReg1;
7892       if ((NumLPads & 0xFFFF0000) != 0) {
7893         VReg2 = MRI->createVirtualRegister(TRC);
7894         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
7895                        .addReg(VReg1)
7896                        .addImm(NumLPads >> 16));
7897       }
7898 
7899       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
7900                      .addReg(NewVReg1)
7901                      .addReg(VReg2));
7902     }
7903 
7904     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
7905       .addMBB(TrapBB)
7906       .addImm(ARMCC::HI)
7907       .addReg(ARM::CPSR);
7908 
7909     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7910     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
7911                    .addJumpTableIndex(MJTI));
7912 
7913     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7914     AddDefaultCC(
7915       AddDefaultPred(
7916         BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
7917         .addReg(NewVReg3, RegState::Kill)
7918         .addReg(NewVReg1)
7919         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
7920 
7921     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
7922       .addReg(NewVReg4, RegState::Kill)
7923       .addReg(NewVReg1)
7924       .addJumpTableIndex(MJTI);
7925   } else if (Subtarget->isThumb()) {
7926     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
7927     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
7928                    .addFrameIndex(FI)
7929                    .addImm(1)
7930                    .addMemOperand(FIMMOLd));
7931 
7932     if (NumLPads < 256) {
7933       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
7934                      .addReg(NewVReg1)
7935                      .addImm(NumLPads));
7936     } else {
7937       MachineConstantPool *ConstantPool = MF->getConstantPool();
7938       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7939       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
7940 
7941       // MachineConstantPool wants an explicit alignment.
7942       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
7943       if (Align == 0)
7944         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
7945       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7946 
7947       unsigned VReg1 = MRI->createVirtualRegister(TRC);
7948       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
7949                      .addReg(VReg1, RegState::Define)
7950                      .addConstantPoolIndex(Idx));
7951       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
7952                      .addReg(NewVReg1)
7953                      .addReg(VReg1));
7954     }
7955 
7956     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
7957       .addMBB(TrapBB)
7958       .addImm(ARMCC::HI)
7959       .addReg(ARM::CPSR);
7960 
7961     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
7962     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
7963                    .addReg(ARM::CPSR, RegState::Define)
7964                    .addReg(NewVReg1)
7965                    .addImm(2));
7966 
7967     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
7968     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
7969                    .addJumpTableIndex(MJTI));
7970 
7971     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
7972     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
7973                    .addReg(ARM::CPSR, RegState::Define)
7974                    .addReg(NewVReg2, RegState::Kill)
7975                    .addReg(NewVReg3));
7976 
7977     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
7978         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
7979 
7980     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
7981     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
7982                    .addReg(NewVReg4, RegState::Kill)
7983                    .addImm(0)
7984                    .addMemOperand(JTMMOLd));
7985 
7986     unsigned NewVReg6 = NewVReg5;
7987     if (IsPositionIndependent) {
7988       NewVReg6 = MRI->createVirtualRegister(TRC);
7989       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
7990                      .addReg(ARM::CPSR, RegState::Define)
7991                      .addReg(NewVReg5, RegState::Kill)
7992                      .addReg(NewVReg3));
7993     }
7994 
7995     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
7996       .addReg(NewVReg6, RegState::Kill)
7997       .addJumpTableIndex(MJTI);
7998   } else {
7999     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8000     AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
8001                    .addFrameIndex(FI)
8002                    .addImm(4)
8003                    .addMemOperand(FIMMOLd));
8004 
8005     if (NumLPads < 256) {
8006       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
8007                      .addReg(NewVReg1)
8008                      .addImm(NumLPads));
8009     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
8010       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8011       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
8012                      .addImm(NumLPads & 0xFFFF));
8013 
8014       unsigned VReg2 = VReg1;
8015       if ((NumLPads & 0xFFFF0000) != 0) {
8016         VReg2 = MRI->createVirtualRegister(TRC);
8017         AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
8018                        .addReg(VReg1)
8019                        .addImm(NumLPads >> 16));
8020       }
8021 
8022       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8023                      .addReg(NewVReg1)
8024                      .addReg(VReg2));
8025     } else {
8026       MachineConstantPool *ConstantPool = MF->getConstantPool();
8027       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8028       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
8029 
8030       // MachineConstantPool wants an explicit alignment.
8031       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8032       if (Align == 0)
8033         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8034       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8035 
8036       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8037       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
8038                      .addReg(VReg1, RegState::Define)
8039                      .addConstantPoolIndex(Idx)
8040                      .addImm(0));
8041       AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8042                      .addReg(NewVReg1)
8043                      .addReg(VReg1, RegState::Kill));
8044     }
8045 
8046     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
8047       .addMBB(TrapBB)
8048       .addImm(ARMCC::HI)
8049       .addReg(ARM::CPSR);
8050 
8051     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8052     AddDefaultCC(
8053       AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
8054                      .addReg(NewVReg1)
8055                      .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
8056     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8057     AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
8058                    .addJumpTableIndex(MJTI));
8059 
8060     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
8061         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
8062     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8063     AddDefaultPred(
8064       BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
8065       .addReg(NewVReg3, RegState::Kill)
8066       .addReg(NewVReg4)
8067       .addImm(0)
8068       .addMemOperand(JTMMOLd));
8069 
8070     if (IsPositionIndependent) {
8071       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
8072         .addReg(NewVReg5, RegState::Kill)
8073         .addReg(NewVReg4)
8074         .addJumpTableIndex(MJTI);
8075     } else {
8076       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
8077         .addReg(NewVReg5, RegState::Kill)
8078         .addJumpTableIndex(MJTI);
8079     }
8080   }
8081 
8082   // Add the jump table entries as successors to the MBB.
8083   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
8084   for (std::vector<MachineBasicBlock*>::iterator
8085          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
8086     MachineBasicBlock *CurMBB = *I;
8087     if (SeenMBBs.insert(CurMBB).second)
8088       DispContBB->addSuccessor(CurMBB);
8089   }
8090 
8091   // N.B. the order the invoke BBs are processed in doesn't matter here.
8092   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
8093   SmallVector<MachineBasicBlock*, 64> MBBLPads;
8094   for (MachineBasicBlock *BB : InvokeBBs) {
8095 
8096     // Remove the landing pad successor from the invoke block and replace it
8097     // with the new dispatch block.
8098     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
8099                                                   BB->succ_end());
8100     while (!Successors.empty()) {
8101       MachineBasicBlock *SMBB = Successors.pop_back_val();
8102       if (SMBB->isEHPad()) {
8103         BB->removeSuccessor(SMBB);
8104         MBBLPads.push_back(SMBB);
8105       }
8106     }
8107 
8108     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
8109     BB->normalizeSuccProbs();
8110 
8111     // Find the invoke call and mark all of the callee-saved registers as
8112     // 'implicit defined' so that they're spilled. This prevents code from
8113     // moving instructions to before the EH block, where they will never be
8114     // executed.
8115     for (MachineBasicBlock::reverse_iterator
8116            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
8117       if (!II->isCall()) continue;
8118 
8119       DenseMap<unsigned, bool> DefRegs;
8120       for (MachineInstr::mop_iterator
8121              OI = II->operands_begin(), OE = II->operands_end();
8122            OI != OE; ++OI) {
8123         if (!OI->isReg()) continue;
8124         DefRegs[OI->getReg()] = true;
8125       }
8126 
8127       MachineInstrBuilder MIB(*MF, &*II);
8128 
8129       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
8130         unsigned Reg = SavedRegs[i];
8131         if (Subtarget->isThumb2() &&
8132             !ARM::tGPRRegClass.contains(Reg) &&
8133             !ARM::hGPRRegClass.contains(Reg))
8134           continue;
8135         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
8136           continue;
8137         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
8138           continue;
8139         if (!DefRegs[Reg])
8140           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
8141       }
8142 
8143       break;
8144     }
8145   }
8146 
8147   // Mark all former landing pads as non-landing pads. The dispatch is the only
8148   // landing pad now.
8149   for (SmallVectorImpl<MachineBasicBlock*>::iterator
8150          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
8151     (*I)->setIsEHPad(false);
8152 
8153   // The instruction is gone now.
8154   MI.eraseFromParent();
8155 }
8156 
8157 static
8158 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
8159   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
8160        E = MBB->succ_end(); I != E; ++I)
8161     if (*I != Succ)
8162       return *I;
8163   llvm_unreachable("Expecting a BB with two successors!");
8164 }
8165 
8166 /// Return the load opcode for a given load size. If load size >= 8,
8167 /// neon opcode will be returned.
8168 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
8169   if (LdSize >= 8)
8170     return LdSize == 16 ? ARM::VLD1q32wb_fixed
8171                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
8172   if (IsThumb1)
8173     return LdSize == 4 ? ARM::tLDRi
8174                        : LdSize == 2 ? ARM::tLDRHi
8175                                      : LdSize == 1 ? ARM::tLDRBi : 0;
8176   if (IsThumb2)
8177     return LdSize == 4 ? ARM::t2LDR_POST
8178                        : LdSize == 2 ? ARM::t2LDRH_POST
8179                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
8180   return LdSize == 4 ? ARM::LDR_POST_IMM
8181                      : LdSize == 2 ? ARM::LDRH_POST
8182                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
8183 }
8184 
8185 /// Return the store opcode for a given store size. If store size >= 8,
8186 /// neon opcode will be returned.
8187 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
8188   if (StSize >= 8)
8189     return StSize == 16 ? ARM::VST1q32wb_fixed
8190                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
8191   if (IsThumb1)
8192     return StSize == 4 ? ARM::tSTRi
8193                        : StSize == 2 ? ARM::tSTRHi
8194                                      : StSize == 1 ? ARM::tSTRBi : 0;
8195   if (IsThumb2)
8196     return StSize == 4 ? ARM::t2STR_POST
8197                        : StSize == 2 ? ARM::t2STRH_POST
8198                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
8199   return StSize == 4 ? ARM::STR_POST_IMM
8200                      : StSize == 2 ? ARM::STRH_POST
8201                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
8202 }
8203 
8204 /// Emit a post-increment load operation with given size. The instructions
8205 /// will be added to BB at Pos.
8206 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8207                        const TargetInstrInfo *TII, const DebugLoc &dl,
8208                        unsigned LdSize, unsigned Data, unsigned AddrIn,
8209                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8210   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
8211   assert(LdOpc != 0 && "Should have a load opcode");
8212   if (LdSize >= 8) {
8213     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8214                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
8215                        .addImm(0));
8216   } else if (IsThumb1) {
8217     // load + update AddrIn
8218     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8219                        .addReg(AddrIn).addImm(0));
8220     MachineInstrBuilder MIB =
8221         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
8222     MIB = AddDefaultT1CC(MIB);
8223     MIB.addReg(AddrIn).addImm(LdSize);
8224     AddDefaultPred(MIB);
8225   } else if (IsThumb2) {
8226     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8227                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
8228                        .addImm(LdSize));
8229   } else { // arm
8230     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8231                        .addReg(AddrOut, RegState::Define).addReg(AddrIn)
8232                        .addReg(0).addImm(LdSize));
8233   }
8234 }
8235 
8236 /// Emit a post-increment store operation with given size. The instructions
8237 /// will be added to BB at Pos.
8238 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8239                        const TargetInstrInfo *TII, const DebugLoc &dl,
8240                        unsigned StSize, unsigned Data, unsigned AddrIn,
8241                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8242   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
8243   assert(StOpc != 0 && "Should have a store opcode");
8244   if (StSize >= 8) {
8245     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8246                        .addReg(AddrIn).addImm(0).addReg(Data));
8247   } else if (IsThumb1) {
8248     // store + update AddrIn
8249     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
8250                        .addReg(AddrIn).addImm(0));
8251     MachineInstrBuilder MIB =
8252         BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
8253     MIB = AddDefaultT1CC(MIB);
8254     MIB.addReg(AddrIn).addImm(StSize);
8255     AddDefaultPred(MIB);
8256   } else if (IsThumb2) {
8257     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8258                        .addReg(Data).addReg(AddrIn).addImm(StSize));
8259   } else { // arm
8260     AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8261                        .addReg(Data).addReg(AddrIn).addReg(0)
8262                        .addImm(StSize));
8263   }
8264 }
8265 
8266 MachineBasicBlock *
8267 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
8268                                    MachineBasicBlock *BB) const {
8269   // This pseudo instruction has 3 operands: dst, src, size
8270   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
8271   // Otherwise, we will generate unrolled scalar copies.
8272   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8273   const BasicBlock *LLVM_BB = BB->getBasicBlock();
8274   MachineFunction::iterator It = ++BB->getIterator();
8275 
8276   unsigned dest = MI.getOperand(0).getReg();
8277   unsigned src = MI.getOperand(1).getReg();
8278   unsigned SizeVal = MI.getOperand(2).getImm();
8279   unsigned Align = MI.getOperand(3).getImm();
8280   DebugLoc dl = MI.getDebugLoc();
8281 
8282   MachineFunction *MF = BB->getParent();
8283   MachineRegisterInfo &MRI = MF->getRegInfo();
8284   unsigned UnitSize = 0;
8285   const TargetRegisterClass *TRC = nullptr;
8286   const TargetRegisterClass *VecTRC = nullptr;
8287 
8288   bool IsThumb1 = Subtarget->isThumb1Only();
8289   bool IsThumb2 = Subtarget->isThumb2();
8290   bool IsThumb = Subtarget->isThumb();
8291 
8292   if (Align & 1) {
8293     UnitSize = 1;
8294   } else if (Align & 2) {
8295     UnitSize = 2;
8296   } else {
8297     // Check whether we can use NEON instructions.
8298     if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
8299         Subtarget->hasNEON()) {
8300       if ((Align % 16 == 0) && SizeVal >= 16)
8301         UnitSize = 16;
8302       else if ((Align % 8 == 0) && SizeVal >= 8)
8303         UnitSize = 8;
8304     }
8305     // Can't use NEON instructions.
8306     if (UnitSize == 0)
8307       UnitSize = 4;
8308   }
8309 
8310   // Select the correct opcode and register class for unit size load/store
8311   bool IsNeon = UnitSize >= 8;
8312   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
8313   if (IsNeon)
8314     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
8315                             : UnitSize == 8 ? &ARM::DPRRegClass
8316                                             : nullptr;
8317 
8318   unsigned BytesLeft = SizeVal % UnitSize;
8319   unsigned LoopSize = SizeVal - BytesLeft;
8320 
8321   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
8322     // Use LDR and STR to copy.
8323     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
8324     // [destOut] = STR_POST(scratch, destIn, UnitSize)
8325     unsigned srcIn = src;
8326     unsigned destIn = dest;
8327     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
8328       unsigned srcOut = MRI.createVirtualRegister(TRC);
8329       unsigned destOut = MRI.createVirtualRegister(TRC);
8330       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
8331       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
8332                  IsThumb1, IsThumb2);
8333       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
8334                  IsThumb1, IsThumb2);
8335       srcIn = srcOut;
8336       destIn = destOut;
8337     }
8338 
8339     // Handle the leftover bytes with LDRB and STRB.
8340     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
8341     // [destOut] = STRB_POST(scratch, destIn, 1)
8342     for (unsigned i = 0; i < BytesLeft; i++) {
8343       unsigned srcOut = MRI.createVirtualRegister(TRC);
8344       unsigned destOut = MRI.createVirtualRegister(TRC);
8345       unsigned scratch = MRI.createVirtualRegister(TRC);
8346       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
8347                  IsThumb1, IsThumb2);
8348       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
8349                  IsThumb1, IsThumb2);
8350       srcIn = srcOut;
8351       destIn = destOut;
8352     }
8353     MI.eraseFromParent(); // The instruction is gone now.
8354     return BB;
8355   }
8356 
8357   // Expand the pseudo op to a loop.
8358   // thisMBB:
8359   //   ...
8360   //   movw varEnd, # --> with thumb2
8361   //   movt varEnd, #
8362   //   ldrcp varEnd, idx --> without thumb2
8363   //   fallthrough --> loopMBB
8364   // loopMBB:
8365   //   PHI varPhi, varEnd, varLoop
8366   //   PHI srcPhi, src, srcLoop
8367   //   PHI destPhi, dst, destLoop
8368   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
8369   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
8370   //   subs varLoop, varPhi, #UnitSize
8371   //   bne loopMBB
8372   //   fallthrough --> exitMBB
8373   // exitMBB:
8374   //   epilogue to handle left-over bytes
8375   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
8376   //   [destOut] = STRB_POST(scratch, destLoop, 1)
8377   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
8378   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
8379   MF->insert(It, loopMBB);
8380   MF->insert(It, exitMBB);
8381 
8382   // Transfer the remainder of BB and its successor edges to exitMBB.
8383   exitMBB->splice(exitMBB->begin(), BB,
8384                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
8385   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
8386 
8387   // Load an immediate to varEnd.
8388   unsigned varEnd = MRI.createVirtualRegister(TRC);
8389   if (Subtarget->useMovt(*MF)) {
8390     unsigned Vtmp = varEnd;
8391     if ((LoopSize & 0xFFFF0000) != 0)
8392       Vtmp = MRI.createVirtualRegister(TRC);
8393     AddDefaultPred(BuildMI(BB, dl,
8394                            TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16),
8395                            Vtmp).addImm(LoopSize & 0xFFFF));
8396 
8397     if ((LoopSize & 0xFFFF0000) != 0)
8398       AddDefaultPred(BuildMI(BB, dl,
8399                              TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16),
8400                              varEnd)
8401                          .addReg(Vtmp)
8402                          .addImm(LoopSize >> 16));
8403   } else {
8404     MachineConstantPool *ConstantPool = MF->getConstantPool();
8405     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
8406     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
8407 
8408     // MachineConstantPool wants an explicit alignment.
8409     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8410     if (Align == 0)
8411       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8412     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8413 
8414     if (IsThumb)
8415       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
8416           varEnd, RegState::Define).addConstantPoolIndex(Idx));
8417     else
8418       AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
8419           varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
8420   }
8421   BB->addSuccessor(loopMBB);
8422 
8423   // Generate the loop body:
8424   //   varPhi = PHI(varLoop, varEnd)
8425   //   srcPhi = PHI(srcLoop, src)
8426   //   destPhi = PHI(destLoop, dst)
8427   MachineBasicBlock *entryBB = BB;
8428   BB = loopMBB;
8429   unsigned varLoop = MRI.createVirtualRegister(TRC);
8430   unsigned varPhi = MRI.createVirtualRegister(TRC);
8431   unsigned srcLoop = MRI.createVirtualRegister(TRC);
8432   unsigned srcPhi = MRI.createVirtualRegister(TRC);
8433   unsigned destLoop = MRI.createVirtualRegister(TRC);
8434   unsigned destPhi = MRI.createVirtualRegister(TRC);
8435 
8436   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
8437     .addReg(varLoop).addMBB(loopMBB)
8438     .addReg(varEnd).addMBB(entryBB);
8439   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
8440     .addReg(srcLoop).addMBB(loopMBB)
8441     .addReg(src).addMBB(entryBB);
8442   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
8443     .addReg(destLoop).addMBB(loopMBB)
8444     .addReg(dest).addMBB(entryBB);
8445 
8446   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
8447   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
8448   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
8449   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
8450              IsThumb1, IsThumb2);
8451   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
8452              IsThumb1, IsThumb2);
8453 
8454   // Decrement loop variable by UnitSize.
8455   if (IsThumb1) {
8456     MachineInstrBuilder MIB =
8457         BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
8458     MIB = AddDefaultT1CC(MIB);
8459     MIB.addReg(varPhi).addImm(UnitSize);
8460     AddDefaultPred(MIB);
8461   } else {
8462     MachineInstrBuilder MIB =
8463         BuildMI(*BB, BB->end(), dl,
8464                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
8465     AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
8466     MIB->getOperand(5).setReg(ARM::CPSR);
8467     MIB->getOperand(5).setIsDef(true);
8468   }
8469   BuildMI(*BB, BB->end(), dl,
8470           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
8471       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
8472 
8473   // loopMBB can loop back to loopMBB or fall through to exitMBB.
8474   BB->addSuccessor(loopMBB);
8475   BB->addSuccessor(exitMBB);
8476 
8477   // Add epilogue to handle BytesLeft.
8478   BB = exitMBB;
8479   auto StartOfExit = exitMBB->begin();
8480 
8481   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
8482   //   [destOut] = STRB_POST(scratch, destLoop, 1)
8483   unsigned srcIn = srcLoop;
8484   unsigned destIn = destLoop;
8485   for (unsigned i = 0; i < BytesLeft; i++) {
8486     unsigned srcOut = MRI.createVirtualRegister(TRC);
8487     unsigned destOut = MRI.createVirtualRegister(TRC);
8488     unsigned scratch = MRI.createVirtualRegister(TRC);
8489     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
8490                IsThumb1, IsThumb2);
8491     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
8492                IsThumb1, IsThumb2);
8493     srcIn = srcOut;
8494     destIn = destOut;
8495   }
8496 
8497   MI.eraseFromParent(); // The instruction is gone now.
8498   return BB;
8499 }
8500 
8501 MachineBasicBlock *
8502 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
8503                                        MachineBasicBlock *MBB) const {
8504   const TargetMachine &TM = getTargetMachine();
8505   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
8506   DebugLoc DL = MI.getDebugLoc();
8507 
8508   assert(Subtarget->isTargetWindows() &&
8509          "__chkstk is only supported on Windows");
8510   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
8511 
8512   // __chkstk takes the number of words to allocate on the stack in R4, and
8513   // returns the stack adjustment in number of bytes in R4.  This will not
8514   // clober any other registers (other than the obvious lr).
8515   //
8516   // Although, technically, IP should be considered a register which may be
8517   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
8518   // thumb-2 environment, so there is no interworking required.  As a result, we
8519   // do not expect a veneer to be emitted by the linker, clobbering IP.
8520   //
8521   // Each module receives its own copy of __chkstk, so no import thunk is
8522   // required, again, ensuring that IP is not clobbered.
8523   //
8524   // Finally, although some linkers may theoretically provide a trampoline for
8525   // out of range calls (which is quite common due to a 32M range limitation of
8526   // branches for Thumb), we can generate the long-call version via
8527   // -mcmodel=large, alleviating the need for the trampoline which may clobber
8528   // IP.
8529 
8530   switch (TM.getCodeModel()) {
8531   case CodeModel::Small:
8532   case CodeModel::Medium:
8533   case CodeModel::Default:
8534   case CodeModel::Kernel:
8535     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
8536       .addImm((unsigned)ARMCC::AL).addReg(0)
8537       .addExternalSymbol("__chkstk")
8538       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8539       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8540       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
8541     break;
8542   case CodeModel::Large:
8543   case CodeModel::JITDefault: {
8544     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
8545     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
8546 
8547     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
8548       .addExternalSymbol("__chkstk");
8549     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
8550       .addImm((unsigned)ARMCC::AL).addReg(0)
8551       .addReg(Reg, RegState::Kill)
8552       .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
8553       .addReg(ARM::R4, RegState::Implicit | RegState::Define)
8554       .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
8555     break;
8556   }
8557   }
8558 
8559   AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
8560                                       ARM::SP)
8561                          .addReg(ARM::SP, RegState::Kill)
8562                          .addReg(ARM::R4, RegState::Kill)
8563                          .setMIFlags(MachineInstr::FrameSetup)));
8564 
8565   MI.eraseFromParent();
8566   return MBB;
8567 }
8568 
8569 MachineBasicBlock *
8570 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
8571                                        MachineBasicBlock *MBB) const {
8572   DebugLoc DL = MI.getDebugLoc();
8573   MachineFunction *MF = MBB->getParent();
8574   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8575 
8576   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
8577   MF->insert(++MBB->getIterator(), ContBB);
8578   ContBB->splice(ContBB->begin(), MBB,
8579                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
8580   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
8581 
8582   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
8583   MF->push_back(TrapBB);
8584   BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249);
8585   MBB->addSuccessor(TrapBB);
8586 
8587   BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ))
8588       .addReg(MI.getOperand(0).getReg())
8589       .addMBB(TrapBB);
8590   AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::t2B)).addMBB(ContBB));
8591   MBB->addSuccessor(ContBB);
8592 
8593   MI.eraseFromParent();
8594   return ContBB;
8595 }
8596 
8597 MachineBasicBlock *
8598 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
8599                                                MachineBasicBlock *BB) const {
8600   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8601   DebugLoc dl = MI.getDebugLoc();
8602   bool isThumb2 = Subtarget->isThumb2();
8603   switch (MI.getOpcode()) {
8604   default: {
8605     MI.dump();
8606     llvm_unreachable("Unexpected instr type to insert");
8607   }
8608 
8609   // Thumb1 post-indexed loads are really just single-register LDMs.
8610   case ARM::tLDR_postidx: {
8611     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
8612       .addOperand(MI.getOperand(1)) // Rn_wb
8613       .addOperand(MI.getOperand(2)) // Rn
8614       .addOperand(MI.getOperand(3)) // PredImm
8615       .addOperand(MI.getOperand(4)) // PredReg
8616       .addOperand(MI.getOperand(0)); // Rt
8617     MI.eraseFromParent();
8618     return BB;
8619   }
8620 
8621   // The Thumb2 pre-indexed stores have the same MI operands, they just
8622   // define them differently in the .td files from the isel patterns, so
8623   // they need pseudos.
8624   case ARM::t2STR_preidx:
8625     MI.setDesc(TII->get(ARM::t2STR_PRE));
8626     return BB;
8627   case ARM::t2STRB_preidx:
8628     MI.setDesc(TII->get(ARM::t2STRB_PRE));
8629     return BB;
8630   case ARM::t2STRH_preidx:
8631     MI.setDesc(TII->get(ARM::t2STRH_PRE));
8632     return BB;
8633 
8634   case ARM::STRi_preidx:
8635   case ARM::STRBi_preidx: {
8636     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
8637                                                          : ARM::STRB_PRE_IMM;
8638     // Decode the offset.
8639     unsigned Offset = MI.getOperand(4).getImm();
8640     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
8641     Offset = ARM_AM::getAM2Offset(Offset);
8642     if (isSub)
8643       Offset = -Offset;
8644 
8645     MachineMemOperand *MMO = *MI.memoperands_begin();
8646     BuildMI(*BB, MI, dl, TII->get(NewOpc))
8647         .addOperand(MI.getOperand(0)) // Rn_wb
8648         .addOperand(MI.getOperand(1)) // Rt
8649         .addOperand(MI.getOperand(2)) // Rn
8650         .addImm(Offset)               // offset (skip GPR==zero_reg)
8651         .addOperand(MI.getOperand(5)) // pred
8652         .addOperand(MI.getOperand(6))
8653         .addMemOperand(MMO);
8654     MI.eraseFromParent();
8655     return BB;
8656   }
8657   case ARM::STRr_preidx:
8658   case ARM::STRBr_preidx:
8659   case ARM::STRH_preidx: {
8660     unsigned NewOpc;
8661     switch (MI.getOpcode()) {
8662     default: llvm_unreachable("unexpected opcode!");
8663     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
8664     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
8665     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
8666     }
8667     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
8668     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
8669       MIB.addOperand(MI.getOperand(i));
8670     MI.eraseFromParent();
8671     return BB;
8672   }
8673 
8674   case ARM::tMOVCCr_pseudo: {
8675     // To "insert" a SELECT_CC instruction, we actually have to insert the
8676     // diamond control-flow pattern.  The incoming instruction knows the
8677     // destination vreg to set, the condition code register to branch on, the
8678     // true/false values to select between, and a branch opcode to use.
8679     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8680     MachineFunction::iterator It = ++BB->getIterator();
8681 
8682     //  thisMBB:
8683     //  ...
8684     //   TrueVal = ...
8685     //   cmpTY ccX, r1, r2
8686     //   bCC copy1MBB
8687     //   fallthrough --> copy0MBB
8688     MachineBasicBlock *thisMBB  = BB;
8689     MachineFunction *F = BB->getParent();
8690     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8691     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
8692     F->insert(It, copy0MBB);
8693     F->insert(It, sinkMBB);
8694 
8695     // Transfer the remainder of BB and its successor edges to sinkMBB.
8696     sinkMBB->splice(sinkMBB->begin(), BB,
8697                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
8698     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
8699 
8700     BB->addSuccessor(copy0MBB);
8701     BB->addSuccessor(sinkMBB);
8702 
8703     BuildMI(BB, dl, TII->get(ARM::tBcc))
8704         .addMBB(sinkMBB)
8705         .addImm(MI.getOperand(3).getImm())
8706         .addReg(MI.getOperand(4).getReg());
8707 
8708     //  copy0MBB:
8709     //   %FalseValue = ...
8710     //   # fallthrough to sinkMBB
8711     BB = copy0MBB;
8712 
8713     // Update machine-CFG edges
8714     BB->addSuccessor(sinkMBB);
8715 
8716     //  sinkMBB:
8717     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8718     //  ...
8719     BB = sinkMBB;
8720     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
8721         .addReg(MI.getOperand(1).getReg())
8722         .addMBB(copy0MBB)
8723         .addReg(MI.getOperand(2).getReg())
8724         .addMBB(thisMBB);
8725 
8726     MI.eraseFromParent(); // The pseudo instruction is gone now.
8727     return BB;
8728   }
8729 
8730   case ARM::BCCi64:
8731   case ARM::BCCZi64: {
8732     // If there is an unconditional branch to the other successor, remove it.
8733     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
8734 
8735     // Compare both parts that make up the double comparison separately for
8736     // equality.
8737     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
8738 
8739     unsigned LHS1 = MI.getOperand(1).getReg();
8740     unsigned LHS2 = MI.getOperand(2).getReg();
8741     if (RHSisZero) {
8742       AddDefaultPred(BuildMI(BB, dl,
8743                              TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8744                      .addReg(LHS1).addImm(0));
8745       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8746         .addReg(LHS2).addImm(0)
8747         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8748     } else {
8749       unsigned RHS1 = MI.getOperand(3).getReg();
8750       unsigned RHS2 = MI.getOperand(4).getReg();
8751       AddDefaultPred(BuildMI(BB, dl,
8752                              TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8753                      .addReg(LHS1).addReg(RHS1));
8754       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
8755         .addReg(LHS2).addReg(RHS2)
8756         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
8757     }
8758 
8759     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
8760     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
8761     if (MI.getOperand(0).getImm() == ARMCC::NE)
8762       std::swap(destMBB, exitMBB);
8763 
8764     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
8765       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
8766     if (isThumb2)
8767       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
8768     else
8769       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
8770 
8771     MI.eraseFromParent(); // The pseudo instruction is gone now.
8772     return BB;
8773   }
8774 
8775   case ARM::Int_eh_sjlj_setjmp:
8776   case ARM::Int_eh_sjlj_setjmp_nofp:
8777   case ARM::tInt_eh_sjlj_setjmp:
8778   case ARM::t2Int_eh_sjlj_setjmp:
8779   case ARM::t2Int_eh_sjlj_setjmp_nofp:
8780     return BB;
8781 
8782   case ARM::Int_eh_sjlj_setup_dispatch:
8783     EmitSjLjDispatchBlock(MI, BB);
8784     return BB;
8785 
8786   case ARM::ABS:
8787   case ARM::t2ABS: {
8788     // To insert an ABS instruction, we have to insert the
8789     // diamond control-flow pattern.  The incoming instruction knows the
8790     // source vreg to test against 0, the destination vreg to set,
8791     // the condition code register to branch on, the
8792     // true/false values to select between, and a branch opcode to use.
8793     // It transforms
8794     //     V1 = ABS V0
8795     // into
8796     //     V2 = MOVS V0
8797     //     BCC                      (branch to SinkBB if V0 >= 0)
8798     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
8799     //     SinkBB: V1 = PHI(V2, V3)
8800     const BasicBlock *LLVM_BB = BB->getBasicBlock();
8801     MachineFunction::iterator BBI = ++BB->getIterator();
8802     MachineFunction *Fn = BB->getParent();
8803     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
8804     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
8805     Fn->insert(BBI, RSBBB);
8806     Fn->insert(BBI, SinkBB);
8807 
8808     unsigned int ABSSrcReg = MI.getOperand(1).getReg();
8809     unsigned int ABSDstReg = MI.getOperand(0).getReg();
8810     bool ABSSrcKIll = MI.getOperand(1).isKill();
8811     bool isThumb2 = Subtarget->isThumb2();
8812     MachineRegisterInfo &MRI = Fn->getRegInfo();
8813     // In Thumb mode S must not be specified if source register is the SP or
8814     // PC and if destination register is the SP, so restrict register class
8815     unsigned NewRsbDstReg =
8816       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
8817 
8818     // Transfer the remainder of BB and its successor edges to sinkMBB.
8819     SinkBB->splice(SinkBB->begin(), BB,
8820                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
8821     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
8822 
8823     BB->addSuccessor(RSBBB);
8824     BB->addSuccessor(SinkBB);
8825 
8826     // fall through to SinkMBB
8827     RSBBB->addSuccessor(SinkBB);
8828 
8829     // insert a cmp at the end of BB
8830     AddDefaultPred(BuildMI(BB, dl,
8831                            TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
8832                    .addReg(ABSSrcReg).addImm(0));
8833 
8834     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
8835     BuildMI(BB, dl,
8836       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
8837       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
8838 
8839     // insert rsbri in RSBBB
8840     // Note: BCC and rsbri will be converted into predicated rsbmi
8841     // by if-conversion pass
8842     BuildMI(*RSBBB, RSBBB->begin(), dl,
8843       TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
8844       .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
8845       .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
8846 
8847     // insert PHI in SinkBB,
8848     // reuse ABSDstReg to not change uses of ABS instruction
8849     BuildMI(*SinkBB, SinkBB->begin(), dl,
8850       TII->get(ARM::PHI), ABSDstReg)
8851       .addReg(NewRsbDstReg).addMBB(RSBBB)
8852       .addReg(ABSSrcReg).addMBB(BB);
8853 
8854     // remove ABS instruction
8855     MI.eraseFromParent();
8856 
8857     // return last added BB
8858     return SinkBB;
8859   }
8860   case ARM::COPY_STRUCT_BYVAL_I32:
8861     ++NumLoopByVals;
8862     return EmitStructByval(MI, BB);
8863   case ARM::WIN__CHKSTK:
8864     return EmitLowered__chkstk(MI, BB);
8865   case ARM::WIN__DBZCHK:
8866     return EmitLowered__dbzchk(MI, BB);
8867   }
8868 }
8869 
8870 /// \brief Attaches vregs to MEMCPY that it will use as scratch registers
8871 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
8872 /// instead of as a custom inserter because we need the use list from the SDNode.
8873 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
8874                                     MachineInstr &MI, const SDNode *Node) {
8875   bool isThumb1 = Subtarget->isThumb1Only();
8876 
8877   DebugLoc DL = MI.getDebugLoc();
8878   MachineFunction *MF = MI.getParent()->getParent();
8879   MachineRegisterInfo &MRI = MF->getRegInfo();
8880   MachineInstrBuilder MIB(*MF, MI);
8881 
8882   // If the new dst/src is unused mark it as dead.
8883   if (!Node->hasAnyUseOfValue(0)) {
8884     MI.getOperand(0).setIsDead(true);
8885   }
8886   if (!Node->hasAnyUseOfValue(1)) {
8887     MI.getOperand(1).setIsDead(true);
8888   }
8889 
8890   // The MEMCPY both defines and kills the scratch registers.
8891   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
8892     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
8893                                                          : &ARM::GPRRegClass);
8894     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
8895   }
8896 }
8897 
8898 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8899                                                       SDNode *Node) const {
8900   if (MI.getOpcode() == ARM::MEMCPY) {
8901     attachMEMCPYScratchRegs(Subtarget, MI, Node);
8902     return;
8903   }
8904 
8905   const MCInstrDesc *MCID = &MI.getDesc();
8906   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
8907   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
8908   // operand is still set to noreg. If needed, set the optional operand's
8909   // register to CPSR, and remove the redundant implicit def.
8910   //
8911   // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
8912 
8913   // Rename pseudo opcodes.
8914   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
8915   if (NewOpc) {
8916     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
8917     MCID = &TII->get(NewOpc);
8918 
8919     assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
8920            "converted opcode should be the same except for cc_out");
8921 
8922     MI.setDesc(*MCID);
8923 
8924     // Add the optional cc_out operand
8925     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
8926   }
8927   unsigned ccOutIdx = MCID->getNumOperands() - 1;
8928 
8929   // Any ARM instruction that sets the 's' bit should specify an optional
8930   // "cc_out" operand in the last operand position.
8931   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
8932     assert(!NewOpc && "Optional cc_out operand required");
8933     return;
8934   }
8935   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
8936   // since we already have an optional CPSR def.
8937   bool definesCPSR = false;
8938   bool deadCPSR = false;
8939   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
8940        ++i) {
8941     const MachineOperand &MO = MI.getOperand(i);
8942     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
8943       definesCPSR = true;
8944       if (MO.isDead())
8945         deadCPSR = true;
8946       MI.RemoveOperand(i);
8947       break;
8948     }
8949   }
8950   if (!definesCPSR) {
8951     assert(!NewOpc && "Optional cc_out operand required");
8952     return;
8953   }
8954   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
8955   if (deadCPSR) {
8956     assert(!MI.getOperand(ccOutIdx).getReg() &&
8957            "expect uninitialized optional cc_out operand");
8958     return;
8959   }
8960 
8961   // If this instruction was defined with an optional CPSR def and its dag node
8962   // had a live implicit CPSR def, then activate the optional CPSR def.
8963   MachineOperand &MO = MI.getOperand(ccOutIdx);
8964   MO.setReg(ARM::CPSR);
8965   MO.setIsDef(true);
8966 }
8967 
8968 //===----------------------------------------------------------------------===//
8969 //                           ARM Optimization Hooks
8970 //===----------------------------------------------------------------------===//
8971 
8972 // Helper function that checks if N is a null or all ones constant.
8973 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
8974   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
8975 }
8976 
8977 // Return true if N is conditionally 0 or all ones.
8978 // Detects these expressions where cc is an i1 value:
8979 //
8980 //   (select cc 0, y)   [AllOnes=0]
8981 //   (select cc y, 0)   [AllOnes=0]
8982 //   (zext cc)          [AllOnes=0]
8983 //   (sext cc)          [AllOnes=0/1]
8984 //   (select cc -1, y)  [AllOnes=1]
8985 //   (select cc y, -1)  [AllOnes=1]
8986 //
8987 // Invert is set when N is the null/all ones constant when CC is false.
8988 // OtherOp is set to the alternative value of N.
8989 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
8990                                        SDValue &CC, bool &Invert,
8991                                        SDValue &OtherOp,
8992                                        SelectionDAG &DAG) {
8993   switch (N->getOpcode()) {
8994   default: return false;
8995   case ISD::SELECT: {
8996     CC = N->getOperand(0);
8997     SDValue N1 = N->getOperand(1);
8998     SDValue N2 = N->getOperand(2);
8999     if (isZeroOrAllOnes(N1, AllOnes)) {
9000       Invert = false;
9001       OtherOp = N2;
9002       return true;
9003     }
9004     if (isZeroOrAllOnes(N2, AllOnes)) {
9005       Invert = true;
9006       OtherOp = N1;
9007       return true;
9008     }
9009     return false;
9010   }
9011   case ISD::ZERO_EXTEND:
9012     // (zext cc) can never be the all ones value.
9013     if (AllOnes)
9014       return false;
9015     LLVM_FALLTHROUGH;
9016   case ISD::SIGN_EXTEND: {
9017     SDLoc dl(N);
9018     EVT VT = N->getValueType(0);
9019     CC = N->getOperand(0);
9020     if (CC.getValueType() != MVT::i1)
9021       return false;
9022     Invert = !AllOnes;
9023     if (AllOnes)
9024       // When looking for an AllOnes constant, N is an sext, and the 'other'
9025       // value is 0.
9026       OtherOp = DAG.getConstant(0, dl, VT);
9027     else if (N->getOpcode() == ISD::ZERO_EXTEND)
9028       // When looking for a 0 constant, N can be zext or sext.
9029       OtherOp = DAG.getConstant(1, dl, VT);
9030     else
9031       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
9032                                 VT);
9033     return true;
9034   }
9035   }
9036 }
9037 
9038 // Combine a constant select operand into its use:
9039 //
9040 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
9041 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
9042 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
9043 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
9044 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
9045 //
9046 // The transform is rejected if the select doesn't have a constant operand that
9047 // is null, or all ones when AllOnes is set.
9048 //
9049 // Also recognize sext/zext from i1:
9050 //
9051 //   (add (zext cc), x) -> (select cc (add x, 1), x)
9052 //   (add (sext cc), x) -> (select cc (add x, -1), x)
9053 //
9054 // These transformations eventually create predicated instructions.
9055 //
9056 // @param N       The node to transform.
9057 // @param Slct    The N operand that is a select.
9058 // @param OtherOp The other N operand (x above).
9059 // @param DCI     Context.
9060 // @param AllOnes Require the select constant to be all ones instead of null.
9061 // @returns The new node, or SDValue() on failure.
9062 static
9063 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
9064                             TargetLowering::DAGCombinerInfo &DCI,
9065                             bool AllOnes = false) {
9066   SelectionDAG &DAG = DCI.DAG;
9067   EVT VT = N->getValueType(0);
9068   SDValue NonConstantVal;
9069   SDValue CCOp;
9070   bool SwapSelectOps;
9071   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
9072                                   NonConstantVal, DAG))
9073     return SDValue();
9074 
9075   // Slct is now know to be the desired identity constant when CC is true.
9076   SDValue TrueVal = OtherOp;
9077   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
9078                                  OtherOp, NonConstantVal);
9079   // Unless SwapSelectOps says CC should be false.
9080   if (SwapSelectOps)
9081     std::swap(TrueVal, FalseVal);
9082 
9083   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
9084                      CCOp, TrueVal, FalseVal);
9085 }
9086 
9087 // Attempt combineSelectAndUse on each operand of a commutative operator N.
9088 static
9089 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
9090                                        TargetLowering::DAGCombinerInfo &DCI) {
9091   SDValue N0 = N->getOperand(0);
9092   SDValue N1 = N->getOperand(1);
9093   if (N0.getNode()->hasOneUse())
9094     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
9095       return Result;
9096   if (N1.getNode()->hasOneUse())
9097     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
9098       return Result;
9099   return SDValue();
9100 }
9101 
9102 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
9103 // (only after legalization).
9104 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9105                                  TargetLowering::DAGCombinerInfo &DCI,
9106                                  const ARMSubtarget *Subtarget) {
9107 
9108   // Only perform optimization if after legalize, and if NEON is available. We
9109   // also expected both operands to be BUILD_VECTORs.
9110   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
9111       || N0.getOpcode() != ISD::BUILD_VECTOR
9112       || N1.getOpcode() != ISD::BUILD_VECTOR)
9113     return SDValue();
9114 
9115   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
9116   EVT VT = N->getValueType(0);
9117   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
9118     return SDValue();
9119 
9120   // Check that the vector operands are of the right form.
9121   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
9122   // operands, where N is the size of the formed vector.
9123   // Each EXTRACT_VECTOR should have the same input vector and odd or even
9124   // index such that we have a pair wise add pattern.
9125 
9126   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
9127   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9128     return SDValue();
9129   SDValue Vec = N0->getOperand(0)->getOperand(0);
9130   SDNode *V = Vec.getNode();
9131   unsigned nextIndex = 0;
9132 
9133   // For each operands to the ADD which are BUILD_VECTORs,
9134   // check to see if each of their operands are an EXTRACT_VECTOR with
9135   // the same vector and appropriate index.
9136   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
9137     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
9138         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
9139 
9140       SDValue ExtVec0 = N0->getOperand(i);
9141       SDValue ExtVec1 = N1->getOperand(i);
9142 
9143       // First operand is the vector, verify its the same.
9144       if (V != ExtVec0->getOperand(0).getNode() ||
9145           V != ExtVec1->getOperand(0).getNode())
9146         return SDValue();
9147 
9148       // Second is the constant, verify its correct.
9149       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
9150       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
9151 
9152       // For the constant, we want to see all the even or all the odd.
9153       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
9154           || C1->getZExtValue() != nextIndex+1)
9155         return SDValue();
9156 
9157       // Increment index.
9158       nextIndex+=2;
9159     } else
9160       return SDValue();
9161   }
9162 
9163   // Create VPADDL node.
9164   SelectionDAG &DAG = DCI.DAG;
9165   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9166 
9167   SDLoc dl(N);
9168 
9169   // Build operand list.
9170   SmallVector<SDValue, 8> Ops;
9171   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
9172                                 TLI.getPointerTy(DAG.getDataLayout())));
9173 
9174   // Input is the vector.
9175   Ops.push_back(Vec);
9176 
9177   // Get widened type and narrowed type.
9178   MVT widenType;
9179   unsigned numElem = VT.getVectorNumElements();
9180 
9181   EVT inputLaneType = Vec.getValueType().getVectorElementType();
9182   switch (inputLaneType.getSimpleVT().SimpleTy) {
9183     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
9184     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
9185     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
9186     default:
9187       llvm_unreachable("Invalid vector element type for padd optimization.");
9188   }
9189 
9190   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
9191   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
9192   return DAG.getNode(ExtOp, dl, VT, tmp);
9193 }
9194 
9195 static SDValue findMUL_LOHI(SDValue V) {
9196   if (V->getOpcode() == ISD::UMUL_LOHI ||
9197       V->getOpcode() == ISD::SMUL_LOHI)
9198     return V;
9199   return SDValue();
9200 }
9201 
9202 static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
9203                                      TargetLowering::DAGCombinerInfo &DCI,
9204                                      const ARMSubtarget *Subtarget) {
9205 
9206   // Look for multiply add opportunities.
9207   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
9208   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
9209   // a glue link from the first add to the second add.
9210   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
9211   // a S/UMLAL instruction.
9212   //                  UMUL_LOHI
9213   //                 / :lo    \ :hi
9214   //                /          \          [no multiline comment]
9215   //    loAdd ->  ADDE         |
9216   //                 \ :glue  /
9217   //                  \      /
9218   //                    ADDC   <- hiAdd
9219   //
9220   assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
9221   SDValue AddcOp0 = AddcNode->getOperand(0);
9222   SDValue AddcOp1 = AddcNode->getOperand(1);
9223 
9224   // Check if the two operands are from the same mul_lohi node.
9225   if (AddcOp0.getNode() == AddcOp1.getNode())
9226     return SDValue();
9227 
9228   assert(AddcNode->getNumValues() == 2 &&
9229          AddcNode->getValueType(0) == MVT::i32 &&
9230          "Expect ADDC with two result values. First: i32");
9231 
9232   // Check that we have a glued ADDC node.
9233   if (AddcNode->getValueType(1) != MVT::Glue)
9234     return SDValue();
9235 
9236   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
9237   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
9238       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
9239       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
9240       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
9241     return SDValue();
9242 
9243   // Look for the glued ADDE.
9244   SDNode* AddeNode = AddcNode->getGluedUser();
9245   if (!AddeNode)
9246     return SDValue();
9247 
9248   // Make sure it is really an ADDE.
9249   if (AddeNode->getOpcode() != ISD::ADDE)
9250     return SDValue();
9251 
9252   assert(AddeNode->getNumOperands() == 3 &&
9253          AddeNode->getOperand(2).getValueType() == MVT::Glue &&
9254          "ADDE node has the wrong inputs");
9255 
9256   // Check for the triangle shape.
9257   SDValue AddeOp0 = AddeNode->getOperand(0);
9258   SDValue AddeOp1 = AddeNode->getOperand(1);
9259 
9260   // Make sure that the ADDE operands are not coming from the same node.
9261   if (AddeOp0.getNode() == AddeOp1.getNode())
9262     return SDValue();
9263 
9264   // Find the MUL_LOHI node walking up ADDE's operands.
9265   bool IsLeftOperandMUL = false;
9266   SDValue MULOp = findMUL_LOHI(AddeOp0);
9267   if (MULOp == SDValue())
9268    MULOp = findMUL_LOHI(AddeOp1);
9269   else
9270     IsLeftOperandMUL = true;
9271   if (MULOp == SDValue())
9272     return SDValue();
9273 
9274   // Figure out the right opcode.
9275   unsigned Opc = MULOp->getOpcode();
9276   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
9277 
9278   // Figure out the high and low input values to the MLAL node.
9279   SDValue* HiAdd = nullptr;
9280   SDValue* LoMul = nullptr;
9281   SDValue* LowAdd = nullptr;
9282 
9283   // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
9284   if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
9285     return SDValue();
9286 
9287   if (IsLeftOperandMUL)
9288     HiAdd = &AddeOp1;
9289   else
9290     HiAdd = &AddeOp0;
9291 
9292 
9293   // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
9294   // whose low result is fed to the ADDC we are checking.
9295 
9296   if (AddcOp0 == MULOp.getValue(0)) {
9297     LoMul = &AddcOp0;
9298     LowAdd = &AddcOp1;
9299   }
9300   if (AddcOp1 == MULOp.getValue(0)) {
9301     LoMul = &AddcOp1;
9302     LowAdd = &AddcOp0;
9303   }
9304 
9305   if (!LoMul)
9306     return SDValue();
9307 
9308   // Create the merged node.
9309   SelectionDAG &DAG = DCI.DAG;
9310 
9311   // Build operand list.
9312   SmallVector<SDValue, 8> Ops;
9313   Ops.push_back(LoMul->getOperand(0));
9314   Ops.push_back(LoMul->getOperand(1));
9315   Ops.push_back(*LowAdd);
9316   Ops.push_back(*HiAdd);
9317 
9318   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
9319                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
9320 
9321   // Replace the ADDs' nodes uses by the MLA node's values.
9322   SDValue HiMLALResult(MLALNode.getNode(), 1);
9323   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
9324 
9325   SDValue LoMLALResult(MLALNode.getNode(), 0);
9326   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
9327 
9328   // Return original node to notify the driver to stop replacing.
9329   SDValue resNode(AddcNode, 0);
9330   return resNode;
9331 }
9332 
9333 static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
9334                                       TargetLowering::DAGCombinerInfo &DCI,
9335                                       const ARMSubtarget *Subtarget) {
9336   // UMAAL is similar to UMLAL except that it adds two unsigned values.
9337   // While trying to combine for the other MLAL nodes, first search for the
9338   // chance to use UMAAL. Check if Addc uses another addc node which can first
9339   // be combined into a UMLAL. The other pattern is AddcNode being combined
9340   // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
9341 
9342   if (!Subtarget->hasV6Ops() ||
9343       (Subtarget->isThumb() && !Subtarget->hasThumb2()))
9344     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9345 
9346   SDNode *PrevAddc = nullptr;
9347   if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
9348     PrevAddc = AddcNode->getOperand(0).getNode();
9349   else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
9350     PrevAddc = AddcNode->getOperand(1).getNode();
9351 
9352   // If there's no addc chains, just return a search for any MLAL.
9353   if (PrevAddc == nullptr)
9354     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9355 
9356   // Try to convert the addc operand to an MLAL and if that fails try to
9357   // combine AddcNode.
9358   SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
9359   if (MLAL != SDValue(PrevAddc, 0))
9360     return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
9361 
9362   // Find the converted UMAAL or quit if it doesn't exist.
9363   SDNode *UmlalNode = nullptr;
9364   SDValue AddHi;
9365   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
9366     UmlalNode = AddcNode->getOperand(0).getNode();
9367     AddHi = AddcNode->getOperand(1);
9368   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
9369     UmlalNode = AddcNode->getOperand(1).getNode();
9370     AddHi = AddcNode->getOperand(0);
9371   } else {
9372     return SDValue();
9373   }
9374 
9375   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
9376   // the ADDC as well as Zero.
9377   auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
9378 
9379   if (!Zero || Zero->getZExtValue() != 0)
9380     return SDValue();
9381 
9382   // Check that we have a glued ADDC node.
9383   if (AddcNode->getValueType(1) != MVT::Glue)
9384     return SDValue();
9385 
9386   // Look for the glued ADDE.
9387   SDNode* AddeNode = AddcNode->getGluedUser();
9388   if (!AddeNode)
9389     return SDValue();
9390 
9391   if ((AddeNode->getOperand(0).getNode() == Zero &&
9392        AddeNode->getOperand(1).getNode() == UmlalNode) ||
9393       (AddeNode->getOperand(0).getNode() == UmlalNode &&
9394        AddeNode->getOperand(1).getNode() == Zero)) {
9395 
9396     SelectionDAG &DAG = DCI.DAG;
9397     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
9398                       UmlalNode->getOperand(2), AddHi };
9399     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
9400                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
9401 
9402     // Replace the ADDs' nodes uses by the UMAAL node's values.
9403     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
9404     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
9405 
9406     // Return original node to notify the driver to stop replacing.
9407     return SDValue(AddcNode, 0);
9408   }
9409   return SDValue();
9410 }
9411 
9412 /// PerformADDCCombine - Target-specific dag combine transform from
9413 /// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
9414 /// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
9415 static SDValue PerformADDCCombine(SDNode *N,
9416                                  TargetLowering::DAGCombinerInfo &DCI,
9417                                  const ARMSubtarget *Subtarget) {
9418 
9419   if (Subtarget->isThumb1Only()) return SDValue();
9420 
9421   // Only perform the checks after legalize when the pattern is available.
9422   if (DCI.isBeforeLegalize()) return SDValue();
9423 
9424   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
9425 }
9426 
9427 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
9428 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
9429 /// called with the default operands, and if that fails, with commuted
9430 /// operands.
9431 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
9432                                           TargetLowering::DAGCombinerInfo &DCI,
9433                                           const ARMSubtarget *Subtarget){
9434 
9435   // Attempt to create vpaddl for this add.
9436   if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
9437     return Result;
9438 
9439   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
9440   if (N0.getNode()->hasOneUse())
9441     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
9442       return Result;
9443   return SDValue();
9444 }
9445 
9446 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
9447 ///
9448 static SDValue PerformADDCombine(SDNode *N,
9449                                  TargetLowering::DAGCombinerInfo &DCI,
9450                                  const ARMSubtarget *Subtarget) {
9451   SDValue N0 = N->getOperand(0);
9452   SDValue N1 = N->getOperand(1);
9453 
9454   // First try with the default operand order.
9455   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
9456     return Result;
9457 
9458   // If that didn't work, try again with the operands commuted.
9459   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
9460 }
9461 
9462 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
9463 ///
9464 static SDValue PerformSUBCombine(SDNode *N,
9465                                  TargetLowering::DAGCombinerInfo &DCI) {
9466   SDValue N0 = N->getOperand(0);
9467   SDValue N1 = N->getOperand(1);
9468 
9469   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
9470   if (N1.getNode()->hasOneUse())
9471     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
9472       return Result;
9473 
9474   return SDValue();
9475 }
9476 
9477 /// PerformVMULCombine
9478 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
9479 /// special multiplier accumulator forwarding.
9480 ///   vmul d3, d0, d2
9481 ///   vmla d3, d1, d2
9482 /// is faster than
9483 ///   vadd d3, d0, d1
9484 ///   vmul d3, d3, d2
9485 //  However, for (A + B) * (A + B),
9486 //    vadd d2, d0, d1
9487 //    vmul d3, d0, d2
9488 //    vmla d3, d1, d2
9489 //  is slower than
9490 //    vadd d2, d0, d1
9491 //    vmul d3, d2, d2
9492 static SDValue PerformVMULCombine(SDNode *N,
9493                                   TargetLowering::DAGCombinerInfo &DCI,
9494                                   const ARMSubtarget *Subtarget) {
9495   if (!Subtarget->hasVMLxForwarding())
9496     return SDValue();
9497 
9498   SelectionDAG &DAG = DCI.DAG;
9499   SDValue N0 = N->getOperand(0);
9500   SDValue N1 = N->getOperand(1);
9501   unsigned Opcode = N0.getOpcode();
9502   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
9503       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
9504     Opcode = N1.getOpcode();
9505     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
9506         Opcode != ISD::FADD && Opcode != ISD::FSUB)
9507       return SDValue();
9508     std::swap(N0, N1);
9509   }
9510 
9511   if (N0 == N1)
9512     return SDValue();
9513 
9514   EVT VT = N->getValueType(0);
9515   SDLoc DL(N);
9516   SDValue N00 = N0->getOperand(0);
9517   SDValue N01 = N0->getOperand(1);
9518   return DAG.getNode(Opcode, DL, VT,
9519                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
9520                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
9521 }
9522 
9523 static SDValue PerformMULCombine(SDNode *N,
9524                                  TargetLowering::DAGCombinerInfo &DCI,
9525                                  const ARMSubtarget *Subtarget) {
9526   SelectionDAG &DAG = DCI.DAG;
9527 
9528   if (Subtarget->isThumb1Only())
9529     return SDValue();
9530 
9531   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9532     return SDValue();
9533 
9534   EVT VT = N->getValueType(0);
9535   if (VT.is64BitVector() || VT.is128BitVector())
9536     return PerformVMULCombine(N, DCI, Subtarget);
9537   if (VT != MVT::i32)
9538     return SDValue();
9539 
9540   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9541   if (!C)
9542     return SDValue();
9543 
9544   int64_t MulAmt = C->getSExtValue();
9545   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
9546 
9547   ShiftAmt = ShiftAmt & (32 - 1);
9548   SDValue V = N->getOperand(0);
9549   SDLoc DL(N);
9550 
9551   SDValue Res;
9552   MulAmt >>= ShiftAmt;
9553 
9554   if (MulAmt >= 0) {
9555     if (isPowerOf2_32(MulAmt - 1)) {
9556       // (mul x, 2^N + 1) => (add (shl x, N), x)
9557       Res = DAG.getNode(ISD::ADD, DL, VT,
9558                         V,
9559                         DAG.getNode(ISD::SHL, DL, VT,
9560                                     V,
9561                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
9562                                                     MVT::i32)));
9563     } else if (isPowerOf2_32(MulAmt + 1)) {
9564       // (mul x, 2^N - 1) => (sub (shl x, N), x)
9565       Res = DAG.getNode(ISD::SUB, DL, VT,
9566                         DAG.getNode(ISD::SHL, DL, VT,
9567                                     V,
9568                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
9569                                                     MVT::i32)),
9570                         V);
9571     } else
9572       return SDValue();
9573   } else {
9574     uint64_t MulAmtAbs = -MulAmt;
9575     if (isPowerOf2_32(MulAmtAbs + 1)) {
9576       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
9577       Res = DAG.getNode(ISD::SUB, DL, VT,
9578                         V,
9579                         DAG.getNode(ISD::SHL, DL, VT,
9580                                     V,
9581                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
9582                                                     MVT::i32)));
9583     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
9584       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
9585       Res = DAG.getNode(ISD::ADD, DL, VT,
9586                         V,
9587                         DAG.getNode(ISD::SHL, DL, VT,
9588                                     V,
9589                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
9590                                                     MVT::i32)));
9591       Res = DAG.getNode(ISD::SUB, DL, VT,
9592                         DAG.getConstant(0, DL, MVT::i32), Res);
9593 
9594     } else
9595       return SDValue();
9596   }
9597 
9598   if (ShiftAmt != 0)
9599     Res = DAG.getNode(ISD::SHL, DL, VT,
9600                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
9601 
9602   // Do not add new nodes to DAG combiner worklist.
9603   DCI.CombineTo(N, Res, false);
9604   return SDValue();
9605 }
9606 
9607 static SDValue PerformANDCombine(SDNode *N,
9608                                  TargetLowering::DAGCombinerInfo &DCI,
9609                                  const ARMSubtarget *Subtarget) {
9610 
9611   // Attempt to use immediate-form VBIC
9612   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9613   SDLoc dl(N);
9614   EVT VT = N->getValueType(0);
9615   SelectionDAG &DAG = DCI.DAG;
9616 
9617   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9618     return SDValue();
9619 
9620   APInt SplatBits, SplatUndef;
9621   unsigned SplatBitSize;
9622   bool HasAnyUndefs;
9623   if (BVN &&
9624       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9625     if (SplatBitSize <= 64) {
9626       EVT VbicVT;
9627       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
9628                                       SplatUndef.getZExtValue(), SplatBitSize,
9629                                       DAG, dl, VbicVT, VT.is128BitVector(),
9630                                       OtherModImm);
9631       if (Val.getNode()) {
9632         SDValue Input =
9633           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
9634         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
9635         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
9636       }
9637     }
9638   }
9639 
9640   if (!Subtarget->isThumb1Only()) {
9641     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
9642     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
9643       return Result;
9644   }
9645 
9646   return SDValue();
9647 }
9648 
9649 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
9650 static SDValue PerformORCombine(SDNode *N,
9651                                 TargetLowering::DAGCombinerInfo &DCI,
9652                                 const ARMSubtarget *Subtarget) {
9653   // Attempt to use immediate-form VORR
9654   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
9655   SDLoc dl(N);
9656   EVT VT = N->getValueType(0);
9657   SelectionDAG &DAG = DCI.DAG;
9658 
9659   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9660     return SDValue();
9661 
9662   APInt SplatBits, SplatUndef;
9663   unsigned SplatBitSize;
9664   bool HasAnyUndefs;
9665   if (BVN && Subtarget->hasNEON() &&
9666       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9667     if (SplatBitSize <= 64) {
9668       EVT VorrVT;
9669       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
9670                                       SplatUndef.getZExtValue(), SplatBitSize,
9671                                       DAG, dl, VorrVT, VT.is128BitVector(),
9672                                       OtherModImm);
9673       if (Val.getNode()) {
9674         SDValue Input =
9675           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
9676         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
9677         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
9678       }
9679     }
9680   }
9681 
9682   if (!Subtarget->isThumb1Only()) {
9683     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
9684     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
9685       return Result;
9686   }
9687 
9688   // The code below optimizes (or (and X, Y), Z).
9689   // The AND operand needs to have a single user to make these optimizations
9690   // profitable.
9691   SDValue N0 = N->getOperand(0);
9692   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
9693     return SDValue();
9694   SDValue N1 = N->getOperand(1);
9695 
9696   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
9697   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
9698       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
9699     APInt SplatUndef;
9700     unsigned SplatBitSize;
9701     bool HasAnyUndefs;
9702 
9703     APInt SplatBits0, SplatBits1;
9704     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
9705     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
9706     // Ensure that the second operand of both ands are constants
9707     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
9708                                       HasAnyUndefs) && !HasAnyUndefs) {
9709         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
9710                                           HasAnyUndefs) && !HasAnyUndefs) {
9711             // Ensure that the bit width of the constants are the same and that
9712             // the splat arguments are logical inverses as per the pattern we
9713             // are trying to simplify.
9714             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
9715                 SplatBits0 == ~SplatBits1) {
9716                 // Canonicalize the vector type to make instruction selection
9717                 // simpler.
9718                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
9719                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
9720                                              N0->getOperand(1),
9721                                              N0->getOperand(0),
9722                                              N1->getOperand(0));
9723                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
9724             }
9725         }
9726     }
9727   }
9728 
9729   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
9730   // reasonable.
9731 
9732   // BFI is only available on V6T2+
9733   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
9734     return SDValue();
9735 
9736   SDLoc DL(N);
9737   // 1) or (and A, mask), val => ARMbfi A, val, mask
9738   //      iff (val & mask) == val
9739   //
9740   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
9741   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
9742   //          && mask == ~mask2
9743   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
9744   //          && ~mask == mask2
9745   //  (i.e., copy a bitfield value into another bitfield of the same width)
9746 
9747   if (VT != MVT::i32)
9748     return SDValue();
9749 
9750   SDValue N00 = N0.getOperand(0);
9751 
9752   // The value and the mask need to be constants so we can verify this is
9753   // actually a bitfield set. If the mask is 0xffff, we can do better
9754   // via a movt instruction, so don't use BFI in that case.
9755   SDValue MaskOp = N0.getOperand(1);
9756   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
9757   if (!MaskC)
9758     return SDValue();
9759   unsigned Mask = MaskC->getZExtValue();
9760   if (Mask == 0xffff)
9761     return SDValue();
9762   SDValue Res;
9763   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
9764   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9765   if (N1C) {
9766     unsigned Val = N1C->getZExtValue();
9767     if ((Val & ~Mask) != Val)
9768       return SDValue();
9769 
9770     if (ARM::isBitFieldInvertedMask(Mask)) {
9771       Val >>= countTrailingZeros(~Mask);
9772 
9773       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
9774                         DAG.getConstant(Val, DL, MVT::i32),
9775                         DAG.getConstant(Mask, DL, MVT::i32));
9776 
9777       // Do not add new nodes to DAG combiner worklist.
9778       DCI.CombineTo(N, Res, false);
9779       return SDValue();
9780     }
9781   } else if (N1.getOpcode() == ISD::AND) {
9782     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
9783     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
9784     if (!N11C)
9785       return SDValue();
9786     unsigned Mask2 = N11C->getZExtValue();
9787 
9788     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
9789     // as is to match.
9790     if (ARM::isBitFieldInvertedMask(Mask) &&
9791         (Mask == ~Mask2)) {
9792       // The pack halfword instruction works better for masks that fit it,
9793       // so use that when it's available.
9794       if (Subtarget->hasT2ExtractPack() &&
9795           (Mask == 0xffff || Mask == 0xffff0000))
9796         return SDValue();
9797       // 2a
9798       unsigned amt = countTrailingZeros(Mask2);
9799       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
9800                         DAG.getConstant(amt, DL, MVT::i32));
9801       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
9802                         DAG.getConstant(Mask, DL, MVT::i32));
9803       // Do not add new nodes to DAG combiner worklist.
9804       DCI.CombineTo(N, Res, false);
9805       return SDValue();
9806     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
9807                (~Mask == Mask2)) {
9808       // The pack halfword instruction works better for masks that fit it,
9809       // so use that when it's available.
9810       if (Subtarget->hasT2ExtractPack() &&
9811           (Mask2 == 0xffff || Mask2 == 0xffff0000))
9812         return SDValue();
9813       // 2b
9814       unsigned lsb = countTrailingZeros(Mask);
9815       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
9816                         DAG.getConstant(lsb, DL, MVT::i32));
9817       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
9818                         DAG.getConstant(Mask2, DL, MVT::i32));
9819       // Do not add new nodes to DAG combiner worklist.
9820       DCI.CombineTo(N, Res, false);
9821       return SDValue();
9822     }
9823   }
9824 
9825   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
9826       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
9827       ARM::isBitFieldInvertedMask(~Mask)) {
9828     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
9829     // where lsb(mask) == #shamt and masked bits of B are known zero.
9830     SDValue ShAmt = N00.getOperand(1);
9831     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
9832     unsigned LSB = countTrailingZeros(Mask);
9833     if (ShAmtC != LSB)
9834       return SDValue();
9835 
9836     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
9837                       DAG.getConstant(~Mask, DL, MVT::i32));
9838 
9839     // Do not add new nodes to DAG combiner worklist.
9840     DCI.CombineTo(N, Res, false);
9841   }
9842 
9843   return SDValue();
9844 }
9845 
9846 static SDValue PerformXORCombine(SDNode *N,
9847                                  TargetLowering::DAGCombinerInfo &DCI,
9848                                  const ARMSubtarget *Subtarget) {
9849   EVT VT = N->getValueType(0);
9850   SelectionDAG &DAG = DCI.DAG;
9851 
9852   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
9853     return SDValue();
9854 
9855   if (!Subtarget->isThumb1Only()) {
9856     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
9857     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
9858       return Result;
9859   }
9860 
9861   return SDValue();
9862 }
9863 
9864 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
9865 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
9866 // their position in "to" (Rd).
9867 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
9868   assert(N->getOpcode() == ARMISD::BFI);
9869 
9870   SDValue From = N->getOperand(1);
9871   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
9872   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
9873 
9874   // If the Base came from a SHR #C, we can deduce that it is really testing bit
9875   // #C in the base of the SHR.
9876   if (From->getOpcode() == ISD::SRL &&
9877       isa<ConstantSDNode>(From->getOperand(1))) {
9878     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
9879     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
9880     FromMask <<= Shift.getLimitedValue(31);
9881     From = From->getOperand(0);
9882   }
9883 
9884   return From;
9885 }
9886 
9887 // If A and B contain one contiguous set of bits, does A | B == A . B?
9888 //
9889 // Neither A nor B must be zero.
9890 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
9891   unsigned LastActiveBitInA =  A.countTrailingZeros();
9892   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
9893   return LastActiveBitInA - 1 == FirstActiveBitInB;
9894 }
9895 
9896 static SDValue FindBFIToCombineWith(SDNode *N) {
9897   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
9898   // if one exists.
9899   APInt ToMask, FromMask;
9900   SDValue From = ParseBFI(N, ToMask, FromMask);
9901   SDValue To = N->getOperand(0);
9902 
9903   // Now check for a compatible BFI to merge with. We can pass through BFIs that
9904   // aren't compatible, but not if they set the same bit in their destination as
9905   // we do (or that of any BFI we're going to combine with).
9906   SDValue V = To;
9907   APInt CombinedToMask = ToMask;
9908   while (V.getOpcode() == ARMISD::BFI) {
9909     APInt NewToMask, NewFromMask;
9910     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
9911     if (NewFrom != From) {
9912       // This BFI has a different base. Keep going.
9913       CombinedToMask |= NewToMask;
9914       V = V.getOperand(0);
9915       continue;
9916     }
9917 
9918     // Do the written bits conflict with any we've seen so far?
9919     if ((NewToMask & CombinedToMask).getBoolValue())
9920       // Conflicting bits - bail out because going further is unsafe.
9921       return SDValue();
9922 
9923     // Are the new bits contiguous when combined with the old bits?
9924     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
9925         BitsProperlyConcatenate(FromMask, NewFromMask))
9926       return V;
9927     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
9928         BitsProperlyConcatenate(NewFromMask, FromMask))
9929       return V;
9930 
9931     // We've seen a write to some bits, so track it.
9932     CombinedToMask |= NewToMask;
9933     // Keep going...
9934     V = V.getOperand(0);
9935   }
9936 
9937   return SDValue();
9938 }
9939 
9940 static SDValue PerformBFICombine(SDNode *N,
9941                                  TargetLowering::DAGCombinerInfo &DCI) {
9942   SDValue N1 = N->getOperand(1);
9943   if (N1.getOpcode() == ISD::AND) {
9944     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
9945     // the bits being cleared by the AND are not demanded by the BFI.
9946     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
9947     if (!N11C)
9948       return SDValue();
9949     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
9950     unsigned LSB = countTrailingZeros(~InvMask);
9951     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
9952     assert(Width <
9953                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
9954            "undefined behavior");
9955     unsigned Mask = (1u << Width) - 1;
9956     unsigned Mask2 = N11C->getZExtValue();
9957     if ((Mask & (~Mask2)) == 0)
9958       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
9959                              N->getOperand(0), N1.getOperand(0),
9960                              N->getOperand(2));
9961   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
9962     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
9963     // Keep track of any consecutive bits set that all come from the same base
9964     // value. We can combine these together into a single BFI.
9965     SDValue CombineBFI = FindBFIToCombineWith(N);
9966     if (CombineBFI == SDValue())
9967       return SDValue();
9968 
9969     // We've found a BFI.
9970     APInt ToMask1, FromMask1;
9971     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
9972 
9973     APInt ToMask2, FromMask2;
9974     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
9975     assert(From1 == From2);
9976     (void)From2;
9977 
9978     // First, unlink CombineBFI.
9979     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
9980     // Then create a new BFI, combining the two together.
9981     APInt NewFromMask = FromMask1 | FromMask2;
9982     APInt NewToMask = ToMask1 | ToMask2;
9983 
9984     EVT VT = N->getValueType(0);
9985     SDLoc dl(N);
9986 
9987     if (NewFromMask[0] == 0)
9988       From1 = DCI.DAG.getNode(
9989         ISD::SRL, dl, VT, From1,
9990         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
9991     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
9992                            DCI.DAG.getConstant(~NewToMask, dl, VT));
9993   }
9994   return SDValue();
9995 }
9996 
9997 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
9998 /// ARMISD::VMOVRRD.
9999 static SDValue PerformVMOVRRDCombine(SDNode *N,
10000                                      TargetLowering::DAGCombinerInfo &DCI,
10001                                      const ARMSubtarget *Subtarget) {
10002   // vmovrrd(vmovdrr x, y) -> x,y
10003   SDValue InDouble = N->getOperand(0);
10004   if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
10005     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
10006 
10007   // vmovrrd(load f64) -> (load i32), (load i32)
10008   SDNode *InNode = InDouble.getNode();
10009   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
10010       InNode->getValueType(0) == MVT::f64 &&
10011       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
10012       !cast<LoadSDNode>(InNode)->isVolatile()) {
10013     // TODO: Should this be done for non-FrameIndex operands?
10014     LoadSDNode *LD = cast<LoadSDNode>(InNode);
10015 
10016     SelectionDAG &DAG = DCI.DAG;
10017     SDLoc DL(LD);
10018     SDValue BasePtr = LD->getBasePtr();
10019     SDValue NewLD1 =
10020         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
10021                     LD->getAlignment(), LD->getMemOperand()->getFlags());
10022 
10023     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
10024                                     DAG.getConstant(4, DL, MVT::i32));
10025     SDValue NewLD2 = DAG.getLoad(
10026         MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
10027         std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
10028 
10029     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
10030     if (DCI.DAG.getDataLayout().isBigEndian())
10031       std::swap (NewLD1, NewLD2);
10032     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
10033     return Result;
10034   }
10035 
10036   return SDValue();
10037 }
10038 
10039 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
10040 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
10041 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
10042   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
10043   SDValue Op0 = N->getOperand(0);
10044   SDValue Op1 = N->getOperand(1);
10045   if (Op0.getOpcode() == ISD::BITCAST)
10046     Op0 = Op0.getOperand(0);
10047   if (Op1.getOpcode() == ISD::BITCAST)
10048     Op1 = Op1.getOperand(0);
10049   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
10050       Op0.getNode() == Op1.getNode() &&
10051       Op0.getResNo() == 0 && Op1.getResNo() == 1)
10052     return DAG.getNode(ISD::BITCAST, SDLoc(N),
10053                        N->getValueType(0), Op0.getOperand(0));
10054   return SDValue();
10055 }
10056 
10057 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
10058 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
10059 /// i64 vector to have f64 elements, since the value can then be loaded
10060 /// directly into a VFP register.
10061 static bool hasNormalLoadOperand(SDNode *N) {
10062   unsigned NumElts = N->getValueType(0).getVectorNumElements();
10063   for (unsigned i = 0; i < NumElts; ++i) {
10064     SDNode *Elt = N->getOperand(i).getNode();
10065     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
10066       return true;
10067   }
10068   return false;
10069 }
10070 
10071 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
10072 /// ISD::BUILD_VECTOR.
10073 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
10074                                           TargetLowering::DAGCombinerInfo &DCI,
10075                                           const ARMSubtarget *Subtarget) {
10076   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
10077   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
10078   // into a pair of GPRs, which is fine when the value is used as a scalar,
10079   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
10080   SelectionDAG &DAG = DCI.DAG;
10081   if (N->getNumOperands() == 2)
10082     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
10083       return RV;
10084 
10085   // Load i64 elements as f64 values so that type legalization does not split
10086   // them up into i32 values.
10087   EVT VT = N->getValueType(0);
10088   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
10089     return SDValue();
10090   SDLoc dl(N);
10091   SmallVector<SDValue, 8> Ops;
10092   unsigned NumElts = VT.getVectorNumElements();
10093   for (unsigned i = 0; i < NumElts; ++i) {
10094     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
10095     Ops.push_back(V);
10096     // Make the DAGCombiner fold the bitcast.
10097     DCI.AddToWorklist(V.getNode());
10098   }
10099   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
10100   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
10101   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
10102 }
10103 
10104 /// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
10105 static SDValue
10106 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
10107   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
10108   // At that time, we may have inserted bitcasts from integer to float.
10109   // If these bitcasts have survived DAGCombine, change the lowering of this
10110   // BUILD_VECTOR in something more vector friendly, i.e., that does not
10111   // force to use floating point types.
10112 
10113   // Make sure we can change the type of the vector.
10114   // This is possible iff:
10115   // 1. The vector is only used in a bitcast to a integer type. I.e.,
10116   //    1.1. Vector is used only once.
10117   //    1.2. Use is a bit convert to an integer type.
10118   // 2. The size of its operands are 32-bits (64-bits are not legal).
10119   EVT VT = N->getValueType(0);
10120   EVT EltVT = VT.getVectorElementType();
10121 
10122   // Check 1.1. and 2.
10123   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
10124     return SDValue();
10125 
10126   // By construction, the input type must be float.
10127   assert(EltVT == MVT::f32 && "Unexpected type!");
10128 
10129   // Check 1.2.
10130   SDNode *Use = *N->use_begin();
10131   if (Use->getOpcode() != ISD::BITCAST ||
10132       Use->getValueType(0).isFloatingPoint())
10133     return SDValue();
10134 
10135   // Check profitability.
10136   // Model is, if more than half of the relevant operands are bitcast from
10137   // i32, turn the build_vector into a sequence of insert_vector_elt.
10138   // Relevant operands are everything that is not statically
10139   // (i.e., at compile time) bitcasted.
10140   unsigned NumOfBitCastedElts = 0;
10141   unsigned NumElts = VT.getVectorNumElements();
10142   unsigned NumOfRelevantElts = NumElts;
10143   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
10144     SDValue Elt = N->getOperand(Idx);
10145     if (Elt->getOpcode() == ISD::BITCAST) {
10146       // Assume only bit cast to i32 will go away.
10147       if (Elt->getOperand(0).getValueType() == MVT::i32)
10148         ++NumOfBitCastedElts;
10149     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
10150       // Constants are statically casted, thus do not count them as
10151       // relevant operands.
10152       --NumOfRelevantElts;
10153   }
10154 
10155   // Check if more than half of the elements require a non-free bitcast.
10156   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
10157     return SDValue();
10158 
10159   SelectionDAG &DAG = DCI.DAG;
10160   // Create the new vector type.
10161   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
10162   // Check if the type is legal.
10163   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10164   if (!TLI.isTypeLegal(VecVT))
10165     return SDValue();
10166 
10167   // Combine:
10168   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
10169   // => BITCAST INSERT_VECTOR_ELT
10170   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
10171   //                      (BITCAST EN), N.
10172   SDValue Vec = DAG.getUNDEF(VecVT);
10173   SDLoc dl(N);
10174   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
10175     SDValue V = N->getOperand(Idx);
10176     if (V.isUndef())
10177       continue;
10178     if (V.getOpcode() == ISD::BITCAST &&
10179         V->getOperand(0).getValueType() == MVT::i32)
10180       // Fold obvious case.
10181       V = V.getOperand(0);
10182     else {
10183       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
10184       // Make the DAGCombiner fold the bitcasts.
10185       DCI.AddToWorklist(V.getNode());
10186     }
10187     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
10188     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
10189   }
10190   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
10191   // Make the DAGCombiner fold the bitcasts.
10192   DCI.AddToWorklist(Vec.getNode());
10193   return Vec;
10194 }
10195 
10196 /// PerformInsertEltCombine - Target-specific dag combine xforms for
10197 /// ISD::INSERT_VECTOR_ELT.
10198 static SDValue PerformInsertEltCombine(SDNode *N,
10199                                        TargetLowering::DAGCombinerInfo &DCI) {
10200   // Bitcast an i64 load inserted into a vector to f64.
10201   // Otherwise, the i64 value will be legalized to a pair of i32 values.
10202   EVT VT = N->getValueType(0);
10203   SDNode *Elt = N->getOperand(1).getNode();
10204   if (VT.getVectorElementType() != MVT::i64 ||
10205       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
10206     return SDValue();
10207 
10208   SelectionDAG &DAG = DCI.DAG;
10209   SDLoc dl(N);
10210   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
10211                                  VT.getVectorNumElements());
10212   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
10213   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
10214   // Make the DAGCombiner fold the bitcasts.
10215   DCI.AddToWorklist(Vec.getNode());
10216   DCI.AddToWorklist(V.getNode());
10217   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
10218                                Vec, V, N->getOperand(2));
10219   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
10220 }
10221 
10222 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
10223 /// ISD::VECTOR_SHUFFLE.
10224 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
10225   // The LLVM shufflevector instruction does not require the shuffle mask
10226   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
10227   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
10228   // operands do not match the mask length, they are extended by concatenating
10229   // them with undef vectors.  That is probably the right thing for other
10230   // targets, but for NEON it is better to concatenate two double-register
10231   // size vector operands into a single quad-register size vector.  Do that
10232   // transformation here:
10233   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
10234   //   shuffle(concat(v1, v2), undef)
10235   SDValue Op0 = N->getOperand(0);
10236   SDValue Op1 = N->getOperand(1);
10237   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
10238       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
10239       Op0.getNumOperands() != 2 ||
10240       Op1.getNumOperands() != 2)
10241     return SDValue();
10242   SDValue Concat0Op1 = Op0.getOperand(1);
10243   SDValue Concat1Op1 = Op1.getOperand(1);
10244   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
10245     return SDValue();
10246   // Skip the transformation if any of the types are illegal.
10247   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10248   EVT VT = N->getValueType(0);
10249   if (!TLI.isTypeLegal(VT) ||
10250       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
10251       !TLI.isTypeLegal(Concat1Op1.getValueType()))
10252     return SDValue();
10253 
10254   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
10255                                   Op0.getOperand(0), Op1.getOperand(0));
10256   // Translate the shuffle mask.
10257   SmallVector<int, 16> NewMask;
10258   unsigned NumElts = VT.getVectorNumElements();
10259   unsigned HalfElts = NumElts/2;
10260   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
10261   for (unsigned n = 0; n < NumElts; ++n) {
10262     int MaskElt = SVN->getMaskElt(n);
10263     int NewElt = -1;
10264     if (MaskElt < (int)HalfElts)
10265       NewElt = MaskElt;
10266     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
10267       NewElt = HalfElts + MaskElt - NumElts;
10268     NewMask.push_back(NewElt);
10269   }
10270   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
10271                               DAG.getUNDEF(VT), NewMask);
10272 }
10273 
10274 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
10275 /// NEON load/store intrinsics, and generic vector load/stores, to merge
10276 /// base address updates.
10277 /// For generic load/stores, the memory type is assumed to be a vector.
10278 /// The caller is assumed to have checked legality.
10279 static SDValue CombineBaseUpdate(SDNode *N,
10280                                  TargetLowering::DAGCombinerInfo &DCI) {
10281   SelectionDAG &DAG = DCI.DAG;
10282   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
10283                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
10284   const bool isStore = N->getOpcode() == ISD::STORE;
10285   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
10286   SDValue Addr = N->getOperand(AddrOpIdx);
10287   MemSDNode *MemN = cast<MemSDNode>(N);
10288   SDLoc dl(N);
10289 
10290   // Search for a use of the address operand that is an increment.
10291   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
10292          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
10293     SDNode *User = *UI;
10294     if (User->getOpcode() != ISD::ADD ||
10295         UI.getUse().getResNo() != Addr.getResNo())
10296       continue;
10297 
10298     // Check that the add is independent of the load/store.  Otherwise, folding
10299     // it would create a cycle.
10300     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
10301       continue;
10302 
10303     // Find the new opcode for the updating load/store.
10304     bool isLoadOp = true;
10305     bool isLaneOp = false;
10306     unsigned NewOpc = 0;
10307     unsigned NumVecs = 0;
10308     if (isIntrinsic) {
10309       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
10310       switch (IntNo) {
10311       default: llvm_unreachable("unexpected intrinsic for Neon base update");
10312       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
10313         NumVecs = 1; break;
10314       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
10315         NumVecs = 2; break;
10316       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
10317         NumVecs = 3; break;
10318       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
10319         NumVecs = 4; break;
10320       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
10321         NumVecs = 2; isLaneOp = true; break;
10322       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
10323         NumVecs = 3; isLaneOp = true; break;
10324       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
10325         NumVecs = 4; isLaneOp = true; break;
10326       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
10327         NumVecs = 1; isLoadOp = false; break;
10328       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
10329         NumVecs = 2; isLoadOp = false; break;
10330       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
10331         NumVecs = 3; isLoadOp = false; break;
10332       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
10333         NumVecs = 4; isLoadOp = false; break;
10334       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
10335         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
10336       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
10337         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
10338       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
10339         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
10340       }
10341     } else {
10342       isLaneOp = true;
10343       switch (N->getOpcode()) {
10344       default: llvm_unreachable("unexpected opcode for Neon base update");
10345       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
10346       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
10347       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
10348       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
10349         NumVecs = 1; isLaneOp = false; break;
10350       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
10351         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
10352       }
10353     }
10354 
10355     // Find the size of memory referenced by the load/store.
10356     EVT VecTy;
10357     if (isLoadOp) {
10358       VecTy = N->getValueType(0);
10359     } else if (isIntrinsic) {
10360       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
10361     } else {
10362       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
10363       VecTy = N->getOperand(1).getValueType();
10364     }
10365 
10366     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
10367     if (isLaneOp)
10368       NumBytes /= VecTy.getVectorNumElements();
10369 
10370     // If the increment is a constant, it must match the memory ref size.
10371     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
10372     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
10373       uint64_t IncVal = CInc->getZExtValue();
10374       if (IncVal != NumBytes)
10375         continue;
10376     } else if (NumBytes >= 3 * 16) {
10377       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
10378       // separate instructions that make it harder to use a non-constant update.
10379       continue;
10380     }
10381 
10382     // OK, we found an ADD we can fold into the base update.
10383     // Now, create a _UPD node, taking care of not breaking alignment.
10384 
10385     EVT AlignedVecTy = VecTy;
10386     unsigned Alignment = MemN->getAlignment();
10387 
10388     // If this is a less-than-standard-aligned load/store, change the type to
10389     // match the standard alignment.
10390     // The alignment is overlooked when selecting _UPD variants; and it's
10391     // easier to introduce bitcasts here than fix that.
10392     // There are 3 ways to get to this base-update combine:
10393     // - intrinsics: they are assumed to be properly aligned (to the standard
10394     //   alignment of the memory type), so we don't need to do anything.
10395     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
10396     //   intrinsics, so, likewise, there's nothing to do.
10397     // - generic load/store instructions: the alignment is specified as an
10398     //   explicit operand, rather than implicitly as the standard alignment
10399     //   of the memory type (like the intrisics).  We need to change the
10400     //   memory type to match the explicit alignment.  That way, we don't
10401     //   generate non-standard-aligned ARMISD::VLDx nodes.
10402     if (isa<LSBaseSDNode>(N)) {
10403       if (Alignment == 0)
10404         Alignment = 1;
10405       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
10406         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
10407         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
10408         assert(!isLaneOp && "Unexpected generic load/store lane.");
10409         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
10410         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
10411       }
10412       // Don't set an explicit alignment on regular load/stores that we want
10413       // to transform to VLD/VST 1_UPD nodes.
10414       // This matches the behavior of regular load/stores, which only get an
10415       // explicit alignment if the MMO alignment is larger than the standard
10416       // alignment of the memory type.
10417       // Intrinsics, however, always get an explicit alignment, set to the
10418       // alignment of the MMO.
10419       Alignment = 1;
10420     }
10421 
10422     // Create the new updating load/store node.
10423     // First, create an SDVTList for the new updating node's results.
10424     EVT Tys[6];
10425     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
10426     unsigned n;
10427     for (n = 0; n < NumResultVecs; ++n)
10428       Tys[n] = AlignedVecTy;
10429     Tys[n++] = MVT::i32;
10430     Tys[n] = MVT::Other;
10431     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
10432 
10433     // Then, gather the new node's operands.
10434     SmallVector<SDValue, 8> Ops;
10435     Ops.push_back(N->getOperand(0)); // incoming chain
10436     Ops.push_back(N->getOperand(AddrOpIdx));
10437     Ops.push_back(Inc);
10438 
10439     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
10440       // Try to match the intrinsic's signature
10441       Ops.push_back(StN->getValue());
10442     } else {
10443       // Loads (and of course intrinsics) match the intrinsics' signature,
10444       // so just add all but the alignment operand.
10445       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
10446         Ops.push_back(N->getOperand(i));
10447     }
10448 
10449     // For all node types, the alignment operand is always the last one.
10450     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
10451 
10452     // If this is a non-standard-aligned STORE, the penultimate operand is the
10453     // stored value.  Bitcast it to the aligned type.
10454     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
10455       SDValue &StVal = Ops[Ops.size()-2];
10456       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
10457     }
10458 
10459     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
10460                                            Ops, AlignedVecTy,
10461                                            MemN->getMemOperand());
10462 
10463     // Update the uses.
10464     SmallVector<SDValue, 5> NewResults;
10465     for (unsigned i = 0; i < NumResultVecs; ++i)
10466       NewResults.push_back(SDValue(UpdN.getNode(), i));
10467 
10468     // If this is an non-standard-aligned LOAD, the first result is the loaded
10469     // value.  Bitcast it to the expected result type.
10470     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
10471       SDValue &LdVal = NewResults[0];
10472       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
10473     }
10474 
10475     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
10476     DCI.CombineTo(N, NewResults);
10477     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
10478 
10479     break;
10480   }
10481   return SDValue();
10482 }
10483 
10484 static SDValue PerformVLDCombine(SDNode *N,
10485                                  TargetLowering::DAGCombinerInfo &DCI) {
10486   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
10487     return SDValue();
10488 
10489   return CombineBaseUpdate(N, DCI);
10490 }
10491 
10492 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
10493 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
10494 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
10495 /// return true.
10496 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
10497   SelectionDAG &DAG = DCI.DAG;
10498   EVT VT = N->getValueType(0);
10499   // vldN-dup instructions only support 64-bit vectors for N > 1.
10500   if (!VT.is64BitVector())
10501     return false;
10502 
10503   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
10504   SDNode *VLD = N->getOperand(0).getNode();
10505   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
10506     return false;
10507   unsigned NumVecs = 0;
10508   unsigned NewOpc = 0;
10509   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
10510   if (IntNo == Intrinsic::arm_neon_vld2lane) {
10511     NumVecs = 2;
10512     NewOpc = ARMISD::VLD2DUP;
10513   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
10514     NumVecs = 3;
10515     NewOpc = ARMISD::VLD3DUP;
10516   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
10517     NumVecs = 4;
10518     NewOpc = ARMISD::VLD4DUP;
10519   } else {
10520     return false;
10521   }
10522 
10523   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
10524   // numbers match the load.
10525   unsigned VLDLaneNo =
10526     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
10527   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
10528        UI != UE; ++UI) {
10529     // Ignore uses of the chain result.
10530     if (UI.getUse().getResNo() == NumVecs)
10531       continue;
10532     SDNode *User = *UI;
10533     if (User->getOpcode() != ARMISD::VDUPLANE ||
10534         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
10535       return false;
10536   }
10537 
10538   // Create the vldN-dup node.
10539   EVT Tys[5];
10540   unsigned n;
10541   for (n = 0; n < NumVecs; ++n)
10542     Tys[n] = VT;
10543   Tys[n] = MVT::Other;
10544   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
10545   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
10546   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
10547   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
10548                                            Ops, VLDMemInt->getMemoryVT(),
10549                                            VLDMemInt->getMemOperand());
10550 
10551   // Update the uses.
10552   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
10553        UI != UE; ++UI) {
10554     unsigned ResNo = UI.getUse().getResNo();
10555     // Ignore uses of the chain result.
10556     if (ResNo == NumVecs)
10557       continue;
10558     SDNode *User = *UI;
10559     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
10560   }
10561 
10562   // Now the vldN-lane intrinsic is dead except for its chain result.
10563   // Update uses of the chain.
10564   std::vector<SDValue> VLDDupResults;
10565   for (unsigned n = 0; n < NumVecs; ++n)
10566     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
10567   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
10568   DCI.CombineTo(VLD, VLDDupResults);
10569 
10570   return true;
10571 }
10572 
10573 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
10574 /// ARMISD::VDUPLANE.
10575 static SDValue PerformVDUPLANECombine(SDNode *N,
10576                                       TargetLowering::DAGCombinerInfo &DCI) {
10577   SDValue Op = N->getOperand(0);
10578 
10579   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
10580   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
10581   if (CombineVLDDUP(N, DCI))
10582     return SDValue(N, 0);
10583 
10584   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
10585   // redundant.  Ignore bit_converts for now; element sizes are checked below.
10586   while (Op.getOpcode() == ISD::BITCAST)
10587     Op = Op.getOperand(0);
10588   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
10589     return SDValue();
10590 
10591   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
10592   unsigned EltSize = Op.getScalarValueSizeInBits();
10593   // The canonical VMOV for a zero vector uses a 32-bit element size.
10594   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10595   unsigned EltBits;
10596   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
10597     EltSize = 8;
10598   EVT VT = N->getValueType(0);
10599   if (EltSize > VT.getScalarSizeInBits())
10600     return SDValue();
10601 
10602   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
10603 }
10604 
10605 static SDValue PerformLOADCombine(SDNode *N,
10606                                   TargetLowering::DAGCombinerInfo &DCI) {
10607   EVT VT = N->getValueType(0);
10608 
10609   // If this is a legal vector load, try to combine it into a VLD1_UPD.
10610   if (ISD::isNormalLoad(N) && VT.isVector() &&
10611       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
10612     return CombineBaseUpdate(N, DCI);
10613 
10614   return SDValue();
10615 }
10616 
10617 /// PerformSTORECombine - Target-specific dag combine xforms for
10618 /// ISD::STORE.
10619 static SDValue PerformSTORECombine(SDNode *N,
10620                                    TargetLowering::DAGCombinerInfo &DCI) {
10621   StoreSDNode *St = cast<StoreSDNode>(N);
10622   if (St->isVolatile())
10623     return SDValue();
10624 
10625   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
10626   // pack all of the elements in one place.  Next, store to memory in fewer
10627   // chunks.
10628   SDValue StVal = St->getValue();
10629   EVT VT = StVal.getValueType();
10630   if (St->isTruncatingStore() && VT.isVector()) {
10631     SelectionDAG &DAG = DCI.DAG;
10632     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10633     EVT StVT = St->getMemoryVT();
10634     unsigned NumElems = VT.getVectorNumElements();
10635     assert(StVT != VT && "Cannot truncate to the same type");
10636     unsigned FromEltSz = VT.getScalarSizeInBits();
10637     unsigned ToEltSz = StVT.getScalarSizeInBits();
10638 
10639     // From, To sizes and ElemCount must be pow of two
10640     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
10641 
10642     // We are going to use the original vector elt for storing.
10643     // Accumulated smaller vector elements must be a multiple of the store size.
10644     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
10645 
10646     unsigned SizeRatio  = FromEltSz / ToEltSz;
10647     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
10648 
10649     // Create a type on which we perform the shuffle.
10650     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
10651                                      NumElems*SizeRatio);
10652     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
10653 
10654     SDLoc DL(St);
10655     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
10656     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
10657     for (unsigned i = 0; i < NumElems; ++i)
10658       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
10659                           ? (i + 1) * SizeRatio - 1
10660                           : i * SizeRatio;
10661 
10662     // Can't shuffle using an illegal type.
10663     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
10664 
10665     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
10666                                 DAG.getUNDEF(WideVec.getValueType()),
10667                                 ShuffleVec);
10668     // At this point all of the data is stored at the bottom of the
10669     // register. We now need to save it to mem.
10670 
10671     // Find the largest store unit
10672     MVT StoreType = MVT::i8;
10673     for (MVT Tp : MVT::integer_valuetypes()) {
10674       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
10675         StoreType = Tp;
10676     }
10677     // Didn't find a legal store type.
10678     if (!TLI.isTypeLegal(StoreType))
10679       return SDValue();
10680 
10681     // Bitcast the original vector into a vector of store-size units
10682     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
10683             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
10684     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
10685     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
10686     SmallVector<SDValue, 8> Chains;
10687     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
10688                                         TLI.getPointerTy(DAG.getDataLayout()));
10689     SDValue BasePtr = St->getBasePtr();
10690 
10691     // Perform one or more big stores into memory.
10692     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
10693     for (unsigned I = 0; I < E; I++) {
10694       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
10695                                    StoreType, ShuffWide,
10696                                    DAG.getIntPtrConstant(I, DL));
10697       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
10698                                 St->getPointerInfo(), St->getAlignment(),
10699                                 St->getMemOperand()->getFlags());
10700       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
10701                             Increment);
10702       Chains.push_back(Ch);
10703     }
10704     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
10705   }
10706 
10707   if (!ISD::isNormalStore(St))
10708     return SDValue();
10709 
10710   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
10711   // ARM stores of arguments in the same cache line.
10712   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
10713       StVal.getNode()->hasOneUse()) {
10714     SelectionDAG  &DAG = DCI.DAG;
10715     bool isBigEndian = DAG.getDataLayout().isBigEndian();
10716     SDLoc DL(St);
10717     SDValue BasePtr = St->getBasePtr();
10718     SDValue NewST1 = DAG.getStore(
10719         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
10720         BasePtr, St->getPointerInfo(), St->getAlignment(),
10721         St->getMemOperand()->getFlags());
10722 
10723     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
10724                                     DAG.getConstant(4, DL, MVT::i32));
10725     return DAG.getStore(NewST1.getValue(0), DL,
10726                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
10727                         OffsetPtr, St->getPointerInfo(),
10728                         std::min(4U, St->getAlignment() / 2),
10729                         St->getMemOperand()->getFlags());
10730   }
10731 
10732   if (StVal.getValueType() == MVT::i64 &&
10733       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10734 
10735     // Bitcast an i64 store extracted from a vector to f64.
10736     // Otherwise, the i64 value will be legalized to a pair of i32 values.
10737     SelectionDAG &DAG = DCI.DAG;
10738     SDLoc dl(StVal);
10739     SDValue IntVec = StVal.getOperand(0);
10740     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
10741                                    IntVec.getValueType().getVectorNumElements());
10742     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
10743     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
10744                                  Vec, StVal.getOperand(1));
10745     dl = SDLoc(N);
10746     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
10747     // Make the DAGCombiner fold the bitcasts.
10748     DCI.AddToWorklist(Vec.getNode());
10749     DCI.AddToWorklist(ExtElt.getNode());
10750     DCI.AddToWorklist(V.getNode());
10751     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
10752                         St->getPointerInfo(), St->getAlignment(),
10753                         St->getMemOperand()->getFlags(), St->getAAInfo());
10754   }
10755 
10756   // If this is a legal vector store, try to combine it into a VST1_UPD.
10757   if (ISD::isNormalStore(N) && VT.isVector() &&
10758       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
10759     return CombineBaseUpdate(N, DCI);
10760 
10761   return SDValue();
10762 }
10763 
10764 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
10765 /// can replace combinations of VMUL and VCVT (floating-point to integer)
10766 /// when the VMUL has a constant operand that is a power of 2.
10767 ///
10768 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
10769 ///  vmul.f32        d16, d17, d16
10770 ///  vcvt.s32.f32    d16, d16
10771 /// becomes:
10772 ///  vcvt.s32.f32    d16, d16, #3
10773 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
10774                                   const ARMSubtarget *Subtarget) {
10775   if (!Subtarget->hasNEON())
10776     return SDValue();
10777 
10778   SDValue Op = N->getOperand(0);
10779   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
10780       Op.getOpcode() != ISD::FMUL)
10781     return SDValue();
10782 
10783   SDValue ConstVec = Op->getOperand(1);
10784   if (!isa<BuildVectorSDNode>(ConstVec))
10785     return SDValue();
10786 
10787   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
10788   uint32_t FloatBits = FloatTy.getSizeInBits();
10789   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
10790   uint32_t IntBits = IntTy.getSizeInBits();
10791   unsigned NumLanes = Op.getValueType().getVectorNumElements();
10792   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
10793     // These instructions only exist converting from f32 to i32. We can handle
10794     // smaller integers by generating an extra truncate, but larger ones would
10795     // be lossy. We also can't handle more then 4 lanes, since these intructions
10796     // only support v2i32/v4i32 types.
10797     return SDValue();
10798   }
10799 
10800   BitVector UndefElements;
10801   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
10802   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
10803   if (C == -1 || C == 0 || C > 32)
10804     return SDValue();
10805 
10806   SDLoc dl(N);
10807   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
10808   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
10809     Intrinsic::arm_neon_vcvtfp2fxu;
10810   SDValue FixConv = DAG.getNode(
10811       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
10812       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
10813       DAG.getConstant(C, dl, MVT::i32));
10814 
10815   if (IntBits < FloatBits)
10816     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
10817 
10818   return FixConv;
10819 }
10820 
10821 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
10822 /// can replace combinations of VCVT (integer to floating-point) and VDIV
10823 /// when the VDIV has a constant operand that is a power of 2.
10824 ///
10825 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
10826 ///  vcvt.f32.s32    d16, d16
10827 ///  vdiv.f32        d16, d17, d16
10828 /// becomes:
10829 ///  vcvt.f32.s32    d16, d16, #3
10830 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
10831                                   const ARMSubtarget *Subtarget) {
10832   if (!Subtarget->hasNEON())
10833     return SDValue();
10834 
10835   SDValue Op = N->getOperand(0);
10836   unsigned OpOpcode = Op.getNode()->getOpcode();
10837   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
10838       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
10839     return SDValue();
10840 
10841   SDValue ConstVec = N->getOperand(1);
10842   if (!isa<BuildVectorSDNode>(ConstVec))
10843     return SDValue();
10844 
10845   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
10846   uint32_t FloatBits = FloatTy.getSizeInBits();
10847   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
10848   uint32_t IntBits = IntTy.getSizeInBits();
10849   unsigned NumLanes = Op.getValueType().getVectorNumElements();
10850   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
10851     // These instructions only exist converting from i32 to f32. We can handle
10852     // smaller integers by generating an extra extend, but larger ones would
10853     // be lossy. We also can't handle more then 4 lanes, since these intructions
10854     // only support v2i32/v4i32 types.
10855     return SDValue();
10856   }
10857 
10858   BitVector UndefElements;
10859   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
10860   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
10861   if (C == -1 || C == 0 || C > 32)
10862     return SDValue();
10863 
10864   SDLoc dl(N);
10865   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
10866   SDValue ConvInput = Op.getOperand(0);
10867   if (IntBits < FloatBits)
10868     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
10869                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
10870                             ConvInput);
10871 
10872   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
10873     Intrinsic::arm_neon_vcvtfxu2fp;
10874   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
10875                      Op.getValueType(),
10876                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
10877                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
10878 }
10879 
10880 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
10881 /// operand of a vector shift operation, where all the elements of the
10882 /// build_vector must have the same constant integer value.
10883 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10884   // Ignore bit_converts.
10885   while (Op.getOpcode() == ISD::BITCAST)
10886     Op = Op.getOperand(0);
10887   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
10888   APInt SplatBits, SplatUndef;
10889   unsigned SplatBitSize;
10890   bool HasAnyUndefs;
10891   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10892                                       HasAnyUndefs, ElementBits) ||
10893       SplatBitSize > ElementBits)
10894     return false;
10895   Cnt = SplatBits.getSExtValue();
10896   return true;
10897 }
10898 
10899 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
10900 /// operand of a vector shift left operation.  That value must be in the range:
10901 ///   0 <= Value < ElementBits for a left shift; or
10902 ///   0 <= Value <= ElementBits for a long left shift.
10903 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10904   assert(VT.isVector() && "vector shift count is not a vector type");
10905   int64_t ElementBits = VT.getScalarSizeInBits();
10906   if (! getVShiftImm(Op, ElementBits, Cnt))
10907     return false;
10908   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
10909 }
10910 
10911 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
10912 /// operand of a vector shift right operation.  For a shift opcode, the value
10913 /// is positive, but for an intrinsic the value count must be negative. The
10914 /// absolute value must be in the range:
10915 ///   1 <= |Value| <= ElementBits for a right shift; or
10916 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
10917 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
10918                          int64_t &Cnt) {
10919   assert(VT.isVector() && "vector shift count is not a vector type");
10920   int64_t ElementBits = VT.getScalarSizeInBits();
10921   if (! getVShiftImm(Op, ElementBits, Cnt))
10922     return false;
10923   if (!isIntrinsic)
10924     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
10925   if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
10926     Cnt = -Cnt;
10927     return true;
10928   }
10929   return false;
10930 }
10931 
10932 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
10933 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
10934   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
10935   switch (IntNo) {
10936   default:
10937     // Don't do anything for most intrinsics.
10938     break;
10939 
10940   // Vector shifts: check for immediate versions and lower them.
10941   // Note: This is done during DAG combining instead of DAG legalizing because
10942   // the build_vectors for 64-bit vector element shift counts are generally
10943   // not legal, and it is hard to see their values after they get legalized to
10944   // loads from a constant pool.
10945   case Intrinsic::arm_neon_vshifts:
10946   case Intrinsic::arm_neon_vshiftu:
10947   case Intrinsic::arm_neon_vrshifts:
10948   case Intrinsic::arm_neon_vrshiftu:
10949   case Intrinsic::arm_neon_vrshiftn:
10950   case Intrinsic::arm_neon_vqshifts:
10951   case Intrinsic::arm_neon_vqshiftu:
10952   case Intrinsic::arm_neon_vqshiftsu:
10953   case Intrinsic::arm_neon_vqshiftns:
10954   case Intrinsic::arm_neon_vqshiftnu:
10955   case Intrinsic::arm_neon_vqshiftnsu:
10956   case Intrinsic::arm_neon_vqrshiftns:
10957   case Intrinsic::arm_neon_vqrshiftnu:
10958   case Intrinsic::arm_neon_vqrshiftnsu: {
10959     EVT VT = N->getOperand(1).getValueType();
10960     int64_t Cnt;
10961     unsigned VShiftOpc = 0;
10962 
10963     switch (IntNo) {
10964     case Intrinsic::arm_neon_vshifts:
10965     case Intrinsic::arm_neon_vshiftu:
10966       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
10967         VShiftOpc = ARMISD::VSHL;
10968         break;
10969       }
10970       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
10971         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
10972                      ARMISD::VSHRs : ARMISD::VSHRu);
10973         break;
10974       }
10975       return SDValue();
10976 
10977     case Intrinsic::arm_neon_vrshifts:
10978     case Intrinsic::arm_neon_vrshiftu:
10979       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
10980         break;
10981       return SDValue();
10982 
10983     case Intrinsic::arm_neon_vqshifts:
10984     case Intrinsic::arm_neon_vqshiftu:
10985       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
10986         break;
10987       return SDValue();
10988 
10989     case Intrinsic::arm_neon_vqshiftsu:
10990       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
10991         break;
10992       llvm_unreachable("invalid shift count for vqshlu intrinsic");
10993 
10994     case Intrinsic::arm_neon_vrshiftn:
10995     case Intrinsic::arm_neon_vqshiftns:
10996     case Intrinsic::arm_neon_vqshiftnu:
10997     case Intrinsic::arm_neon_vqshiftnsu:
10998     case Intrinsic::arm_neon_vqrshiftns:
10999     case Intrinsic::arm_neon_vqrshiftnu:
11000     case Intrinsic::arm_neon_vqrshiftnsu:
11001       // Narrowing shifts require an immediate right shift.
11002       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
11003         break;
11004       llvm_unreachable("invalid shift count for narrowing vector shift "
11005                        "intrinsic");
11006 
11007     default:
11008       llvm_unreachable("unhandled vector shift");
11009     }
11010 
11011     switch (IntNo) {
11012     case Intrinsic::arm_neon_vshifts:
11013     case Intrinsic::arm_neon_vshiftu:
11014       // Opcode already set above.
11015       break;
11016     case Intrinsic::arm_neon_vrshifts:
11017       VShiftOpc = ARMISD::VRSHRs; break;
11018     case Intrinsic::arm_neon_vrshiftu:
11019       VShiftOpc = ARMISD::VRSHRu; break;
11020     case Intrinsic::arm_neon_vrshiftn:
11021       VShiftOpc = ARMISD::VRSHRN; break;
11022     case Intrinsic::arm_neon_vqshifts:
11023       VShiftOpc = ARMISD::VQSHLs; break;
11024     case Intrinsic::arm_neon_vqshiftu:
11025       VShiftOpc = ARMISD::VQSHLu; break;
11026     case Intrinsic::arm_neon_vqshiftsu:
11027       VShiftOpc = ARMISD::VQSHLsu; break;
11028     case Intrinsic::arm_neon_vqshiftns:
11029       VShiftOpc = ARMISD::VQSHRNs; break;
11030     case Intrinsic::arm_neon_vqshiftnu:
11031       VShiftOpc = ARMISD::VQSHRNu; break;
11032     case Intrinsic::arm_neon_vqshiftnsu:
11033       VShiftOpc = ARMISD::VQSHRNsu; break;
11034     case Intrinsic::arm_neon_vqrshiftns:
11035       VShiftOpc = ARMISD::VQRSHRNs; break;
11036     case Intrinsic::arm_neon_vqrshiftnu:
11037       VShiftOpc = ARMISD::VQRSHRNu; break;
11038     case Intrinsic::arm_neon_vqrshiftnsu:
11039       VShiftOpc = ARMISD::VQRSHRNsu; break;
11040     }
11041 
11042     SDLoc dl(N);
11043     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
11044                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
11045   }
11046 
11047   case Intrinsic::arm_neon_vshiftins: {
11048     EVT VT = N->getOperand(1).getValueType();
11049     int64_t Cnt;
11050     unsigned VShiftOpc = 0;
11051 
11052     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
11053       VShiftOpc = ARMISD::VSLI;
11054     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
11055       VShiftOpc = ARMISD::VSRI;
11056     else {
11057       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
11058     }
11059 
11060     SDLoc dl(N);
11061     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
11062                        N->getOperand(1), N->getOperand(2),
11063                        DAG.getConstant(Cnt, dl, MVT::i32));
11064   }
11065 
11066   case Intrinsic::arm_neon_vqrshifts:
11067   case Intrinsic::arm_neon_vqrshiftu:
11068     // No immediate versions of these to check for.
11069     break;
11070   }
11071 
11072   return SDValue();
11073 }
11074 
11075 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
11076 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
11077 /// combining instead of DAG legalizing because the build_vectors for 64-bit
11078 /// vector element shift counts are generally not legal, and it is hard to see
11079 /// their values after they get legalized to loads from a constant pool.
11080 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
11081                                    const ARMSubtarget *ST) {
11082   EVT VT = N->getValueType(0);
11083   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
11084     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
11085     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
11086     SDValue N1 = N->getOperand(1);
11087     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
11088       SDValue N0 = N->getOperand(0);
11089       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
11090           DAG.MaskedValueIsZero(N0.getOperand(0),
11091                                 APInt::getHighBitsSet(32, 16)))
11092         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
11093     }
11094   }
11095 
11096   // Nothing to be done for scalar shifts.
11097   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11098   if (!VT.isVector() || !TLI.isTypeLegal(VT))
11099     return SDValue();
11100 
11101   assert(ST->hasNEON() && "unexpected vector shift");
11102   int64_t Cnt;
11103 
11104   switch (N->getOpcode()) {
11105   default: llvm_unreachable("unexpected shift opcode");
11106 
11107   case ISD::SHL:
11108     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
11109       SDLoc dl(N);
11110       return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
11111                          DAG.getConstant(Cnt, dl, MVT::i32));
11112     }
11113     break;
11114 
11115   case ISD::SRA:
11116   case ISD::SRL:
11117     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
11118       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
11119                             ARMISD::VSHRs : ARMISD::VSHRu);
11120       SDLoc dl(N);
11121       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
11122                          DAG.getConstant(Cnt, dl, MVT::i32));
11123     }
11124   }
11125   return SDValue();
11126 }
11127 
11128 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
11129 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
11130 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
11131                                     const ARMSubtarget *ST) {
11132   SDValue N0 = N->getOperand(0);
11133 
11134   // Check for sign- and zero-extensions of vector extract operations of 8-
11135   // and 16-bit vector elements.  NEON supports these directly.  They are
11136   // handled during DAG combining because type legalization will promote them
11137   // to 32-bit types and it is messy to recognize the operations after that.
11138   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
11139     SDValue Vec = N0.getOperand(0);
11140     SDValue Lane = N0.getOperand(1);
11141     EVT VT = N->getValueType(0);
11142     EVT EltVT = N0.getValueType();
11143     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11144 
11145     if (VT == MVT::i32 &&
11146         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
11147         TLI.isTypeLegal(Vec.getValueType()) &&
11148         isa<ConstantSDNode>(Lane)) {
11149 
11150       unsigned Opc = 0;
11151       switch (N->getOpcode()) {
11152       default: llvm_unreachable("unexpected opcode");
11153       case ISD::SIGN_EXTEND:
11154         Opc = ARMISD::VGETLANEs;
11155         break;
11156       case ISD::ZERO_EXTEND:
11157       case ISD::ANY_EXTEND:
11158         Opc = ARMISD::VGETLANEu;
11159         break;
11160       }
11161       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
11162     }
11163   }
11164 
11165   return SDValue();
11166 }
11167 
11168 static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
11169                              APInt &KnownOne) {
11170   if (Op.getOpcode() == ARMISD::BFI) {
11171     // Conservatively, we can recurse down the first operand
11172     // and just mask out all affected bits.
11173     computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
11174 
11175     // The operand to BFI is already a mask suitable for removing the bits it
11176     // sets.
11177     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
11178     const APInt &Mask = CI->getAPIntValue();
11179     KnownZero &= Mask;
11180     KnownOne &= Mask;
11181     return;
11182   }
11183   if (Op.getOpcode() == ARMISD::CMOV) {
11184     APInt KZ2(KnownZero.getBitWidth(), 0);
11185     APInt KO2(KnownOne.getBitWidth(), 0);
11186     computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
11187     computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
11188 
11189     KnownZero &= KZ2;
11190     KnownOne &= KO2;
11191     return;
11192   }
11193   return DAG.computeKnownBits(Op, KnownZero, KnownOne);
11194 }
11195 
11196 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
11197   // If we have a CMOV, OR and AND combination such as:
11198   //   if (x & CN)
11199   //     y |= CM;
11200   //
11201   // And:
11202   //   * CN is a single bit;
11203   //   * All bits covered by CM are known zero in y
11204   //
11205   // Then we can convert this into a sequence of BFI instructions. This will
11206   // always be a win if CM is a single bit, will always be no worse than the
11207   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
11208   // three bits (due to the extra IT instruction).
11209 
11210   SDValue Op0 = CMOV->getOperand(0);
11211   SDValue Op1 = CMOV->getOperand(1);
11212   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
11213   auto CC = CCNode->getAPIntValue().getLimitedValue();
11214   SDValue CmpZ = CMOV->getOperand(4);
11215 
11216   // The compare must be against zero.
11217   if (!isNullConstant(CmpZ->getOperand(1)))
11218     return SDValue();
11219 
11220   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
11221   SDValue And = CmpZ->getOperand(0);
11222   if (And->getOpcode() != ISD::AND)
11223     return SDValue();
11224   ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
11225   if (!AndC || !AndC->getAPIntValue().isPowerOf2())
11226     return SDValue();
11227   SDValue X = And->getOperand(0);
11228 
11229   if (CC == ARMCC::EQ) {
11230     // We're performing an "equal to zero" compare. Swap the operands so we
11231     // canonicalize on a "not equal to zero" compare.
11232     std::swap(Op0, Op1);
11233   } else {
11234     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
11235   }
11236 
11237   if (Op1->getOpcode() != ISD::OR)
11238     return SDValue();
11239 
11240   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
11241   if (!OrC)
11242     return SDValue();
11243   SDValue Y = Op1->getOperand(0);
11244 
11245   if (Op0 != Y)
11246     return SDValue();
11247 
11248   // Now, is it profitable to continue?
11249   APInt OrCI = OrC->getAPIntValue();
11250   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
11251   if (OrCI.countPopulation() > Heuristic)
11252     return SDValue();
11253 
11254   // Lastly, can we determine that the bits defined by OrCI
11255   // are zero in Y?
11256   APInt KnownZero, KnownOne;
11257   computeKnownBits(DAG, Y, KnownZero, KnownOne);
11258   if ((OrCI & KnownZero) != OrCI)
11259     return SDValue();
11260 
11261   // OK, we can do the combine.
11262   SDValue V = Y;
11263   SDLoc dl(X);
11264   EVT VT = X.getValueType();
11265   unsigned BitInX = AndC->getAPIntValue().logBase2();
11266 
11267   if (BitInX != 0) {
11268     // We must shift X first.
11269     X = DAG.getNode(ISD::SRL, dl, VT, X,
11270                     DAG.getConstant(BitInX, dl, VT));
11271   }
11272 
11273   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
11274        BitInY < NumActiveBits; ++BitInY) {
11275     if (OrCI[BitInY] == 0)
11276       continue;
11277     APInt Mask(VT.getSizeInBits(), 0);
11278     Mask.setBit(BitInY);
11279     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
11280                     // Confusingly, the operand is an *inverted* mask.
11281                     DAG.getConstant(~Mask, dl, VT));
11282   }
11283 
11284   return V;
11285 }
11286 
11287 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
11288 SDValue
11289 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
11290   SDValue Cmp = N->getOperand(4);
11291   if (Cmp.getOpcode() != ARMISD::CMPZ)
11292     // Only looking at NE cases.
11293     return SDValue();
11294 
11295   EVT VT = N->getValueType(0);
11296   SDLoc dl(N);
11297   SDValue LHS = Cmp.getOperand(0);
11298   SDValue RHS = Cmp.getOperand(1);
11299   SDValue Chain = N->getOperand(0);
11300   SDValue BB = N->getOperand(1);
11301   SDValue ARMcc = N->getOperand(2);
11302   ARMCC::CondCodes CC =
11303     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
11304 
11305   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
11306   // -> (brcond Chain BB CC CPSR Cmp)
11307   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
11308       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
11309       LHS->getOperand(0)->hasOneUse()) {
11310     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
11311     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
11312     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
11313     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
11314     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
11315         (LHS01C && LHS01C->getZExtValue() == 1) &&
11316         (LHS1C && LHS1C->getZExtValue() == 1) &&
11317         (RHSC && RHSC->getZExtValue() == 0)) {
11318       return DAG.getNode(
11319           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
11320           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
11321     }
11322   }
11323 
11324   return SDValue();
11325 }
11326 
11327 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
11328 SDValue
11329 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
11330   SDValue Cmp = N->getOperand(4);
11331   if (Cmp.getOpcode() != ARMISD::CMPZ)
11332     // Only looking at EQ and NE cases.
11333     return SDValue();
11334 
11335   EVT VT = N->getValueType(0);
11336   SDLoc dl(N);
11337   SDValue LHS = Cmp.getOperand(0);
11338   SDValue RHS = Cmp.getOperand(1);
11339   SDValue FalseVal = N->getOperand(0);
11340   SDValue TrueVal = N->getOperand(1);
11341   SDValue ARMcc = N->getOperand(2);
11342   ARMCC::CondCodes CC =
11343     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
11344 
11345   // BFI is only available on V6T2+.
11346   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
11347     SDValue R = PerformCMOVToBFICombine(N, DAG);
11348     if (R)
11349       return R;
11350   }
11351 
11352   // Simplify
11353   //   mov     r1, r0
11354   //   cmp     r1, x
11355   //   mov     r0, y
11356   //   moveq   r0, x
11357   // to
11358   //   cmp     r0, x
11359   //   movne   r0, y
11360   //
11361   //   mov     r1, r0
11362   //   cmp     r1, x
11363   //   mov     r0, x
11364   //   movne   r0, y
11365   // to
11366   //   cmp     r0, x
11367   //   movne   r0, y
11368   /// FIXME: Turn this into a target neutral optimization?
11369   SDValue Res;
11370   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
11371     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
11372                       N->getOperand(3), Cmp);
11373   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
11374     SDValue ARMcc;
11375     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
11376     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
11377                       N->getOperand(3), NewCmp);
11378   }
11379 
11380   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
11381   // -> (cmov F T CC CPSR Cmp)
11382   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
11383     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
11384     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
11385     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
11386     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
11387         (LHS1C && LHS1C->getZExtValue() == 1) &&
11388         (RHSC && RHSC->getZExtValue() == 0)) {
11389       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
11390                          LHS->getOperand(2), LHS->getOperand(3),
11391                          LHS->getOperand(4));
11392     }
11393   }
11394 
11395   if (Res.getNode()) {
11396     APInt KnownZero, KnownOne;
11397     DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
11398     // Capture demanded bits information that would be otherwise lost.
11399     if (KnownZero == 0xfffffffe)
11400       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11401                         DAG.getValueType(MVT::i1));
11402     else if (KnownZero == 0xffffff00)
11403       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11404                         DAG.getValueType(MVT::i8));
11405     else if (KnownZero == 0xffff0000)
11406       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
11407                         DAG.getValueType(MVT::i16));
11408   }
11409 
11410   return Res;
11411 }
11412 
11413 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
11414                                              DAGCombinerInfo &DCI) const {
11415   switch (N->getOpcode()) {
11416   default: break;
11417   case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
11418   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
11419   case ISD::SUB:        return PerformSUBCombine(N, DCI);
11420   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
11421   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
11422   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
11423   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
11424   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
11425   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
11426   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
11427   case ISD::STORE:      return PerformSTORECombine(N, DCI);
11428   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
11429   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
11430   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
11431   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
11432   case ISD::FP_TO_SINT:
11433   case ISD::FP_TO_UINT:
11434     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
11435   case ISD::FDIV:
11436     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
11437   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
11438   case ISD::SHL:
11439   case ISD::SRA:
11440   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
11441   case ISD::SIGN_EXTEND:
11442   case ISD::ZERO_EXTEND:
11443   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
11444   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
11445   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
11446   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
11447   case ARMISD::VLD2DUP:
11448   case ARMISD::VLD3DUP:
11449   case ARMISD::VLD4DUP:
11450     return PerformVLDCombine(N, DCI);
11451   case ARMISD::BUILD_VECTOR:
11452     return PerformARMBUILD_VECTORCombine(N, DCI);
11453   case ISD::INTRINSIC_VOID:
11454   case ISD::INTRINSIC_W_CHAIN:
11455     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
11456     case Intrinsic::arm_neon_vld1:
11457     case Intrinsic::arm_neon_vld2:
11458     case Intrinsic::arm_neon_vld3:
11459     case Intrinsic::arm_neon_vld4:
11460     case Intrinsic::arm_neon_vld2lane:
11461     case Intrinsic::arm_neon_vld3lane:
11462     case Intrinsic::arm_neon_vld4lane:
11463     case Intrinsic::arm_neon_vst1:
11464     case Intrinsic::arm_neon_vst2:
11465     case Intrinsic::arm_neon_vst3:
11466     case Intrinsic::arm_neon_vst4:
11467     case Intrinsic::arm_neon_vst2lane:
11468     case Intrinsic::arm_neon_vst3lane:
11469     case Intrinsic::arm_neon_vst4lane:
11470       return PerformVLDCombine(N, DCI);
11471     default: break;
11472     }
11473     break;
11474   }
11475   return SDValue();
11476 }
11477 
11478 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
11479                                                           EVT VT) const {
11480   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
11481 }
11482 
11483 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
11484                                                        unsigned,
11485                                                        unsigned,
11486                                                        bool *Fast) const {
11487   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
11488   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
11489 
11490   switch (VT.getSimpleVT().SimpleTy) {
11491   default:
11492     return false;
11493   case MVT::i8:
11494   case MVT::i16:
11495   case MVT::i32: {
11496     // Unaligned access can use (for example) LRDB, LRDH, LDR
11497     if (AllowsUnaligned) {
11498       if (Fast)
11499         *Fast = Subtarget->hasV7Ops();
11500       return true;
11501     }
11502     return false;
11503   }
11504   case MVT::f64:
11505   case MVT::v2f64: {
11506     // For any little-endian targets with neon, we can support unaligned ld/st
11507     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
11508     // A big-endian target may also explicitly support unaligned accesses
11509     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
11510       if (Fast)
11511         *Fast = true;
11512       return true;
11513     }
11514     return false;
11515   }
11516   }
11517 }
11518 
11519 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
11520                        unsigned AlignCheck) {
11521   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
11522           (DstAlign == 0 || DstAlign % AlignCheck == 0));
11523 }
11524 
11525 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
11526                                            unsigned DstAlign, unsigned SrcAlign,
11527                                            bool IsMemset, bool ZeroMemset,
11528                                            bool MemcpyStrSrc,
11529                                            MachineFunction &MF) const {
11530   const Function *F = MF.getFunction();
11531 
11532   // See if we can use NEON instructions for this...
11533   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
11534       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
11535     bool Fast;
11536     if (Size >= 16 &&
11537         (memOpAlign(SrcAlign, DstAlign, 16) ||
11538          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
11539       return MVT::v2f64;
11540     } else if (Size >= 8 &&
11541                (memOpAlign(SrcAlign, DstAlign, 8) ||
11542                 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
11543                  Fast))) {
11544       return MVT::f64;
11545     }
11546   }
11547 
11548   // Lowering to i32/i16 if the size permits.
11549   if (Size >= 4)
11550     return MVT::i32;
11551   else if (Size >= 2)
11552     return MVT::i16;
11553 
11554   // Let the target-independent logic figure it out.
11555   return MVT::Other;
11556 }
11557 
11558 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
11559   if (Val.getOpcode() != ISD::LOAD)
11560     return false;
11561 
11562   EVT VT1 = Val.getValueType();
11563   if (!VT1.isSimple() || !VT1.isInteger() ||
11564       !VT2.isSimple() || !VT2.isInteger())
11565     return false;
11566 
11567   switch (VT1.getSimpleVT().SimpleTy) {
11568   default: break;
11569   case MVT::i1:
11570   case MVT::i8:
11571   case MVT::i16:
11572     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
11573     return true;
11574   }
11575 
11576   return false;
11577 }
11578 
11579 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
11580   EVT VT = ExtVal.getValueType();
11581 
11582   if (!isTypeLegal(VT))
11583     return false;
11584 
11585   // Don't create a loadext if we can fold the extension into a wide/long
11586   // instruction.
11587   // If there's more than one user instruction, the loadext is desirable no
11588   // matter what.  There can be two uses by the same instruction.
11589   if (ExtVal->use_empty() ||
11590       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
11591     return true;
11592 
11593   SDNode *U = *ExtVal->use_begin();
11594   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
11595        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
11596     return false;
11597 
11598   return true;
11599 }
11600 
11601 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
11602   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
11603     return false;
11604 
11605   if (!isTypeLegal(EVT::getEVT(Ty1)))
11606     return false;
11607 
11608   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
11609 
11610   // Assuming the caller doesn't have a zeroext or signext return parameter,
11611   // truncation all the way down to i1 is valid.
11612   return true;
11613 }
11614 
11615 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
11616                                                 const AddrMode &AM, Type *Ty,
11617                                                 unsigned AS) const {
11618   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
11619     if (Subtarget->hasFPAO())
11620       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
11621     return 0;
11622   }
11623   return -1;
11624 }
11625 
11626 
11627 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
11628   if (V < 0)
11629     return false;
11630 
11631   unsigned Scale = 1;
11632   switch (VT.getSimpleVT().SimpleTy) {
11633   default: return false;
11634   case MVT::i1:
11635   case MVT::i8:
11636     // Scale == 1;
11637     break;
11638   case MVT::i16:
11639     // Scale == 2;
11640     Scale = 2;
11641     break;
11642   case MVT::i32:
11643     // Scale == 4;
11644     Scale = 4;
11645     break;
11646   }
11647 
11648   if ((V & (Scale - 1)) != 0)
11649     return false;
11650   V /= Scale;
11651   return V == (V & ((1LL << 5) - 1));
11652 }
11653 
11654 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
11655                                       const ARMSubtarget *Subtarget) {
11656   bool isNeg = false;
11657   if (V < 0) {
11658     isNeg = true;
11659     V = - V;
11660   }
11661 
11662   switch (VT.getSimpleVT().SimpleTy) {
11663   default: return false;
11664   case MVT::i1:
11665   case MVT::i8:
11666   case MVT::i16:
11667   case MVT::i32:
11668     // + imm12 or - imm8
11669     if (isNeg)
11670       return V == (V & ((1LL << 8) - 1));
11671     return V == (V & ((1LL << 12) - 1));
11672   case MVT::f32:
11673   case MVT::f64:
11674     // Same as ARM mode. FIXME: NEON?
11675     if (!Subtarget->hasVFP2())
11676       return false;
11677     if ((V & 3) != 0)
11678       return false;
11679     V >>= 2;
11680     return V == (V & ((1LL << 8) - 1));
11681   }
11682 }
11683 
11684 /// isLegalAddressImmediate - Return true if the integer value can be used
11685 /// as the offset of the target addressing mode for load / store of the
11686 /// given type.
11687 static bool isLegalAddressImmediate(int64_t V, EVT VT,
11688                                     const ARMSubtarget *Subtarget) {
11689   if (V == 0)
11690     return true;
11691 
11692   if (!VT.isSimple())
11693     return false;
11694 
11695   if (Subtarget->isThumb1Only())
11696     return isLegalT1AddressImmediate(V, VT);
11697   else if (Subtarget->isThumb2())
11698     return isLegalT2AddressImmediate(V, VT, Subtarget);
11699 
11700   // ARM mode.
11701   if (V < 0)
11702     V = - V;
11703   switch (VT.getSimpleVT().SimpleTy) {
11704   default: return false;
11705   case MVT::i1:
11706   case MVT::i8:
11707   case MVT::i32:
11708     // +- imm12
11709     return V == (V & ((1LL << 12) - 1));
11710   case MVT::i16:
11711     // +- imm8
11712     return V == (V & ((1LL << 8) - 1));
11713   case MVT::f32:
11714   case MVT::f64:
11715     if (!Subtarget->hasVFP2()) // FIXME: NEON?
11716       return false;
11717     if ((V & 3) != 0)
11718       return false;
11719     V >>= 2;
11720     return V == (V & ((1LL << 8) - 1));
11721   }
11722 }
11723 
11724 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
11725                                                       EVT VT) const {
11726   int Scale = AM.Scale;
11727   if (Scale < 0)
11728     return false;
11729 
11730   switch (VT.getSimpleVT().SimpleTy) {
11731   default: return false;
11732   case MVT::i1:
11733   case MVT::i8:
11734   case MVT::i16:
11735   case MVT::i32:
11736     if (Scale == 1)
11737       return true;
11738     // r + r << imm
11739     Scale = Scale & ~1;
11740     return Scale == 2 || Scale == 4 || Scale == 8;
11741   case MVT::i64:
11742     // r + r
11743     if (((unsigned)AM.HasBaseReg + Scale) <= 2)
11744       return true;
11745     return false;
11746   case MVT::isVoid:
11747     // Note, we allow "void" uses (basically, uses that aren't loads or
11748     // stores), because arm allows folding a scale into many arithmetic
11749     // operations.  This should be made more precise and revisited later.
11750 
11751     // Allow r << imm, but the imm has to be a multiple of two.
11752     if (Scale & 1) return false;
11753     return isPowerOf2_32(Scale);
11754   }
11755 }
11756 
11757 /// isLegalAddressingMode - Return true if the addressing mode represented
11758 /// by AM is legal for this target, for a load/store of the specified type.
11759 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
11760                                               const AddrMode &AM, Type *Ty,
11761                                               unsigned AS) const {
11762   EVT VT = getValueType(DL, Ty, true);
11763   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
11764     return false;
11765 
11766   // Can never fold addr of global into load/store.
11767   if (AM.BaseGV)
11768     return false;
11769 
11770   switch (AM.Scale) {
11771   case 0:  // no scale reg, must be "r+i" or "r", or "i".
11772     break;
11773   case 1:
11774     if (Subtarget->isThumb1Only())
11775       return false;
11776     LLVM_FALLTHROUGH;
11777   default:
11778     // ARM doesn't support any R+R*scale+imm addr modes.
11779     if (AM.BaseOffs)
11780       return false;
11781 
11782     if (!VT.isSimple())
11783       return false;
11784 
11785     if (Subtarget->isThumb2())
11786       return isLegalT2ScaledAddressingMode(AM, VT);
11787 
11788     int Scale = AM.Scale;
11789     switch (VT.getSimpleVT().SimpleTy) {
11790     default: return false;
11791     case MVT::i1:
11792     case MVT::i8:
11793     case MVT::i32:
11794       if (Scale < 0) Scale = -Scale;
11795       if (Scale == 1)
11796         return true;
11797       // r + r << imm
11798       return isPowerOf2_32(Scale & ~1);
11799     case MVT::i16:
11800     case MVT::i64:
11801       // r + r
11802       if (((unsigned)AM.HasBaseReg + Scale) <= 2)
11803         return true;
11804       return false;
11805 
11806     case MVT::isVoid:
11807       // Note, we allow "void" uses (basically, uses that aren't loads or
11808       // stores), because arm allows folding a scale into many arithmetic
11809       // operations.  This should be made more precise and revisited later.
11810 
11811       // Allow r << imm, but the imm has to be a multiple of two.
11812       if (Scale & 1) return false;
11813       return isPowerOf2_32(Scale);
11814     }
11815   }
11816   return true;
11817 }
11818 
11819 /// isLegalICmpImmediate - Return true if the specified immediate is legal
11820 /// icmp immediate, that is the target has icmp instructions which can compare
11821 /// a register against the immediate without having to materialize the
11822 /// immediate into a register.
11823 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
11824   // Thumb2 and ARM modes can use cmn for negative immediates.
11825   if (!Subtarget->isThumb())
11826     return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
11827   if (Subtarget->isThumb2())
11828     return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
11829   // Thumb1 doesn't have cmn, and only 8-bit immediates.
11830   return Imm >= 0 && Imm <= 255;
11831 }
11832 
11833 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
11834 /// *or sub* immediate, that is the target has add or sub instructions which can
11835 /// add a register with the immediate without having to materialize the
11836 /// immediate into a register.
11837 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
11838   // Same encoding for add/sub, just flip the sign.
11839   int64_t AbsImm = std::abs(Imm);
11840   if (!Subtarget->isThumb())
11841     return ARM_AM::getSOImmVal(AbsImm) != -1;
11842   if (Subtarget->isThumb2())
11843     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
11844   // Thumb1 only has 8-bit unsigned immediate.
11845   return AbsImm >= 0 && AbsImm <= 255;
11846 }
11847 
11848 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
11849                                       bool isSEXTLoad, SDValue &Base,
11850                                       SDValue &Offset, bool &isInc,
11851                                       SelectionDAG &DAG) {
11852   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
11853     return false;
11854 
11855   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
11856     // AddressingMode 3
11857     Base = Ptr->getOperand(0);
11858     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11859       int RHSC = (int)RHS->getZExtValue();
11860       if (RHSC < 0 && RHSC > -256) {
11861         assert(Ptr->getOpcode() == ISD::ADD);
11862         isInc = false;
11863         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11864         return true;
11865       }
11866     }
11867     isInc = (Ptr->getOpcode() == ISD::ADD);
11868     Offset = Ptr->getOperand(1);
11869     return true;
11870   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
11871     // AddressingMode 2
11872     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11873       int RHSC = (int)RHS->getZExtValue();
11874       if (RHSC < 0 && RHSC > -0x1000) {
11875         assert(Ptr->getOpcode() == ISD::ADD);
11876         isInc = false;
11877         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11878         Base = Ptr->getOperand(0);
11879         return true;
11880       }
11881     }
11882 
11883     if (Ptr->getOpcode() == ISD::ADD) {
11884       isInc = true;
11885       ARM_AM::ShiftOpc ShOpcVal=
11886         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
11887       if (ShOpcVal != ARM_AM::no_shift) {
11888         Base = Ptr->getOperand(1);
11889         Offset = Ptr->getOperand(0);
11890       } else {
11891         Base = Ptr->getOperand(0);
11892         Offset = Ptr->getOperand(1);
11893       }
11894       return true;
11895     }
11896 
11897     isInc = (Ptr->getOpcode() == ISD::ADD);
11898     Base = Ptr->getOperand(0);
11899     Offset = Ptr->getOperand(1);
11900     return true;
11901   }
11902 
11903   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
11904   return false;
11905 }
11906 
11907 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
11908                                      bool isSEXTLoad, SDValue &Base,
11909                                      SDValue &Offset, bool &isInc,
11910                                      SelectionDAG &DAG) {
11911   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
11912     return false;
11913 
11914   Base = Ptr->getOperand(0);
11915   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
11916     int RHSC = (int)RHS->getZExtValue();
11917     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
11918       assert(Ptr->getOpcode() == ISD::ADD);
11919       isInc = false;
11920       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
11921       return true;
11922     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
11923       isInc = Ptr->getOpcode() == ISD::ADD;
11924       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
11925       return true;
11926     }
11927   }
11928 
11929   return false;
11930 }
11931 
11932 /// getPreIndexedAddressParts - returns true by value, base pointer and
11933 /// offset pointer and addressing mode by reference if the node's address
11934 /// can be legally represented as pre-indexed load / store address.
11935 bool
11936 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
11937                                              SDValue &Offset,
11938                                              ISD::MemIndexedMode &AM,
11939                                              SelectionDAG &DAG) const {
11940   if (Subtarget->isThumb1Only())
11941     return false;
11942 
11943   EVT VT;
11944   SDValue Ptr;
11945   bool isSEXTLoad = false;
11946   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
11947     Ptr = LD->getBasePtr();
11948     VT  = LD->getMemoryVT();
11949     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
11950   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
11951     Ptr = ST->getBasePtr();
11952     VT  = ST->getMemoryVT();
11953   } else
11954     return false;
11955 
11956   bool isInc;
11957   bool isLegal = false;
11958   if (Subtarget->isThumb2())
11959     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
11960                                        Offset, isInc, DAG);
11961   else
11962     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
11963                                         Offset, isInc, DAG);
11964   if (!isLegal)
11965     return false;
11966 
11967   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
11968   return true;
11969 }
11970 
11971 /// getPostIndexedAddressParts - returns true by value, base pointer and
11972 /// offset pointer and addressing mode by reference if this node can be
11973 /// combined with a load / store to form a post-indexed load / store.
11974 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
11975                                                    SDValue &Base,
11976                                                    SDValue &Offset,
11977                                                    ISD::MemIndexedMode &AM,
11978                                                    SelectionDAG &DAG) const {
11979   EVT VT;
11980   SDValue Ptr;
11981   bool isSEXTLoad = false, isNonExt;
11982   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
11983     VT  = LD->getMemoryVT();
11984     Ptr = LD->getBasePtr();
11985     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
11986     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
11987   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
11988     VT  = ST->getMemoryVT();
11989     Ptr = ST->getBasePtr();
11990     isNonExt = !ST->isTruncatingStore();
11991   } else
11992     return false;
11993 
11994   if (Subtarget->isThumb1Only()) {
11995     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
11996     // must be non-extending/truncating, i32, with an offset of 4.
11997     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
11998     if (Op->getOpcode() != ISD::ADD || !isNonExt)
11999       return false;
12000     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12001     if (!RHS || RHS->getZExtValue() != 4)
12002       return false;
12003 
12004     Offset = Op->getOperand(1);
12005     Base = Op->getOperand(0);
12006     AM = ISD::POST_INC;
12007     return true;
12008   }
12009 
12010   bool isInc;
12011   bool isLegal = false;
12012   if (Subtarget->isThumb2())
12013     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
12014                                        isInc, DAG);
12015   else
12016     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
12017                                         isInc, DAG);
12018   if (!isLegal)
12019     return false;
12020 
12021   if (Ptr != Base) {
12022     // Swap base ptr and offset to catch more post-index load / store when
12023     // it's legal. In Thumb2 mode, offset must be an immediate.
12024     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
12025         !Subtarget->isThumb2())
12026       std::swap(Base, Offset);
12027 
12028     // Post-indexed load / store update the base pointer.
12029     if (Ptr != Base)
12030       return false;
12031   }
12032 
12033   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
12034   return true;
12035 }
12036 
12037 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
12038                                                       APInt &KnownZero,
12039                                                       APInt &KnownOne,
12040                                                       const SelectionDAG &DAG,
12041                                                       unsigned Depth) const {
12042   unsigned BitWidth = KnownOne.getBitWidth();
12043   KnownZero = KnownOne = APInt(BitWidth, 0);
12044   switch (Op.getOpcode()) {
12045   default: break;
12046   case ARMISD::ADDC:
12047   case ARMISD::ADDE:
12048   case ARMISD::SUBC:
12049   case ARMISD::SUBE:
12050     // These nodes' second result is a boolean
12051     if (Op.getResNo() == 0)
12052       break;
12053     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
12054     break;
12055   case ARMISD::CMOV: {
12056     // Bits are known zero/one if known on the LHS and RHS.
12057     DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
12058     if (KnownZero == 0 && KnownOne == 0) return;
12059 
12060     APInt KnownZeroRHS, KnownOneRHS;
12061     DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
12062     KnownZero &= KnownZeroRHS;
12063     KnownOne  &= KnownOneRHS;
12064     return;
12065   }
12066   case ISD::INTRINSIC_W_CHAIN: {
12067     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
12068     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
12069     switch (IntID) {
12070     default: return;
12071     case Intrinsic::arm_ldaex:
12072     case Intrinsic::arm_ldrex: {
12073       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
12074       unsigned MemBits = VT.getScalarSizeInBits();
12075       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
12076       return;
12077     }
12078     }
12079   }
12080   }
12081 }
12082 
12083 //===----------------------------------------------------------------------===//
12084 //                           ARM Inline Assembly Support
12085 //===----------------------------------------------------------------------===//
12086 
12087 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
12088   // Looking for "rev" which is V6+.
12089   if (!Subtarget->hasV6Ops())
12090     return false;
12091 
12092   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
12093   std::string AsmStr = IA->getAsmString();
12094   SmallVector<StringRef, 4> AsmPieces;
12095   SplitString(AsmStr, AsmPieces, ";\n");
12096 
12097   switch (AsmPieces.size()) {
12098   default: return false;
12099   case 1:
12100     AsmStr = AsmPieces[0];
12101     AsmPieces.clear();
12102     SplitString(AsmStr, AsmPieces, " \t,");
12103 
12104     // rev $0, $1
12105     if (AsmPieces.size() == 3 &&
12106         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
12107         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
12108       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
12109       if (Ty && Ty->getBitWidth() == 32)
12110         return IntrinsicLowering::LowerToByteSwap(CI);
12111     }
12112     break;
12113   }
12114 
12115   return false;
12116 }
12117 
12118 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12119   // At this point, we have to lower this constraint to something else, so we
12120   // lower it to an "r" or "w". However, by doing this we will force the result
12121   // to be in register, while the X constraint is much more permissive.
12122   //
12123   // Although we are correct (we are free to emit anything, without
12124   // constraints), we might break use cases that would expect us to be more
12125   // efficient and emit something else.
12126   if (!Subtarget->hasVFP2())
12127     return "r";
12128   if (ConstraintVT.isFloatingPoint())
12129     return "w";
12130   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
12131      (ConstraintVT.getSizeInBits() == 64 ||
12132       ConstraintVT.getSizeInBits() == 128))
12133     return "w";
12134 
12135   return "r";
12136 }
12137 
12138 /// getConstraintType - Given a constraint letter, return the type of
12139 /// constraint it is for this target.
12140 ARMTargetLowering::ConstraintType
12141 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
12142   if (Constraint.size() == 1) {
12143     switch (Constraint[0]) {
12144     default:  break;
12145     case 'l': return C_RegisterClass;
12146     case 'w': return C_RegisterClass;
12147     case 'h': return C_RegisterClass;
12148     case 'x': return C_RegisterClass;
12149     case 't': return C_RegisterClass;
12150     case 'j': return C_Other; // Constant for movw.
12151       // An address with a single base register. Due to the way we
12152       // currently handle addresses it is the same as an 'r' memory constraint.
12153     case 'Q': return C_Memory;
12154     }
12155   } else if (Constraint.size() == 2) {
12156     switch (Constraint[0]) {
12157     default: break;
12158     // All 'U+' constraints are addresses.
12159     case 'U': return C_Memory;
12160     }
12161   }
12162   return TargetLowering::getConstraintType(Constraint);
12163 }
12164 
12165 /// Examine constraint type and operand type and determine a weight value.
12166 /// This object must already have been set up with the operand type
12167 /// and the current alternative constraint selected.
12168 TargetLowering::ConstraintWeight
12169 ARMTargetLowering::getSingleConstraintMatchWeight(
12170     AsmOperandInfo &info, const char *constraint) const {
12171   ConstraintWeight weight = CW_Invalid;
12172   Value *CallOperandVal = info.CallOperandVal;
12173     // If we don't have a value, we can't do a match,
12174     // but allow it at the lowest weight.
12175   if (!CallOperandVal)
12176     return CW_Default;
12177   Type *type = CallOperandVal->getType();
12178   // Look at the constraint type.
12179   switch (*constraint) {
12180   default:
12181     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
12182     break;
12183   case 'l':
12184     if (type->isIntegerTy()) {
12185       if (Subtarget->isThumb())
12186         weight = CW_SpecificReg;
12187       else
12188         weight = CW_Register;
12189     }
12190     break;
12191   case 'w':
12192     if (type->isFloatingPointTy())
12193       weight = CW_Register;
12194     break;
12195   }
12196   return weight;
12197 }
12198 
12199 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
12200 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
12201     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12202   if (Constraint.size() == 1) {
12203     // GCC ARM Constraint Letters
12204     switch (Constraint[0]) {
12205     case 'l': // Low regs or general regs.
12206       if (Subtarget->isThumb())
12207         return RCPair(0U, &ARM::tGPRRegClass);
12208       return RCPair(0U, &ARM::GPRRegClass);
12209     case 'h': // High regs or no regs.
12210       if (Subtarget->isThumb())
12211         return RCPair(0U, &ARM::hGPRRegClass);
12212       break;
12213     case 'r':
12214       if (Subtarget->isThumb1Only())
12215         return RCPair(0U, &ARM::tGPRRegClass);
12216       return RCPair(0U, &ARM::GPRRegClass);
12217     case 'w':
12218       if (VT == MVT::Other)
12219         break;
12220       if (VT == MVT::f32)
12221         return RCPair(0U, &ARM::SPRRegClass);
12222       if (VT.getSizeInBits() == 64)
12223         return RCPair(0U, &ARM::DPRRegClass);
12224       if (VT.getSizeInBits() == 128)
12225         return RCPair(0U, &ARM::QPRRegClass);
12226       break;
12227     case 'x':
12228       if (VT == MVT::Other)
12229         break;
12230       if (VT == MVT::f32)
12231         return RCPair(0U, &ARM::SPR_8RegClass);
12232       if (VT.getSizeInBits() == 64)
12233         return RCPair(0U, &ARM::DPR_8RegClass);
12234       if (VT.getSizeInBits() == 128)
12235         return RCPair(0U, &ARM::QPR_8RegClass);
12236       break;
12237     case 't':
12238       if (VT == MVT::f32)
12239         return RCPair(0U, &ARM::SPRRegClass);
12240       break;
12241     }
12242   }
12243   if (StringRef("{cc}").equals_lower(Constraint))
12244     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
12245 
12246   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12247 }
12248 
12249 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12250 /// vector.  If it is invalid, don't add anything to Ops.
12251 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
12252                                                      std::string &Constraint,
12253                                                      std::vector<SDValue>&Ops,
12254                                                      SelectionDAG &DAG) const {
12255   SDValue Result;
12256 
12257   // Currently only support length 1 constraints.
12258   if (Constraint.length() != 1) return;
12259 
12260   char ConstraintLetter = Constraint[0];
12261   switch (ConstraintLetter) {
12262   default: break;
12263   case 'j':
12264   case 'I': case 'J': case 'K': case 'L':
12265   case 'M': case 'N': case 'O':
12266     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
12267     if (!C)
12268       return;
12269 
12270     int64_t CVal64 = C->getSExtValue();
12271     int CVal = (int) CVal64;
12272     // None of these constraints allow values larger than 32 bits.  Check
12273     // that the value fits in an int.
12274     if (CVal != CVal64)
12275       return;
12276 
12277     switch (ConstraintLetter) {
12278       case 'j':
12279         // Constant suitable for movw, must be between 0 and
12280         // 65535.
12281         if (Subtarget->hasV6T2Ops())
12282           if (CVal >= 0 && CVal <= 65535)
12283             break;
12284         return;
12285       case 'I':
12286         if (Subtarget->isThumb1Only()) {
12287           // This must be a constant between 0 and 255, for ADD
12288           // immediates.
12289           if (CVal >= 0 && CVal <= 255)
12290             break;
12291         } else if (Subtarget->isThumb2()) {
12292           // A constant that can be used as an immediate value in a
12293           // data-processing instruction.
12294           if (ARM_AM::getT2SOImmVal(CVal) != -1)
12295             break;
12296         } else {
12297           // A constant that can be used as an immediate value in a
12298           // data-processing instruction.
12299           if (ARM_AM::getSOImmVal(CVal) != -1)
12300             break;
12301         }
12302         return;
12303 
12304       case 'J':
12305         if (Subtarget->isThumb1Only()) {
12306           // This must be a constant between -255 and -1, for negated ADD
12307           // immediates. This can be used in GCC with an "n" modifier that
12308           // prints the negated value, for use with SUB instructions. It is
12309           // not useful otherwise but is implemented for compatibility.
12310           if (CVal >= -255 && CVal <= -1)
12311             break;
12312         } else {
12313           // This must be a constant between -4095 and 4095. It is not clear
12314           // what this constraint is intended for. Implemented for
12315           // compatibility with GCC.
12316           if (CVal >= -4095 && CVal <= 4095)
12317             break;
12318         }
12319         return;
12320 
12321       case 'K':
12322         if (Subtarget->isThumb1Only()) {
12323           // A 32-bit value where only one byte has a nonzero value. Exclude
12324           // zero to match GCC. This constraint is used by GCC internally for
12325           // constants that can be loaded with a move/shift combination.
12326           // It is not useful otherwise but is implemented for compatibility.
12327           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
12328             break;
12329         } else if (Subtarget->isThumb2()) {
12330           // A constant whose bitwise inverse can be used as an immediate
12331           // value in a data-processing instruction. This can be used in GCC
12332           // with a "B" modifier that prints the inverted value, for use with
12333           // BIC and MVN instructions. It is not useful otherwise but is
12334           // implemented for compatibility.
12335           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
12336             break;
12337         } else {
12338           // A constant whose bitwise inverse can be used as an immediate
12339           // value in a data-processing instruction. This can be used in GCC
12340           // with a "B" modifier that prints the inverted value, for use with
12341           // BIC and MVN instructions. It is not useful otherwise but is
12342           // implemented for compatibility.
12343           if (ARM_AM::getSOImmVal(~CVal) != -1)
12344             break;
12345         }
12346         return;
12347 
12348       case 'L':
12349         if (Subtarget->isThumb1Only()) {
12350           // This must be a constant between -7 and 7,
12351           // for 3-operand ADD/SUB immediate instructions.
12352           if (CVal >= -7 && CVal < 7)
12353             break;
12354         } else if (Subtarget->isThumb2()) {
12355           // A constant whose negation can be used as an immediate value in a
12356           // data-processing instruction. This can be used in GCC with an "n"
12357           // modifier that prints the negated value, for use with SUB
12358           // instructions. It is not useful otherwise but is implemented for
12359           // compatibility.
12360           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
12361             break;
12362         } else {
12363           // A constant whose negation can be used as an immediate value in a
12364           // data-processing instruction. This can be used in GCC with an "n"
12365           // modifier that prints the negated value, for use with SUB
12366           // instructions. It is not useful otherwise but is implemented for
12367           // compatibility.
12368           if (ARM_AM::getSOImmVal(-CVal) != -1)
12369             break;
12370         }
12371         return;
12372 
12373       case 'M':
12374         if (Subtarget->isThumb1Only()) {
12375           // This must be a multiple of 4 between 0 and 1020, for
12376           // ADD sp + immediate.
12377           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
12378             break;
12379         } else {
12380           // A power of two or a constant between 0 and 32.  This is used in
12381           // GCC for the shift amount on shifted register operands, but it is
12382           // useful in general for any shift amounts.
12383           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
12384             break;
12385         }
12386         return;
12387 
12388       case 'N':
12389         if (Subtarget->isThumb()) {  // FIXME thumb2
12390           // This must be a constant between 0 and 31, for shift amounts.
12391           if (CVal >= 0 && CVal <= 31)
12392             break;
12393         }
12394         return;
12395 
12396       case 'O':
12397         if (Subtarget->isThumb()) {  // FIXME thumb2
12398           // This must be a multiple of 4 between -508 and 508, for
12399           // ADD/SUB sp = sp + immediate.
12400           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
12401             break;
12402         }
12403         return;
12404     }
12405     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
12406     break;
12407   }
12408 
12409   if (Result.getNode()) {
12410     Ops.push_back(Result);
12411     return;
12412   }
12413   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12414 }
12415 
12416 static RTLIB::Libcall getDivRemLibcall(
12417     const SDNode *N, MVT::SimpleValueType SVT) {
12418   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
12419           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
12420          "Unhandled Opcode in getDivRemLibcall");
12421   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
12422                   N->getOpcode() == ISD::SREM;
12423   RTLIB::Libcall LC;
12424   switch (SVT) {
12425   default: llvm_unreachable("Unexpected request for libcall!");
12426   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
12427   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
12428   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
12429   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
12430   }
12431   return LC;
12432 }
12433 
12434 static TargetLowering::ArgListTy getDivRemArgList(
12435     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
12436   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
12437           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
12438          "Unhandled Opcode in getDivRemArgList");
12439   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
12440                   N->getOpcode() == ISD::SREM;
12441   TargetLowering::ArgListTy Args;
12442   TargetLowering::ArgListEntry Entry;
12443   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
12444     EVT ArgVT = N->getOperand(i).getValueType();
12445     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
12446     Entry.Node = N->getOperand(i);
12447     Entry.Ty = ArgTy;
12448     Entry.isSExt = isSigned;
12449     Entry.isZExt = !isSigned;
12450     Args.push_back(Entry);
12451   }
12452   if (Subtarget->isTargetWindows() && Args.size() >= 2)
12453     std::swap(Args[0], Args[1]);
12454   return Args;
12455 }
12456 
12457 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
12458   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
12459           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
12460           Subtarget->isTargetWindows()) &&
12461          "Register-based DivRem lowering only");
12462   unsigned Opcode = Op->getOpcode();
12463   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
12464          "Invalid opcode for Div/Rem lowering");
12465   bool isSigned = (Opcode == ISD::SDIVREM);
12466   EVT VT = Op->getValueType(0);
12467   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
12468   SDLoc dl(Op);
12469 
12470   // If the target has hardware divide, use divide + multiply + subtract:
12471   //     div = a / b
12472   //     rem = a - b * div
12473   //     return {div, rem}
12474   // This should be lowered into UDIV/SDIV + MLS later on.
12475   if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() &&
12476       Op->getSimpleValueType(0) == MVT::i32) {
12477     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
12478     const SDValue Dividend = Op->getOperand(0);
12479     const SDValue Divisor = Op->getOperand(1);
12480     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
12481     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
12482     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
12483 
12484     SDValue Values[2] = {Div, Rem};
12485     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
12486   }
12487 
12488   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
12489                                        VT.getSimpleVT().SimpleTy);
12490   SDValue InChain = DAG.getEntryNode();
12491 
12492   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
12493                                                     DAG.getContext(),
12494                                                     Subtarget);
12495 
12496   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
12497                                          getPointerTy(DAG.getDataLayout()));
12498 
12499   Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
12500 
12501   if (Subtarget->isTargetWindows())
12502     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
12503 
12504   TargetLowering::CallLoweringInfo CLI(DAG);
12505   CLI.setDebugLoc(dl).setChain(InChain)
12506     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
12507     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
12508 
12509   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
12510   return CallInfo.first;
12511 }
12512 
12513 // Lowers REM using divmod helpers
12514 // see RTABI section 4.2/4.3
12515 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
12516   // Build return types (div and rem)
12517   std::vector<Type*> RetTyParams;
12518   Type *RetTyElement;
12519 
12520   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
12521   default: llvm_unreachable("Unexpected request for libcall!");
12522   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
12523   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
12524   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
12525   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
12526   }
12527 
12528   RetTyParams.push_back(RetTyElement);
12529   RetTyParams.push_back(RetTyElement);
12530   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
12531   Type *RetTy = StructType::get(*DAG.getContext(), ret);
12532 
12533   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
12534                                                              SimpleTy);
12535   SDValue InChain = DAG.getEntryNode();
12536   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
12537                                                     Subtarget);
12538   bool isSigned = N->getOpcode() == ISD::SREM;
12539   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
12540                                          getPointerTy(DAG.getDataLayout()));
12541 
12542   if (Subtarget->isTargetWindows())
12543     InChain = WinDBZCheckDenominator(DAG, N, InChain);
12544 
12545   // Lower call
12546   CallLoweringInfo CLI(DAG);
12547   CLI.setChain(InChain)
12548      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
12549      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
12550   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
12551 
12552   // Return second (rem) result operand (first contains div)
12553   SDNode *ResNode = CallResult.first.getNode();
12554   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
12555   return ResNode->getOperand(1);
12556 }
12557 
12558 SDValue
12559 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
12560   assert(Subtarget->isTargetWindows() && "unsupported target platform");
12561   SDLoc DL(Op);
12562 
12563   // Get the inputs.
12564   SDValue Chain = Op.getOperand(0);
12565   SDValue Size  = Op.getOperand(1);
12566 
12567   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
12568                               DAG.getConstant(2, DL, MVT::i32));
12569 
12570   SDValue Flag;
12571   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
12572   Flag = Chain.getValue(1);
12573 
12574   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
12575   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
12576 
12577   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
12578   Chain = NewSP.getValue(1);
12579 
12580   SDValue Ops[2] = { NewSP, Chain };
12581   return DAG.getMergeValues(Ops, DL);
12582 }
12583 
12584 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
12585   assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
12586          "Unexpected type for custom-lowering FP_EXTEND");
12587 
12588   RTLIB::Libcall LC;
12589   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
12590 
12591   SDValue SrcVal = Op.getOperand(0);
12592   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
12593                      SDLoc(Op)).first;
12594 }
12595 
12596 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
12597   assert(Op.getOperand(0).getValueType() == MVT::f64 &&
12598          Subtarget->isFPOnlySP() &&
12599          "Unexpected type for custom-lowering FP_ROUND");
12600 
12601   RTLIB::Libcall LC;
12602   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
12603 
12604   SDValue SrcVal = Op.getOperand(0);
12605   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
12606                      SDLoc(Op)).first;
12607 }
12608 
12609 bool
12610 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
12611   // The ARM target isn't yet aware of offsets.
12612   return false;
12613 }
12614 
12615 bool ARM::isBitFieldInvertedMask(unsigned v) {
12616   if (v == 0xffffffff)
12617     return false;
12618 
12619   // there can be 1's on either or both "outsides", all the "inside"
12620   // bits must be 0's
12621   return isShiftedMask_32(~v);
12622 }
12623 
12624 /// isFPImmLegal - Returns true if the target can instruction select the
12625 /// specified FP immediate natively. If false, the legalizer will
12626 /// materialize the FP immediate as a load from a constant pool.
12627 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
12628   if (!Subtarget->hasVFP3())
12629     return false;
12630   if (VT == MVT::f32)
12631     return ARM_AM::getFP32Imm(Imm) != -1;
12632   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
12633     return ARM_AM::getFP64Imm(Imm) != -1;
12634   return false;
12635 }
12636 
12637 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
12638 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
12639 /// specified in the intrinsic calls.
12640 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12641                                            const CallInst &I,
12642                                            unsigned Intrinsic) const {
12643   switch (Intrinsic) {
12644   case Intrinsic::arm_neon_vld1:
12645   case Intrinsic::arm_neon_vld2:
12646   case Intrinsic::arm_neon_vld3:
12647   case Intrinsic::arm_neon_vld4:
12648   case Intrinsic::arm_neon_vld2lane:
12649   case Intrinsic::arm_neon_vld3lane:
12650   case Intrinsic::arm_neon_vld4lane: {
12651     Info.opc = ISD::INTRINSIC_W_CHAIN;
12652     // Conservatively set memVT to the entire set of vectors loaded.
12653     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12654     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
12655     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12656     Info.ptrVal = I.getArgOperand(0);
12657     Info.offset = 0;
12658     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12659     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12660     Info.vol = false; // volatile loads with NEON intrinsics not supported
12661     Info.readMem = true;
12662     Info.writeMem = false;
12663     return true;
12664   }
12665   case Intrinsic::arm_neon_vst1:
12666   case Intrinsic::arm_neon_vst2:
12667   case Intrinsic::arm_neon_vst3:
12668   case Intrinsic::arm_neon_vst4:
12669   case Intrinsic::arm_neon_vst2lane:
12670   case Intrinsic::arm_neon_vst3lane:
12671   case Intrinsic::arm_neon_vst4lane: {
12672     Info.opc = ISD::INTRINSIC_VOID;
12673     // Conservatively set memVT to the entire set of vectors stored.
12674     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12675     unsigned NumElts = 0;
12676     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
12677       Type *ArgTy = I.getArgOperand(ArgI)->getType();
12678       if (!ArgTy->isVectorTy())
12679         break;
12680       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
12681     }
12682     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12683     Info.ptrVal = I.getArgOperand(0);
12684     Info.offset = 0;
12685     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
12686     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
12687     Info.vol = false; // volatile stores with NEON intrinsics not supported
12688     Info.readMem = false;
12689     Info.writeMem = true;
12690     return true;
12691   }
12692   case Intrinsic::arm_ldaex:
12693   case Intrinsic::arm_ldrex: {
12694     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12695     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
12696     Info.opc = ISD::INTRINSIC_W_CHAIN;
12697     Info.memVT = MVT::getVT(PtrTy->getElementType());
12698     Info.ptrVal = I.getArgOperand(0);
12699     Info.offset = 0;
12700     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12701     Info.vol = true;
12702     Info.readMem = true;
12703     Info.writeMem = false;
12704     return true;
12705   }
12706   case Intrinsic::arm_stlex:
12707   case Intrinsic::arm_strex: {
12708     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
12709     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
12710     Info.opc = ISD::INTRINSIC_W_CHAIN;
12711     Info.memVT = MVT::getVT(PtrTy->getElementType());
12712     Info.ptrVal = I.getArgOperand(1);
12713     Info.offset = 0;
12714     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
12715     Info.vol = true;
12716     Info.readMem = false;
12717     Info.writeMem = true;
12718     return true;
12719   }
12720   case Intrinsic::arm_stlexd:
12721   case Intrinsic::arm_strexd: {
12722     Info.opc = ISD::INTRINSIC_W_CHAIN;
12723     Info.memVT = MVT::i64;
12724     Info.ptrVal = I.getArgOperand(2);
12725     Info.offset = 0;
12726     Info.align = 8;
12727     Info.vol = true;
12728     Info.readMem = false;
12729     Info.writeMem = true;
12730     return true;
12731   }
12732   case Intrinsic::arm_ldaexd:
12733   case Intrinsic::arm_ldrexd: {
12734     Info.opc = ISD::INTRINSIC_W_CHAIN;
12735     Info.memVT = MVT::i64;
12736     Info.ptrVal = I.getArgOperand(0);
12737     Info.offset = 0;
12738     Info.align = 8;
12739     Info.vol = true;
12740     Info.readMem = true;
12741     Info.writeMem = false;
12742     return true;
12743   }
12744   default:
12745     break;
12746   }
12747 
12748   return false;
12749 }
12750 
12751 /// \brief Returns true if it is beneficial to convert a load of a constant
12752 /// to just the constant itself.
12753 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
12754                                                           Type *Ty) const {
12755   assert(Ty->isIntegerTy());
12756 
12757   unsigned Bits = Ty->getPrimitiveSizeInBits();
12758   if (Bits == 0 || Bits > 32)
12759     return false;
12760   return true;
12761 }
12762 
12763 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
12764                                         ARM_MB::MemBOpt Domain) const {
12765   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12766 
12767   // First, if the target has no DMB, see what fallback we can use.
12768   if (!Subtarget->hasDataBarrier()) {
12769     // Some ARMv6 cpus can support data barriers with an mcr instruction.
12770     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
12771     // here.
12772     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
12773       Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
12774       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
12775                         Builder.getInt32(0), Builder.getInt32(7),
12776                         Builder.getInt32(10), Builder.getInt32(5)};
12777       return Builder.CreateCall(MCR, args);
12778     } else {
12779       // Instead of using barriers, atomic accesses on these subtargets use
12780       // libcalls.
12781       llvm_unreachable("makeDMB on a target so old that it has no barriers");
12782     }
12783   } else {
12784     Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
12785     // Only a full system barrier exists in the M-class architectures.
12786     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
12787     Constant *CDomain = Builder.getInt32(Domain);
12788     return Builder.CreateCall(DMB, CDomain);
12789   }
12790 }
12791 
12792 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
12793 Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
12794                                          AtomicOrdering Ord, bool IsStore,
12795                                          bool IsLoad) const {
12796   switch (Ord) {
12797   case AtomicOrdering::NotAtomic:
12798   case AtomicOrdering::Unordered:
12799     llvm_unreachable("Invalid fence: unordered/non-atomic");
12800   case AtomicOrdering::Monotonic:
12801   case AtomicOrdering::Acquire:
12802     return nullptr; // Nothing to do
12803   case AtomicOrdering::SequentiallyConsistent:
12804     if (!IsStore)
12805       return nullptr; // Nothing to do
12806     /*FALLTHROUGH*/
12807   case AtomicOrdering::Release:
12808   case AtomicOrdering::AcquireRelease:
12809     if (Subtarget->preferISHSTBarriers())
12810       return makeDMB(Builder, ARM_MB::ISHST);
12811     // FIXME: add a comment with a link to documentation justifying this.
12812     else
12813       return makeDMB(Builder, ARM_MB::ISH);
12814   }
12815   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
12816 }
12817 
12818 Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
12819                                           AtomicOrdering Ord, bool IsStore,
12820                                           bool IsLoad) const {
12821   switch (Ord) {
12822   case AtomicOrdering::NotAtomic:
12823   case AtomicOrdering::Unordered:
12824     llvm_unreachable("Invalid fence: unordered/not-atomic");
12825   case AtomicOrdering::Monotonic:
12826   case AtomicOrdering::Release:
12827     return nullptr; // Nothing to do
12828   case AtomicOrdering::Acquire:
12829   case AtomicOrdering::AcquireRelease:
12830   case AtomicOrdering::SequentiallyConsistent:
12831     return makeDMB(Builder, ARM_MB::ISH);
12832   }
12833   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
12834 }
12835 
12836 // Loads and stores less than 64-bits are already atomic; ones above that
12837 // are doomed anyway, so defer to the default libcall and blame the OS when
12838 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
12839 // anything for those.
12840 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
12841   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
12842   return (Size == 64) && !Subtarget->isMClass();
12843 }
12844 
12845 // Loads and stores less than 64-bits are already atomic; ones above that
12846 // are doomed anyway, so defer to the default libcall and blame the OS when
12847 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
12848 // anything for those.
12849 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
12850 // guarantee, see DDI0406C ARM architecture reference manual,
12851 // sections A8.8.72-74 LDRD)
12852 TargetLowering::AtomicExpansionKind
12853 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
12854   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
12855   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
12856                                                   : AtomicExpansionKind::None;
12857 }
12858 
12859 // For the real atomic operations, we have ldrex/strex up to 32 bits,
12860 // and up to 64 bits on the non-M profiles
12861 TargetLowering::AtomicExpansionKind
12862 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
12863   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
12864   return (Size <= (Subtarget->isMClass() ? 32U : 64U))
12865              ? AtomicExpansionKind::LLSC
12866              : AtomicExpansionKind::None;
12867 }
12868 
12869 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
12870     AtomicCmpXchgInst *AI) const {
12871   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
12872   // implement cmpxchg without spilling. If the address being exchanged is also
12873   // on the stack and close enough to the spill slot, this can lead to a
12874   // situation where the monitor always gets cleared and the atomic operation
12875   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
12876   return getTargetMachine().getOptLevel() != 0;
12877 }
12878 
12879 bool ARMTargetLowering::shouldInsertFencesForAtomic(
12880     const Instruction *I) const {
12881   return InsertFencesForAtomic;
12882 }
12883 
12884 // This has so far only been implemented for MachO.
12885 bool ARMTargetLowering::useLoadStackGuardNode() const {
12886   return Subtarget->isTargetMachO();
12887 }
12888 
12889 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
12890                                                   unsigned &Cost) const {
12891   // If we do not have NEON, vector types are not natively supported.
12892   if (!Subtarget->hasNEON())
12893     return false;
12894 
12895   // Floating point values and vector values map to the same register file.
12896   // Therefore, although we could do a store extract of a vector type, this is
12897   // better to leave at float as we have more freedom in the addressing mode for
12898   // those.
12899   if (VectorTy->isFPOrFPVectorTy())
12900     return false;
12901 
12902   // If the index is unknown at compile time, this is very expensive to lower
12903   // and it is not possible to combine the store with the extract.
12904   if (!isa<ConstantInt>(Idx))
12905     return false;
12906 
12907   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
12908   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
12909   // We can do a store + vector extract on any vector that fits perfectly in a D
12910   // or Q register.
12911   if (BitWidth == 64 || BitWidth == 128) {
12912     Cost = 0;
12913     return true;
12914   }
12915   return false;
12916 }
12917 
12918 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
12919   return Subtarget->hasV6T2Ops();
12920 }
12921 
12922 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
12923   return Subtarget->hasV6T2Ops();
12924 }
12925 
12926 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
12927                                          AtomicOrdering Ord) const {
12928   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12929   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
12930   bool IsAcquire = isAcquireOrStronger(Ord);
12931 
12932   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
12933   // intrinsic must return {i32, i32} and we have to recombine them into a
12934   // single i64 here.
12935   if (ValTy->getPrimitiveSizeInBits() == 64) {
12936     Intrinsic::ID Int =
12937         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
12938     Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
12939 
12940     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
12941     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
12942 
12943     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
12944     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
12945     if (!Subtarget->isLittle())
12946       std::swap (Lo, Hi);
12947     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
12948     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
12949     return Builder.CreateOr(
12950         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
12951   }
12952 
12953   Type *Tys[] = { Addr->getType() };
12954   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
12955   Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
12956 
12957   return Builder.CreateTruncOrBitCast(
12958       Builder.CreateCall(Ldrex, Addr),
12959       cast<PointerType>(Addr->getType())->getElementType());
12960 }
12961 
12962 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
12963     IRBuilder<> &Builder) const {
12964   if (!Subtarget->hasV7Ops())
12965     return;
12966   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12967   Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
12968 }
12969 
12970 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
12971                                                Value *Addr,
12972                                                AtomicOrdering Ord) const {
12973   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
12974   bool IsRelease = isReleaseOrStronger(Ord);
12975 
12976   // Since the intrinsics must have legal type, the i64 intrinsics take two
12977   // parameters: "i32, i32". We must marshal Val into the appropriate form
12978   // before the call.
12979   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
12980     Intrinsic::ID Int =
12981         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
12982     Function *Strex = Intrinsic::getDeclaration(M, Int);
12983     Type *Int32Ty = Type::getInt32Ty(M->getContext());
12984 
12985     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
12986     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
12987     if (!Subtarget->isLittle())
12988       std::swap (Lo, Hi);
12989     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
12990     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
12991   }
12992 
12993   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
12994   Type *Tys[] = { Addr->getType() };
12995   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
12996 
12997   return Builder.CreateCall(
12998       Strex, {Builder.CreateZExtOrBitCast(
12999                   Val, Strex->getFunctionType()->getParamType(0)),
13000               Addr});
13001 }
13002 
13003 /// \brief Lower an interleaved load into a vldN intrinsic.
13004 ///
13005 /// E.g. Lower an interleaved load (Factor = 2):
13006 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
13007 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
13008 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
13009 ///
13010 ///      Into:
13011 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
13012 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
13013 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
13014 bool ARMTargetLowering::lowerInterleavedLoad(
13015     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
13016     ArrayRef<unsigned> Indices, unsigned Factor) const {
13017   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13018          "Invalid interleave factor");
13019   assert(!Shuffles.empty() && "Empty shufflevector input");
13020   assert(Shuffles.size() == Indices.size() &&
13021          "Unmatched number of shufflevectors and indices");
13022 
13023   VectorType *VecTy = Shuffles[0]->getType();
13024   Type *EltTy = VecTy->getVectorElementType();
13025 
13026   const DataLayout &DL = LI->getModule()->getDataLayout();
13027   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
13028   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
13029 
13030   // Skip if we do not have NEON and skip illegal vector types and vector types
13031   // with i64/f64 elements (vldN doesn't support i64/f64 elements).
13032   if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
13033     return false;
13034 
13035   // A pointer vector can not be the return type of the ldN intrinsics. Need to
13036   // load integer vectors first and then convert to pointer vectors.
13037   if (EltTy->isPointerTy())
13038     VecTy =
13039         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
13040 
13041   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
13042                                             Intrinsic::arm_neon_vld3,
13043                                             Intrinsic::arm_neon_vld4};
13044 
13045   IRBuilder<> Builder(LI);
13046   SmallVector<Value *, 2> Ops;
13047 
13048   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
13049   Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
13050   Ops.push_back(Builder.getInt32(LI->getAlignment()));
13051 
13052   Type *Tys[] = { VecTy, Int8Ptr };
13053   Function *VldnFunc =
13054       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
13055   CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
13056 
13057   // Replace uses of each shufflevector with the corresponding vector loaded
13058   // by ldN.
13059   for (unsigned i = 0; i < Shuffles.size(); i++) {
13060     ShuffleVectorInst *SV = Shuffles[i];
13061     unsigned Index = Indices[i];
13062 
13063     Value *SubVec = Builder.CreateExtractValue(VldN, Index);
13064 
13065     // Convert the integer vector to pointer vector if the element is pointer.
13066     if (EltTy->isPointerTy())
13067       SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
13068 
13069     SV->replaceAllUsesWith(SubVec);
13070   }
13071 
13072   return true;
13073 }
13074 
13075 /// \brief Get a mask consisting of sequential integers starting from \p Start.
13076 ///
13077 /// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
13078 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
13079                                    unsigned NumElts) {
13080   SmallVector<Constant *, 16> Mask;
13081   for (unsigned i = 0; i < NumElts; i++)
13082     Mask.push_back(Builder.getInt32(Start + i));
13083 
13084   return ConstantVector::get(Mask);
13085 }
13086 
13087 /// \brief Lower an interleaved store into a vstN intrinsic.
13088 ///
13089 /// E.g. Lower an interleaved store (Factor = 3):
13090 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
13091 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
13092 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
13093 ///
13094 ///      Into:
13095 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
13096 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
13097 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
13098 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
13099 ///
13100 /// Note that the new shufflevectors will be removed and we'll only generate one
13101 /// vst3 instruction in CodeGen.
13102 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
13103                                               ShuffleVectorInst *SVI,
13104                                               unsigned Factor) const {
13105   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13106          "Invalid interleave factor");
13107 
13108   VectorType *VecTy = SVI->getType();
13109   assert(VecTy->getVectorNumElements() % Factor == 0 &&
13110          "Invalid interleaved store");
13111 
13112   unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
13113   Type *EltTy = VecTy->getVectorElementType();
13114   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
13115 
13116   const DataLayout &DL = SI->getModule()->getDataLayout();
13117   unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
13118   bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
13119 
13120   // Skip if we do not have NEON and skip illegal vector types and vector types
13121   // with i64/f64 elements (vstN doesn't support i64/f64 elements).
13122   if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
13123       EltIs64Bits)
13124     return false;
13125 
13126   Value *Op0 = SVI->getOperand(0);
13127   Value *Op1 = SVI->getOperand(1);
13128   IRBuilder<> Builder(SI);
13129 
13130   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
13131   // vectors to integer vectors.
13132   if (EltTy->isPointerTy()) {
13133     Type *IntTy = DL.getIntPtrType(EltTy);
13134 
13135     // Convert to the corresponding integer vector.
13136     Type *IntVecTy =
13137         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
13138     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
13139     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
13140 
13141     SubVecTy = VectorType::get(IntTy, NumSubElts);
13142   }
13143 
13144   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
13145                                              Intrinsic::arm_neon_vst3,
13146                                              Intrinsic::arm_neon_vst4};
13147   SmallVector<Value *, 6> Ops;
13148 
13149   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
13150   Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
13151 
13152   Type *Tys[] = { Int8Ptr, SubVecTy };
13153   Function *VstNFunc = Intrinsic::getDeclaration(
13154       SI->getModule(), StoreInts[Factor - 2], Tys);
13155 
13156   // Split the shufflevector operands into sub vectors for the new vstN call.
13157   for (unsigned i = 0; i < Factor; i++)
13158     Ops.push_back(Builder.CreateShuffleVector(
13159         Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
13160 
13161   Ops.push_back(Builder.getInt32(SI->getAlignment()));
13162   Builder.CreateCall(VstNFunc, Ops);
13163   return true;
13164 }
13165 
13166 enum HABaseType {
13167   HA_UNKNOWN = 0,
13168   HA_FLOAT,
13169   HA_DOUBLE,
13170   HA_VECT64,
13171   HA_VECT128
13172 };
13173 
13174 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
13175                                    uint64_t &Members) {
13176   if (auto *ST = dyn_cast<StructType>(Ty)) {
13177     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
13178       uint64_t SubMembers = 0;
13179       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
13180         return false;
13181       Members += SubMembers;
13182     }
13183   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
13184     uint64_t SubMembers = 0;
13185     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
13186       return false;
13187     Members += SubMembers * AT->getNumElements();
13188   } else if (Ty->isFloatTy()) {
13189     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
13190       return false;
13191     Members = 1;
13192     Base = HA_FLOAT;
13193   } else if (Ty->isDoubleTy()) {
13194     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
13195       return false;
13196     Members = 1;
13197     Base = HA_DOUBLE;
13198   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
13199     Members = 1;
13200     switch (Base) {
13201     case HA_FLOAT:
13202     case HA_DOUBLE:
13203       return false;
13204     case HA_VECT64:
13205       return VT->getBitWidth() == 64;
13206     case HA_VECT128:
13207       return VT->getBitWidth() == 128;
13208     case HA_UNKNOWN:
13209       switch (VT->getBitWidth()) {
13210       case 64:
13211         Base = HA_VECT64;
13212         return true;
13213       case 128:
13214         Base = HA_VECT128;
13215         return true;
13216       default:
13217         return false;
13218       }
13219     }
13220   }
13221 
13222   return (Members > 0 && Members <= 4);
13223 }
13224 
13225 /// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
13226 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
13227 /// passing according to AAPCS rules.
13228 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
13229     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
13230   if (getEffectiveCallingConv(CallConv, isVarArg) !=
13231       CallingConv::ARM_AAPCS_VFP)
13232     return false;
13233 
13234   HABaseType Base = HA_UNKNOWN;
13235   uint64_t Members = 0;
13236   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
13237   DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
13238 
13239   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
13240   return IsHA || IsIntArray;
13241 }
13242 
13243 unsigned ARMTargetLowering::getExceptionPointerRegister(
13244     const Constant *PersonalityFn) const {
13245   // Platforms which do not use SjLj EH may return values in these registers
13246   // via the personality function.
13247   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
13248 }
13249 
13250 unsigned ARMTargetLowering::getExceptionSelectorRegister(
13251     const Constant *PersonalityFn) const {
13252   // Platforms which do not use SjLj EH may return values in these registers
13253   // via the personality function.
13254   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
13255 }
13256 
13257 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
13258   // Update IsSplitCSR in ARMFunctionInfo.
13259   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
13260   AFI->setIsSplitCSR(true);
13261 }
13262 
13263 void ARMTargetLowering::insertCopiesSplitCSR(
13264     MachineBasicBlock *Entry,
13265     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
13266   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
13267   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
13268   if (!IStart)
13269     return;
13270 
13271   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
13272   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
13273   MachineBasicBlock::iterator MBBI = Entry->begin();
13274   for (const MCPhysReg *I = IStart; *I; ++I) {
13275     const TargetRegisterClass *RC = nullptr;
13276     if (ARM::GPRRegClass.contains(*I))
13277       RC = &ARM::GPRRegClass;
13278     else if (ARM::DPRRegClass.contains(*I))
13279       RC = &ARM::DPRRegClass;
13280     else
13281       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
13282 
13283     unsigned NewVR = MRI->createVirtualRegister(RC);
13284     // Create copy from CSR to a virtual register.
13285     // FIXME: this currently does not emit CFI pseudo-instructions, it works
13286     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
13287     // nounwind. If we want to generalize this later, we may need to emit
13288     // CFI pseudo-instructions.
13289     assert(Entry->getParent()->getFunction()->hasFnAttribute(
13290                Attribute::NoUnwind) &&
13291            "Function should be nounwind in insertCopiesSplitCSR!");
13292     Entry->addLiveIn(*I);
13293     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
13294         .addReg(*I);
13295 
13296     // Insert the copy-back instructions right before the terminator.
13297     for (auto *Exit : Exits)
13298       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
13299               TII->get(TargetOpcode::COPY), *I)
13300           .addReg(NewVR);
13301   }
13302 }
13303