1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that ARM uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "ARMISelLowering.h"
16 #include "ARMBaseInstrInfo.h"
17 #include "ARMBaseRegisterInfo.h"
18 #include "ARMCallingConv.h"
19 #include "ARMConstantPoolValue.h"
20 #include "ARMMachineFunctionInfo.h"
21 #include "ARMPerfectShuffle.h"
22 #include "ARMRegisterInfo.h"
23 #include "ARMSelectionDAGInfo.h"
24 #include "ARMSubtarget.h"
25 #include "MCTargetDesc/ARMAddressingModes.h"
26 #include "MCTargetDesc/ARMBaseInfo.h"
27 #include "Utils/ARMBaseInfo.h"
28 #include "llvm/ADT/APFloat.h"
29 #include "llvm/ADT/APInt.h"
30 #include "llvm/ADT/ArrayRef.h"
31 #include "llvm/ADT/BitVector.h"
32 #include "llvm/ADT/DenseMap.h"
33 #include "llvm/ADT/STLExtras.h"
34 #include "llvm/ADT/SmallPtrSet.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringExtras.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/ADT/StringSwitch.h"
40 #include "llvm/ADT/Triple.h"
41 #include "llvm/ADT/Twine.h"
42 #include "llvm/Analysis/VectorUtils.h"
43 #include "llvm/CodeGen/CallingConvLower.h"
44 #include "llvm/CodeGen/ISDOpcodes.h"
45 #include "llvm/CodeGen/IntrinsicLowering.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineConstantPool.h"
48 #include "llvm/CodeGen/MachineFrameInfo.h"
49 #include "llvm/CodeGen/MachineFunction.h"
50 #include "llvm/CodeGen/MachineInstr.h"
51 #include "llvm/CodeGen/MachineInstrBuilder.h"
52 #include "llvm/CodeGen/MachineJumpTableInfo.h"
53 #include "llvm/CodeGen/MachineMemOperand.h"
54 #include "llvm/CodeGen/MachineOperand.h"
55 #include "llvm/CodeGen/MachineRegisterInfo.h"
56 #include "llvm/CodeGen/RuntimeLibcalls.h"
57 #include "llvm/CodeGen/SelectionDAG.h"
58 #include "llvm/CodeGen/SelectionDAGNodes.h"
59 #include "llvm/CodeGen/TargetInstrInfo.h"
60 #include "llvm/CodeGen/TargetLowering.h"
61 #include "llvm/CodeGen/TargetOpcodes.h"
62 #include "llvm/CodeGen/TargetRegisterInfo.h"
63 #include "llvm/CodeGen/TargetSubtargetInfo.h"
64 #include "llvm/CodeGen/ValueTypes.h"
65 #include "llvm/IR/Attributes.h"
66 #include "llvm/IR/CallingConv.h"
67 #include "llvm/IR/Constant.h"
68 #include "llvm/IR/Constants.h"
69 #include "llvm/IR/DataLayout.h"
70 #include "llvm/IR/DebugLoc.h"
71 #include "llvm/IR/DerivedTypes.h"
72 #include "llvm/IR/Function.h"
73 #include "llvm/IR/GlobalAlias.h"
74 #include "llvm/IR/GlobalValue.h"
75 #include "llvm/IR/GlobalVariable.h"
76 #include "llvm/IR/IRBuilder.h"
77 #include "llvm/IR/InlineAsm.h"
78 #include "llvm/IR/Instruction.h"
79 #include "llvm/IR/Instructions.h"
80 #include "llvm/IR/IntrinsicInst.h"
81 #include "llvm/IR/Intrinsics.h"
82 #include "llvm/IR/Module.h"
83 #include "llvm/IR/Type.h"
84 #include "llvm/IR/User.h"
85 #include "llvm/IR/Value.h"
86 #include "llvm/MC/MCInstrDesc.h"
87 #include "llvm/MC/MCInstrItineraries.h"
88 #include "llvm/MC/MCRegisterInfo.h"
89 #include "llvm/MC/MCSchedule.h"
90 #include "llvm/Support/AtomicOrdering.h"
91 #include "llvm/Support/BranchProbability.h"
92 #include "llvm/Support/Casting.h"
93 #include "llvm/Support/CodeGen.h"
94 #include "llvm/Support/CommandLine.h"
95 #include "llvm/Support/Compiler.h"
96 #include "llvm/Support/Debug.h"
97 #include "llvm/Support/ErrorHandling.h"
98 #include "llvm/Support/KnownBits.h"
99 #include "llvm/Support/MachineValueType.h"
100 #include "llvm/Support/MathExtras.h"
101 #include "llvm/Support/raw_ostream.h"
102 #include "llvm/Target/TargetMachine.h"
103 #include "llvm/Target/TargetOptions.h"
104 #include <algorithm>
105 #include <cassert>
106 #include <cstdint>
107 #include <cstdlib>
108 #include <iterator>
109 #include <limits>
110 #include <string>
111 #include <tuple>
112 #include <utility>
113 #include <vector>
114 
115 using namespace llvm;
116 
117 #define DEBUG_TYPE "arm-isel"
118 
119 STATISTIC(NumTailCalls, "Number of tail calls");
120 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
121 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
122 STATISTIC(NumConstpoolPromoted,
123   "Number of constants with their storage promoted into constant pools");
124 
125 static cl::opt<bool>
126 ARMInterworking("arm-interworking", cl::Hidden,
127   cl::desc("Enable / disable ARM interworking (for debugging only)"),
128   cl::init(true));
129 
130 static cl::opt<bool> EnableConstpoolPromotion(
131     "arm-promote-constant", cl::Hidden,
132     cl::desc("Enable / disable promotion of unnamed_addr constants into "
133              "constant pools"),
134     cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
135 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
136     "arm-promote-constant-max-size", cl::Hidden,
137     cl::desc("Maximum size of constant to promote into a constant pool"),
138     cl::init(64));
139 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
140     "arm-promote-constant-max-total", cl::Hidden,
141     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
142     cl::init(128));
143 
144 // The APCS parameter registers.
145 static const MCPhysReg GPRArgRegs[] = {
146   ARM::R0, ARM::R1, ARM::R2, ARM::R3
147 };
148 
149 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
150                                        MVT PromotedBitwiseVT) {
151   if (VT != PromotedLdStVT) {
152     setOperationAction(ISD::LOAD, VT, Promote);
153     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
154 
155     setOperationAction(ISD::STORE, VT, Promote);
156     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
157   }
158 
159   MVT ElemTy = VT.getVectorElementType();
160   if (ElemTy != MVT::f64)
161     setOperationAction(ISD::SETCC, VT, Custom);
162   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
163   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
164   if (ElemTy == MVT::i32) {
165     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
166     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
167     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
168     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
169   } else {
170     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
171     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
172     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
173     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
174   }
175   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
176   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
177   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
178   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
179   setOperationAction(ISD::SELECT,            VT, Expand);
180   setOperationAction(ISD::SELECT_CC,         VT, Expand);
181   setOperationAction(ISD::VSELECT,           VT, Expand);
182   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
183   if (VT.isInteger()) {
184     setOperationAction(ISD::SHL, VT, Custom);
185     setOperationAction(ISD::SRA, VT, Custom);
186     setOperationAction(ISD::SRL, VT, Custom);
187   }
188 
189   // Promote all bit-wise operations.
190   if (VT.isInteger() && VT != PromotedBitwiseVT) {
191     setOperationAction(ISD::AND, VT, Promote);
192     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
193     setOperationAction(ISD::OR,  VT, Promote);
194     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
195     setOperationAction(ISD::XOR, VT, Promote);
196     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
197   }
198 
199   // Neon does not support vector divide/remainder operations.
200   setOperationAction(ISD::SDIV, VT, Expand);
201   setOperationAction(ISD::UDIV, VT, Expand);
202   setOperationAction(ISD::FDIV, VT, Expand);
203   setOperationAction(ISD::SREM, VT, Expand);
204   setOperationAction(ISD::UREM, VT, Expand);
205   setOperationAction(ISD::FREM, VT, Expand);
206 
207   if (!VT.isFloatingPoint() &&
208       VT != MVT::v2i64 && VT != MVT::v1i64)
209     for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
210       setOperationAction(Opcode, VT, Legal);
211 }
212 
213 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
214   addRegisterClass(VT, &ARM::DPRRegClass);
215   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
216 }
217 
218 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
219   addRegisterClass(VT, &ARM::DPairRegClass);
220   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
221 }
222 
223 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
224                                      const ARMSubtarget &STI)
225     : TargetLowering(TM), Subtarget(&STI) {
226   RegInfo = Subtarget->getRegisterInfo();
227   Itins = Subtarget->getInstrItineraryData();
228 
229   setBooleanContents(ZeroOrOneBooleanContent);
230   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
231 
232   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
233       !Subtarget->isTargetWatchOS()) {
234     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
235     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
236       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
237                             IsHFTarget ? CallingConv::ARM_AAPCS_VFP
238                                        : CallingConv::ARM_AAPCS);
239   }
240 
241   if (Subtarget->isTargetMachO()) {
242     // Uses VFP for Thumb libfuncs if available.
243     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
244         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
245       static const struct {
246         const RTLIB::Libcall Op;
247         const char * const Name;
248         const ISD::CondCode Cond;
249       } LibraryCalls[] = {
250         // Single-precision floating-point arithmetic.
251         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
252         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
253         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
254         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
255 
256         // Double-precision floating-point arithmetic.
257         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
258         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
259         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
260         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
261 
262         // Single-precision comparisons.
263         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
264         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
265         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
266         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
267         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
268         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
269         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
270         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
271 
272         // Double-precision comparisons.
273         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
274         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
275         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
276         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
277         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
278         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
279         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
280         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
281 
282         // Floating-point to integer conversions.
283         // i64 conversions are done via library routines even when generating VFP
284         // instructions, so use the same ones.
285         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
286         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
287         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
288         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
289 
290         // Conversions between floating types.
291         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
292         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
293 
294         // Integer to floating-point conversions.
295         // i64 conversions are done via library routines even when generating VFP
296         // instructions, so use the same ones.
297         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
298         // e.g., __floatunsidf vs. __floatunssidfvfp.
299         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
300         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
301         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
302         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
303       };
304 
305       for (const auto &LC : LibraryCalls) {
306         setLibcallName(LC.Op, LC.Name);
307         if (LC.Cond != ISD::SETCC_INVALID)
308           setCmpLibcallCC(LC.Op, LC.Cond);
309       }
310     }
311   }
312 
313   // These libcalls are not available in 32-bit.
314   setLibcallName(RTLIB::SHL_I128, nullptr);
315   setLibcallName(RTLIB::SRL_I128, nullptr);
316   setLibcallName(RTLIB::SRA_I128, nullptr);
317 
318   // RTLIB
319   if (Subtarget->isAAPCS_ABI() &&
320       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
321        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
322     static const struct {
323       const RTLIB::Libcall Op;
324       const char * const Name;
325       const CallingConv::ID CC;
326       const ISD::CondCode Cond;
327     } LibraryCalls[] = {
328       // Double-precision floating-point arithmetic helper functions
329       // RTABI chapter 4.1.2, Table 2
330       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
331       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
332       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
333       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
334 
335       // Double-precision floating-point comparison helper functions
336       // RTABI chapter 4.1.2, Table 3
337       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
338       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
339       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
340       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
341       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
342       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
343       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
344       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
345 
346       // Single-precision floating-point arithmetic helper functions
347       // RTABI chapter 4.1.2, Table 4
348       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
349       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
350       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
351       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
352 
353       // Single-precision floating-point comparison helper functions
354       // RTABI chapter 4.1.2, Table 5
355       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
356       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
357       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
358       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
359       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
360       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
361       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
362       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
363 
364       // Floating-point to integer conversions.
365       // RTABI chapter 4.1.2, Table 6
366       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
367       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
368       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
369       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
370       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
371       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
372       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
373       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
374 
375       // Conversions between floating types.
376       // RTABI chapter 4.1.2, Table 7
377       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
378       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
379       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
380 
381       // Integer to floating-point conversions.
382       // RTABI chapter 4.1.2, Table 8
383       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
384       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
385       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
386       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
387       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
388       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
389       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
390       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
391 
392       // Long long helper functions
393       // RTABI chapter 4.2, Table 9
394       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
395       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
396       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
397       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
398 
399       // Integer division functions
400       // RTABI chapter 4.3.1
401       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
402       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
403       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
404       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
405       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
406       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
407       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
408       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
409     };
410 
411     for (const auto &LC : LibraryCalls) {
412       setLibcallName(LC.Op, LC.Name);
413       setLibcallCallingConv(LC.Op, LC.CC);
414       if (LC.Cond != ISD::SETCC_INVALID)
415         setCmpLibcallCC(LC.Op, LC.Cond);
416     }
417 
418     // EABI dependent RTLIB
419     if (TM.Options.EABIVersion == EABI::EABI4 ||
420         TM.Options.EABIVersion == EABI::EABI5) {
421       static const struct {
422         const RTLIB::Libcall Op;
423         const char *const Name;
424         const CallingConv::ID CC;
425         const ISD::CondCode Cond;
426       } MemOpsLibraryCalls[] = {
427         // Memory operations
428         // RTABI chapter 4.3.4
429         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
430         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
431         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
432       };
433 
434       for (const auto &LC : MemOpsLibraryCalls) {
435         setLibcallName(LC.Op, LC.Name);
436         setLibcallCallingConv(LC.Op, LC.CC);
437         if (LC.Cond != ISD::SETCC_INVALID)
438           setCmpLibcallCC(LC.Op, LC.Cond);
439       }
440     }
441   }
442 
443   if (Subtarget->isTargetWindows()) {
444     static const struct {
445       const RTLIB::Libcall Op;
446       const char * const Name;
447       const CallingConv::ID CC;
448     } LibraryCalls[] = {
449       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
450       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
451       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
452       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
453       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
454       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
455       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
456       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
457     };
458 
459     for (const auto &LC : LibraryCalls) {
460       setLibcallName(LC.Op, LC.Name);
461       setLibcallCallingConv(LC.Op, LC.CC);
462     }
463   }
464 
465   // Use divmod compiler-rt calls for iOS 5.0 and later.
466   if (Subtarget->isTargetMachO() &&
467       !(Subtarget->isTargetIOS() &&
468         Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
469     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
470     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
471   }
472 
473   // The half <-> float conversion functions are always soft-float on
474   // non-watchos platforms, but are needed for some targets which use a
475   // hard-float calling convention by default.
476   if (!Subtarget->isTargetWatchABI()) {
477     if (Subtarget->isAAPCS_ABI()) {
478       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
479       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
480       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
481     } else {
482       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
483       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
484       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
485     }
486   }
487 
488   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
489   // a __gnu_ prefix (which is the default).
490   if (Subtarget->isTargetAEABI()) {
491     static const struct {
492       const RTLIB::Libcall Op;
493       const char * const Name;
494       const CallingConv::ID CC;
495     } LibraryCalls[] = {
496       { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
497       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
498       { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
499     };
500 
501     for (const auto &LC : LibraryCalls) {
502       setLibcallName(LC.Op, LC.Name);
503       setLibcallCallingConv(LC.Op, LC.CC);
504     }
505   }
506 
507   if (Subtarget->isThumb1Only())
508     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
509   else
510     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
511 
512   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
513       !Subtarget->isThumb1Only()) {
514     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
515     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
516   }
517 
518   if (Subtarget->hasFullFP16()) {
519     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
520     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
521     setOperationAction(ISD::BITCAST, MVT::i32, Custom);
522     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
523 
524     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
525     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
526   }
527 
528   for (MVT VT : MVT::vector_valuetypes()) {
529     for (MVT InnerVT : MVT::vector_valuetypes()) {
530       setTruncStoreAction(VT, InnerVT, Expand);
531       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
532       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
533       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
534     }
535 
536     setOperationAction(ISD::MULHS, VT, Expand);
537     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
538     setOperationAction(ISD::MULHU, VT, Expand);
539     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
540 
541     setOperationAction(ISD::BSWAP, VT, Expand);
542   }
543 
544   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
545   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
546 
547   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
548   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
549 
550   if (Subtarget->hasNEON()) {
551     addDRTypeForNEON(MVT::v2f32);
552     addDRTypeForNEON(MVT::v8i8);
553     addDRTypeForNEON(MVT::v4i16);
554     addDRTypeForNEON(MVT::v2i32);
555     addDRTypeForNEON(MVT::v1i64);
556 
557     addQRTypeForNEON(MVT::v4f32);
558     addQRTypeForNEON(MVT::v2f64);
559     addQRTypeForNEON(MVT::v16i8);
560     addQRTypeForNEON(MVT::v8i16);
561     addQRTypeForNEON(MVT::v4i32);
562     addQRTypeForNEON(MVT::v2i64);
563 
564     if (Subtarget->hasFullFP16()) {
565       addQRTypeForNEON(MVT::v8f16);
566       addDRTypeForNEON(MVT::v4f16);
567     }
568 
569     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
570     // neither Neon nor VFP support any arithmetic operations on it.
571     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
572     // supported for v4f32.
573     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
574     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
575     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
576     // FIXME: Code duplication: FDIV and FREM are expanded always, see
577     // ARMTargetLowering::addTypeForNEON method for details.
578     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
579     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
580     // FIXME: Create unittest.
581     // In another words, find a way when "copysign" appears in DAG with vector
582     // operands.
583     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
584     // FIXME: Code duplication: SETCC has custom operation action, see
585     // ARMTargetLowering::addTypeForNEON method for details.
586     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
587     // FIXME: Create unittest for FNEG and for FABS.
588     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
589     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
590     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
591     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
592     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
593     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
594     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
595     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
596     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
597     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
598     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
599     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
600     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
601     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
602     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
603     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
604     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
605     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
606 
607     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
608     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
609     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
610     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
611     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
612     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
613     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
614     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
615     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
616     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
617     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
618     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
619     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
620     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
621 
622     // Mark v2f32 intrinsics.
623     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
624     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
625     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
626     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
627     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
628     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
629     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
630     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
631     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
632     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
633     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
634     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
635     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
636     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
637 
638     // Neon does not support some operations on v1i64 and v2i64 types.
639     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
640     // Custom handling for some quad-vector types to detect VMULL.
641     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
642     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
643     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
644     // Custom handling for some vector types to avoid expensive expansions
645     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
646     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
647     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
648     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
649     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
650     // a destination type that is wider than the source, and nor does
651     // it have a FP_TO_[SU]INT instruction with a narrower destination than
652     // source.
653     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
654     setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
655     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
656     setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
657     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
658     setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
659     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
660     setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
661 
662     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
663     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
664 
665     // NEON does not have single instruction CTPOP for vectors with element
666     // types wider than 8-bits.  However, custom lowering can leverage the
667     // v8i8/v16i8 vcnt instruction.
668     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
669     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
670     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
671     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
672     setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
673     setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
674 
675     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
676     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
677 
678     // NEON does not have single instruction CTTZ for vectors.
679     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
680     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
681     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
682     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
683 
684     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
685     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
686     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
687     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
688 
689     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
690     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
691     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
692     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
693 
694     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
695     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
696     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
697     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
698 
699     // NEON only has FMA instructions as of VFP4.
700     if (!Subtarget->hasVFP4()) {
701       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
702       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
703     }
704 
705     setTargetDAGCombine(ISD::INTRINSIC_VOID);
706     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
707     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
708     setTargetDAGCombine(ISD::SHL);
709     setTargetDAGCombine(ISD::SRL);
710     setTargetDAGCombine(ISD::SRA);
711     setTargetDAGCombine(ISD::SIGN_EXTEND);
712     setTargetDAGCombine(ISD::ZERO_EXTEND);
713     setTargetDAGCombine(ISD::ANY_EXTEND);
714     setTargetDAGCombine(ISD::BUILD_VECTOR);
715     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
716     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
717     setTargetDAGCombine(ISD::STORE);
718     setTargetDAGCombine(ISD::FP_TO_SINT);
719     setTargetDAGCombine(ISD::FP_TO_UINT);
720     setTargetDAGCombine(ISD::FDIV);
721     setTargetDAGCombine(ISD::LOAD);
722 
723     // It is legal to extload from v4i8 to v4i16 or v4i32.
724     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
725                    MVT::v2i32}) {
726       for (MVT VT : MVT::integer_vector_valuetypes()) {
727         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
728         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
729         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
730       }
731     }
732   }
733 
734   if (Subtarget->isFPOnlySP()) {
735     // When targeting a floating-point unit with only single-precision
736     // operations, f64 is legal for the few double-precision instructions which
737     // are present However, no double-precision operations other than moves,
738     // loads and stores are provided by the hardware.
739     setOperationAction(ISD::FADD,       MVT::f64, Expand);
740     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
741     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
742     setOperationAction(ISD::FMA,        MVT::f64, Expand);
743     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
744     setOperationAction(ISD::FREM,       MVT::f64, Expand);
745     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
746     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
747     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
748     setOperationAction(ISD::FABS,       MVT::f64, Expand);
749     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
750     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
751     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
752     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
753     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
754     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
755     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
756     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
757     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
758     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
759     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
760     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
761     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
762     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
763     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
764     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
765     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
766     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
767     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
768     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
769     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
770     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
771   }
772 
773   computeRegisterProperties(Subtarget->getRegisterInfo());
774 
775   // ARM does not have floating-point extending loads.
776   for (MVT VT : MVT::fp_valuetypes()) {
777     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
778     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
779   }
780 
781   // ... or truncating stores
782   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
783   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
784   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
785 
786   // ARM does not have i1 sign extending load.
787   for (MVT VT : MVT::integer_valuetypes())
788     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
789 
790   // ARM supports all 4 flavors of integer indexed load / store.
791   if (!Subtarget->isThumb1Only()) {
792     for (unsigned im = (unsigned)ISD::PRE_INC;
793          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
794       setIndexedLoadAction(im,  MVT::i1,  Legal);
795       setIndexedLoadAction(im,  MVT::i8,  Legal);
796       setIndexedLoadAction(im,  MVT::i16, Legal);
797       setIndexedLoadAction(im,  MVT::i32, Legal);
798       setIndexedStoreAction(im, MVT::i1,  Legal);
799       setIndexedStoreAction(im, MVT::i8,  Legal);
800       setIndexedStoreAction(im, MVT::i16, Legal);
801       setIndexedStoreAction(im, MVT::i32, Legal);
802     }
803   } else {
804     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
805     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
806     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
807   }
808 
809   setOperationAction(ISD::SADDO, MVT::i32, Custom);
810   setOperationAction(ISD::UADDO, MVT::i32, Custom);
811   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
812   setOperationAction(ISD::USUBO, MVT::i32, Custom);
813 
814   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
815   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
816 
817   // i64 operation support.
818   setOperationAction(ISD::MUL,     MVT::i64, Expand);
819   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
820   if (Subtarget->isThumb1Only()) {
821     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
822     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
823   }
824   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
825       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
826     setOperationAction(ISD::MULHS, MVT::i32, Expand);
827 
828   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
829   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
830   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
831   setOperationAction(ISD::SRL,       MVT::i64, Custom);
832   setOperationAction(ISD::SRA,       MVT::i64, Custom);
833   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
834 
835   // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
836   if (Subtarget->isThumb1Only()) {
837     setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
838     setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
839     setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
840   }
841 
842   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
843     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
844 
845   // ARM does not have ROTL.
846   setOperationAction(ISD::ROTL, MVT::i32, Expand);
847   for (MVT VT : MVT::vector_valuetypes()) {
848     setOperationAction(ISD::ROTL, VT, Expand);
849     setOperationAction(ISD::ROTR, VT, Expand);
850   }
851   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
852   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
853   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
854     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
855 
856   // @llvm.readcyclecounter requires the Performance Monitors extension.
857   // Default to the 0 expansion on unsupported platforms.
858   // FIXME: Technically there are older ARM CPUs that have
859   // implementation-specific ways of obtaining this information.
860   if (Subtarget->hasPerfMon())
861     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
862 
863   // Only ARMv6 has BSWAP.
864   if (!Subtarget->hasV6Ops())
865     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
866 
867   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
868                                         : Subtarget->hasDivideInARMMode();
869   if (!hasDivide) {
870     // These are expanded into libcalls if the cpu doesn't have HW divider.
871     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
872     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
873   }
874 
875   if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
876     setOperationAction(ISD::SDIV, MVT::i32, Custom);
877     setOperationAction(ISD::UDIV, MVT::i32, Custom);
878 
879     setOperationAction(ISD::SDIV, MVT::i64, Custom);
880     setOperationAction(ISD::UDIV, MVT::i64, Custom);
881   }
882 
883   setOperationAction(ISD::SREM,  MVT::i32, Expand);
884   setOperationAction(ISD::UREM,  MVT::i32, Expand);
885 
886   // Register based DivRem for AEABI (RTABI 4.2)
887   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
888       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
889       Subtarget->isTargetWindows()) {
890     setOperationAction(ISD::SREM, MVT::i64, Custom);
891     setOperationAction(ISD::UREM, MVT::i64, Custom);
892     HasStandaloneRem = false;
893 
894     if (Subtarget->isTargetWindows()) {
895       const struct {
896         const RTLIB::Libcall Op;
897         const char * const Name;
898         const CallingConv::ID CC;
899       } LibraryCalls[] = {
900         { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
901         { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
902         { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
903         { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
904 
905         { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
906         { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
907         { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
908         { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
909       };
910 
911       for (const auto &LC : LibraryCalls) {
912         setLibcallName(LC.Op, LC.Name);
913         setLibcallCallingConv(LC.Op, LC.CC);
914       }
915     } else {
916       const struct {
917         const RTLIB::Libcall Op;
918         const char * const Name;
919         const CallingConv::ID CC;
920       } LibraryCalls[] = {
921         { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
922         { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
923         { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
924         { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
925 
926         { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
927         { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
928         { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
929         { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
930       };
931 
932       for (const auto &LC : LibraryCalls) {
933         setLibcallName(LC.Op, LC.Name);
934         setLibcallCallingConv(LC.Op, LC.CC);
935       }
936     }
937 
938     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
939     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
940     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
941     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
942   } else {
943     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
944     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
945   }
946 
947   if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
948     for (auto &VT : {MVT::f32, MVT::f64})
949       setOperationAction(ISD::FPOWI, VT, Custom);
950 
951   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
952   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
953   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
954   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
955 
956   setOperationAction(ISD::TRAP, MVT::Other, Legal);
957 
958   // Use the default implementation.
959   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
960   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
961   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
962   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
963   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
964   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
965 
966   if (Subtarget->isTargetWindows())
967     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
968   else
969     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
970 
971   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
972   // the default expansion.
973   InsertFencesForAtomic = false;
974   if (Subtarget->hasAnyDataBarrier() &&
975       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
976     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
977     // to ldrex/strex loops already.
978     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
979     if (!Subtarget->isThumb() || !Subtarget->isMClass())
980       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
981 
982     // On v8, we have particularly efficient implementations of atomic fences
983     // if they can be combined with nearby atomic loads and stores.
984     if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
985       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
986       InsertFencesForAtomic = true;
987     }
988   } else {
989     // If there's anything we can use as a barrier, go through custom lowering
990     // for ATOMIC_FENCE.
991     // If target has DMB in thumb, Fences can be inserted.
992     if (Subtarget->hasDataBarrier())
993       InsertFencesForAtomic = true;
994 
995     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
996                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
997 
998     // Set them all for expansion, which will force libcalls.
999     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
1000     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
1001     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
1002     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
1003     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
1004     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
1005     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
1006     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1007     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1008     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1009     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1010     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1011     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1012     // Unordered/Monotonic case.
1013     if (!InsertFencesForAtomic) {
1014       setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1015       setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1016     }
1017   }
1018 
1019   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
1020 
1021   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1022   if (!Subtarget->hasV6Ops()) {
1023     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1024     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
1025   }
1026   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1027 
1028   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1029       !Subtarget->isThumb1Only()) {
1030     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1031     // iff target supports vfp2.
1032     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1033     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1034   }
1035 
1036   // We want to custom lower some of our intrinsics.
1037   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1038   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1039   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1040   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1041   if (Subtarget->useSjLjEH())
1042     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1043 
1044   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
1045   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
1046   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
1047   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
1048   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
1049   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
1050   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1051   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1052   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1053   if (Subtarget->hasFullFP16()) {
1054     setOperationAction(ISD::SETCC,     MVT::f16, Expand);
1055     setOperationAction(ISD::SELECT,    MVT::f16, Custom);
1056     setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
1057   }
1058 
1059   setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
1060 
1061   setOperationAction(ISD::BRCOND,    MVT::Other, Custom);
1062   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
1063   if (Subtarget->hasFullFP16())
1064       setOperationAction(ISD::BR_CC, MVT::f16,   Custom);
1065   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
1066   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
1067   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
1068 
1069   // We don't support sin/cos/fmod/copysign/pow
1070   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
1071   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
1072   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
1073   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
1074   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
1075   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
1076   setOperationAction(ISD::FREM,      MVT::f64, Expand);
1077   setOperationAction(ISD::FREM,      MVT::f32, Expand);
1078   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
1079       !Subtarget->isThumb1Only()) {
1080     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1081     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1082   }
1083   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
1084   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
1085 
1086   if (!Subtarget->hasVFP4()) {
1087     setOperationAction(ISD::FMA, MVT::f64, Expand);
1088     setOperationAction(ISD::FMA, MVT::f32, Expand);
1089   }
1090 
1091   // Various VFP goodness
1092   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1093     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1094     if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
1095       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1096       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1097     }
1098 
1099     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1100     if (!Subtarget->hasFP16()) {
1101       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1102       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1103     }
1104   }
1105 
1106   // Use __sincos_stret if available.
1107   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1108       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1109     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1110     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1111   }
1112 
1113   // FP-ARMv8 implements a lot of rounding-like FP operations.
1114   if (Subtarget->hasFPARMv8()) {
1115     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1116     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1117     setOperationAction(ISD::FROUND, MVT::f32, Legal);
1118     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1119     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1120     setOperationAction(ISD::FRINT, MVT::f32, Legal);
1121     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1122     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1123     setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1124     setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1125     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1126     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1127 
1128     if (!Subtarget->isFPOnlySP()) {
1129       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1130       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1131       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1132       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1133       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1134       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1135       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1136       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1137     }
1138   }
1139 
1140   if (Subtarget->hasNEON()) {
1141     // vmin and vmax aren't available in a scalar form, so we use
1142     // a NEON instruction with an undef lane instead.
1143     setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
1144     setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
1145     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
1146     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
1147     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
1148     setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
1149     setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
1150     setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
1151 
1152     if (Subtarget->hasFullFP16()) {
1153       setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1154       setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1155       setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1156       setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1157 
1158       setOperationAction(ISD::FMINNAN, MVT::v4f16, Legal);
1159       setOperationAction(ISD::FMAXNAN, MVT::v4f16, Legal);
1160       setOperationAction(ISD::FMINNAN, MVT::v8f16, Legal);
1161       setOperationAction(ISD::FMAXNAN, MVT::v8f16, Legal);
1162     }
1163   }
1164 
1165   // We have target-specific dag combine patterns for the following nodes:
1166   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1167   setTargetDAGCombine(ISD::ADD);
1168   setTargetDAGCombine(ISD::SUB);
1169   setTargetDAGCombine(ISD::MUL);
1170   setTargetDAGCombine(ISD::AND);
1171   setTargetDAGCombine(ISD::OR);
1172   setTargetDAGCombine(ISD::XOR);
1173 
1174   if (Subtarget->hasV6Ops())
1175     setTargetDAGCombine(ISD::SRL);
1176 
1177   setStackPointerRegisterToSaveRestore(ARM::SP);
1178 
1179   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1180       !Subtarget->hasVFP2())
1181     setSchedulingPreference(Sched::RegPressure);
1182   else
1183     setSchedulingPreference(Sched::Hybrid);
1184 
1185   //// temporary - rewrite interface to use type
1186   MaxStoresPerMemset = 8;
1187   MaxStoresPerMemsetOptSize = 4;
1188   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1189   MaxStoresPerMemcpyOptSize = 2;
1190   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1191   MaxStoresPerMemmoveOptSize = 2;
1192 
1193   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1194   // are at least 4 bytes aligned.
1195   setMinStackArgumentAlignment(4);
1196 
1197   // Prefer likely predicted branches to selects on out-of-order cores.
1198   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1199 
1200   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
1201 }
1202 
1203 bool ARMTargetLowering::useSoftFloat() const {
1204   return Subtarget->useSoftFloat();
1205 }
1206 
1207 // FIXME: It might make sense to define the representative register class as the
1208 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1209 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1210 // SPR's representative would be DPR_VFP2. This should work well if register
1211 // pressure tracking were modified such that a register use would increment the
1212 // pressure of the register class's representative and all of it's super
1213 // classes' representatives transitively. We have not implemented this because
1214 // of the difficulty prior to coalescing of modeling operand register classes
1215 // due to the common occurrence of cross class copies and subregister insertions
1216 // and extractions.
1217 std::pair<const TargetRegisterClass *, uint8_t>
1218 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1219                                            MVT VT) const {
1220   const TargetRegisterClass *RRC = nullptr;
1221   uint8_t Cost = 1;
1222   switch (VT.SimpleTy) {
1223   default:
1224     return TargetLowering::findRepresentativeClass(TRI, VT);
1225   // Use DPR as representative register class for all floating point
1226   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1227   // the cost is 1 for both f32 and f64.
1228   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1229   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1230     RRC = &ARM::DPRRegClass;
1231     // When NEON is used for SP, only half of the register file is available
1232     // because operations that define both SP and DP results will be constrained
1233     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1234     // coalescing by double-counting the SP regs. See the FIXME above.
1235     if (Subtarget->useNEONForSinglePrecisionFP())
1236       Cost = 2;
1237     break;
1238   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1239   case MVT::v4f32: case MVT::v2f64:
1240     RRC = &ARM::DPRRegClass;
1241     Cost = 2;
1242     break;
1243   case MVT::v4i64:
1244     RRC = &ARM::DPRRegClass;
1245     Cost = 4;
1246     break;
1247   case MVT::v8i64:
1248     RRC = &ARM::DPRRegClass;
1249     Cost = 8;
1250     break;
1251   }
1252   return std::make_pair(RRC, Cost);
1253 }
1254 
1255 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1256   switch ((ARMISD::NodeType)Opcode) {
1257   case ARMISD::FIRST_NUMBER:  break;
1258   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1259   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1260   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1261   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1262   case ARMISD::CALL:          return "ARMISD::CALL";
1263   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1264   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1265   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1266   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1267   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1268   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1269   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1270   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1271   case ARMISD::CMP:           return "ARMISD::CMP";
1272   case ARMISD::CMN:           return "ARMISD::CMN";
1273   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1274   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1275   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1276   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1277   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1278 
1279   case ARMISD::CMOV:          return "ARMISD::CMOV";
1280 
1281   case ARMISD::SSAT:          return "ARMISD::SSAT";
1282   case ARMISD::USAT:          return "ARMISD::USAT";
1283 
1284   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1285   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1286   case ARMISD::RRX:           return "ARMISD::RRX";
1287 
1288   case ARMISD::ADDC:          return "ARMISD::ADDC";
1289   case ARMISD::ADDE:          return "ARMISD::ADDE";
1290   case ARMISD::SUBC:          return "ARMISD::SUBC";
1291   case ARMISD::SUBE:          return "ARMISD::SUBE";
1292 
1293   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1294   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1295   case ARMISD::VMOVhr:        return "ARMISD::VMOVhr";
1296   case ARMISD::VMOVrh:        return "ARMISD::VMOVrh";
1297   case ARMISD::VMOVSR:        return "ARMISD::VMOVSR";
1298 
1299   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1300   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1301   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1302 
1303   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1304 
1305   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1306 
1307   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1308 
1309   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1310 
1311   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1312 
1313   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
1314   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1315 
1316   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1317   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1318   case ARMISD::VCGE:          return "ARMISD::VCGE";
1319   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1320   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1321   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1322   case ARMISD::VCGT:          return "ARMISD::VCGT";
1323   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1324   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1325   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1326   case ARMISD::VTST:          return "ARMISD::VTST";
1327 
1328   case ARMISD::VSHL:          return "ARMISD::VSHL";
1329   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1330   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1331   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1332   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1333   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1334   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1335   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1336   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1337   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1338   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1339   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1340   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1341   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1342   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1343   case ARMISD::VSLI:          return "ARMISD::VSLI";
1344   case ARMISD::VSRI:          return "ARMISD::VSRI";
1345   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1346   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1347   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1348   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1349   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1350   case ARMISD::VDUP:          return "ARMISD::VDUP";
1351   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1352   case ARMISD::VEXT:          return "ARMISD::VEXT";
1353   case ARMISD::VREV64:        return "ARMISD::VREV64";
1354   case ARMISD::VREV32:        return "ARMISD::VREV32";
1355   case ARMISD::VREV16:        return "ARMISD::VREV16";
1356   case ARMISD::VZIP:          return "ARMISD::VZIP";
1357   case ARMISD::VUZP:          return "ARMISD::VUZP";
1358   case ARMISD::VTRN:          return "ARMISD::VTRN";
1359   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1360   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1361   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1362   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1363   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
1364   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1365   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1366   case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
1367   case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
1368   case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
1369   case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
1370   case ARMISD::SMULWB:        return "ARMISD::SMULWB";
1371   case ARMISD::SMULWT:        return "ARMISD::SMULWT";
1372   case ARMISD::SMLALD:        return "ARMISD::SMLALD";
1373   case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
1374   case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
1375   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
1376   case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
1377   case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
1378   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1379   case ARMISD::BFI:           return "ARMISD::BFI";
1380   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1381   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1382   case ARMISD::VBSL:          return "ARMISD::VBSL";
1383   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1384   case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
1385   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1386   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1387   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1388   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1389   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1390   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1391   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1392   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1393   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1394   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1395   case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
1396   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1397   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1398   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1399   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1400   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1401   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1402   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1403   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1404   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1405   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1406   }
1407   return nullptr;
1408 }
1409 
1410 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1411                                           EVT VT) const {
1412   if (!VT.isVector())
1413     return getPointerTy(DL);
1414   return VT.changeVectorElementTypeToInteger();
1415 }
1416 
1417 /// getRegClassFor - Return the register class that should be used for the
1418 /// specified value type.
1419 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1420   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1421   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1422   // load / store 4 to 8 consecutive D registers.
1423   if (Subtarget->hasNEON()) {
1424     if (VT == MVT::v4i64)
1425       return &ARM::QQPRRegClass;
1426     if (VT == MVT::v8i64)
1427       return &ARM::QQQQPRRegClass;
1428   }
1429   return TargetLowering::getRegClassFor(VT);
1430 }
1431 
1432 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1433 // source/dest is aligned and the copy size is large enough. We therefore want
1434 // to align such objects passed to memory intrinsics.
1435 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1436                                                unsigned &PrefAlign) const {
1437   if (!isa<MemIntrinsic>(CI))
1438     return false;
1439   MinSize = 8;
1440   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1441   // cycle faster than 4-byte aligned LDM.
1442   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1443   return true;
1444 }
1445 
1446 // Create a fast isel object.
1447 FastISel *
1448 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1449                                   const TargetLibraryInfo *libInfo) const {
1450   return ARM::createFastISel(funcInfo, libInfo);
1451 }
1452 
1453 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1454   unsigned NumVals = N->getNumValues();
1455   if (!NumVals)
1456     return Sched::RegPressure;
1457 
1458   for (unsigned i = 0; i != NumVals; ++i) {
1459     EVT VT = N->getValueType(i);
1460     if (VT == MVT::Glue || VT == MVT::Other)
1461       continue;
1462     if (VT.isFloatingPoint() || VT.isVector())
1463       return Sched::ILP;
1464   }
1465 
1466   if (!N->isMachineOpcode())
1467     return Sched::RegPressure;
1468 
1469   // Load are scheduled for latency even if there instruction itinerary
1470   // is not available.
1471   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1472   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1473 
1474   if (MCID.getNumDefs() == 0)
1475     return Sched::RegPressure;
1476   if (!Itins->isEmpty() &&
1477       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1478     return Sched::ILP;
1479 
1480   return Sched::RegPressure;
1481 }
1482 
1483 //===----------------------------------------------------------------------===//
1484 // Lowering Code
1485 //===----------------------------------------------------------------------===//
1486 
1487 static bool isSRL16(const SDValue &Op) {
1488   if (Op.getOpcode() != ISD::SRL)
1489     return false;
1490   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1491     return Const->getZExtValue() == 16;
1492   return false;
1493 }
1494 
1495 static bool isSRA16(const SDValue &Op) {
1496   if (Op.getOpcode() != ISD::SRA)
1497     return false;
1498   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1499     return Const->getZExtValue() == 16;
1500   return false;
1501 }
1502 
1503 static bool isSHL16(const SDValue &Op) {
1504   if (Op.getOpcode() != ISD::SHL)
1505     return false;
1506   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1507     return Const->getZExtValue() == 16;
1508   return false;
1509 }
1510 
1511 // Check for a signed 16-bit value. We special case SRA because it makes it
1512 // more simple when also looking for SRAs that aren't sign extending a
1513 // smaller value. Without the check, we'd need to take extra care with
1514 // checking order for some operations.
1515 static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1516   if (isSRA16(Op))
1517     return isSHL16(Op.getOperand(0));
1518   return DAG.ComputeNumSignBits(Op) == 17;
1519 }
1520 
1521 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1522 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1523   switch (CC) {
1524   default: llvm_unreachable("Unknown condition code!");
1525   case ISD::SETNE:  return ARMCC::NE;
1526   case ISD::SETEQ:  return ARMCC::EQ;
1527   case ISD::SETGT:  return ARMCC::GT;
1528   case ISD::SETGE:  return ARMCC::GE;
1529   case ISD::SETLT:  return ARMCC::LT;
1530   case ISD::SETLE:  return ARMCC::LE;
1531   case ISD::SETUGT: return ARMCC::HI;
1532   case ISD::SETUGE: return ARMCC::HS;
1533   case ISD::SETULT: return ARMCC::LO;
1534   case ISD::SETULE: return ARMCC::LS;
1535   }
1536 }
1537 
1538 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1539 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1540                         ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
1541   CondCode2 = ARMCC::AL;
1542   InvalidOnQNaN = true;
1543   switch (CC) {
1544   default: llvm_unreachable("Unknown FP condition!");
1545   case ISD::SETEQ:
1546   case ISD::SETOEQ:
1547     CondCode = ARMCC::EQ;
1548     InvalidOnQNaN = false;
1549     break;
1550   case ISD::SETGT:
1551   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1552   case ISD::SETGE:
1553   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1554   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1555   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1556   case ISD::SETONE:
1557     CondCode = ARMCC::MI;
1558     CondCode2 = ARMCC::GT;
1559     InvalidOnQNaN = false;
1560     break;
1561   case ISD::SETO:   CondCode = ARMCC::VC; break;
1562   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1563   case ISD::SETUEQ:
1564     CondCode = ARMCC::EQ;
1565     CondCode2 = ARMCC::VS;
1566     InvalidOnQNaN = false;
1567     break;
1568   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1569   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1570   case ISD::SETLT:
1571   case ISD::SETULT: CondCode = ARMCC::LT; break;
1572   case ISD::SETLE:
1573   case ISD::SETULE: CondCode = ARMCC::LE; break;
1574   case ISD::SETNE:
1575   case ISD::SETUNE:
1576     CondCode = ARMCC::NE;
1577     InvalidOnQNaN = false;
1578     break;
1579   }
1580 }
1581 
1582 //===----------------------------------------------------------------------===//
1583 //                      Calling Convention Implementation
1584 //===----------------------------------------------------------------------===//
1585 
1586 #include "ARMGenCallingConv.inc"
1587 
1588 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1589 /// account presence of floating point hardware and calling convention
1590 /// limitations, such as support for variadic functions.
1591 CallingConv::ID
1592 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1593                                            bool isVarArg) const {
1594   switch (CC) {
1595   default:
1596     report_fatal_error("Unsupported calling convention");
1597   case CallingConv::ARM_AAPCS:
1598   case CallingConv::ARM_APCS:
1599   case CallingConv::GHC:
1600     return CC;
1601   case CallingConv::PreserveMost:
1602     return CallingConv::PreserveMost;
1603   case CallingConv::ARM_AAPCS_VFP:
1604   case CallingConv::Swift:
1605     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1606   case CallingConv::C:
1607     if (!Subtarget->isAAPCS_ABI())
1608       return CallingConv::ARM_APCS;
1609     else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
1610              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1611              !isVarArg)
1612       return CallingConv::ARM_AAPCS_VFP;
1613     else
1614       return CallingConv::ARM_AAPCS;
1615   case CallingConv::Fast:
1616   case CallingConv::CXX_FAST_TLS:
1617     if (!Subtarget->isAAPCS_ABI()) {
1618       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1619         return CallingConv::Fast;
1620       return CallingConv::ARM_APCS;
1621     } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
1622       return CallingConv::ARM_AAPCS_VFP;
1623     else
1624       return CallingConv::ARM_AAPCS;
1625   }
1626 }
1627 
1628 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1629                                                  bool isVarArg) const {
1630   return CCAssignFnForNode(CC, false, isVarArg);
1631 }
1632 
1633 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1634                                                    bool isVarArg) const {
1635   return CCAssignFnForNode(CC, true, isVarArg);
1636 }
1637 
1638 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1639 /// CallingConvention.
1640 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1641                                                  bool Return,
1642                                                  bool isVarArg) const {
1643   switch (getEffectiveCallingConv(CC, isVarArg)) {
1644   default:
1645     report_fatal_error("Unsupported calling convention");
1646   case CallingConv::ARM_APCS:
1647     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1648   case CallingConv::ARM_AAPCS:
1649     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1650   case CallingConv::ARM_AAPCS_VFP:
1651     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1652   case CallingConv::Fast:
1653     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1654   case CallingConv::GHC:
1655     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1656   case CallingConv::PreserveMost:
1657     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1658   }
1659 }
1660 
1661 /// LowerCallResult - Lower the result values of a call into the
1662 /// appropriate copies out of appropriate physical registers.
1663 SDValue ARMTargetLowering::LowerCallResult(
1664     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
1665     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1666     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1667     SDValue ThisVal) const {
1668   // Assign locations to each value returned by this call.
1669   SmallVector<CCValAssign, 16> RVLocs;
1670   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1671                  *DAG.getContext());
1672   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1673 
1674   // Copy all of the result registers out of their specified physreg.
1675   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1676     CCValAssign VA = RVLocs[i];
1677 
1678     // Pass 'this' value directly from the argument to return value, to avoid
1679     // reg unit interference
1680     if (i == 0 && isThisReturn) {
1681       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1682              "unexpected return calling convention register assignment");
1683       InVals.push_back(ThisVal);
1684       continue;
1685     }
1686 
1687     SDValue Val;
1688     if (VA.needsCustom()) {
1689       // Handle f64 or half of a v2f64.
1690       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1691                                       InFlag);
1692       Chain = Lo.getValue(1);
1693       InFlag = Lo.getValue(2);
1694       VA = RVLocs[++i]; // skip ahead to next loc
1695       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1696                                       InFlag);
1697       Chain = Hi.getValue(1);
1698       InFlag = Hi.getValue(2);
1699       if (!Subtarget->isLittle())
1700         std::swap (Lo, Hi);
1701       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1702 
1703       if (VA.getLocVT() == MVT::v2f64) {
1704         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1705         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1706                           DAG.getConstant(0, dl, MVT::i32));
1707 
1708         VA = RVLocs[++i]; // skip ahead to next loc
1709         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1710         Chain = Lo.getValue(1);
1711         InFlag = Lo.getValue(2);
1712         VA = RVLocs[++i]; // skip ahead to next loc
1713         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1714         Chain = Hi.getValue(1);
1715         InFlag = Hi.getValue(2);
1716         if (!Subtarget->isLittle())
1717           std::swap (Lo, Hi);
1718         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1719         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1720                           DAG.getConstant(1, dl, MVT::i32));
1721       }
1722     } else {
1723       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1724                                InFlag);
1725       Chain = Val.getValue(1);
1726       InFlag = Val.getValue(2);
1727     }
1728 
1729     switch (VA.getLocInfo()) {
1730     default: llvm_unreachable("Unknown loc info!");
1731     case CCValAssign::Full: break;
1732     case CCValAssign::BCvt:
1733       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1734       break;
1735     }
1736 
1737     InVals.push_back(Val);
1738   }
1739 
1740   return Chain;
1741 }
1742 
1743 /// LowerMemOpCallTo - Store the argument to the stack.
1744 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1745                                             SDValue Arg, const SDLoc &dl,
1746                                             SelectionDAG &DAG,
1747                                             const CCValAssign &VA,
1748                                             ISD::ArgFlagsTy Flags) const {
1749   unsigned LocMemOffset = VA.getLocMemOffset();
1750   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1751   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1752                        StackPtr, PtrOff);
1753   return DAG.getStore(
1754       Chain, dl, Arg, PtrOff,
1755       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
1756 }
1757 
1758 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1759                                          SDValue Chain, SDValue &Arg,
1760                                          RegsToPassVector &RegsToPass,
1761                                          CCValAssign &VA, CCValAssign &NextVA,
1762                                          SDValue &StackPtr,
1763                                          SmallVectorImpl<SDValue> &MemOpChains,
1764                                          ISD::ArgFlagsTy Flags) const {
1765   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1766                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1767   unsigned id = Subtarget->isLittle() ? 0 : 1;
1768   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1769 
1770   if (NextVA.isRegLoc())
1771     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1772   else {
1773     assert(NextVA.isMemLoc());
1774     if (!StackPtr.getNode())
1775       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1776                                     getPointerTy(DAG.getDataLayout()));
1777 
1778     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
1779                                            dl, DAG, NextVA,
1780                                            Flags));
1781   }
1782 }
1783 
1784 /// LowerCall - Lowering a call into a callseq_start <-
1785 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1786 /// nodes.
1787 SDValue
1788 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1789                              SmallVectorImpl<SDValue> &InVals) const {
1790   SelectionDAG &DAG                     = CLI.DAG;
1791   SDLoc &dl                             = CLI.DL;
1792   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1793   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1794   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1795   SDValue Chain                         = CLI.Chain;
1796   SDValue Callee                        = CLI.Callee;
1797   bool &isTailCall                      = CLI.IsTailCall;
1798   CallingConv::ID CallConv              = CLI.CallConv;
1799   bool doesNotRet                       = CLI.DoesNotReturn;
1800   bool isVarArg                         = CLI.IsVarArg;
1801 
1802   MachineFunction &MF = DAG.getMachineFunction();
1803   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1804   bool isThisReturn   = false;
1805   bool isSibCall      = false;
1806   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
1807 
1808   // Disable tail calls if they're not supported.
1809   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
1810     isTailCall = false;
1811 
1812   if (isTailCall) {
1813     // Check if it's really possible to do a tail call.
1814     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1815                     isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(),
1816                                                    Outs, OutVals, Ins, DAG);
1817     if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
1818       report_fatal_error("failed to perform tail call elimination on a call "
1819                          "site marked musttail");
1820     // We don't support GuaranteedTailCallOpt for ARM, only automatically
1821     // detected sibcalls.
1822     if (isTailCall) {
1823       ++NumTailCalls;
1824       isSibCall = true;
1825     }
1826   }
1827 
1828   // Analyze operands of the call, assigning locations to each operand.
1829   SmallVector<CCValAssign, 16> ArgLocs;
1830   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1831                  *DAG.getContext());
1832   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
1833 
1834   // Get a count of how many bytes are to be pushed on the stack.
1835   unsigned NumBytes = CCInfo.getNextStackOffset();
1836 
1837   // For tail calls, memory operands are available in our caller's stack.
1838   if (isSibCall)
1839     NumBytes = 0;
1840 
1841   // Adjust the stack pointer for the new arguments...
1842   // These operations are automatically eliminated by the prolog/epilog pass
1843   if (!isSibCall)
1844     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
1845 
1846   SDValue StackPtr =
1847       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
1848 
1849   RegsToPassVector RegsToPass;
1850   SmallVector<SDValue, 8> MemOpChains;
1851 
1852   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1853   // of tail call optimization, arguments are handled later.
1854   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1855        i != e;
1856        ++i, ++realArgIdx) {
1857     CCValAssign &VA = ArgLocs[i];
1858     SDValue Arg = OutVals[realArgIdx];
1859     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1860     bool isByVal = Flags.isByVal();
1861 
1862     // Promote the value if needed.
1863     switch (VA.getLocInfo()) {
1864     default: llvm_unreachable("Unknown loc info!");
1865     case CCValAssign::Full: break;
1866     case CCValAssign::SExt:
1867       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1868       break;
1869     case CCValAssign::ZExt:
1870       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1871       break;
1872     case CCValAssign::AExt:
1873       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1874       break;
1875     case CCValAssign::BCvt:
1876       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1877       break;
1878     }
1879 
1880     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1881     if (VA.needsCustom()) {
1882       if (VA.getLocVT() == MVT::v2f64) {
1883         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1884                                   DAG.getConstant(0, dl, MVT::i32));
1885         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1886                                   DAG.getConstant(1, dl, MVT::i32));
1887 
1888         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1889                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1890 
1891         VA = ArgLocs[++i]; // skip ahead to next loc
1892         if (VA.isRegLoc()) {
1893           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1894                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1895         } else {
1896           assert(VA.isMemLoc());
1897 
1898           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1899                                                  dl, DAG, VA, Flags));
1900         }
1901       } else {
1902         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1903                          StackPtr, MemOpChains, Flags);
1904       }
1905     } else if (VA.isRegLoc()) {
1906       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
1907           Outs[0].VT == MVT::i32) {
1908         assert(VA.getLocVT() == MVT::i32 &&
1909                "unexpected calling convention register assignment");
1910         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1911                "unexpected use of 'returned'");
1912         isThisReturn = true;
1913       }
1914       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1915     } else if (isByVal) {
1916       assert(VA.isMemLoc());
1917       unsigned offset = 0;
1918 
1919       // True if this byval aggregate will be split between registers
1920       // and memory.
1921       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1922       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
1923 
1924       if (CurByValIdx < ByValArgsCount) {
1925 
1926         unsigned RegBegin, RegEnd;
1927         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1928 
1929         EVT PtrVT =
1930             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1931         unsigned int i, j;
1932         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1933           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
1934           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1935           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1936                                      MachinePointerInfo(),
1937                                      DAG.InferPtrAlignment(AddArg));
1938           MemOpChains.push_back(Load.getValue(1));
1939           RegsToPass.push_back(std::make_pair(j, Load));
1940         }
1941 
1942         // If parameter size outsides register area, "offset" value
1943         // helps us to calculate stack slot for remained part properly.
1944         offset = RegEnd - RegBegin;
1945 
1946         CCInfo.nextInRegsParam();
1947       }
1948 
1949       if (Flags.getByValSize() > 4*offset) {
1950         auto PtrVT = getPointerTy(DAG.getDataLayout());
1951         unsigned LocMemOffset = VA.getLocMemOffset();
1952         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1953         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
1954         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
1955         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
1956         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
1957                                            MVT::i32);
1958         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
1959                                             MVT::i32);
1960 
1961         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1962         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1963         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1964                                           Ops));
1965       }
1966     } else if (!isSibCall) {
1967       assert(VA.isMemLoc());
1968 
1969       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1970                                              dl, DAG, VA, Flags));
1971     }
1972   }
1973 
1974   if (!MemOpChains.empty())
1975     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
1976 
1977   // Build a sequence of copy-to-reg nodes chained together with token chain
1978   // and flag operands which copy the outgoing args into the appropriate regs.
1979   SDValue InFlag;
1980   // Tail call byval lowering might overwrite argument registers so in case of
1981   // tail call optimization the copies to registers are lowered later.
1982   if (!isTailCall)
1983     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1984       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1985                                RegsToPass[i].second, InFlag);
1986       InFlag = Chain.getValue(1);
1987     }
1988 
1989   // For tail calls lower the arguments to the 'real' stack slot.
1990   if (isTailCall) {
1991     // Force all the incoming stack arguments to be loaded from the stack
1992     // before any new outgoing arguments are stored to the stack, because the
1993     // outgoing stack slots may alias the incoming argument stack slots, and
1994     // the alias isn't otherwise explicit. This is slightly more conservative
1995     // than necessary, because it means that each store effectively depends
1996     // on every argument instead of just those arguments it would clobber.
1997 
1998     // Do not flag preceding copytoreg stuff together with the following stuff.
1999     InFlag = SDValue();
2000     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2001       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2002                                RegsToPass[i].second, InFlag);
2003       InFlag = Chain.getValue(1);
2004     }
2005     InFlag = SDValue();
2006   }
2007 
2008   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2009   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2010   // node so that legalize doesn't hack it.
2011   bool isDirect = false;
2012 
2013   const TargetMachine &TM = getTargetMachine();
2014   const Module *Mod = MF.getFunction().getParent();
2015   const GlobalValue *GV = nullptr;
2016   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2017     GV = G->getGlobal();
2018   bool isStub =
2019       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2020 
2021   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2022   bool isLocalARMFunc = false;
2023   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2024   auto PtrVt = getPointerTy(DAG.getDataLayout());
2025 
2026   if (Subtarget->genLongCalls()) {
2027     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2028            "long-calls codegen is not position independent!");
2029     // Handle a global address or an external symbol. If it's not one of
2030     // those, the target's already in a register, so we don't need to do
2031     // anything extra.
2032     if (isa<GlobalAddressSDNode>(Callee)) {
2033       // Create a constant pool entry for the callee address
2034       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2035       ARMConstantPoolValue *CPV =
2036         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2037 
2038       // Get the address of the callee into a register
2039       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2040       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2041       Callee = DAG.getLoad(
2042           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2043           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2044     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2045       const char *Sym = S->getSymbol();
2046 
2047       // Create a constant pool entry for the callee address
2048       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2049       ARMConstantPoolValue *CPV =
2050         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2051                                       ARMPCLabelIndex, 0);
2052       // Get the address of the callee into a register
2053       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2054       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2055       Callee = DAG.getLoad(
2056           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2057           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2058     }
2059   } else if (isa<GlobalAddressSDNode>(Callee)) {
2060     // If we're optimizing for minimum size and the function is called three or
2061     // more times in this block, we can improve codesize by calling indirectly
2062     // as BLXr has a 16-bit encoding.
2063     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2064     auto *BB = CLI.CS.getParent();
2065     bool PreferIndirect =
2066         Subtarget->isThumb() && MF.getFunction().optForMinSize() &&
2067         count_if(GV->users(), [&BB](const User *U) {
2068           return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
2069         }) > 2;
2070 
2071     if (!PreferIndirect) {
2072       isDirect = true;
2073       bool isDef = GV->isStrongDefinitionForLinker();
2074 
2075       // ARM call to a local ARM function is predicable.
2076       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2077       // tBX takes a register source operand.
2078       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2079         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2080         Callee = DAG.getNode(
2081             ARMISD::WrapperPIC, dl, PtrVt,
2082             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2083         Callee = DAG.getLoad(
2084             PtrVt, dl, DAG.getEntryNode(), Callee,
2085             MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2086             /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
2087                                      MachineMemOperand::MOInvariant);
2088       } else if (Subtarget->isTargetCOFF()) {
2089         assert(Subtarget->isTargetWindows() &&
2090                "Windows is the only supported COFF target");
2091         unsigned TargetFlags = GV->hasDLLImportStorageClass()
2092                                    ? ARMII::MO_DLLIMPORT
2093                                    : ARMII::MO_NO_FLAG;
2094         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
2095                                             TargetFlags);
2096         if (GV->hasDLLImportStorageClass())
2097           Callee =
2098               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2099                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2100                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2101       } else {
2102         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2103       }
2104     }
2105   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2106     isDirect = true;
2107     // tBX takes a register source operand.
2108     const char *Sym = S->getSymbol();
2109     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2110       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2111       ARMConstantPoolValue *CPV =
2112         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2113                                       ARMPCLabelIndex, 4);
2114       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2115       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2116       Callee = DAG.getLoad(
2117           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2118           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2119       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2120       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2121     } else {
2122       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2123     }
2124   }
2125 
2126   // FIXME: handle tail calls differently.
2127   unsigned CallOpc;
2128   if (Subtarget->isThumb()) {
2129     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2130       CallOpc = ARMISD::CALL_NOLINK;
2131     else
2132       CallOpc = ARMISD::CALL;
2133   } else {
2134     if (!isDirect && !Subtarget->hasV5TOps())
2135       CallOpc = ARMISD::CALL_NOLINK;
2136     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2137              // Emit regular call when code size is the priority
2138              !MF.getFunction().optForMinSize())
2139       // "mov lr, pc; b _foo" to avoid confusing the RSP
2140       CallOpc = ARMISD::CALL_NOLINK;
2141     else
2142       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2143   }
2144 
2145   std::vector<SDValue> Ops;
2146   Ops.push_back(Chain);
2147   Ops.push_back(Callee);
2148 
2149   // Add argument registers to the end of the list so that they are known live
2150   // into the call.
2151   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2152     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2153                                   RegsToPass[i].second.getValueType()));
2154 
2155   // Add a register mask operand representing the call-preserved registers.
2156   if (!isTailCall) {
2157     const uint32_t *Mask;
2158     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2159     if (isThisReturn) {
2160       // For 'this' returns, use the R0-preserving mask if applicable
2161       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2162       if (!Mask) {
2163         // Set isThisReturn to false if the calling convention is not one that
2164         // allows 'returned' to be modeled in this way, so LowerCallResult does
2165         // not try to pass 'this' straight through
2166         isThisReturn = false;
2167         Mask = ARI->getCallPreservedMask(MF, CallConv);
2168       }
2169     } else
2170       Mask = ARI->getCallPreservedMask(MF, CallConv);
2171 
2172     assert(Mask && "Missing call preserved mask for calling convention");
2173     Ops.push_back(DAG.getRegisterMask(Mask));
2174   }
2175 
2176   if (InFlag.getNode())
2177     Ops.push_back(InFlag);
2178 
2179   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2180   if (isTailCall) {
2181     MF.getFrameInfo().setHasTailCall();
2182     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2183   }
2184 
2185   // Returns a chain and a flag for retval copy to use.
2186   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2187   InFlag = Chain.getValue(1);
2188 
2189   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2190                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2191   if (!Ins.empty())
2192     InFlag = Chain.getValue(1);
2193 
2194   // Handle result values, copying them out of physregs into vregs that we
2195   // return.
2196   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2197                          InVals, isThisReturn,
2198                          isThisReturn ? OutVals[0] : SDValue());
2199 }
2200 
2201 /// HandleByVal - Every parameter *after* a byval parameter is passed
2202 /// on the stack.  Remember the next parameter register to allocate,
2203 /// and then confiscate the rest of the parameter registers to insure
2204 /// this.
2205 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2206                                     unsigned Align) const {
2207   // Byval (as with any stack) slots are always at least 4 byte aligned.
2208   Align = std::max(Align, 4U);
2209 
2210   unsigned Reg = State->AllocateReg(GPRArgRegs);
2211   if (!Reg)
2212     return;
2213 
2214   unsigned AlignInRegs = Align / 4;
2215   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2216   for (unsigned i = 0; i < Waste; ++i)
2217     Reg = State->AllocateReg(GPRArgRegs);
2218 
2219   if (!Reg)
2220     return;
2221 
2222   unsigned Excess = 4 * (ARM::R4 - Reg);
2223 
2224   // Special case when NSAA != SP and parameter size greater than size of
2225   // all remained GPR regs. In that case we can't split parameter, we must
2226   // send it to stack. We also must set NCRN to R4, so waste all
2227   // remained registers.
2228   const unsigned NSAAOffset = State->getNextStackOffset();
2229   if (NSAAOffset != 0 && Size > Excess) {
2230     while (State->AllocateReg(GPRArgRegs))
2231       ;
2232     return;
2233   }
2234 
2235   // First register for byval parameter is the first register that wasn't
2236   // allocated before this method call, so it would be "reg".
2237   // If parameter is small enough to be saved in range [reg, r4), then
2238   // the end (first after last) register would be reg + param-size-in-regs,
2239   // else parameter would be splitted between registers and stack,
2240   // end register would be r4 in this case.
2241   unsigned ByValRegBegin = Reg;
2242   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2243   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2244   // Note, first register is allocated in the beginning of function already,
2245   // allocate remained amount of registers we need.
2246   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2247     State->AllocateReg(GPRArgRegs);
2248   // A byval parameter that is split between registers and memory needs its
2249   // size truncated here.
2250   // In the case where the entire structure fits in registers, we set the
2251   // size in memory to zero.
2252   Size = std::max<int>(Size - Excess, 0);
2253 }
2254 
2255 /// MatchingStackOffset - Return true if the given stack call argument is
2256 /// already available in the same position (relatively) of the caller's
2257 /// incoming argument stack.
2258 static
2259 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2260                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2261                          const TargetInstrInfo *TII) {
2262   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2263   int FI = std::numeric_limits<int>::max();
2264   if (Arg.getOpcode() == ISD::CopyFromReg) {
2265     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2266     if (!TargetRegisterInfo::isVirtualRegister(VR))
2267       return false;
2268     MachineInstr *Def = MRI->getVRegDef(VR);
2269     if (!Def)
2270       return false;
2271     if (!Flags.isByVal()) {
2272       if (!TII->isLoadFromStackSlot(*Def, FI))
2273         return false;
2274     } else {
2275       return false;
2276     }
2277   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2278     if (Flags.isByVal())
2279       // ByVal argument is passed in as a pointer but it's now being
2280       // dereferenced. e.g.
2281       // define @foo(%struct.X* %A) {
2282       //   tail call @bar(%struct.X* byval %A)
2283       // }
2284       return false;
2285     SDValue Ptr = Ld->getBasePtr();
2286     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2287     if (!FINode)
2288       return false;
2289     FI = FINode->getIndex();
2290   } else
2291     return false;
2292 
2293   assert(FI != std::numeric_limits<int>::max());
2294   if (!MFI.isFixedObjectIndex(FI))
2295     return false;
2296   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2297 }
2298 
2299 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2300 /// for tail call optimization. Targets which want to do tail call
2301 /// optimization should implement this function.
2302 bool
2303 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2304                                                      CallingConv::ID CalleeCC,
2305                                                      bool isVarArg,
2306                                                      bool isCalleeStructRet,
2307                                                      bool isCallerStructRet,
2308                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
2309                                     const SmallVectorImpl<SDValue> &OutVals,
2310                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2311                                                      SelectionDAG& DAG) const {
2312   MachineFunction &MF = DAG.getMachineFunction();
2313   const Function &CallerF = MF.getFunction();
2314   CallingConv::ID CallerCC = CallerF.getCallingConv();
2315 
2316   assert(Subtarget->supportsTailCall());
2317 
2318   // Tail calls to function pointers cannot be optimized for Thumb1 if the args
2319   // to the call take up r0-r3. The reason is that there are no legal registers
2320   // left to hold the pointer to the function to be called.
2321   if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
2322       !isa<GlobalAddressSDNode>(Callee.getNode()))
2323       return false;
2324 
2325   // Look for obvious safe cases to perform tail call optimization that do not
2326   // require ABI changes. This is what gcc calls sibcall.
2327 
2328   // Exception-handling functions need a special set of instructions to indicate
2329   // a return to the hardware. Tail-calling another function would probably
2330   // break this.
2331   if (CallerF.hasFnAttribute("interrupt"))
2332     return false;
2333 
2334   // Also avoid sibcall optimization if either caller or callee uses struct
2335   // return semantics.
2336   if (isCalleeStructRet || isCallerStructRet)
2337     return false;
2338 
2339   // Externally-defined functions with weak linkage should not be
2340   // tail-called on ARM when the OS does not support dynamic
2341   // pre-emption of symbols, as the AAELF spec requires normal calls
2342   // to undefined weak functions to be replaced with a NOP or jump to the
2343   // next instruction. The behaviour of branch instructions in this
2344   // situation (as used for tail calls) is implementation-defined, so we
2345   // cannot rely on the linker replacing the tail call with a return.
2346   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2347     const GlobalValue *GV = G->getGlobal();
2348     const Triple &TT = getTargetMachine().getTargetTriple();
2349     if (GV->hasExternalWeakLinkage() &&
2350         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2351       return false;
2352   }
2353 
2354   // Check that the call results are passed in the same way.
2355   LLVMContext &C = *DAG.getContext();
2356   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2357                                   CCAssignFnForReturn(CalleeCC, isVarArg),
2358                                   CCAssignFnForReturn(CallerCC, isVarArg)))
2359     return false;
2360   // The callee has to preserve all registers the caller needs to preserve.
2361   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2362   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2363   if (CalleeCC != CallerCC) {
2364     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2365     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2366       return false;
2367   }
2368 
2369   // If Caller's vararg or byval argument has been split between registers and
2370   // stack, do not perform tail call, since part of the argument is in caller's
2371   // local frame.
2372   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2373   if (AFI_Caller->getArgRegsSaveSize())
2374     return false;
2375 
2376   // If the callee takes no arguments then go on to check the results of the
2377   // call.
2378   if (!Outs.empty()) {
2379     // Check if stack adjustment is needed. For now, do not do this if any
2380     // argument is passed on the stack.
2381     SmallVector<CCValAssign, 16> ArgLocs;
2382     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2383     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2384     if (CCInfo.getNextStackOffset()) {
2385       // Check if the arguments are already laid out in the right way as
2386       // the caller's fixed stack objects.
2387       MachineFrameInfo &MFI = MF.getFrameInfo();
2388       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2389       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2390       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2391            i != e;
2392            ++i, ++realArgIdx) {
2393         CCValAssign &VA = ArgLocs[i];
2394         EVT RegVT = VA.getLocVT();
2395         SDValue Arg = OutVals[realArgIdx];
2396         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2397         if (VA.getLocInfo() == CCValAssign::Indirect)
2398           return false;
2399         if (VA.needsCustom()) {
2400           // f64 and vector types are split into multiple registers or
2401           // register/stack-slot combinations.  The types will not match
2402           // the registers; give up on memory f64 refs until we figure
2403           // out what to do about this.
2404           if (!VA.isRegLoc())
2405             return false;
2406           if (!ArgLocs[++i].isRegLoc())
2407             return false;
2408           if (RegVT == MVT::v2f64) {
2409             if (!ArgLocs[++i].isRegLoc())
2410               return false;
2411             if (!ArgLocs[++i].isRegLoc())
2412               return false;
2413           }
2414         } else if (!VA.isRegLoc()) {
2415           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2416                                    MFI, MRI, TII))
2417             return false;
2418         }
2419       }
2420     }
2421 
2422     const MachineRegisterInfo &MRI = MF.getRegInfo();
2423     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2424       return false;
2425   }
2426 
2427   return true;
2428 }
2429 
2430 bool
2431 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2432                                   MachineFunction &MF, bool isVarArg,
2433                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2434                                   LLVMContext &Context) const {
2435   SmallVector<CCValAssign, 16> RVLocs;
2436   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2437   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2438 }
2439 
2440 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2441                                     const SDLoc &DL, SelectionDAG &DAG) {
2442   const MachineFunction &MF = DAG.getMachineFunction();
2443   const Function &F = MF.getFunction();
2444 
2445   StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2446 
2447   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2448   // version of the "preferred return address". These offsets affect the return
2449   // instruction if this is a return from PL1 without hypervisor extensions.
2450   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2451   //    SWI:     0      "subs pc, lr, #0"
2452   //    ABORT:   +4     "subs pc, lr, #4"
2453   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2454   // UNDEF varies depending on where the exception came from ARM or Thumb
2455   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2456 
2457   int64_t LROffset;
2458   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2459       IntKind == "ABORT")
2460     LROffset = 4;
2461   else if (IntKind == "SWI" || IntKind == "UNDEF")
2462     LROffset = 0;
2463   else
2464     report_fatal_error("Unsupported interrupt attribute. If present, value "
2465                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2466 
2467   RetOps.insert(RetOps.begin() + 1,
2468                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2469 
2470   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2471 }
2472 
2473 SDValue
2474 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2475                                bool isVarArg,
2476                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2477                                const SmallVectorImpl<SDValue> &OutVals,
2478                                const SDLoc &dl, SelectionDAG &DAG) const {
2479   // CCValAssign - represent the assignment of the return value to a location.
2480   SmallVector<CCValAssign, 16> RVLocs;
2481 
2482   // CCState - Info about the registers and stack slots.
2483   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2484                  *DAG.getContext());
2485 
2486   // Analyze outgoing return values.
2487   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2488 
2489   SDValue Flag;
2490   SmallVector<SDValue, 4> RetOps;
2491   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2492   bool isLittleEndian = Subtarget->isLittle();
2493 
2494   MachineFunction &MF = DAG.getMachineFunction();
2495   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2496   AFI->setReturnRegsCount(RVLocs.size());
2497 
2498   // Copy the result values into the output registers.
2499   for (unsigned i = 0, realRVLocIdx = 0;
2500        i != RVLocs.size();
2501        ++i, ++realRVLocIdx) {
2502     CCValAssign &VA = RVLocs[i];
2503     assert(VA.isRegLoc() && "Can only return in registers!");
2504 
2505     SDValue Arg = OutVals[realRVLocIdx];
2506     bool ReturnF16 = false;
2507 
2508     if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
2509       // Half-precision return values can be returned like this:
2510       //
2511       // t11 f16 = fadd ...
2512       // t12: i16 = bitcast t11
2513       //   t13: i32 = zero_extend t12
2514       // t14: f32 = bitcast t13  <~~~~~~~ Arg
2515       //
2516       // to avoid code generation for bitcasts, we simply set Arg to the node
2517       // that produces the f16 value, t11 in this case.
2518       //
2519       if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2520         SDValue ZE = Arg.getOperand(0);
2521         if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2522           SDValue BC = ZE.getOperand(0);
2523           if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2524             Arg = BC.getOperand(0);
2525             ReturnF16 = true;
2526           }
2527         }
2528       }
2529     }
2530 
2531     switch (VA.getLocInfo()) {
2532     default: llvm_unreachable("Unknown loc info!");
2533     case CCValAssign::Full: break;
2534     case CCValAssign::BCvt:
2535       if (!ReturnF16)
2536         Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2537       break;
2538     }
2539 
2540     if (VA.needsCustom()) {
2541       if (VA.getLocVT() == MVT::v2f64) {
2542         // Extract the first half and return it in two registers.
2543         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2544                                    DAG.getConstant(0, dl, MVT::i32));
2545         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2546                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2547 
2548         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2549                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2550                                  Flag);
2551         Flag = Chain.getValue(1);
2552         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2553         VA = RVLocs[++i]; // skip ahead to next loc
2554         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2555                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2556                                  Flag);
2557         Flag = Chain.getValue(1);
2558         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2559         VA = RVLocs[++i]; // skip ahead to next loc
2560 
2561         // Extract the 2nd half and fall through to handle it as an f64 value.
2562         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2563                           DAG.getConstant(1, dl, MVT::i32));
2564       }
2565       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2566       // available.
2567       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2568                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2569       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2570                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2571                                Flag);
2572       Flag = Chain.getValue(1);
2573       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2574       VA = RVLocs[++i]; // skip ahead to next loc
2575       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2576                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2577                                Flag);
2578     } else
2579       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2580 
2581     // Guarantee that all emitted copies are
2582     // stuck together, avoiding something bad.
2583     Flag = Chain.getValue(1);
2584     RetOps.push_back(DAG.getRegister(VA.getLocReg(),
2585                                      ReturnF16 ? MVT::f16 : VA.getLocVT()));
2586   }
2587   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2588   const MCPhysReg *I =
2589       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2590   if (I) {
2591     for (; *I; ++I) {
2592       if (ARM::GPRRegClass.contains(*I))
2593         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2594       else if (ARM::DPRRegClass.contains(*I))
2595         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
2596       else
2597         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2598     }
2599   }
2600 
2601   // Update chain and glue.
2602   RetOps[0] = Chain;
2603   if (Flag.getNode())
2604     RetOps.push_back(Flag);
2605 
2606   // CPUs which aren't M-class use a special sequence to return from
2607   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2608   // though we use "subs pc, lr, #N").
2609   //
2610   // M-class CPUs actually use a normal return sequence with a special
2611   // (hardware-provided) value in LR, so the normal code path works.
2612   if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
2613       !Subtarget->isMClass()) {
2614     if (Subtarget->isThumb1Only())
2615       report_fatal_error("interrupt attribute is not supported in Thumb1");
2616     return LowerInterruptReturn(RetOps, dl, DAG);
2617   }
2618 
2619   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2620 }
2621 
2622 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2623   if (N->getNumValues() != 1)
2624     return false;
2625   if (!N->hasNUsesOfValue(1, 0))
2626     return false;
2627 
2628   SDValue TCChain = Chain;
2629   SDNode *Copy = *N->use_begin();
2630   if (Copy->getOpcode() == ISD::CopyToReg) {
2631     // If the copy has a glue operand, we conservatively assume it isn't safe to
2632     // perform a tail call.
2633     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2634       return false;
2635     TCChain = Copy->getOperand(0);
2636   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2637     SDNode *VMov = Copy;
2638     // f64 returned in a pair of GPRs.
2639     SmallPtrSet<SDNode*, 2> Copies;
2640     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2641          UI != UE; ++UI) {
2642       if (UI->getOpcode() != ISD::CopyToReg)
2643         return false;
2644       Copies.insert(*UI);
2645     }
2646     if (Copies.size() > 2)
2647       return false;
2648 
2649     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2650          UI != UE; ++UI) {
2651       SDValue UseChain = UI->getOperand(0);
2652       if (Copies.count(UseChain.getNode()))
2653         // Second CopyToReg
2654         Copy = *UI;
2655       else {
2656         // We are at the top of this chain.
2657         // If the copy has a glue operand, we conservatively assume it
2658         // isn't safe to perform a tail call.
2659         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2660           return false;
2661         // First CopyToReg
2662         TCChain = UseChain;
2663       }
2664     }
2665   } else if (Copy->getOpcode() == ISD::BITCAST) {
2666     // f32 returned in a single GPR.
2667     if (!Copy->hasOneUse())
2668       return false;
2669     Copy = *Copy->use_begin();
2670     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2671       return false;
2672     // If the copy has a glue operand, we conservatively assume it isn't safe to
2673     // perform a tail call.
2674     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2675       return false;
2676     TCChain = Copy->getOperand(0);
2677   } else {
2678     return false;
2679   }
2680 
2681   bool HasRet = false;
2682   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2683        UI != UE; ++UI) {
2684     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2685         UI->getOpcode() != ARMISD::INTRET_FLAG)
2686       return false;
2687     HasRet = true;
2688   }
2689 
2690   if (!HasRet)
2691     return false;
2692 
2693   Chain = TCChain;
2694   return true;
2695 }
2696 
2697 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2698   if (!Subtarget->supportsTailCall())
2699     return false;
2700 
2701   auto Attr =
2702       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2703   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2704     return false;
2705 
2706   return true;
2707 }
2708 
2709 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2710 // and pass the lower and high parts through.
2711 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2712   SDLoc DL(Op);
2713   SDValue WriteValue = Op->getOperand(2);
2714 
2715   // This function is only supposed to be called for i64 type argument.
2716   assert(WriteValue.getValueType() == MVT::i64
2717           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2718 
2719   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2720                            DAG.getConstant(0, DL, MVT::i32));
2721   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2722                            DAG.getConstant(1, DL, MVT::i32));
2723   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2724   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2725 }
2726 
2727 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2728 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2729 // one of the above mentioned nodes. It has to be wrapped because otherwise
2730 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2731 // be used to form addressing mode. These wrapped nodes will be selected
2732 // into MOVi.
2733 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
2734                                              SelectionDAG &DAG) const {
2735   EVT PtrVT = Op.getValueType();
2736   // FIXME there is no actual debug info here
2737   SDLoc dl(Op);
2738   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2739   SDValue Res;
2740 
2741   // When generating execute-only code Constant Pools must be promoted to the
2742   // global data section. It's a bit ugly that we can't share them across basic
2743   // blocks, but this way we guarantee that execute-only behaves correct with
2744   // position-independent addressing modes.
2745   if (Subtarget->genExecuteOnly()) {
2746     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2747     auto T = const_cast<Type*>(CP->getType());
2748     auto C = const_cast<Constant*>(CP->getConstVal());
2749     auto M = const_cast<Module*>(DAG.getMachineFunction().
2750                                  getFunction().getParent());
2751     auto GV = new GlobalVariable(
2752                     *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
2753                     Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
2754                     Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
2755                     Twine(AFI->createPICLabelUId())
2756                   );
2757     SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
2758                                             dl, PtrVT);
2759     return LowerGlobalAddress(GA, DAG);
2760   }
2761 
2762   if (CP->isMachineConstantPoolEntry())
2763     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2764                                     CP->getAlignment());
2765   else
2766     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2767                                     CP->getAlignment());
2768   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2769 }
2770 
2771 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2772   return MachineJumpTableInfo::EK_Inline;
2773 }
2774 
2775 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2776                                              SelectionDAG &DAG) const {
2777   MachineFunction &MF = DAG.getMachineFunction();
2778   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2779   unsigned ARMPCLabelIndex = 0;
2780   SDLoc DL(Op);
2781   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2782   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2783   SDValue CPAddr;
2784   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
2785   if (!IsPositionIndependent) {
2786     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2787   } else {
2788     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2789     ARMPCLabelIndex = AFI->createPICLabelUId();
2790     ARMConstantPoolValue *CPV =
2791       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2792                                       ARMCP::CPBlockAddress, PCAdj);
2793     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2794   }
2795   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2796   SDValue Result = DAG.getLoad(
2797       PtrVT, DL, DAG.getEntryNode(), CPAddr,
2798       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2799   if (!IsPositionIndependent)
2800     return Result;
2801   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
2802   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2803 }
2804 
2805 /// Convert a TLS address reference into the correct sequence of loads
2806 /// and calls to compute the variable's address for Darwin, and return an
2807 /// SDValue containing the final node.
2808 
2809 /// Darwin only has one TLS scheme which must be capable of dealing with the
2810 /// fully general situation, in the worst case. This means:
2811 ///     + "extern __thread" declaration.
2812 ///     + Defined in a possibly unknown dynamic library.
2813 ///
2814 /// The general system is that each __thread variable has a [3 x i32] descriptor
2815 /// which contains information used by the runtime to calculate the address. The
2816 /// only part of this the compiler needs to know about is the first word, which
2817 /// contains a function pointer that must be called with the address of the
2818 /// entire descriptor in "r0".
2819 ///
2820 /// Since this descriptor may be in a different unit, in general access must
2821 /// proceed along the usual ARM rules. A common sequence to produce is:
2822 ///
2823 ///     movw rT1, :lower16:_var$non_lazy_ptr
2824 ///     movt rT1, :upper16:_var$non_lazy_ptr
2825 ///     ldr r0, [rT1]
2826 ///     ldr rT2, [r0]
2827 ///     blx rT2
2828 ///     [...address now in r0...]
2829 SDValue
2830 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
2831                                                SelectionDAG &DAG) const {
2832   assert(Subtarget->isTargetDarwin() &&
2833          "This function expects a Darwin target");
2834   SDLoc DL(Op);
2835 
2836   // First step is to get the address of the actua global symbol. This is where
2837   // the TLS descriptor lives.
2838   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
2839 
2840   // The first entry in the descriptor is a function pointer that we must call
2841   // to obtain the address of the variable.
2842   SDValue Chain = DAG.getEntryNode();
2843   SDValue FuncTLVGet = DAG.getLoad(
2844       MVT::i32, DL, Chain, DescAddr,
2845       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2846       /* Alignment = */ 4,
2847       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
2848           MachineMemOperand::MOInvariant);
2849   Chain = FuncTLVGet.getValue(1);
2850 
2851   MachineFunction &F = DAG.getMachineFunction();
2852   MachineFrameInfo &MFI = F.getFrameInfo();
2853   MFI.setAdjustsStack(true);
2854 
2855   // TLS calls preserve all registers except those that absolutely must be
2856   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
2857   // silly).
2858   auto TRI =
2859       getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
2860   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
2861   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
2862 
2863   // Finally, we can make the call. This is just a degenerate version of a
2864   // normal AArch64 call node: r0 takes the address of the descriptor, and
2865   // returns the address of the variable in this thread.
2866   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
2867   Chain =
2868       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
2869                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
2870                   DAG.getRegisterMask(Mask), Chain.getValue(1));
2871   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
2872 }
2873 
2874 SDValue
2875 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
2876                                                 SelectionDAG &DAG) const {
2877   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
2878 
2879   SDValue Chain = DAG.getEntryNode();
2880   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2881   SDLoc DL(Op);
2882 
2883   // Load the current TEB (thread environment block)
2884   SDValue Ops[] = {Chain,
2885                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
2886                    DAG.getConstant(15, DL, MVT::i32),
2887                    DAG.getConstant(0, DL, MVT::i32),
2888                    DAG.getConstant(13, DL, MVT::i32),
2889                    DAG.getConstant(0, DL, MVT::i32),
2890                    DAG.getConstant(2, DL, MVT::i32)};
2891   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
2892                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
2893 
2894   SDValue TEB = CurrentTEB.getValue(0);
2895   Chain = CurrentTEB.getValue(1);
2896 
2897   // Load the ThreadLocalStoragePointer from the TEB
2898   // A pointer to the TLS array is located at offset 0x2c from the TEB.
2899   SDValue TLSArray =
2900       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
2901   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
2902 
2903   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
2904   // offset into the TLSArray.
2905 
2906   // Load the TLS index from the C runtime
2907   SDValue TLSIndex =
2908       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
2909   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
2910   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
2911 
2912   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
2913                               DAG.getConstant(2, DL, MVT::i32));
2914   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
2915                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
2916                             MachinePointerInfo());
2917 
2918   // Get the offset of the start of the .tls section (section base)
2919   const auto *GA = cast<GlobalAddressSDNode>(Op);
2920   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
2921   SDValue Offset = DAG.getLoad(
2922       PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
2923                                     DAG.getTargetConstantPool(CPV, PtrVT, 4)),
2924       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2925 
2926   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
2927 }
2928 
2929 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
2930 SDValue
2931 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2932                                                  SelectionDAG &DAG) const {
2933   SDLoc dl(GA);
2934   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2935   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2936   MachineFunction &MF = DAG.getMachineFunction();
2937   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2938   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2939   ARMConstantPoolValue *CPV =
2940     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2941                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2942   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2943   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2944   Argument = DAG.getLoad(
2945       PtrVT, dl, DAG.getEntryNode(), Argument,
2946       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2947   SDValue Chain = Argument.getValue(1);
2948 
2949   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2950   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2951 
2952   // call __tls_get_addr.
2953   ArgListTy Args;
2954   ArgListEntry Entry;
2955   Entry.Node = Argument;
2956   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2957   Args.push_back(Entry);
2958 
2959   // FIXME: is there useful debug info available here?
2960   TargetLowering::CallLoweringInfo CLI(DAG);
2961   CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
2962       CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
2963       DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
2964 
2965   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2966   return CallResult.first;
2967 }
2968 
2969 // Lower ISD::GlobalTLSAddress using the "initial exec" or
2970 // "local exec" model.
2971 SDValue
2972 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2973                                         SelectionDAG &DAG,
2974                                         TLSModel::Model model) const {
2975   const GlobalValue *GV = GA->getGlobal();
2976   SDLoc dl(GA);
2977   SDValue Offset;
2978   SDValue Chain = DAG.getEntryNode();
2979   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2980   // Get the Thread Pointer
2981   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2982 
2983   if (model == TLSModel::InitialExec) {
2984     MachineFunction &MF = DAG.getMachineFunction();
2985     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2986     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2987     // Initial exec model.
2988     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2989     ARMConstantPoolValue *CPV =
2990       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2991                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2992                                       true);
2993     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2994     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2995     Offset = DAG.getLoad(
2996         PtrVT, dl, Chain, Offset,
2997         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2998     Chain = Offset.getValue(1);
2999 
3000     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3001     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3002 
3003     Offset = DAG.getLoad(
3004         PtrVT, dl, Chain, Offset,
3005         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3006   } else {
3007     // local exec model
3008     assert(model == TLSModel::LocalExec);
3009     ARMConstantPoolValue *CPV =
3010       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
3011     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3012     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3013     Offset = DAG.getLoad(
3014         PtrVT, dl, Chain, Offset,
3015         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3016   }
3017 
3018   // The address of the thread local variable is the add of the thread
3019   // pointer with the offset of the variable.
3020   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3021 }
3022 
3023 SDValue
3024 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3025   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3026   if (DAG.getTarget().useEmulatedTLS())
3027     return LowerToTLSEmulatedModel(GA, DAG);
3028 
3029   if (Subtarget->isTargetDarwin())
3030     return LowerGlobalTLSAddressDarwin(Op, DAG);
3031 
3032   if (Subtarget->isTargetWindows())
3033     return LowerGlobalTLSAddressWindows(Op, DAG);
3034 
3035   // TODO: implement the "local dynamic" model
3036   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3037   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
3038 
3039   switch (model) {
3040     case TLSModel::GeneralDynamic:
3041     case TLSModel::LocalDynamic:
3042       return LowerToTLSGeneralDynamicModel(GA, DAG);
3043     case TLSModel::InitialExec:
3044     case TLSModel::LocalExec:
3045       return LowerToTLSExecModels(GA, DAG, model);
3046   }
3047   llvm_unreachable("bogus TLS model");
3048 }
3049 
3050 /// Return true if all users of V are within function F, looking through
3051 /// ConstantExprs.
3052 static bool allUsersAreInFunction(const Value *V, const Function *F) {
3053   SmallVector<const User*,4> Worklist;
3054   for (auto *U : V->users())
3055     Worklist.push_back(U);
3056   while (!Worklist.empty()) {
3057     auto *U = Worklist.pop_back_val();
3058     if (isa<ConstantExpr>(U)) {
3059       for (auto *UU : U->users())
3060         Worklist.push_back(UU);
3061       continue;
3062     }
3063 
3064     auto *I = dyn_cast<Instruction>(U);
3065     if (!I || I->getParent()->getParent() != F)
3066       return false;
3067   }
3068   return true;
3069 }
3070 
3071 /// Return true if all users of V are within some (any) function, looking through
3072 /// ConstantExprs. In other words, are there any global constant users?
3073 static bool allUsersAreInFunctions(const Value *V) {
3074   SmallVector<const User*,4> Worklist;
3075   for (auto *U : V->users())
3076     Worklist.push_back(U);
3077   while (!Worklist.empty()) {
3078     auto *U = Worklist.pop_back_val();
3079     if (isa<ConstantExpr>(U)) {
3080       for (auto *UU : U->users())
3081         Worklist.push_back(UU);
3082       continue;
3083     }
3084 
3085     if (!isa<Instruction>(U))
3086       return false;
3087   }
3088   return true;
3089 }
3090 
3091 // Return true if T is an integer, float or an array/vector of either.
3092 static bool isSimpleType(Type *T) {
3093   if (T->isIntegerTy() || T->isFloatingPointTy())
3094     return true;
3095   Type *SubT = nullptr;
3096   if (T->isArrayTy())
3097     SubT = T->getArrayElementType();
3098   else if (T->isVectorTy())
3099     SubT = T->getVectorElementType();
3100   else
3101     return false;
3102   return SubT->isIntegerTy() || SubT->isFloatingPointTy();
3103 }
3104 
3105 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
3106                                      EVT PtrVT, const SDLoc &dl) {
3107   // If we're creating a pool entry for a constant global with unnamed address,
3108   // and the global is small enough, we can emit it inline into the constant pool
3109   // to save ourselves an indirection.
3110   //
3111   // This is a win if the constant is only used in one function (so it doesn't
3112   // need to be duplicated) or duplicating the constant wouldn't increase code
3113   // size (implying the constant is no larger than 4 bytes).
3114   const Function &F = DAG.getMachineFunction().getFunction();
3115 
3116   // We rely on this decision to inline being idemopotent and unrelated to the
3117   // use-site. We know that if we inline a variable at one use site, we'll
3118   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3119   // doesn't know about this optimization, so bail out if it's enabled else
3120   // we could decide to inline here (and thus never emit the GV) but require
3121   // the GV from fast-isel generated code.
3122   if (!EnableConstpoolPromotion ||
3123       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3124       return SDValue();
3125 
3126   auto *GVar = dyn_cast<GlobalVariable>(GV);
3127   if (!GVar || !GVar->hasInitializer() ||
3128       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3129       !GVar->hasLocalLinkage())
3130     return SDValue();
3131 
3132   // Ensure that we don't try and inline any type that contains pointers. If
3133   // we inline a value that contains relocations, we move the relocations from
3134   // .data to .text which is not ideal.
3135   auto *Init = GVar->getInitializer();
3136   if (!isSimpleType(Init->getType()))
3137     return SDValue();
3138 
3139   // The constant islands pass can only really deal with alignment requests
3140   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3141   // any type wanting greater alignment requirements than 4 bytes. We also
3142   // can only promote constants that are multiples of 4 bytes in size or
3143   // are paddable to a multiple of 4. Currently we only try and pad constants
3144   // that are strings for simplicity.
3145   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3146   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3147   unsigned Align = GVar->getAlignment();
3148   unsigned RequiredPadding = 4 - (Size % 4);
3149   bool PaddingPossible =
3150     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3151   if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
3152       Size == 0)
3153     return SDValue();
3154 
3155   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3156   MachineFunction &MF = DAG.getMachineFunction();
3157   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3158 
3159   // We can't bloat the constant pool too much, else the ConstantIslands pass
3160   // may fail to converge. If we haven't promoted this global yet (it may have
3161   // multiple uses), and promoting it would increase the constant pool size (Sz
3162   // > 4), ensure we have space to do so up to MaxTotal.
3163   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3164     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3165         ConstpoolPromotionMaxTotal)
3166       return SDValue();
3167 
3168   // This is only valid if all users are in a single function OR it has users
3169   // in multiple functions but it no larger than a pointer. We also check if
3170   // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
3171   // address taken.
3172   if (!allUsersAreInFunction(GVar, &F) &&
3173       !(Size <= 4 && allUsersAreInFunctions(GVar)))
3174     return SDValue();
3175 
3176   // We're going to inline this global. Pad it out if needed.
3177   if (RequiredPadding != 4) {
3178     StringRef S = CDAInit->getAsString();
3179 
3180     SmallVector<uint8_t,16> V(S.size());
3181     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3182     while (RequiredPadding--)
3183       V.push_back(0);
3184     Init = ConstantDataArray::get(*DAG.getContext(), V);
3185   }
3186 
3187   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3188   SDValue CPAddr =
3189     DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
3190   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3191     AFI->markGlobalAsPromotedToConstantPool(GVar);
3192     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3193                                       PaddedSize - 4);
3194   }
3195   ++NumConstpoolPromoted;
3196   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3197 }
3198 
3199 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
3200   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3201     GV = GA->getBaseObject();
3202   return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
3203          isa<Function>(GV);
3204 }
3205 
3206 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3207                                               SelectionDAG &DAG) const {
3208   switch (Subtarget->getTargetTriple().getObjectFormat()) {
3209   default: llvm_unreachable("unknown object format");
3210   case Triple::COFF:
3211     return LowerGlobalAddressWindows(Op, DAG);
3212   case Triple::ELF:
3213     return LowerGlobalAddressELF(Op, DAG);
3214   case Triple::MachO:
3215     return LowerGlobalAddressDarwin(Op, DAG);
3216   }
3217 }
3218 
3219 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3220                                                  SelectionDAG &DAG) const {
3221   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3222   SDLoc dl(Op);
3223   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3224   const TargetMachine &TM = getTargetMachine();
3225   bool IsRO = isReadOnly(GV);
3226 
3227   // promoteToConstantPool only if not generating XO text section
3228   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
3229     if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
3230       return V;
3231 
3232   if (isPositionIndependent()) {
3233     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3234     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3235                                            UseGOT_PREL ? ARMII::MO_GOT : 0);
3236     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3237     if (UseGOT_PREL)
3238       Result =
3239           DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3240                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3241     return Result;
3242   } else if (Subtarget->isROPI() && IsRO) {
3243     // PC-relative.
3244     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3245     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3246     return Result;
3247   } else if (Subtarget->isRWPI() && !IsRO) {
3248     // SB-relative.
3249     SDValue RelAddr;
3250     if (Subtarget->useMovt(DAG.getMachineFunction())) {
3251       ++NumMovwMovt;
3252       SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3253       RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3254     } else { // use literal pool for address constant
3255       ARMConstantPoolValue *CPV =
3256         ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3257       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3258       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3259       RelAddr = DAG.getLoad(
3260           PtrVT, dl, DAG.getEntryNode(), CPAddr,
3261           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3262     }
3263     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3264     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3265     return Result;
3266   }
3267 
3268   // If we have T2 ops, we can materialize the address directly via movt/movw
3269   // pair. This is always cheaper.
3270   if (Subtarget->useMovt(DAG.getMachineFunction())) {
3271     ++NumMovwMovt;
3272     // FIXME: Once remat is capable of dealing with instructions with register
3273     // operands, expand this into two nodes.
3274     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3275                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3276   } else {
3277     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
3278     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3279     return DAG.getLoad(
3280         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3281         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3282   }
3283 }
3284 
3285 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3286                                                     SelectionDAG &DAG) const {
3287   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3288          "ROPI/RWPI not currently supported for Darwin");
3289   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3290   SDLoc dl(Op);
3291   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3292 
3293   if (Subtarget->useMovt(DAG.getMachineFunction()))
3294     ++NumMovwMovt;
3295 
3296   // FIXME: Once remat is capable of dealing with instructions with register
3297   // operands, expand this into multiple nodes
3298   unsigned Wrapper =
3299       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3300 
3301   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3302   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3303 
3304   if (Subtarget->isGVIndirectSymbol(GV))
3305     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3306                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3307   return Result;
3308 }
3309 
3310 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3311                                                      SelectionDAG &DAG) const {
3312   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3313   assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
3314          "Windows on ARM expects to use movw/movt");
3315   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3316          "ROPI/RWPI not currently supported for Windows");
3317 
3318   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3319   const ARMII::TOF TargetFlags =
3320     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
3321   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3322   SDValue Result;
3323   SDLoc DL(Op);
3324 
3325   ++NumMovwMovt;
3326 
3327   // FIXME: Once remat is capable of dealing with instructions with register
3328   // operands, expand this into two nodes.
3329   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3330                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
3331                                                   TargetFlags));
3332   if (GV->hasDLLImportStorageClass())
3333     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3334                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3335   return Result;
3336 }
3337 
3338 SDValue
3339 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3340   SDLoc dl(Op);
3341   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3342   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3343                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3344                      Op.getOperand(1), Val);
3345 }
3346 
3347 SDValue
3348 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3349   SDLoc dl(Op);
3350   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3351                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3352 }
3353 
3354 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3355                                                       SelectionDAG &DAG) const {
3356   SDLoc dl(Op);
3357   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3358                      Op.getOperand(0));
3359 }
3360 
3361 SDValue
3362 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3363                                           const ARMSubtarget *Subtarget) const {
3364   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3365   SDLoc dl(Op);
3366   switch (IntNo) {
3367   default: return SDValue();    // Don't custom lower most intrinsics.
3368   case Intrinsic::thread_pointer: {
3369     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3370     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3371   }
3372   case Intrinsic::eh_sjlj_lsda: {
3373     MachineFunction &MF = DAG.getMachineFunction();
3374     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3375     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3376     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3377     SDValue CPAddr;
3378     bool IsPositionIndependent = isPositionIndependent();
3379     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3380     ARMConstantPoolValue *CPV =
3381       ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3382                                       ARMCP::CPLSDA, PCAdj);
3383     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3384     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3385     SDValue Result = DAG.getLoad(
3386         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3387         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3388 
3389     if (IsPositionIndependent) {
3390       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3391       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3392     }
3393     return Result;
3394   }
3395   case Intrinsic::arm_neon_vabs:
3396     return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3397                         Op.getOperand(1));
3398   case Intrinsic::arm_neon_vmulls:
3399   case Intrinsic::arm_neon_vmullu: {
3400     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3401       ? ARMISD::VMULLs : ARMISD::VMULLu;
3402     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3403                        Op.getOperand(1), Op.getOperand(2));
3404   }
3405   case Intrinsic::arm_neon_vminnm:
3406   case Intrinsic::arm_neon_vmaxnm: {
3407     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3408       ? ISD::FMINNUM : ISD::FMAXNUM;
3409     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3410                        Op.getOperand(1), Op.getOperand(2));
3411   }
3412   case Intrinsic::arm_neon_vminu:
3413   case Intrinsic::arm_neon_vmaxu: {
3414     if (Op.getValueType().isFloatingPoint())
3415       return SDValue();
3416     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3417       ? ISD::UMIN : ISD::UMAX;
3418     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3419                          Op.getOperand(1), Op.getOperand(2));
3420   }
3421   case Intrinsic::arm_neon_vmins:
3422   case Intrinsic::arm_neon_vmaxs: {
3423     // v{min,max}s is overloaded between signed integers and floats.
3424     if (!Op.getValueType().isFloatingPoint()) {
3425       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3426         ? ISD::SMIN : ISD::SMAX;
3427       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3428                          Op.getOperand(1), Op.getOperand(2));
3429     }
3430     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3431       ? ISD::FMINNAN : ISD::FMAXNAN;
3432     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3433                        Op.getOperand(1), Op.getOperand(2));
3434   }
3435   case Intrinsic::arm_neon_vtbl1:
3436     return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3437                        Op.getOperand(1), Op.getOperand(2));
3438   case Intrinsic::arm_neon_vtbl2:
3439     return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3440                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3441   }
3442 }
3443 
3444 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
3445                                  const ARMSubtarget *Subtarget) {
3446   SDLoc dl(Op);
3447   ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
3448   auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
3449   if (SSID == SyncScope::SingleThread)
3450     return Op;
3451 
3452   if (!Subtarget->hasDataBarrier()) {
3453     // Some ARMv6 cpus can support data barriers with an mcr instruction.
3454     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3455     // here.
3456     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3457            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3458     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3459                        DAG.getConstant(0, dl, MVT::i32));
3460   }
3461 
3462   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
3463   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
3464   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
3465   if (Subtarget->isMClass()) {
3466     // Only a full system barrier exists in the M-class architectures.
3467     Domain = ARM_MB::SY;
3468   } else if (Subtarget->preferISHSTBarriers() &&
3469              Ord == AtomicOrdering::Release) {
3470     // Swift happens to implement ISHST barriers in a way that's compatible with
3471     // Release semantics but weaker than ISH so we'd be fools not to use
3472     // it. Beware: other processors probably don't!
3473     Domain = ARM_MB::ISHST;
3474   }
3475 
3476   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3477                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3478                      DAG.getConstant(Domain, dl, MVT::i32));
3479 }
3480 
3481 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
3482                              const ARMSubtarget *Subtarget) {
3483   // ARM pre v5TE and Thumb1 does not have preload instructions.
3484   if (!(Subtarget->isThumb2() ||
3485         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
3486     // Just preserve the chain.
3487     return Op.getOperand(0);
3488 
3489   SDLoc dl(Op);
3490   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
3491   if (!isRead &&
3492       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
3493     // ARMv7 with MP extension has PLDW.
3494     return Op.getOperand(0);
3495 
3496   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3497   if (Subtarget->isThumb()) {
3498     // Invert the bits.
3499     isRead = ~isRead & 1;
3500     isData = ~isData & 1;
3501   }
3502 
3503   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
3504                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
3505                      DAG.getConstant(isData, dl, MVT::i32));
3506 }
3507 
3508 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
3509   MachineFunction &MF = DAG.getMachineFunction();
3510   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
3511 
3512   // vastart just stores the address of the VarArgsFrameIndex slot into the
3513   // memory location argument.
3514   SDLoc dl(Op);
3515   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
3516   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3517   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3518   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3519                       MachinePointerInfo(SV));
3520 }
3521 
3522 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
3523                                                 CCValAssign &NextVA,
3524                                                 SDValue &Root,
3525                                                 SelectionDAG &DAG,
3526                                                 const SDLoc &dl) const {
3527   MachineFunction &MF = DAG.getMachineFunction();
3528   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3529 
3530   const TargetRegisterClass *RC;
3531   if (AFI->isThumb1OnlyFunction())
3532     RC = &ARM::tGPRRegClass;
3533   else
3534     RC = &ARM::GPRRegClass;
3535 
3536   // Transform the arguments stored in physical registers into virtual ones.
3537   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3538   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3539 
3540   SDValue ArgValue2;
3541   if (NextVA.isMemLoc()) {
3542     MachineFrameInfo &MFI = MF.getFrameInfo();
3543     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
3544 
3545     // Create load node to retrieve arguments from the stack.
3546     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3547     ArgValue2 = DAG.getLoad(
3548         MVT::i32, dl, Root, FIN,
3549         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3550   } else {
3551     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3552     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3553   }
3554   if (!Subtarget->isLittle())
3555     std::swap (ArgValue, ArgValue2);
3556   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
3557 }
3558 
3559 // The remaining GPRs hold either the beginning of variable-argument
3560 // data, or the beginning of an aggregate passed by value (usually
3561 // byval).  Either way, we allocate stack slots adjacent to the data
3562 // provided by our caller, and store the unallocated registers there.
3563 // If this is a variadic function, the va_list pointer will begin with
3564 // these values; otherwise, this reassembles a (byval) structure that
3565 // was split between registers and memory.
3566 // Return: The frame index registers were stored into.
3567 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
3568                                       const SDLoc &dl, SDValue &Chain,
3569                                       const Value *OrigArg,
3570                                       unsigned InRegsParamRecordIdx,
3571                                       int ArgOffset, unsigned ArgSize) const {
3572   // Currently, two use-cases possible:
3573   // Case #1. Non-var-args function, and we meet first byval parameter.
3574   //          Setup first unallocated register as first byval register;
3575   //          eat all remained registers
3576   //          (these two actions are performed by HandleByVal method).
3577   //          Then, here, we initialize stack frame with
3578   //          "store-reg" instructions.
3579   // Case #2. Var-args function, that doesn't contain byval parameters.
3580   //          The same: eat all remained unallocated registers,
3581   //          initialize stack frame.
3582 
3583   MachineFunction &MF = DAG.getMachineFunction();
3584   MachineFrameInfo &MFI = MF.getFrameInfo();
3585   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3586   unsigned RBegin, REnd;
3587   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
3588     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
3589   } else {
3590     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3591     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
3592     REnd = ARM::R4;
3593   }
3594 
3595   if (REnd != RBegin)
3596     ArgOffset = -4 * (ARM::R4 - RBegin);
3597 
3598   auto PtrVT = getPointerTy(DAG.getDataLayout());
3599   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
3600   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
3601 
3602   SmallVector<SDValue, 4> MemOps;
3603   const TargetRegisterClass *RC =
3604       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
3605 
3606   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3607     unsigned VReg = MF.addLiveIn(Reg, RC);
3608     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3609     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
3610                                  MachinePointerInfo(OrigArg, 4 * i));
3611     MemOps.push_back(Store);
3612     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3613   }
3614 
3615   if (!MemOps.empty())
3616     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3617   return FrameIndex;
3618 }
3619 
3620 // Setup stack frame, the va_list pointer will start from.
3621 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3622                                              const SDLoc &dl, SDValue &Chain,
3623                                              unsigned ArgOffset,
3624                                              unsigned TotalArgRegsSaveSize,
3625                                              bool ForceMutable) const {
3626   MachineFunction &MF = DAG.getMachineFunction();
3627   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3628 
3629   // Try to store any remaining integer argument regs
3630   // to their spots on the stack so that they may be loaded by dereferencing
3631   // the result of va_next.
3632   // If there is no regs to be stored, just point address after last
3633   // argument passed via stack.
3634   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3635                                   CCInfo.getInRegsParamsCount(),
3636                                   CCInfo.getNextStackOffset(), 4);
3637   AFI->setVarArgsFrameIndex(FrameIndex);
3638 }
3639 
3640 SDValue ARMTargetLowering::LowerFormalArguments(
3641     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3642     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3643     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3644   MachineFunction &MF = DAG.getMachineFunction();
3645   MachineFrameInfo &MFI = MF.getFrameInfo();
3646 
3647   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3648 
3649   // Assign locations to all of the incoming arguments.
3650   SmallVector<CCValAssign, 16> ArgLocs;
3651   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3652                  *DAG.getContext());
3653   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
3654 
3655   SmallVector<SDValue, 16> ArgValues;
3656   SDValue ArgValue;
3657   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
3658   unsigned CurArgIdx = 0;
3659 
3660   // Initially ArgRegsSaveSize is zero.
3661   // Then we increase this value each time we meet byval parameter.
3662   // We also increase this value in case of varargs function.
3663   AFI->setArgRegsSaveSize(0);
3664 
3665   // Calculate the amount of stack space that we need to allocate to store
3666   // byval and variadic arguments that are passed in registers.
3667   // We need to know this before we allocate the first byval or variadic
3668   // argument, as they will be allocated a stack slot below the CFA (Canonical
3669   // Frame Address, the stack pointer at entry to the function).
3670   unsigned ArgRegBegin = ARM::R4;
3671   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3672     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3673       break;
3674 
3675     CCValAssign &VA = ArgLocs[i];
3676     unsigned Index = VA.getValNo();
3677     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3678     if (!Flags.isByVal())
3679       continue;
3680 
3681     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3682     unsigned RBegin, REnd;
3683     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3684     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3685 
3686     CCInfo.nextInRegsParam();
3687   }
3688   CCInfo.rewindByValRegsInfo();
3689 
3690   int lastInsIndex = -1;
3691   if (isVarArg && MFI.hasVAStart()) {
3692     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3693     if (RegIdx != array_lengthof(GPRArgRegs))
3694       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3695   }
3696 
3697   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3698   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3699   auto PtrVT = getPointerTy(DAG.getDataLayout());
3700 
3701   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3702     CCValAssign &VA = ArgLocs[i];
3703     if (Ins[VA.getValNo()].isOrigArg()) {
3704       std::advance(CurOrigArg,
3705                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3706       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3707     }
3708     // Arguments stored in registers.
3709     if (VA.isRegLoc()) {
3710       EVT RegVT = VA.getLocVT();
3711 
3712       if (VA.needsCustom()) {
3713         // f64 and vector types are split up into multiple registers or
3714         // combinations of registers and stack slots.
3715         if (VA.getLocVT() == MVT::v2f64) {
3716           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3717                                                    Chain, DAG, dl);
3718           VA = ArgLocs[++i]; // skip ahead to next loc
3719           SDValue ArgValue2;
3720           if (VA.isMemLoc()) {
3721             int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
3722             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3723             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
3724                                     MachinePointerInfo::getFixedStack(
3725                                         DAG.getMachineFunction(), FI));
3726           } else {
3727             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3728                                              Chain, DAG, dl);
3729           }
3730           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3731           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3732                                  ArgValue, ArgValue1,
3733                                  DAG.getIntPtrConstant(0, dl));
3734           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3735                                  ArgValue, ArgValue2,
3736                                  DAG.getIntPtrConstant(1, dl));
3737         } else
3738           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3739       } else {
3740         const TargetRegisterClass *RC;
3741 
3742 
3743         if (RegVT == MVT::f16)
3744           RC = &ARM::HPRRegClass;
3745         else if (RegVT == MVT::f32)
3746           RC = &ARM::SPRRegClass;
3747         else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
3748           RC = &ARM::DPRRegClass;
3749         else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
3750           RC = &ARM::QPRRegClass;
3751         else if (RegVT == MVT::i32)
3752           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3753                                            : &ARM::GPRRegClass;
3754         else
3755           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3756 
3757         // Transform the arguments in physical registers into virtual ones.
3758         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3759         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3760       }
3761 
3762       // If this is an 8 or 16-bit value, it is really passed promoted
3763       // to 32 bits.  Insert an assert[sz]ext to capture this, then
3764       // truncate to the right size.
3765       switch (VA.getLocInfo()) {
3766       default: llvm_unreachable("Unknown loc info!");
3767       case CCValAssign::Full: break;
3768       case CCValAssign::BCvt:
3769         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
3770         break;
3771       case CCValAssign::SExt:
3772         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3773                                DAG.getValueType(VA.getValVT()));
3774         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3775         break;
3776       case CCValAssign::ZExt:
3777         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3778                                DAG.getValueType(VA.getValVT()));
3779         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3780         break;
3781       }
3782 
3783       InVals.push_back(ArgValue);
3784     } else { // VA.isRegLoc()
3785       // sanity check
3786       assert(VA.isMemLoc());
3787       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
3788 
3789       int index = VA.getValNo();
3790 
3791       // Some Ins[] entries become multiple ArgLoc[] entries.
3792       // Process them only once.
3793       if (index != lastInsIndex)
3794         {
3795           ISD::ArgFlagsTy Flags = Ins[index].Flags;
3796           // FIXME: For now, all byval parameter objects are marked mutable.
3797           // This can be changed with more analysis.
3798           // In case of tail call optimization mark all arguments mutable.
3799           // Since they could be overwritten by lowering of arguments in case of
3800           // a tail call.
3801           if (Flags.isByVal()) {
3802             assert(Ins[index].isOrigArg() &&
3803                    "Byval arguments cannot be implicit");
3804             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
3805 
3806             int FrameIndex = StoreByValRegs(
3807                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
3808                 VA.getLocMemOffset(), Flags.getByValSize());
3809             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
3810             CCInfo.nextInRegsParam();
3811           } else {
3812             unsigned FIOffset = VA.getLocMemOffset();
3813             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
3814                                            FIOffset, true);
3815 
3816             // Create load nodes to retrieve arguments from the stack.
3817             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3818             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
3819                                          MachinePointerInfo::getFixedStack(
3820                                              DAG.getMachineFunction(), FI)));
3821           }
3822           lastInsIndex = index;
3823         }
3824     }
3825   }
3826 
3827   // varargs
3828   if (isVarArg && MFI.hasVAStart())
3829     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3830                          CCInfo.getNextStackOffset(),
3831                          TotalArgRegsSaveSize);
3832 
3833   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
3834 
3835   return Chain;
3836 }
3837 
3838 /// isFloatingPointZero - Return true if this is +0.0.
3839 static bool isFloatingPointZero(SDValue Op) {
3840   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3841     return CFP->getValueAPF().isPosZero();
3842   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3843     // Maybe this has already been legalized into the constant pool?
3844     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3845       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3846       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3847         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3848           return CFP->getValueAPF().isPosZero();
3849     }
3850   } else if (Op->getOpcode() == ISD::BITCAST &&
3851              Op->getValueType(0) == MVT::f64) {
3852     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
3853     // created by LowerConstantFP().
3854     SDValue BitcastOp = Op->getOperand(0);
3855     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
3856         isNullConstant(BitcastOp->getOperand(0)))
3857       return true;
3858   }
3859   return false;
3860 }
3861 
3862 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3863 /// the given operands.
3864 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3865                                      SDValue &ARMcc, SelectionDAG &DAG,
3866                                      const SDLoc &dl) const {
3867   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3868     unsigned C = RHSC->getZExtValue();
3869     if (!isLegalICmpImmediate((int32_t)C)) {
3870       // Constant does not fit, try adjusting it by one.
3871       switch (CC) {
3872       default: break;
3873       case ISD::SETLT:
3874       case ISD::SETGE:
3875         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3876           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3877           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3878         }
3879         break;
3880       case ISD::SETULT:
3881       case ISD::SETUGE:
3882         if (C != 0 && isLegalICmpImmediate(C-1)) {
3883           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3884           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
3885         }
3886         break;
3887       case ISD::SETLE:
3888       case ISD::SETGT:
3889         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3890           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3891           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3892         }
3893         break;
3894       case ISD::SETULE:
3895       case ISD::SETUGT:
3896         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3897           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3898           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
3899         }
3900         break;
3901       }
3902     }
3903   } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
3904              (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
3905     // In ARM and Thumb-2, the compare instructions can shift their second
3906     // operand.
3907     CC = ISD::getSetCCSwappedOperands(CC);
3908     std::swap(LHS, RHS);
3909   }
3910 
3911   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3912   ARMISD::NodeType CompareType;
3913   switch (CondCode) {
3914   default:
3915     CompareType = ARMISD::CMP;
3916     break;
3917   case ARMCC::EQ:
3918   case ARMCC::NE:
3919     // Uses only Z Flag
3920     CompareType = ARMISD::CMPZ;
3921     break;
3922   }
3923   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
3924   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3925 }
3926 
3927 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3928 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
3929                                      SelectionDAG &DAG, const SDLoc &dl,
3930                                      bool InvalidOnQNaN) const {
3931   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
3932   SDValue Cmp;
3933   SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
3934   if (!isFloatingPointZero(RHS))
3935     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
3936   else
3937     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
3938   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3939 }
3940 
3941 /// duplicateCmp - Glue values can have only one use, so this function
3942 /// duplicates a comparison node.
3943 SDValue
3944 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3945   unsigned Opc = Cmp.getOpcode();
3946   SDLoc DL(Cmp);
3947   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3948     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3949 
3950   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3951   Cmp = Cmp.getOperand(0);
3952   Opc = Cmp.getOpcode();
3953   if (Opc == ARMISD::CMPFP)
3954     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
3955                       Cmp.getOperand(1), Cmp.getOperand(2));
3956   else {
3957     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3958     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
3959                       Cmp.getOperand(1));
3960   }
3961   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3962 }
3963 
3964 // This function returns three things: the arithmetic computation itself
3965 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc).  The
3966 // comparison and the condition code define the case in which the arithmetic
3967 // computation *does not* overflow.
3968 std::pair<SDValue, SDValue>
3969 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
3970                                  SDValue &ARMcc) const {
3971   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
3972 
3973   SDValue Value, OverflowCmp;
3974   SDValue LHS = Op.getOperand(0);
3975   SDValue RHS = Op.getOperand(1);
3976   SDLoc dl(Op);
3977 
3978   // FIXME: We are currently always generating CMPs because we don't support
3979   // generating CMN through the backend. This is not as good as the natural
3980   // CMP case because it causes a register dependency and cannot be folded
3981   // later.
3982 
3983   switch (Op.getOpcode()) {
3984   default:
3985     llvm_unreachable("Unknown overflow instruction!");
3986   case ISD::SADDO:
3987     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
3988     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
3989     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3990     break;
3991   case ISD::UADDO:
3992     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
3993     // We use ADDC here to correspond to its use in LowerUnsignedALUO.
3994     // We do not use it in the USUBO case as Value may not be used.
3995     Value = DAG.getNode(ARMISD::ADDC, dl,
3996                         DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
3997                 .getValue(0);
3998     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
3999     break;
4000   case ISD::SSUBO:
4001     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4002     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4003     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4004     break;
4005   case ISD::USUBO:
4006     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4007     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4008     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4009     break;
4010   case ISD::UMULO:
4011     // We generate a UMUL_LOHI and then check if the high word is 0.
4012     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4013     Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4014                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4015                         LHS, RHS);
4016     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4017                               DAG.getConstant(0, dl, MVT::i32));
4018     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4019     break;
4020   case ISD::SMULO:
4021     // We generate a SMUL_LOHI and then check if all the bits of the high word
4022     // are the same as the sign bit of the low word.
4023     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4024     Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4025                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4026                         LHS, RHS);
4027     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4028                               DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4029                                           Value.getValue(0),
4030                                           DAG.getConstant(31, dl, MVT::i32)));
4031     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4032     break;
4033   } // switch (...)
4034 
4035   return std::make_pair(Value, OverflowCmp);
4036 }
4037 
4038 SDValue
4039 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4040   // Let legalize expand this if it isn't a legal type yet.
4041   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4042     return SDValue();
4043 
4044   SDValue Value, OverflowCmp;
4045   SDValue ARMcc;
4046   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4047   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4048   SDLoc dl(Op);
4049   // We use 0 and 1 as false and true values.
4050   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4051   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4052   EVT VT = Op.getValueType();
4053 
4054   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
4055                                  ARMcc, CCR, OverflowCmp);
4056 
4057   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4058   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4059 }
4060 
4061 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
4062                                               SelectionDAG &DAG) {
4063   SDLoc DL(BoolCarry);
4064   EVT CarryVT = BoolCarry.getValueType();
4065 
4066   // This converts the boolean value carry into the carry flag by doing
4067   // ARMISD::SUBC Carry, 1
4068   SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4069                               DAG.getVTList(CarryVT, MVT::i32),
4070                               BoolCarry, DAG.getConstant(1, DL, CarryVT));
4071   return Carry.getValue(1);
4072 }
4073 
4074 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
4075                                               SelectionDAG &DAG) {
4076   SDLoc DL(Flags);
4077 
4078   // Now convert the carry flag into a boolean carry. We do this
4079   // using ARMISD:ADDE 0, 0, Carry
4080   return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4081                      DAG.getConstant(0, DL, MVT::i32),
4082                      DAG.getConstant(0, DL, MVT::i32), Flags);
4083 }
4084 
4085 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4086                                              SelectionDAG &DAG) const {
4087   // Let legalize expand this if it isn't a legal type yet.
4088   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4089     return SDValue();
4090 
4091   SDValue LHS = Op.getOperand(0);
4092   SDValue RHS = Op.getOperand(1);
4093   SDLoc dl(Op);
4094 
4095   EVT VT = Op.getValueType();
4096   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4097   SDValue Value;
4098   SDValue Overflow;
4099   switch (Op.getOpcode()) {
4100   default:
4101     llvm_unreachable("Unknown overflow instruction!");
4102   case ISD::UADDO:
4103     Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4104     // Convert the carry flag into a boolean value.
4105     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4106     break;
4107   case ISD::USUBO: {
4108     Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4109     // Convert the carry flag into a boolean value.
4110     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4111     // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4112     // value. So compute 1 - C.
4113     Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4114                            DAG.getConstant(1, dl, MVT::i32), Overflow);
4115     break;
4116   }
4117   }
4118 
4119   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4120 }
4121 
4122 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4123   SDValue Cond = Op.getOperand(0);
4124   SDValue SelectTrue = Op.getOperand(1);
4125   SDValue SelectFalse = Op.getOperand(2);
4126   SDLoc dl(Op);
4127   unsigned Opc = Cond.getOpcode();
4128 
4129   if (Cond.getResNo() == 1 &&
4130       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4131        Opc == ISD::USUBO)) {
4132     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
4133       return SDValue();
4134 
4135     SDValue Value, OverflowCmp;
4136     SDValue ARMcc;
4137     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4138     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4139     EVT VT = Op.getValueType();
4140 
4141     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
4142                    OverflowCmp, DAG);
4143   }
4144 
4145   // Convert:
4146   //
4147   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4148   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4149   //
4150   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4151     const ConstantSDNode *CMOVTrue =
4152       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4153     const ConstantSDNode *CMOVFalse =
4154       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4155 
4156     if (CMOVTrue && CMOVFalse) {
4157       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4158       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4159 
4160       SDValue True;
4161       SDValue False;
4162       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4163         True = SelectTrue;
4164         False = SelectFalse;
4165       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4166         True = SelectFalse;
4167         False = SelectTrue;
4168       }
4169 
4170       if (True.getNode() && False.getNode()) {
4171         EVT VT = Op.getValueType();
4172         SDValue ARMcc = Cond.getOperand(2);
4173         SDValue CCR = Cond.getOperand(3);
4174         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
4175         assert(True.getValueType() == VT);
4176         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
4177       }
4178     }
4179   }
4180 
4181   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4182   // undefined bits before doing a full-word comparison with zero.
4183   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4184                      DAG.getConstant(1, dl, Cond.getValueType()));
4185 
4186   return DAG.getSelectCC(dl, Cond,
4187                          DAG.getConstant(0, dl, Cond.getValueType()),
4188                          SelectTrue, SelectFalse, ISD::SETNE);
4189 }
4190 
4191 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
4192                                  bool &swpCmpOps, bool &swpVselOps) {
4193   // Start by selecting the GE condition code for opcodes that return true for
4194   // 'equality'
4195   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4196       CC == ISD::SETULE)
4197     CondCode = ARMCC::GE;
4198 
4199   // and GT for opcodes that return false for 'equality'.
4200   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4201            CC == ISD::SETULT)
4202     CondCode = ARMCC::GT;
4203 
4204   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4205   // to swap the compare operands.
4206   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4207       CC == ISD::SETULT)
4208     swpCmpOps = true;
4209 
4210   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4211   // If we have an unordered opcode, we need to swap the operands to the VSEL
4212   // instruction (effectively negating the condition).
4213   //
4214   // This also has the effect of swapping which one of 'less' or 'greater'
4215   // returns true, so we also swap the compare operands. It also switches
4216   // whether we return true for 'equality', so we compensate by picking the
4217   // opposite condition code to our original choice.
4218   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4219       CC == ISD::SETUGT) {
4220     swpCmpOps = !swpCmpOps;
4221     swpVselOps = !swpVselOps;
4222     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4223   }
4224 
4225   // 'ordered' is 'anything but unordered', so use the VS condition code and
4226   // swap the VSEL operands.
4227   if (CC == ISD::SETO) {
4228     CondCode = ARMCC::VS;
4229     swpVselOps = true;
4230   }
4231 
4232   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4233   // code and swap the VSEL operands.
4234   if (CC == ISD::SETUNE) {
4235     CondCode = ARMCC::EQ;
4236     swpVselOps = true;
4237   }
4238 }
4239 
4240 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4241                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
4242                                    SDValue Cmp, SelectionDAG &DAG) const {
4243   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
4244     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4245                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4246     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4247                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4248 
4249     SDValue TrueLow = TrueVal.getValue(0);
4250     SDValue TrueHigh = TrueVal.getValue(1);
4251     SDValue FalseLow = FalseVal.getValue(0);
4252     SDValue FalseHigh = FalseVal.getValue(1);
4253 
4254     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4255                               ARMcc, CCR, Cmp);
4256     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4257                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
4258 
4259     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4260   } else {
4261     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
4262                        Cmp);
4263   }
4264 }
4265 
4266 static bool isGTorGE(ISD::CondCode CC) {
4267   return CC == ISD::SETGT || CC == ISD::SETGE;
4268 }
4269 
4270 static bool isLTorLE(ISD::CondCode CC) {
4271   return CC == ISD::SETLT || CC == ISD::SETLE;
4272 }
4273 
4274 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4275 // All of these conditions (and their <= and >= counterparts) will do:
4276 //          x < k ? k : x
4277 //          x > k ? x : k
4278 //          k < x ? x : k
4279 //          k > x ? k : x
4280 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4281                             const SDValue TrueVal, const SDValue FalseVal,
4282                             const ISD::CondCode CC, const SDValue K) {
4283   return (isGTorGE(CC) &&
4284           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4285          (isLTorLE(CC) &&
4286           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4287 }
4288 
4289 // Similar to isLowerSaturate(), but checks for upper-saturating conditions.
4290 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
4291                             const SDValue TrueVal, const SDValue FalseVal,
4292                             const ISD::CondCode CC, const SDValue K) {
4293   return (isGTorGE(CC) &&
4294           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
4295          (isLTorLE(CC) &&
4296           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
4297 }
4298 
4299 // Check if two chained conditionals could be converted into SSAT or USAT.
4300 //
4301 // SSAT can replace a set of two conditional selectors that bound a number to an
4302 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4303 //
4304 //     x < -k ? -k : (x > k ? k : x)
4305 //     x < -k ? -k : (x < k ? x : k)
4306 //     x > -k ? (x > k ? k : x) : -k
4307 //     x < k ? (x < -k ? -k : x) : k
4308 //     etc.
4309 //
4310 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
4311 // a power of 2.
4312 //
4313 // It returns true if the conversion can be done, false otherwise.
4314 // Additionally, the variable is returned in parameter V, the constant in K and
4315 // usat is set to true if the conditional represents an unsigned saturation
4316 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
4317                                     uint64_t &K, bool &usat) {
4318   SDValue LHS1 = Op.getOperand(0);
4319   SDValue RHS1 = Op.getOperand(1);
4320   SDValue TrueVal1 = Op.getOperand(2);
4321   SDValue FalseVal1 = Op.getOperand(3);
4322   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4323 
4324   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4325   if (Op2.getOpcode() != ISD::SELECT_CC)
4326     return false;
4327 
4328   SDValue LHS2 = Op2.getOperand(0);
4329   SDValue RHS2 = Op2.getOperand(1);
4330   SDValue TrueVal2 = Op2.getOperand(2);
4331   SDValue FalseVal2 = Op2.getOperand(3);
4332   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
4333 
4334   // Find out which are the constants and which are the variables
4335   // in each conditional
4336   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
4337                                                         ? &RHS1
4338                                                         : nullptr;
4339   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
4340                                                         ? &RHS2
4341                                                         : nullptr;
4342   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
4343   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
4344   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
4345   SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
4346 
4347   // We must detect cases where the original operations worked with 16- or
4348   // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
4349   // must work with sign-extended values but the select operations return
4350   // the original non-extended value.
4351   SDValue V2TmpReg = V2Tmp;
4352   if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
4353     V2TmpReg = V2Tmp->getOperand(0);
4354 
4355   // Check that the registers and the constants have the correct values
4356   // in both conditionals
4357   if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
4358       V2TmpReg != V2)
4359     return false;
4360 
4361   // Figure out which conditional is saturating the lower/upper bound.
4362   const SDValue *LowerCheckOp =
4363       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4364           ? &Op
4365           : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4366                 ? &Op2
4367                 : nullptr;
4368   const SDValue *UpperCheckOp =
4369       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4370           ? &Op
4371           : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4372                 ? &Op2
4373                 : nullptr;
4374 
4375   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
4376     return false;
4377 
4378   // Check that the constant in the lower-bound check is
4379   // the opposite of the constant in the upper-bound check
4380   // in 1's complement.
4381   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
4382   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
4383   int64_t PosVal = std::max(Val1, Val2);
4384   int64_t NegVal = std::min(Val1, Val2);
4385 
4386   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
4387        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
4388       isPowerOf2_64(PosVal + 1)) {
4389 
4390     // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
4391     if (Val1 == ~Val2)
4392       usat = false;
4393     else if (NegVal == 0)
4394       usat = true;
4395     else
4396       return false;
4397 
4398     V = V2;
4399     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
4400 
4401     return true;
4402   }
4403 
4404   return false;
4405 }
4406 
4407 // Check if a condition of the type x < k ? k : x can be converted into a
4408 // bit operation instead of conditional moves.
4409 // Currently this is allowed given:
4410 // - The conditions and values match up
4411 // - k is 0 or -1 (all ones)
4412 // This function will not check the last condition, thats up to the caller
4413 // It returns true if the transformation can be made, and in such case
4414 // returns x in V, and k in SatK.
4415 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
4416                                          SDValue &SatK)
4417 {
4418   SDValue LHS = Op.getOperand(0);
4419   SDValue RHS = Op.getOperand(1);
4420   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4421   SDValue TrueVal = Op.getOperand(2);
4422   SDValue FalseVal = Op.getOperand(3);
4423 
4424   SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
4425                                                ? &RHS
4426                                                : nullptr;
4427 
4428   // No constant operation in comparison, early out
4429   if (!K)
4430     return false;
4431 
4432   SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
4433   V = (KTmp == TrueVal) ? FalseVal : TrueVal;
4434   SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
4435 
4436   // If the constant on left and right side, or variable on left and right,
4437   // does not match, early out
4438   if (*K != KTmp || V != VTmp)
4439     return false;
4440 
4441   if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
4442     SatK = *K;
4443     return true;
4444   }
4445 
4446   return false;
4447 }
4448 
4449 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
4450   EVT VT = Op.getValueType();
4451   SDLoc dl(Op);
4452 
4453   // Try to convert two saturating conditional selects into a single SSAT
4454   SDValue SatValue;
4455   uint64_t SatConstant;
4456   bool SatUSat;
4457   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
4458       isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
4459     if (SatUSat)
4460       return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
4461                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4462     else
4463       return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
4464                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4465   }
4466 
4467   // Try to convert expressions of the form x < k ? k : x (and similar forms)
4468   // into more efficient bit operations, which is possible when k is 0 or -1
4469   // On ARM and Thumb-2 which have flexible operand 2 this will result in
4470   // single instructions. On Thumb the shift and the bit operation will be two
4471   // instructions.
4472   // Only allow this transformation on full-width (32-bit) operations
4473   SDValue LowerSatConstant;
4474   if (VT == MVT::i32 &&
4475       isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
4476     SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
4477                                  DAG.getConstant(31, dl, VT));
4478     if (isNullConstant(LowerSatConstant)) {
4479       SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
4480                                       DAG.getAllOnesConstant(dl, VT));
4481       return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
4482     } else if (isAllOnesConstant(LowerSatConstant))
4483       return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
4484   }
4485 
4486   SDValue LHS = Op.getOperand(0);
4487   SDValue RHS = Op.getOperand(1);
4488   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4489   SDValue TrueVal = Op.getOperand(2);
4490   SDValue FalseVal = Op.getOperand(3);
4491 
4492   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4493     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4494                                                     dl);
4495 
4496     // If softenSetCCOperands only returned one value, we should compare it to
4497     // zero.
4498     if (!RHS.getNode()) {
4499       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4500       CC = ISD::SETNE;
4501     }
4502   }
4503 
4504   if (LHS.getValueType() == MVT::i32) {
4505     // Try to generate VSEL on ARMv8.
4506     // The VSEL instruction can't use all the usual ARM condition
4507     // codes: it only has two bits to select the condition code, so it's
4508     // constrained to use only GE, GT, VS and EQ.
4509     //
4510     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
4511     // swap the operands of the previous compare instruction (effectively
4512     // inverting the compare condition, swapping 'less' and 'greater') and
4513     // sometimes need to swap the operands to the VSEL (which inverts the
4514     // condition in the sense of firing whenever the previous condition didn't)
4515     if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
4516                                     TrueVal.getValueType() == MVT::f64)) {
4517       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4518       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
4519           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
4520         CC = ISD::getSetCCInverse(CC, true);
4521         std::swap(TrueVal, FalseVal);
4522       }
4523     }
4524 
4525     SDValue ARMcc;
4526     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4527     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4528     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4529   }
4530 
4531   ARMCC::CondCodes CondCode, CondCode2;
4532   bool InvalidOnQNaN;
4533   FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
4534 
4535   // Normalize the fp compare. If RHS is zero we keep it there so we match
4536   // CMPFPw0 instead of CMPFP.
4537   if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&
4538      (TrueVal.getValueType() == MVT::f16 ||
4539       TrueVal.getValueType() == MVT::f32 ||
4540       TrueVal.getValueType() == MVT::f64)) {
4541     bool swpCmpOps = false;
4542     bool swpVselOps = false;
4543     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
4544 
4545     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
4546         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
4547       if (swpCmpOps)
4548         std::swap(LHS, RHS);
4549       if (swpVselOps)
4550         std::swap(TrueVal, FalseVal);
4551     }
4552   }
4553 
4554   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4555   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
4556   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4557   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4558   if (CondCode2 != ARMCC::AL) {
4559     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
4560     // FIXME: Needs another CMP because flag can have but one use.
4561     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
4562     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
4563   }
4564   return Result;
4565 }
4566 
4567 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
4568 /// to morph to an integer compare sequence.
4569 static bool canChangeToInt(SDValue Op, bool &SeenZero,
4570                            const ARMSubtarget *Subtarget) {
4571   SDNode *N = Op.getNode();
4572   if (!N->hasOneUse())
4573     // Otherwise it requires moving the value from fp to integer registers.
4574     return false;
4575   if (!N->getNumValues())
4576     return false;
4577   EVT VT = Op.getValueType();
4578   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
4579     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
4580     // vmrs are very slow, e.g. cortex-a8.
4581     return false;
4582 
4583   if (isFloatingPointZero(Op)) {
4584     SeenZero = true;
4585     return true;
4586   }
4587   return ISD::isNormalLoad(N);
4588 }
4589 
4590 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
4591   if (isFloatingPointZero(Op))
4592     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
4593 
4594   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
4595     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
4596                        Ld->getPointerInfo(), Ld->getAlignment(),
4597                        Ld->getMemOperand()->getFlags());
4598 
4599   llvm_unreachable("Unknown VFP cmp argument!");
4600 }
4601 
4602 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
4603                            SDValue &RetVal1, SDValue &RetVal2) {
4604   SDLoc dl(Op);
4605 
4606   if (isFloatingPointZero(Op)) {
4607     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
4608     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
4609     return;
4610   }
4611 
4612   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
4613     SDValue Ptr = Ld->getBasePtr();
4614     RetVal1 =
4615         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
4616                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
4617 
4618     EVT PtrType = Ptr.getValueType();
4619     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
4620     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
4621                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
4622     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
4623                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
4624                           Ld->getMemOperand()->getFlags());
4625     return;
4626   }
4627 
4628   llvm_unreachable("Unknown VFP cmp argument!");
4629 }
4630 
4631 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
4632 /// f32 and even f64 comparisons to integer ones.
4633 SDValue
4634 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
4635   SDValue Chain = Op.getOperand(0);
4636   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4637   SDValue LHS = Op.getOperand(2);
4638   SDValue RHS = Op.getOperand(3);
4639   SDValue Dest = Op.getOperand(4);
4640   SDLoc dl(Op);
4641 
4642   bool LHSSeenZero = false;
4643   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
4644   bool RHSSeenZero = false;
4645   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
4646   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
4647     // If unsafe fp math optimization is enabled and there are no other uses of
4648     // the CMP operands, and the condition code is EQ or NE, we can optimize it
4649     // to an integer comparison.
4650     if (CC == ISD::SETOEQ)
4651       CC = ISD::SETEQ;
4652     else if (CC == ISD::SETUNE)
4653       CC = ISD::SETNE;
4654 
4655     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4656     SDValue ARMcc;
4657     if (LHS.getValueType() == MVT::f32) {
4658       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4659                         bitcastf32Toi32(LHS, DAG), Mask);
4660       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4661                         bitcastf32Toi32(RHS, DAG), Mask);
4662       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4663       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4664       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4665                          Chain, Dest, ARMcc, CCR, Cmp);
4666     }
4667 
4668     SDValue LHS1, LHS2;
4669     SDValue RHS1, RHS2;
4670     expandf64Toi32(LHS, DAG, LHS1, LHS2);
4671     expandf64Toi32(RHS, DAG, RHS1, RHS2);
4672     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
4673     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
4674     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4675     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4676     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4677     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
4678     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
4679   }
4680 
4681   return SDValue();
4682 }
4683 
4684 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
4685   SDValue Chain = Op.getOperand(0);
4686   SDValue Cond = Op.getOperand(1);
4687   SDValue Dest = Op.getOperand(2);
4688   SDLoc dl(Op);
4689 
4690   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
4691   // instruction.
4692   unsigned Opc = Cond.getOpcode();
4693   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
4694                       !Subtarget->isThumb1Only();
4695   if (Cond.getResNo() == 1 &&
4696       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4697        Opc == ISD::USUBO || OptimizeMul)) {
4698     // Only lower legal XALUO ops.
4699     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
4700       return SDValue();
4701 
4702     // The actual operation with overflow check.
4703     SDValue Value, OverflowCmp;
4704     SDValue ARMcc;
4705     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4706 
4707     // Reverse the condition code.
4708     ARMCC::CondCodes CondCode =
4709         (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
4710     CondCode = ARMCC::getOppositeCondition(CondCode);
4711     ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
4712     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4713 
4714     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
4715                        OverflowCmp);
4716   }
4717 
4718   return SDValue();
4719 }
4720 
4721 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
4722   SDValue Chain = Op.getOperand(0);
4723   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4724   SDValue LHS = Op.getOperand(2);
4725   SDValue RHS = Op.getOperand(3);
4726   SDValue Dest = Op.getOperand(4);
4727   SDLoc dl(Op);
4728 
4729   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
4730     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
4731                                                     dl);
4732 
4733     // If softenSetCCOperands only returned one value, we should compare it to
4734     // zero.
4735     if (!RHS.getNode()) {
4736       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4737       CC = ISD::SETNE;
4738     }
4739   }
4740 
4741   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
4742   // instruction.
4743   unsigned Opc = LHS.getOpcode();
4744   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
4745                       !Subtarget->isThumb1Only();
4746   if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
4747       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4748        Opc == ISD::USUBO || OptimizeMul) &&
4749       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
4750     // Only lower legal XALUO ops.
4751     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
4752       return SDValue();
4753 
4754     // The actual operation with overflow check.
4755     SDValue Value, OverflowCmp;
4756     SDValue ARMcc;
4757     std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
4758 
4759     if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
4760       // Reverse the condition code.
4761       ARMCC::CondCodes CondCode =
4762           (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
4763       CondCode = ARMCC::getOppositeCondition(CondCode);
4764       ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
4765     }
4766     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4767 
4768     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
4769                        OverflowCmp);
4770   }
4771 
4772   if (LHS.getValueType() == MVT::i32) {
4773     SDValue ARMcc;
4774     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4775     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4776     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4777                        Chain, Dest, ARMcc, CCR, Cmp);
4778   }
4779 
4780   if (getTargetMachine().Options.UnsafeFPMath &&
4781       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
4782        CC == ISD::SETNE || CC == ISD::SETUNE)) {
4783     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
4784       return Result;
4785   }
4786 
4787   ARMCC::CondCodes CondCode, CondCode2;
4788   bool InvalidOnQNaN;
4789   FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
4790 
4791   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4792   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
4793   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4794   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4795   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
4796   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4797   if (CondCode2 != ARMCC::AL) {
4798     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
4799     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
4800     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
4801   }
4802   return Res;
4803 }
4804 
4805 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
4806   SDValue Chain = Op.getOperand(0);
4807   SDValue Table = Op.getOperand(1);
4808   SDValue Index = Op.getOperand(2);
4809   SDLoc dl(Op);
4810 
4811   EVT PTy = getPointerTy(DAG.getDataLayout());
4812   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
4813   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
4814   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
4815   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
4816   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
4817   if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
4818     // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
4819     // which does another jump to the destination. This also makes it easier
4820     // to translate it to TBB / TBH later (Thumb2 only).
4821     // FIXME: This might not work if the function is extremely large.
4822     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
4823                        Addr, Op.getOperand(2), JTI);
4824   }
4825   if (isPositionIndependent() || Subtarget->isROPI()) {
4826     Addr =
4827         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
4828                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4829     Chain = Addr.getValue(1);
4830     Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
4831     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4832   } else {
4833     Addr =
4834         DAG.getLoad(PTy, dl, Chain, Addr,
4835                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
4836     Chain = Addr.getValue(1);
4837     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
4838   }
4839 }
4840 
4841 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
4842   EVT VT = Op.getValueType();
4843   SDLoc dl(Op);
4844 
4845   if (Op.getValueType().getVectorElementType() == MVT::i32) {
4846     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
4847       return Op;
4848     return DAG.UnrollVectorOp(Op.getNode());
4849   }
4850 
4851   const bool HasFullFP16 =
4852     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
4853 
4854   EVT NewTy;
4855   const EVT OpTy = Op.getOperand(0).getValueType();
4856   if (OpTy == MVT::v4f32)
4857     NewTy = MVT::v4i32;
4858   else if (OpTy == MVT::v4f16 && HasFullFP16)
4859     NewTy = MVT::v4i16;
4860   else if (OpTy == MVT::v8f16 && HasFullFP16)
4861     NewTy = MVT::v8i16;
4862   else
4863     llvm_unreachable("Invalid type for custom lowering!");
4864 
4865   if (VT != MVT::v4i16 && VT != MVT::v8i16)
4866     return DAG.UnrollVectorOp(Op.getNode());
4867 
4868   Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
4869   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
4870 }
4871 
4872 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
4873   EVT VT = Op.getValueType();
4874   if (VT.isVector())
4875     return LowerVectorFP_TO_INT(Op, DAG);
4876   if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
4877     RTLIB::Libcall LC;
4878     if (Op.getOpcode() == ISD::FP_TO_SINT)
4879       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
4880                               Op.getValueType());
4881     else
4882       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
4883                               Op.getValueType());
4884     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4885                        /*isSigned*/ false, SDLoc(Op)).first;
4886   }
4887 
4888   return Op;
4889 }
4890 
4891 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
4892   EVT VT = Op.getValueType();
4893   SDLoc dl(Op);
4894 
4895   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
4896     if (VT.getVectorElementType() == MVT::f32)
4897       return Op;
4898     return DAG.UnrollVectorOp(Op.getNode());
4899   }
4900 
4901   assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
4902           Op.getOperand(0).getValueType() == MVT::v8i16) &&
4903          "Invalid type for custom lowering!");
4904 
4905   const bool HasFullFP16 =
4906     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
4907 
4908   EVT DestVecType;
4909   if (VT == MVT::v4f32)
4910     DestVecType = MVT::v4i32;
4911   else if (VT == MVT::v4f16 && HasFullFP16)
4912     DestVecType = MVT::v4i16;
4913   else if (VT == MVT::v8f16 && HasFullFP16)
4914     DestVecType = MVT::v8i16;
4915   else
4916     return DAG.UnrollVectorOp(Op.getNode());
4917 
4918   unsigned CastOpc;
4919   unsigned Opc;
4920   switch (Op.getOpcode()) {
4921   default: llvm_unreachable("Invalid opcode!");
4922   case ISD::SINT_TO_FP:
4923     CastOpc = ISD::SIGN_EXTEND;
4924     Opc = ISD::SINT_TO_FP;
4925     break;
4926   case ISD::UINT_TO_FP:
4927     CastOpc = ISD::ZERO_EXTEND;
4928     Opc = ISD::UINT_TO_FP;
4929     break;
4930   }
4931 
4932   Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
4933   return DAG.getNode(Opc, dl, VT, Op);
4934 }
4935 
4936 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
4937   EVT VT = Op.getValueType();
4938   if (VT.isVector())
4939     return LowerVectorINT_TO_FP(Op, DAG);
4940   if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
4941     RTLIB::Libcall LC;
4942     if (Op.getOpcode() == ISD::SINT_TO_FP)
4943       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
4944                               Op.getValueType());
4945     else
4946       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
4947                               Op.getValueType());
4948     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
4949                        /*isSigned*/ false, SDLoc(Op)).first;
4950   }
4951 
4952   return Op;
4953 }
4954 
4955 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
4956   // Implement fcopysign with a fabs and a conditional fneg.
4957   SDValue Tmp0 = Op.getOperand(0);
4958   SDValue Tmp1 = Op.getOperand(1);
4959   SDLoc dl(Op);
4960   EVT VT = Op.getValueType();
4961   EVT SrcVT = Tmp1.getValueType();
4962   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
4963     Tmp0.getOpcode() == ARMISD::VMOVDRR;
4964   bool UseNEON = !InGPR && Subtarget->hasNEON();
4965 
4966   if (UseNEON) {
4967     // Use VBSL to copy the sign bit.
4968     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
4969     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
4970                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
4971     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
4972     if (VT == MVT::f64)
4973       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4974                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
4975                          DAG.getConstant(32, dl, MVT::i32));
4976     else /*if (VT == MVT::f32)*/
4977       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
4978     if (SrcVT == MVT::f32) {
4979       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
4980       if (VT == MVT::f64)
4981         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
4982                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
4983                            DAG.getConstant(32, dl, MVT::i32));
4984     } else if (VT == MVT::f32)
4985       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
4986                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
4987                          DAG.getConstant(32, dl, MVT::i32));
4988     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
4989     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
4990 
4991     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
4992                                             dl, MVT::i32);
4993     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
4994     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
4995                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
4996 
4997     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
4998                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
4999                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5000     if (VT == MVT::f32) {
5001       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5002       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5003                         DAG.getConstant(0, dl, MVT::i32));
5004     } else {
5005       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5006     }
5007 
5008     return Res;
5009   }
5010 
5011   // Bitcast operand 1 to i32.
5012   if (SrcVT == MVT::f64)
5013     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5014                        Tmp1).getValue(1);
5015   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5016 
5017   // Or in the signbit with integer operations.
5018   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5019   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5020   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5021   if (VT == MVT::f32) {
5022     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5023                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5024     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5025                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5026   }
5027 
5028   // f64: Or the high part with signbit and then combine two parts.
5029   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5030                      Tmp0);
5031   SDValue Lo = Tmp0.getValue(0);
5032   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5033   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5034   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5035 }
5036 
5037 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5038   MachineFunction &MF = DAG.getMachineFunction();
5039   MachineFrameInfo &MFI = MF.getFrameInfo();
5040   MFI.setReturnAddressIsTaken(true);
5041 
5042   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
5043     return SDValue();
5044 
5045   EVT VT = Op.getValueType();
5046   SDLoc dl(Op);
5047   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5048   if (Depth) {
5049     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5050     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5051     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5052                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5053                        MachinePointerInfo());
5054   }
5055 
5056   // Return LR, which contains the return address. Mark it an implicit live-in.
5057   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5058   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5059 }
5060 
5061 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5062   const ARMBaseRegisterInfo &ARI =
5063     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5064   MachineFunction &MF = DAG.getMachineFunction();
5065   MachineFrameInfo &MFI = MF.getFrameInfo();
5066   MFI.setFrameAddressIsTaken(true);
5067 
5068   EVT VT = Op.getValueType();
5069   SDLoc dl(Op);  // FIXME probably not meaningful
5070   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5071   unsigned FrameReg = ARI.getFrameRegister(MF);
5072   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5073   while (Depth--)
5074     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5075                             MachinePointerInfo());
5076   return FrameAddr;
5077 }
5078 
5079 // FIXME? Maybe this could be a TableGen attribute on some registers and
5080 // this table could be generated automatically from RegInfo.
5081 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
5082                                               SelectionDAG &DAG) const {
5083   unsigned Reg = StringSwitch<unsigned>(RegName)
5084                        .Case("sp", ARM::SP)
5085                        .Default(0);
5086   if (Reg)
5087     return Reg;
5088   report_fatal_error(Twine("Invalid register name \""
5089                               + StringRef(RegName)  + "\"."));
5090 }
5091 
5092 // Result is 64 bit value so split into two 32 bit values and return as a
5093 // pair of values.
5094 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
5095                                 SelectionDAG &DAG) {
5096   SDLoc DL(N);
5097 
5098   // This function is only supposed to be called for i64 type destination.
5099   assert(N->getValueType(0) == MVT::i64
5100           && "ExpandREAD_REGISTER called for non-i64 type result.");
5101 
5102   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
5103                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5104                              N->getOperand(0),
5105                              N->getOperand(1));
5106 
5107   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5108                     Read.getValue(1)));
5109   Results.push_back(Read.getOperand(0));
5110 }
5111 
5112 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5113 /// When \p DstVT, the destination type of \p BC, is on the vector
5114 /// register bank and the source of bitcast, \p Op, operates on the same bank,
5115 /// it might be possible to combine them, such that everything stays on the
5116 /// vector register bank.
5117 /// \p return The node that would replace \p BT, if the combine
5118 /// is possible.
5119 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
5120                                                 SelectionDAG &DAG) {
5121   SDValue Op = BC->getOperand(0);
5122   EVT DstVT = BC->getValueType(0);
5123 
5124   // The only vector instruction that can produce a scalar (remember,
5125   // since the bitcast was about to be turned into VMOVDRR, the source
5126   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5127   // Moreover, we can do this combine only if there is one use.
5128   // Finally, if the destination type is not a vector, there is not
5129   // much point on forcing everything on the vector bank.
5130   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5131       !Op.hasOneUse())
5132     return SDValue();
5133 
5134   // If the index is not constant, we will introduce an additional
5135   // multiply that will stick.
5136   // Give up in that case.
5137   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5138   if (!Index)
5139     return SDValue();
5140   unsigned DstNumElt = DstVT.getVectorNumElements();
5141 
5142   // Compute the new index.
5143   const APInt &APIntIndex = Index->getAPIntValue();
5144   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5145   NewIndex *= APIntIndex;
5146   // Check if the new constant index fits into i32.
5147   if (NewIndex.getBitWidth() > 32)
5148     return SDValue();
5149 
5150   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5151   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5152   SDLoc dl(Op);
5153   SDValue ExtractSrc = Op.getOperand(0);
5154   EVT VecVT = EVT::getVectorVT(
5155       *DAG.getContext(), DstVT.getScalarType(),
5156       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5157   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5158   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5159                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5160 }
5161 
5162 /// ExpandBITCAST - If the target supports VFP, this function is called to
5163 /// expand a bit convert where either the source or destination type is i64 to
5164 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
5165 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
5166 /// vectors), since the legalizer won't know what to do with that.
5167 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5168                              const ARMSubtarget *Subtarget) {
5169   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5170   SDLoc dl(N);
5171   SDValue Op = N->getOperand(0);
5172 
5173   // This function is only supposed to be called for i64 types, either as the
5174   // source or destination of the bit convert.
5175   EVT SrcVT = Op.getValueType();
5176   EVT DstVT = N->getValueType(0);
5177   const bool HasFullFP16 = Subtarget->hasFullFP16();
5178 
5179   if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
5180      // FullFP16: half values are passed in S-registers, and we don't
5181      // need any of the bitcast and moves:
5182      //
5183      // t2: f32,ch = CopyFromReg t0, Register:f32 %0
5184      //   t5: i32 = bitcast t2
5185      // t18: f16 = ARMISD::VMOVhr t5
5186      if (Op.getOpcode() != ISD::CopyFromReg ||
5187          Op.getValueType() != MVT::f32)
5188        return SDValue();
5189 
5190      auto Move = N->use_begin();
5191      if (Move->getOpcode() != ARMISD::VMOVhr)
5192        return SDValue();
5193 
5194      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
5195      SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
5196      DAG.ReplaceAllUsesWith(*Move, &Copy);
5197      return Copy;
5198   }
5199 
5200   if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
5201     if (!HasFullFP16)
5202       return SDValue();
5203     // SoftFP: read half-precision arguments:
5204     //
5205     // t2: i32,ch = ...
5206     //        t7: i16 = truncate t2 <~~~~ Op
5207     //      t8: f16 = bitcast t7    <~~~~ N
5208     //
5209     if (Op.getOperand(0).getValueType() == MVT::i32)
5210       return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
5211                          MVT::f16, Op.getOperand(0));
5212 
5213     return SDValue();
5214   }
5215 
5216   // Half-precision return values
5217   if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
5218     if (!HasFullFP16)
5219       return SDValue();
5220     //
5221     //          t11: f16 = fadd t8, t10
5222     //        t12: i16 = bitcast t11       <~~~ SDNode N
5223     //      t13: i32 = zero_extend t12
5224     //    t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
5225     //  t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
5226     //
5227     // transform this into:
5228     //
5229     //    t20: i32 = ARMISD::VMOVrh t11
5230     //  t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
5231     //
5232     auto ZeroExtend = N->use_begin();
5233     if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
5234         ZeroExtend->getValueType(0) != MVT::i32)
5235       return SDValue();
5236 
5237     auto Copy = ZeroExtend->use_begin();
5238     if (Copy->getOpcode() == ISD::CopyToReg &&
5239         Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
5240       SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
5241       DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
5242       return Cvt;
5243     }
5244     return SDValue();
5245   }
5246 
5247   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5248     return SDValue();
5249 
5250   // Turn i64->f64 into VMOVDRR.
5251   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
5252     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5253     // if we can combine the bitcast with its source.
5254     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
5255       return Val;
5256 
5257     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5258                              DAG.getConstant(0, dl, MVT::i32));
5259     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5260                              DAG.getConstant(1, dl, MVT::i32));
5261     return DAG.getNode(ISD::BITCAST, dl, DstVT,
5262                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5263   }
5264 
5265   // Turn f64->i64 into VMOVRRD.
5266   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
5267     SDValue Cvt;
5268     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
5269         SrcVT.getVectorNumElements() > 1)
5270       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5271                         DAG.getVTList(MVT::i32, MVT::i32),
5272                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
5273     else
5274       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5275                         DAG.getVTList(MVT::i32, MVT::i32), Op);
5276     // Merge the pieces into a single i64 value.
5277     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
5278   }
5279 
5280   return SDValue();
5281 }
5282 
5283 /// getZeroVector - Returns a vector of specified type with all zero elements.
5284 /// Zero vectors are used to represent vector negation and in those cases
5285 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
5286 /// not support i64 elements, so sometimes the zero vectors will need to be
5287 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
5288 /// zero vector.
5289 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5290   assert(VT.isVector() && "Expected a vector type");
5291   // The canonical modified immediate encoding of a zero vector is....0!
5292   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
5293   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
5294   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
5295   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
5296 }
5297 
5298 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
5299 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
5300 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
5301                                                 SelectionDAG &DAG) const {
5302   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5303   EVT VT = Op.getValueType();
5304   unsigned VTBits = VT.getSizeInBits();
5305   SDLoc dl(Op);
5306   SDValue ShOpLo = Op.getOperand(0);
5307   SDValue ShOpHi = Op.getOperand(1);
5308   SDValue ShAmt  = Op.getOperand(2);
5309   SDValue ARMcc;
5310   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5311   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
5312 
5313   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
5314 
5315   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
5316                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
5317   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
5318   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
5319                                    DAG.getConstant(VTBits, dl, MVT::i32));
5320   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
5321   SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
5322   SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
5323   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5324                             ISD::SETGE, ARMcc, DAG, dl);
5325   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
5326                            ARMcc, CCR, CmpLo);
5327 
5328   SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
5329   SDValue HiBigShift = Opc == ISD::SRA
5330                            ? DAG.getNode(Opc, dl, VT, ShOpHi,
5331                                          DAG.getConstant(VTBits - 1, dl, VT))
5332                            : DAG.getConstant(0, dl, VT);
5333   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5334                             ISD::SETGE, ARMcc, DAG, dl);
5335   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
5336                            ARMcc, CCR, CmpHi);
5337 
5338   SDValue Ops[2] = { Lo, Hi };
5339   return DAG.getMergeValues(Ops, dl);
5340 }
5341 
5342 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
5343 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
5344 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
5345                                                SelectionDAG &DAG) const {
5346   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5347   EVT VT = Op.getValueType();
5348   unsigned VTBits = VT.getSizeInBits();
5349   SDLoc dl(Op);
5350   SDValue ShOpLo = Op.getOperand(0);
5351   SDValue ShOpHi = Op.getOperand(1);
5352   SDValue ShAmt  = Op.getOperand(2);
5353   SDValue ARMcc;
5354   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5355 
5356   assert(Op.getOpcode() == ISD::SHL_PARTS);
5357   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
5358                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
5359   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
5360   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
5361   SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
5362 
5363   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
5364                                    DAG.getConstant(VTBits, dl, MVT::i32));
5365   SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
5366   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5367                             ISD::SETGE, ARMcc, DAG, dl);
5368   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
5369                            ARMcc, CCR, CmpHi);
5370 
5371   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5372                           ISD::SETGE, ARMcc, DAG, dl);
5373   SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5374   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
5375                            DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
5376 
5377   SDValue Ops[2] = { Lo, Hi };
5378   return DAG.getMergeValues(Ops, dl);
5379 }
5380 
5381 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
5382                                             SelectionDAG &DAG) const {
5383   // The rounding mode is in bits 23:22 of the FPSCR.
5384   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5385   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5386   // so that the shift + and get folded into a bitfield extract.
5387   SDLoc dl(Op);
5388   SDValue Ops[] = { DAG.getEntryNode(),
5389                     DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
5390 
5391   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
5392   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
5393                                   DAG.getConstant(1U << 22, dl, MVT::i32));
5394   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5395                               DAG.getConstant(22, dl, MVT::i32));
5396   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5397                      DAG.getConstant(3, dl, MVT::i32));
5398 }
5399 
5400 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
5401                          const ARMSubtarget *ST) {
5402   SDLoc dl(N);
5403   EVT VT = N->getValueType(0);
5404   if (VT.isVector()) {
5405     assert(ST->hasNEON());
5406 
5407     // Compute the least significant set bit: LSB = X & -X
5408     SDValue X = N->getOperand(0);
5409     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
5410     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
5411 
5412     EVT ElemTy = VT.getVectorElementType();
5413 
5414     if (ElemTy == MVT::i8) {
5415       // Compute with: cttz(x) = ctpop(lsb - 1)
5416       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5417                                 DAG.getTargetConstant(1, dl, ElemTy));
5418       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5419       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
5420     }
5421 
5422     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
5423         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
5424       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
5425       unsigned NumBits = ElemTy.getSizeInBits();
5426       SDValue WidthMinus1 =
5427           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5428                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
5429       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
5430       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
5431     }
5432 
5433     // Compute with: cttz(x) = ctpop(lsb - 1)
5434 
5435     // Since we can only compute the number of bits in a byte with vcnt.8, we
5436     // have to gather the result with pairwise addition (vpaddl) for i16, i32,
5437     // and i64.
5438 
5439     // Compute LSB - 1.
5440     SDValue Bits;
5441     if (ElemTy == MVT::i64) {
5442       // Load constant 0xffff'ffff'ffff'ffff to register.
5443       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5444                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
5445       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
5446     } else {
5447       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5448                                 DAG.getTargetConstant(1, dl, ElemTy));
5449       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5450     }
5451 
5452     // Count #bits with vcnt.8.
5453     EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5454     SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
5455     SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
5456 
5457     // Gather the #bits with vpaddl (pairwise add.)
5458     EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
5459     SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
5460         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5461         Cnt8);
5462     if (ElemTy == MVT::i16)
5463       return Cnt16;
5464 
5465     EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
5466     SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
5467         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5468         Cnt16);
5469     if (ElemTy == MVT::i32)
5470       return Cnt32;
5471 
5472     assert(ElemTy == MVT::i64);
5473     SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5474         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
5475         Cnt32);
5476     return Cnt64;
5477   }
5478 
5479   if (!ST->hasV6T2Ops())
5480     return SDValue();
5481 
5482   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
5483   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
5484 }
5485 
5486 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
5487 /// for each 16-bit element from operand, repeated.  The basic idea is to
5488 /// leverage vcnt to get the 8-bit counts, gather and add the results.
5489 ///
5490 /// Trace for v4i16:
5491 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
5492 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
5493 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
5494 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
5495 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
5496 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
5497 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
5498 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
5499 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
5500   EVT VT = N->getValueType(0);
5501   SDLoc DL(N);
5502 
5503   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5504   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
5505   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
5506   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
5507   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
5508   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
5509 }
5510 
5511 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
5512 /// bit-count for each 16-bit element from the operand.  We need slightly
5513 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
5514 /// 64/128-bit registers.
5515 ///
5516 /// Trace for v4i16:
5517 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
5518 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
5519 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
5520 /// v4i16:Extracted = [k0    k1    k2    k3    ]
5521 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
5522   EVT VT = N->getValueType(0);
5523   SDLoc DL(N);
5524 
5525   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
5526   if (VT.is64BitVector()) {
5527     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
5528     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
5529                        DAG.getIntPtrConstant(0, DL));
5530   } else {
5531     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
5532                                     BitCounts, DAG.getIntPtrConstant(0, DL));
5533     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
5534   }
5535 }
5536 
5537 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
5538 /// bit-count for each 32-bit element from the operand.  The idea here is
5539 /// to split the vector into 16-bit elements, leverage the 16-bit count
5540 /// routine, and then combine the results.
5541 ///
5542 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
5543 /// input    = [v0    v1    ] (vi: 32-bit elements)
5544 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
5545 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
5546 /// vrev: N0 = [k1 k0 k3 k2 ]
5547 ///            [k0 k1 k2 k3 ]
5548 ///       N1 =+[k1 k0 k3 k2 ]
5549 ///            [k0 k2 k1 k3 ]
5550 ///       N2 =+[k1 k3 k0 k2 ]
5551 ///            [k0    k2    k1    k3    ]
5552 /// Extended =+[k1    k3    k0    k2    ]
5553 ///            [k0    k2    ]
5554 /// Extracted=+[k1    k3    ]
5555 ///
5556 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
5557   EVT VT = N->getValueType(0);
5558   SDLoc DL(N);
5559 
5560   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
5561 
5562   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
5563   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
5564   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
5565   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
5566   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
5567 
5568   if (VT.is64BitVector()) {
5569     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
5570     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
5571                        DAG.getIntPtrConstant(0, DL));
5572   } else {
5573     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
5574                                     DAG.getIntPtrConstant(0, DL));
5575     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
5576   }
5577 }
5578 
5579 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
5580                           const ARMSubtarget *ST) {
5581   EVT VT = N->getValueType(0);
5582 
5583   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
5584   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
5585           VT == MVT::v4i16 || VT == MVT::v8i16) &&
5586          "Unexpected type for custom ctpop lowering");
5587 
5588   if (VT.getVectorElementType() == MVT::i32)
5589     return lowerCTPOP32BitElements(N, DAG);
5590   else
5591     return lowerCTPOP16BitElements(N, DAG);
5592 }
5593 
5594 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
5595                           const ARMSubtarget *ST) {
5596   EVT VT = N->getValueType(0);
5597   SDLoc dl(N);
5598 
5599   if (!VT.isVector())
5600     return SDValue();
5601 
5602   // Lower vector shifts on NEON to use VSHL.
5603   assert(ST->hasNEON() && "unexpected vector shift");
5604 
5605   // Left shifts translate directly to the vshiftu intrinsic.
5606   if (N->getOpcode() == ISD::SHL)
5607     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5608                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
5609                                        MVT::i32),
5610                        N->getOperand(0), N->getOperand(1));
5611 
5612   assert((N->getOpcode() == ISD::SRA ||
5613           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
5614 
5615   // NEON uses the same intrinsics for both left and right shifts.  For
5616   // right shifts, the shift amounts are negative, so negate the vector of
5617   // shift amounts.
5618   EVT ShiftVT = N->getOperand(1).getValueType();
5619   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
5620                                      getZeroVector(ShiftVT, DAG, dl),
5621                                      N->getOperand(1));
5622   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
5623                              Intrinsic::arm_neon_vshifts :
5624                              Intrinsic::arm_neon_vshiftu);
5625   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
5626                      DAG.getConstant(vshiftInt, dl, MVT::i32),
5627                      N->getOperand(0), NegatedCount);
5628 }
5629 
5630 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
5631                                 const ARMSubtarget *ST) {
5632   EVT VT = N->getValueType(0);
5633   SDLoc dl(N);
5634 
5635   // We can get here for a node like i32 = ISD::SHL i32, i64
5636   if (VT != MVT::i64)
5637     return SDValue();
5638 
5639   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
5640          "Unknown shift to lower!");
5641 
5642   // We only lower SRA, SRL of 1 here, all others use generic lowering.
5643   if (!isOneConstant(N->getOperand(1)))
5644     return SDValue();
5645 
5646   // If we are in thumb mode, we don't have RRX.
5647   if (ST->isThumb1Only()) return SDValue();
5648 
5649   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
5650   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5651                            DAG.getConstant(0, dl, MVT::i32));
5652   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5653                            DAG.getConstant(1, dl, MVT::i32));
5654 
5655   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
5656   // captures the result into a carry flag.
5657   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
5658   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
5659 
5660   // The low part is an ARMISD::RRX operand, which shifts the carry in.
5661   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
5662 
5663   // Merge the pieces into a single i64 value.
5664  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
5665 }
5666 
5667 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5668   SDValue TmpOp0, TmpOp1;
5669   bool Invert = false;
5670   bool Swap = false;
5671   unsigned Opc = 0;
5672 
5673   SDValue Op0 = Op.getOperand(0);
5674   SDValue Op1 = Op.getOperand(1);
5675   SDValue CC = Op.getOperand(2);
5676   EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
5677   EVT VT = Op.getValueType();
5678   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5679   SDLoc dl(Op);
5680 
5681   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
5682       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
5683     // Special-case integer 64-bit equality comparisons. They aren't legal,
5684     // but they can be lowered with a few vector instructions.
5685     unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
5686     EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
5687     SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
5688     SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
5689     SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
5690                               DAG.getCondCode(ISD::SETEQ));
5691     SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
5692     SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
5693     Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
5694     if (SetCCOpcode == ISD::SETNE)
5695       Merged = DAG.getNOT(dl, Merged, CmpVT);
5696     Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
5697     return Merged;
5698   }
5699 
5700   if (CmpVT.getVectorElementType() == MVT::i64)
5701     // 64-bit comparisons are not legal in general.
5702     return SDValue();
5703 
5704   if (Op1.getValueType().isFloatingPoint()) {
5705     switch (SetCCOpcode) {
5706     default: llvm_unreachable("Illegal FP comparison");
5707     case ISD::SETUNE:
5708     case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
5709     case ISD::SETOEQ:
5710     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5711     case ISD::SETOLT:
5712     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
5713     case ISD::SETOGT:
5714     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5715     case ISD::SETOLE:
5716     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
5717     case ISD::SETOGE:
5718     case ISD::SETGE: Opc = ARMISD::VCGE; break;
5719     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
5720     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
5721     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
5722     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
5723     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
5724     case ISD::SETONE:
5725       // Expand this to (OLT | OGT).
5726       TmpOp0 = Op0;
5727       TmpOp1 = Op1;
5728       Opc = ISD::OR;
5729       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5730       Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
5731       break;
5732     case ISD::SETUO:
5733       Invert = true;
5734       LLVM_FALLTHROUGH;
5735     case ISD::SETO:
5736       // Expand this to (OLT | OGE).
5737       TmpOp0 = Op0;
5738       TmpOp1 = Op1;
5739       Opc = ISD::OR;
5740       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
5741       Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
5742       break;
5743     }
5744   } else {
5745     // Integer comparisons.
5746     switch (SetCCOpcode) {
5747     default: llvm_unreachable("Illegal integer comparison");
5748     case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
5749     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
5750     case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
5751     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
5752     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
5753     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
5754     case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
5755     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
5756     case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
5757     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
5758     }
5759 
5760     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
5761     if (Opc == ARMISD::VCEQ) {
5762       SDValue AndOp;
5763       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5764         AndOp = Op0;
5765       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
5766         AndOp = Op1;
5767 
5768       // Ignore bitconvert.
5769       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
5770         AndOp = AndOp.getOperand(0);
5771 
5772       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
5773         Opc = ARMISD::VTST;
5774         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
5775         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
5776         Invert = !Invert;
5777       }
5778     }
5779   }
5780 
5781   if (Swap)
5782     std::swap(Op0, Op1);
5783 
5784   // If one of the operands is a constant vector zero, attempt to fold the
5785   // comparison to a specialized compare-against-zero form.
5786   SDValue SingleOp;
5787   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
5788     SingleOp = Op0;
5789   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
5790     if (Opc == ARMISD::VCGE)
5791       Opc = ARMISD::VCLEZ;
5792     else if (Opc == ARMISD::VCGT)
5793       Opc = ARMISD::VCLTZ;
5794     SingleOp = Op1;
5795   }
5796 
5797   SDValue Result;
5798   if (SingleOp.getNode()) {
5799     switch (Opc) {
5800     case ARMISD::VCEQ:
5801       Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
5802     case ARMISD::VCGE:
5803       Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
5804     case ARMISD::VCLEZ:
5805       Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
5806     case ARMISD::VCGT:
5807       Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
5808     case ARMISD::VCLTZ:
5809       Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
5810     default:
5811       Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5812     }
5813   } else {
5814      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
5815   }
5816 
5817   Result = DAG.getSExtOrTrunc(Result, dl, VT);
5818 
5819   if (Invert)
5820     Result = DAG.getNOT(dl, Result, VT);
5821 
5822   return Result;
5823 }
5824 
5825 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
5826   SDValue LHS = Op.getOperand(0);
5827   SDValue RHS = Op.getOperand(1);
5828   SDValue Carry = Op.getOperand(2);
5829   SDValue Cond = Op.getOperand(3);
5830   SDLoc DL(Op);
5831 
5832   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
5833 
5834   // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
5835   // have to invert the carry first.
5836   Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
5837                       DAG.getConstant(1, DL, MVT::i32), Carry);
5838   // This converts the boolean value carry into the carry flag.
5839   Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
5840 
5841   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
5842   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
5843 
5844   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
5845   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
5846   SDValue ARMcc = DAG.getConstant(
5847       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
5848   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5849   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
5850                                    Cmp.getValue(1), SDValue());
5851   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
5852                      CCR, Chain.getValue(1));
5853 }
5854 
5855 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
5856 /// valid vector constant for a NEON instruction with a "modified immediate"
5857 /// operand (e.g., VMOV).  If so, return the encoded value.
5858 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
5859                                  unsigned SplatBitSize, SelectionDAG &DAG,
5860                                  const SDLoc &dl, EVT &VT, bool is128Bits,
5861                                  NEONModImmType type) {
5862   unsigned OpCmode, Imm;
5863 
5864   // SplatBitSize is set to the smallest size that splats the vector, so a
5865   // zero vector will always have SplatBitSize == 8.  However, NEON modified
5866   // immediate instructions others than VMOV do not support the 8-bit encoding
5867   // of a zero vector, and the default encoding of zero is supposed to be the
5868   // 32-bit version.
5869   if (SplatBits == 0)
5870     SplatBitSize = 32;
5871 
5872   switch (SplatBitSize) {
5873   case 8:
5874     if (type != VMOVModImm)
5875       return SDValue();
5876     // Any 1-byte value is OK.  Op=0, Cmode=1110.
5877     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
5878     OpCmode = 0xe;
5879     Imm = SplatBits;
5880     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
5881     break;
5882 
5883   case 16:
5884     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
5885     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
5886     if ((SplatBits & ~0xff) == 0) {
5887       // Value = 0x00nn: Op=x, Cmode=100x.
5888       OpCmode = 0x8;
5889       Imm = SplatBits;
5890       break;
5891     }
5892     if ((SplatBits & ~0xff00) == 0) {
5893       // Value = 0xnn00: Op=x, Cmode=101x.
5894       OpCmode = 0xa;
5895       Imm = SplatBits >> 8;
5896       break;
5897     }
5898     return SDValue();
5899 
5900   case 32:
5901     // NEON's 32-bit VMOV supports splat values where:
5902     // * only one byte is nonzero, or
5903     // * the least significant byte is 0xff and the second byte is nonzero, or
5904     // * the least significant 2 bytes are 0xff and the third is nonzero.
5905     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
5906     if ((SplatBits & ~0xff) == 0) {
5907       // Value = 0x000000nn: Op=x, Cmode=000x.
5908       OpCmode = 0;
5909       Imm = SplatBits;
5910       break;
5911     }
5912     if ((SplatBits & ~0xff00) == 0) {
5913       // Value = 0x0000nn00: Op=x, Cmode=001x.
5914       OpCmode = 0x2;
5915       Imm = SplatBits >> 8;
5916       break;
5917     }
5918     if ((SplatBits & ~0xff0000) == 0) {
5919       // Value = 0x00nn0000: Op=x, Cmode=010x.
5920       OpCmode = 0x4;
5921       Imm = SplatBits >> 16;
5922       break;
5923     }
5924     if ((SplatBits & ~0xff000000) == 0) {
5925       // Value = 0xnn000000: Op=x, Cmode=011x.
5926       OpCmode = 0x6;
5927       Imm = SplatBits >> 24;
5928       break;
5929     }
5930 
5931     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
5932     if (type == OtherModImm) return SDValue();
5933 
5934     if ((SplatBits & ~0xffff) == 0 &&
5935         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
5936       // Value = 0x0000nnff: Op=x, Cmode=1100.
5937       OpCmode = 0xc;
5938       Imm = SplatBits >> 8;
5939       break;
5940     }
5941 
5942     if ((SplatBits & ~0xffffff) == 0 &&
5943         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
5944       // Value = 0x00nnffff: Op=x, Cmode=1101.
5945       OpCmode = 0xd;
5946       Imm = SplatBits >> 16;
5947       break;
5948     }
5949 
5950     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
5951     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
5952     // VMOV.I32.  A (very) minor optimization would be to replicate the value
5953     // and fall through here to test for a valid 64-bit splat.  But, then the
5954     // caller would also need to check and handle the change in size.
5955     return SDValue();
5956 
5957   case 64: {
5958     if (type != VMOVModImm)
5959       return SDValue();
5960     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
5961     uint64_t BitMask = 0xff;
5962     uint64_t Val = 0;
5963     unsigned ImmMask = 1;
5964     Imm = 0;
5965     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
5966       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
5967         Val |= BitMask;
5968         Imm |= ImmMask;
5969       } else if ((SplatBits & BitMask) != 0) {
5970         return SDValue();
5971       }
5972       BitMask <<= 8;
5973       ImmMask <<= 1;
5974     }
5975 
5976     if (DAG.getDataLayout().isBigEndian())
5977       // swap higher and lower 32 bit word
5978       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
5979 
5980     // Op=1, Cmode=1110.
5981     OpCmode = 0x1e;
5982     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
5983     break;
5984   }
5985 
5986   default:
5987     llvm_unreachable("unexpected size for isNEONModifiedImm");
5988   }
5989 
5990   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
5991   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
5992 }
5993 
5994 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
5995                                            const ARMSubtarget *ST) const {
5996   EVT VT = Op.getValueType();
5997   bool IsDouble = (VT == MVT::f64);
5998   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
5999   const APFloat &FPVal = CFP->getValueAPF();
6000 
6001   // Prevent floating-point constants from using literal loads
6002   // when execute-only is enabled.
6003   if (ST->genExecuteOnly()) {
6004     // If we can represent the constant as an immediate, don't lower it
6005     if (isFPImmLegal(FPVal, VT))
6006       return Op;
6007     // Otherwise, construct as integer, and move to float register
6008     APInt INTVal = FPVal.bitcastToAPInt();
6009     SDLoc DL(CFP);
6010     switch (VT.getSimpleVT().SimpleTy) {
6011       default:
6012         llvm_unreachable("Unknown floating point type!");
6013         break;
6014       case MVT::f64: {
6015         SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6016         SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6017         if (!ST->isLittle())
6018           std::swap(Lo, Hi);
6019         return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6020       }
6021       case MVT::f32:
6022           return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6023               DAG.getConstant(INTVal, DL, MVT::i32));
6024     }
6025   }
6026 
6027   if (!ST->hasVFP3())
6028     return SDValue();
6029 
6030   // Use the default (constant pool) lowering for double constants when we have
6031   // an SP-only FPU
6032   if (IsDouble && Subtarget->isFPOnlySP())
6033     return SDValue();
6034 
6035   // Try splatting with a VMOV.f32...
6036   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6037 
6038   if (ImmVal != -1) {
6039     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6040       // We have code in place to select a valid ConstantFP already, no need to
6041       // do any mangling.
6042       return Op;
6043     }
6044 
6045     // It's a float and we are trying to use NEON operations where
6046     // possible. Lower it to a splat followed by an extract.
6047     SDLoc DL(Op);
6048     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6049     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6050                                       NewVal);
6051     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6052                        DAG.getConstant(0, DL, MVT::i32));
6053   }
6054 
6055   // The rest of our options are NEON only, make sure that's allowed before
6056   // proceeding..
6057   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6058     return SDValue();
6059 
6060   EVT VMovVT;
6061   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6062 
6063   // It wouldn't really be worth bothering for doubles except for one very
6064   // important value, which does happen to match: 0.0. So make sure we don't do
6065   // anything stupid.
6066   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6067     return SDValue();
6068 
6069   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6070   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6071                                      VMovVT, false, VMOVModImm);
6072   if (NewVal != SDValue()) {
6073     SDLoc DL(Op);
6074     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6075                                       NewVal);
6076     if (IsDouble)
6077       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6078 
6079     // It's a float: cast and extract a vector element.
6080     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6081                                        VecConstant);
6082     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6083                        DAG.getConstant(0, DL, MVT::i32));
6084   }
6085 
6086   // Finally, try a VMVN.i32
6087   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6088                              false, VMVNModImm);
6089   if (NewVal != SDValue()) {
6090     SDLoc DL(Op);
6091     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6092 
6093     if (IsDouble)
6094       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6095 
6096     // It's a float: cast and extract a vector element.
6097     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6098                                        VecConstant);
6099     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6100                        DAG.getConstant(0, DL, MVT::i32));
6101   }
6102 
6103   return SDValue();
6104 }
6105 
6106 // check if an VEXT instruction can handle the shuffle mask when the
6107 // vector sources of the shuffle are the same.
6108 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6109   unsigned NumElts = VT.getVectorNumElements();
6110 
6111   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6112   if (M[0] < 0)
6113     return false;
6114 
6115   Imm = M[0];
6116 
6117   // If this is a VEXT shuffle, the immediate value is the index of the first
6118   // element.  The other shuffle indices must be the successive elements after
6119   // the first one.
6120   unsigned ExpectedElt = Imm;
6121   for (unsigned i = 1; i < NumElts; ++i) {
6122     // Increment the expected index.  If it wraps around, just follow it
6123     // back to index zero and keep going.
6124     ++ExpectedElt;
6125     if (ExpectedElt == NumElts)
6126       ExpectedElt = 0;
6127 
6128     if (M[i] < 0) continue; // ignore UNDEF indices
6129     if (ExpectedElt != static_cast<unsigned>(M[i]))
6130       return false;
6131   }
6132 
6133   return true;
6134 }
6135 
6136 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6137                        bool &ReverseVEXT, unsigned &Imm) {
6138   unsigned NumElts = VT.getVectorNumElements();
6139   ReverseVEXT = false;
6140 
6141   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6142   if (M[0] < 0)
6143     return false;
6144 
6145   Imm = M[0];
6146 
6147   // If this is a VEXT shuffle, the immediate value is the index of the first
6148   // element.  The other shuffle indices must be the successive elements after
6149   // the first one.
6150   unsigned ExpectedElt = Imm;
6151   for (unsigned i = 1; i < NumElts; ++i) {
6152     // Increment the expected index.  If it wraps around, it may still be
6153     // a VEXT but the source vectors must be swapped.
6154     ExpectedElt += 1;
6155     if (ExpectedElt == NumElts * 2) {
6156       ExpectedElt = 0;
6157       ReverseVEXT = true;
6158     }
6159 
6160     if (M[i] < 0) continue; // ignore UNDEF indices
6161     if (ExpectedElt != static_cast<unsigned>(M[i]))
6162       return false;
6163   }
6164 
6165   // Adjust the index value if the source operands will be swapped.
6166   if (ReverseVEXT)
6167     Imm -= NumElts;
6168 
6169   return true;
6170 }
6171 
6172 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
6173 /// instruction with the specified blocksize.  (The order of the elements
6174 /// within each block of the vector is reversed.)
6175 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
6176   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
6177          "Only possible block sizes for VREV are: 16, 32, 64");
6178 
6179   unsigned EltSz = VT.getScalarSizeInBits();
6180   if (EltSz == 64)
6181     return false;
6182 
6183   unsigned NumElts = VT.getVectorNumElements();
6184   unsigned BlockElts = M[0] + 1;
6185   // If the first shuffle index is UNDEF, be optimistic.
6186   if (M[0] < 0)
6187     BlockElts = BlockSize / EltSz;
6188 
6189   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
6190     return false;
6191 
6192   for (unsigned i = 0; i < NumElts; ++i) {
6193     if (M[i] < 0) continue; // ignore UNDEF indices
6194     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
6195       return false;
6196   }
6197 
6198   return true;
6199 }
6200 
6201 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
6202   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
6203   // range, then 0 is placed into the resulting vector. So pretty much any mask
6204   // of 8 elements can work here.
6205   return VT == MVT::v8i8 && M.size() == 8;
6206 }
6207 
6208 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
6209                                unsigned Index) {
6210   if (Mask.size() == Elements * 2)
6211     return Index / Elements;
6212   return Mask[Index] == 0 ? 0 : 1;
6213 }
6214 
6215 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
6216 // checking that pairs of elements in the shuffle mask represent the same index
6217 // in each vector, incrementing the expected index by 2 at each step.
6218 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
6219 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
6220 //  v2={e,f,g,h}
6221 // WhichResult gives the offset for each element in the mask based on which
6222 // of the two results it belongs to.
6223 //
6224 // The transpose can be represented either as:
6225 // result1 = shufflevector v1, v2, result1_shuffle_mask
6226 // result2 = shufflevector v1, v2, result2_shuffle_mask
6227 // where v1/v2 and the shuffle masks have the same number of elements
6228 // (here WhichResult (see below) indicates which result is being checked)
6229 //
6230 // or as:
6231 // results = shufflevector v1, v2, shuffle_mask
6232 // where both results are returned in one vector and the shuffle mask has twice
6233 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
6234 // want to check the low half and high half of the shuffle mask as if it were
6235 // the other case
6236 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6237   unsigned EltSz = VT.getScalarSizeInBits();
6238   if (EltSz == 64)
6239     return false;
6240 
6241   unsigned NumElts = VT.getVectorNumElements();
6242   if (M.size() != NumElts && M.size() != NumElts*2)
6243     return false;
6244 
6245   // If the mask is twice as long as the input vector then we need to check the
6246   // upper and lower parts of the mask with a matching value for WhichResult
6247   // FIXME: A mask with only even values will be rejected in case the first
6248   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
6249   // M[0] is used to determine WhichResult
6250   for (unsigned i = 0; i < M.size(); i += NumElts) {
6251     WhichResult = SelectPairHalf(NumElts, M, i);
6252     for (unsigned j = 0; j < NumElts; j += 2) {
6253       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
6254           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
6255         return false;
6256     }
6257   }
6258 
6259   if (M.size() == NumElts*2)
6260     WhichResult = 0;
6261 
6262   return true;
6263 }
6264 
6265 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
6266 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6267 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
6268 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6269   unsigned EltSz = VT.getScalarSizeInBits();
6270   if (EltSz == 64)
6271     return false;
6272 
6273   unsigned NumElts = VT.getVectorNumElements();
6274   if (M.size() != NumElts && M.size() != NumElts*2)
6275     return false;
6276 
6277   for (unsigned i = 0; i < M.size(); i += NumElts) {
6278     WhichResult = SelectPairHalf(NumElts, M, i);
6279     for (unsigned j = 0; j < NumElts; j += 2) {
6280       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
6281           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
6282         return false;
6283     }
6284   }
6285 
6286   if (M.size() == NumElts*2)
6287     WhichResult = 0;
6288 
6289   return true;
6290 }
6291 
6292 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
6293 // that the mask elements are either all even and in steps of size 2 or all odd
6294 // and in steps of size 2.
6295 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
6296 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
6297 //  v2={e,f,g,h}
6298 // Requires similar checks to that of isVTRNMask with
6299 // respect the how results are returned.
6300 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6301   unsigned EltSz = VT.getScalarSizeInBits();
6302   if (EltSz == 64)
6303     return false;
6304 
6305   unsigned NumElts = VT.getVectorNumElements();
6306   if (M.size() != NumElts && M.size() != NumElts*2)
6307     return false;
6308 
6309   for (unsigned i = 0; i < M.size(); i += NumElts) {
6310     WhichResult = SelectPairHalf(NumElts, M, i);
6311     for (unsigned j = 0; j < NumElts; ++j) {
6312       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
6313         return false;
6314     }
6315   }
6316 
6317   if (M.size() == NumElts*2)
6318     WhichResult = 0;
6319 
6320   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6321   if (VT.is64BitVector() && EltSz == 32)
6322     return false;
6323 
6324   return true;
6325 }
6326 
6327 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
6328 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6329 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
6330 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6331   unsigned EltSz = VT.getScalarSizeInBits();
6332   if (EltSz == 64)
6333     return false;
6334 
6335   unsigned NumElts = VT.getVectorNumElements();
6336   if (M.size() != NumElts && M.size() != NumElts*2)
6337     return false;
6338 
6339   unsigned Half = NumElts / 2;
6340   for (unsigned i = 0; i < M.size(); i += NumElts) {
6341     WhichResult = SelectPairHalf(NumElts, M, i);
6342     for (unsigned j = 0; j < NumElts; j += Half) {
6343       unsigned Idx = WhichResult;
6344       for (unsigned k = 0; k < Half; ++k) {
6345         int MIdx = M[i + j + k];
6346         if (MIdx >= 0 && (unsigned) MIdx != Idx)
6347           return false;
6348         Idx += 2;
6349       }
6350     }
6351   }
6352 
6353   if (M.size() == NumElts*2)
6354     WhichResult = 0;
6355 
6356   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6357   if (VT.is64BitVector() && EltSz == 32)
6358     return false;
6359 
6360   return true;
6361 }
6362 
6363 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
6364 // that pairs of elements of the shufflemask represent the same index in each
6365 // vector incrementing sequentially through the vectors.
6366 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
6367 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
6368 //  v2={e,f,g,h}
6369 // Requires similar checks to that of isVTRNMask with respect the how results
6370 // are returned.
6371 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6372   unsigned EltSz = VT.getScalarSizeInBits();
6373   if (EltSz == 64)
6374     return false;
6375 
6376   unsigned NumElts = VT.getVectorNumElements();
6377   if (M.size() != NumElts && M.size() != NumElts*2)
6378     return false;
6379 
6380   for (unsigned i = 0; i < M.size(); i += NumElts) {
6381     WhichResult = SelectPairHalf(NumElts, M, i);
6382     unsigned Idx = WhichResult * NumElts / 2;
6383     for (unsigned j = 0; j < NumElts; j += 2) {
6384       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
6385           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
6386         return false;
6387       Idx += 1;
6388     }
6389   }
6390 
6391   if (M.size() == NumElts*2)
6392     WhichResult = 0;
6393 
6394   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6395   if (VT.is64BitVector() && EltSz == 32)
6396     return false;
6397 
6398   return true;
6399 }
6400 
6401 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
6402 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6403 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
6404 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6405   unsigned EltSz = VT.getScalarSizeInBits();
6406   if (EltSz == 64)
6407     return false;
6408 
6409   unsigned NumElts = VT.getVectorNumElements();
6410   if (M.size() != NumElts && M.size() != NumElts*2)
6411     return false;
6412 
6413   for (unsigned i = 0; i < M.size(); i += NumElts) {
6414     WhichResult = SelectPairHalf(NumElts, M, i);
6415     unsigned Idx = WhichResult * NumElts / 2;
6416     for (unsigned j = 0; j < NumElts; j += 2) {
6417       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
6418           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
6419         return false;
6420       Idx += 1;
6421     }
6422   }
6423 
6424   if (M.size() == NumElts*2)
6425     WhichResult = 0;
6426 
6427   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6428   if (VT.is64BitVector() && EltSz == 32)
6429     return false;
6430 
6431   return true;
6432 }
6433 
6434 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
6435 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
6436 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
6437                                            unsigned &WhichResult,
6438                                            bool &isV_UNDEF) {
6439   isV_UNDEF = false;
6440   if (isVTRNMask(ShuffleMask, VT, WhichResult))
6441     return ARMISD::VTRN;
6442   if (isVUZPMask(ShuffleMask, VT, WhichResult))
6443     return ARMISD::VUZP;
6444   if (isVZIPMask(ShuffleMask, VT, WhichResult))
6445     return ARMISD::VZIP;
6446 
6447   isV_UNDEF = true;
6448   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
6449     return ARMISD::VTRN;
6450   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
6451     return ARMISD::VUZP;
6452   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
6453     return ARMISD::VZIP;
6454 
6455   return 0;
6456 }
6457 
6458 /// \return true if this is a reverse operation on an vector.
6459 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
6460   unsigned NumElts = VT.getVectorNumElements();
6461   // Make sure the mask has the right size.
6462   if (NumElts != M.size())
6463       return false;
6464 
6465   // Look for <15, ..., 3, -1, 1, 0>.
6466   for (unsigned i = 0; i != NumElts; ++i)
6467     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
6468       return false;
6469 
6470   return true;
6471 }
6472 
6473 // If N is an integer constant that can be moved into a register in one
6474 // instruction, return an SDValue of such a constant (will become a MOV
6475 // instruction).  Otherwise return null.
6476 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
6477                                      const ARMSubtarget *ST, const SDLoc &dl) {
6478   uint64_t Val;
6479   if (!isa<ConstantSDNode>(N))
6480     return SDValue();
6481   Val = cast<ConstantSDNode>(N)->getZExtValue();
6482 
6483   if (ST->isThumb1Only()) {
6484     if (Val <= 255 || ~Val <= 255)
6485       return DAG.getConstant(Val, dl, MVT::i32);
6486   } else {
6487     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
6488       return DAG.getConstant(Val, dl, MVT::i32);
6489   }
6490   return SDValue();
6491 }
6492 
6493 // If this is a case we can't handle, return null and let the default
6494 // expansion code take care of it.
6495 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
6496                                              const ARMSubtarget *ST) const {
6497   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
6498   SDLoc dl(Op);
6499   EVT VT = Op.getValueType();
6500 
6501   APInt SplatBits, SplatUndef;
6502   unsigned SplatBitSize;
6503   bool HasAnyUndefs;
6504   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
6505     if (SplatUndef.isAllOnesValue())
6506       return DAG.getUNDEF(VT);
6507 
6508     if (SplatBitSize <= 64) {
6509       // Check if an immediate VMOV works.
6510       EVT VmovVT;
6511       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
6512                                       SplatUndef.getZExtValue(), SplatBitSize,
6513                                       DAG, dl, VmovVT, VT.is128BitVector(),
6514                                       VMOVModImm);
6515       if (Val.getNode()) {
6516         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
6517         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6518       }
6519 
6520       // Try an immediate VMVN.
6521       uint64_t NegatedImm = (~SplatBits).getZExtValue();
6522       Val = isNEONModifiedImm(NegatedImm,
6523                                       SplatUndef.getZExtValue(), SplatBitSize,
6524                                       DAG, dl, VmovVT, VT.is128BitVector(),
6525                                       VMVNModImm);
6526       if (Val.getNode()) {
6527         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
6528         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6529       }
6530 
6531       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
6532       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
6533         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
6534         if (ImmVal != -1) {
6535           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
6536           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
6537         }
6538       }
6539     }
6540   }
6541 
6542   // Scan through the operands to see if only one value is used.
6543   //
6544   // As an optimisation, even if more than one value is used it may be more
6545   // profitable to splat with one value then change some lanes.
6546   //
6547   // Heuristically we decide to do this if the vector has a "dominant" value,
6548   // defined as splatted to more than half of the lanes.
6549   unsigned NumElts = VT.getVectorNumElements();
6550   bool isOnlyLowElement = true;
6551   bool usesOnlyOneValue = true;
6552   bool hasDominantValue = false;
6553   bool isConstant = true;
6554 
6555   // Map of the number of times a particular SDValue appears in the
6556   // element list.
6557   DenseMap<SDValue, unsigned> ValueCounts;
6558   SDValue Value;
6559   for (unsigned i = 0; i < NumElts; ++i) {
6560     SDValue V = Op.getOperand(i);
6561     if (V.isUndef())
6562       continue;
6563     if (i > 0)
6564       isOnlyLowElement = false;
6565     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
6566       isConstant = false;
6567 
6568     ValueCounts.insert(std::make_pair(V, 0));
6569     unsigned &Count = ValueCounts[V];
6570 
6571     // Is this value dominant? (takes up more than half of the lanes)
6572     if (++Count > (NumElts / 2)) {
6573       hasDominantValue = true;
6574       Value = V;
6575     }
6576   }
6577   if (ValueCounts.size() != 1)
6578     usesOnlyOneValue = false;
6579   if (!Value.getNode() && !ValueCounts.empty())
6580     Value = ValueCounts.begin()->first;
6581 
6582   if (ValueCounts.empty())
6583     return DAG.getUNDEF(VT);
6584 
6585   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
6586   // Keep going if we are hitting this case.
6587   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
6588     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
6589 
6590   unsigned EltSize = VT.getScalarSizeInBits();
6591 
6592   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
6593   // i32 and try again.
6594   if (hasDominantValue && EltSize <= 32) {
6595     if (!isConstant) {
6596       SDValue N;
6597 
6598       // If we are VDUPing a value that comes directly from a vector, that will
6599       // cause an unnecessary move to and from a GPR, where instead we could
6600       // just use VDUPLANE. We can only do this if the lane being extracted
6601       // is at a constant index, as the VDUP from lane instructions only have
6602       // constant-index forms.
6603       ConstantSDNode *constIndex;
6604       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6605           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
6606         // We need to create a new undef vector to use for the VDUPLANE if the
6607         // size of the vector from which we get the value is different than the
6608         // size of the vector that we need to create. We will insert the element
6609         // such that the register coalescer will remove unnecessary copies.
6610         if (VT != Value->getOperand(0).getValueType()) {
6611           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
6612                              VT.getVectorNumElements();
6613           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6614                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
6615                         Value, DAG.getConstant(index, dl, MVT::i32)),
6616                            DAG.getConstant(index, dl, MVT::i32));
6617         } else
6618           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6619                         Value->getOperand(0), Value->getOperand(1));
6620       } else
6621         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
6622 
6623       if (!usesOnlyOneValue) {
6624         // The dominant value was splatted as 'N', but we now have to insert
6625         // all differing elements.
6626         for (unsigned I = 0; I < NumElts; ++I) {
6627           if (Op.getOperand(I) == Value)
6628             continue;
6629           SmallVector<SDValue, 3> Ops;
6630           Ops.push_back(N);
6631           Ops.push_back(Op.getOperand(I));
6632           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
6633           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
6634         }
6635       }
6636       return N;
6637     }
6638     if (VT.getVectorElementType().isFloatingPoint()) {
6639       SmallVector<SDValue, 8> Ops;
6640       for (unsigned i = 0; i < NumElts; ++i)
6641         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
6642                                   Op.getOperand(i)));
6643       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
6644       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
6645       Val = LowerBUILD_VECTOR(Val, DAG, ST);
6646       if (Val.getNode())
6647         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6648     }
6649     if (usesOnlyOneValue) {
6650       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
6651       if (isConstant && Val.getNode())
6652         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
6653     }
6654   }
6655 
6656   // If all elements are constants and the case above didn't get hit, fall back
6657   // to the default expansion, which will generate a load from the constant
6658   // pool.
6659   if (isConstant)
6660     return SDValue();
6661 
6662   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
6663   if (NumElts >= 4) {
6664     SDValue shuffle = ReconstructShuffle(Op, DAG);
6665     if (shuffle != SDValue())
6666       return shuffle;
6667   }
6668 
6669   if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
6670     // If we haven't found an efficient lowering, try splitting a 128-bit vector
6671     // into two 64-bit vectors; we might discover a better way to lower it.
6672     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
6673     EVT ExtVT = VT.getVectorElementType();
6674     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
6675     SDValue Lower =
6676         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
6677     if (Lower.getOpcode() == ISD::BUILD_VECTOR)
6678       Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
6679     SDValue Upper = DAG.getBuildVector(
6680         HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
6681     if (Upper.getOpcode() == ISD::BUILD_VECTOR)
6682       Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
6683     if (Lower && Upper)
6684       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
6685   }
6686 
6687   // Vectors with 32- or 64-bit elements can be built by directly assigning
6688   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
6689   // will be legalized.
6690   if (EltSize >= 32) {
6691     // Do the expansion with floating-point types, since that is what the VFP
6692     // registers are defined to use, and since i64 is not legal.
6693     EVT EltVT = EVT::getFloatingPointVT(EltSize);
6694     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
6695     SmallVector<SDValue, 8> Ops;
6696     for (unsigned i = 0; i < NumElts; ++i)
6697       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
6698     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
6699     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6700   }
6701 
6702   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
6703   // know the default expansion would otherwise fall back on something even
6704   // worse. For a vector with one or two non-undef values, that's
6705   // scalar_to_vector for the elements followed by a shuffle (provided the
6706   // shuffle is valid for the target) and materialization element by element
6707   // on the stack followed by a load for everything else.
6708   if (!isConstant && !usesOnlyOneValue) {
6709     SDValue Vec = DAG.getUNDEF(VT);
6710     for (unsigned i = 0 ; i < NumElts; ++i) {
6711       SDValue V = Op.getOperand(i);
6712       if (V.isUndef())
6713         continue;
6714       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
6715       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
6716     }
6717     return Vec;
6718   }
6719 
6720   return SDValue();
6721 }
6722 
6723 // Gather data to see if the operation can be modelled as a
6724 // shuffle in combination with VEXTs.
6725 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
6726                                               SelectionDAG &DAG) const {
6727   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
6728   SDLoc dl(Op);
6729   EVT VT = Op.getValueType();
6730   unsigned NumElts = VT.getVectorNumElements();
6731 
6732   struct ShuffleSourceInfo {
6733     SDValue Vec;
6734     unsigned MinElt = std::numeric_limits<unsigned>::max();
6735     unsigned MaxElt = 0;
6736 
6737     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
6738     // be compatible with the shuffle we intend to construct. As a result
6739     // ShuffleVec will be some sliding window into the original Vec.
6740     SDValue ShuffleVec;
6741 
6742     // Code should guarantee that element i in Vec starts at element "WindowBase
6743     // + i * WindowScale in ShuffleVec".
6744     int WindowBase = 0;
6745     int WindowScale = 1;
6746 
6747     ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
6748 
6749     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
6750   };
6751 
6752   // First gather all vectors used as an immediate source for this BUILD_VECTOR
6753   // node.
6754   SmallVector<ShuffleSourceInfo, 2> Sources;
6755   for (unsigned i = 0; i < NumElts; ++i) {
6756     SDValue V = Op.getOperand(i);
6757     if (V.isUndef())
6758       continue;
6759     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
6760       // A shuffle can only come from building a vector from various
6761       // elements of other vectors.
6762       return SDValue();
6763     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
6764       // Furthermore, shuffles require a constant mask, whereas extractelts
6765       // accept variable indices.
6766       return SDValue();
6767     }
6768 
6769     // Add this element source to the list if it's not already there.
6770     SDValue SourceVec = V.getOperand(0);
6771     auto Source = llvm::find(Sources, SourceVec);
6772     if (Source == Sources.end())
6773       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
6774 
6775     // Update the minimum and maximum lane number seen.
6776     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
6777     Source->MinElt = std::min(Source->MinElt, EltNo);
6778     Source->MaxElt = std::max(Source->MaxElt, EltNo);
6779   }
6780 
6781   // Currently only do something sane when at most two source vectors
6782   // are involved.
6783   if (Sources.size() > 2)
6784     return SDValue();
6785 
6786   // Find out the smallest element size among result and two sources, and use
6787   // it as element size to build the shuffle_vector.
6788   EVT SmallestEltTy = VT.getVectorElementType();
6789   for (auto &Source : Sources) {
6790     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
6791     if (SrcEltTy.bitsLT(SmallestEltTy))
6792       SmallestEltTy = SrcEltTy;
6793   }
6794   unsigned ResMultiplier =
6795       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
6796   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
6797   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
6798 
6799   // If the source vector is too wide or too narrow, we may nevertheless be able
6800   // to construct a compatible shuffle either by concatenating it with UNDEF or
6801   // extracting a suitable range of elements.
6802   for (auto &Src : Sources) {
6803     EVT SrcVT = Src.ShuffleVec.getValueType();
6804 
6805     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
6806       continue;
6807 
6808     // This stage of the search produces a source with the same element type as
6809     // the original, but with a total width matching the BUILD_VECTOR output.
6810     EVT EltVT = SrcVT.getVectorElementType();
6811     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
6812     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
6813 
6814     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
6815       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
6816         return SDValue();
6817       // We can pad out the smaller vector for free, so if it's part of a
6818       // shuffle...
6819       Src.ShuffleVec =
6820           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
6821                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
6822       continue;
6823     }
6824 
6825     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
6826       return SDValue();
6827 
6828     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
6829       // Span too large for a VEXT to cope
6830       return SDValue();
6831     }
6832 
6833     if (Src.MinElt >= NumSrcElts) {
6834       // The extraction can just take the second half
6835       Src.ShuffleVec =
6836           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6837                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6838       Src.WindowBase = -NumSrcElts;
6839     } else if (Src.MaxElt < NumSrcElts) {
6840       // The extraction can just take the first half
6841       Src.ShuffleVec =
6842           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6843                       DAG.getConstant(0, dl, MVT::i32));
6844     } else {
6845       // An actual VEXT is needed
6846       SDValue VEXTSrc1 =
6847           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6848                       DAG.getConstant(0, dl, MVT::i32));
6849       SDValue VEXTSrc2 =
6850           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
6851                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
6852 
6853       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
6854                                    VEXTSrc2,
6855                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
6856       Src.WindowBase = -Src.MinElt;
6857     }
6858   }
6859 
6860   // Another possible incompatibility occurs from the vector element types. We
6861   // can fix this by bitcasting the source vectors to the same type we intend
6862   // for the shuffle.
6863   for (auto &Src : Sources) {
6864     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
6865     if (SrcEltTy == SmallestEltTy)
6866       continue;
6867     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
6868     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
6869     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
6870     Src.WindowBase *= Src.WindowScale;
6871   }
6872 
6873   // Final sanity check before we try to actually produce a shuffle.
6874   LLVM_DEBUG(for (auto Src
6875                   : Sources)
6876                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
6877 
6878   // The stars all align, our next step is to produce the mask for the shuffle.
6879   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
6880   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
6881   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
6882     SDValue Entry = Op.getOperand(i);
6883     if (Entry.isUndef())
6884       continue;
6885 
6886     auto Src = llvm::find(Sources, Entry.getOperand(0));
6887     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
6888 
6889     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
6890     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
6891     // segment.
6892     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
6893     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
6894                                VT.getScalarSizeInBits());
6895     int LanesDefined = BitsDefined / BitsPerShuffleLane;
6896 
6897     // This source is expected to fill ResMultiplier lanes of the final shuffle,
6898     // starting at the appropriate offset.
6899     int *LaneMask = &Mask[i * ResMultiplier];
6900 
6901     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
6902     ExtractBase += NumElts * (Src - Sources.begin());
6903     for (int j = 0; j < LanesDefined; ++j)
6904       LaneMask[j] = ExtractBase + j;
6905   }
6906 
6907   // Final check before we try to produce nonsense...
6908   if (!isShuffleMaskLegal(Mask, ShuffleVT))
6909     return SDValue();
6910 
6911   // We can't handle more than two sources. This should have already
6912   // been checked before this point.
6913   assert(Sources.size() <= 2 && "Too many sources!");
6914 
6915   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
6916   for (unsigned i = 0; i < Sources.size(); ++i)
6917     ShuffleOps[i] = Sources[i].ShuffleVec;
6918 
6919   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
6920                                          ShuffleOps[1], Mask);
6921   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
6922 }
6923 
6924 /// isShuffleMaskLegal - Targets can use this to indicate that they only
6925 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
6926 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
6927 /// are assumed to be legal.
6928 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
6929   if (VT.getVectorNumElements() == 4 &&
6930       (VT.is128BitVector() || VT.is64BitVector())) {
6931     unsigned PFIndexes[4];
6932     for (unsigned i = 0; i != 4; ++i) {
6933       if (M[i] < 0)
6934         PFIndexes[i] = 8;
6935       else
6936         PFIndexes[i] = M[i];
6937     }
6938 
6939     // Compute the index in the perfect shuffle table.
6940     unsigned PFTableIndex =
6941       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
6942     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6943     unsigned Cost = (PFEntry >> 30);
6944 
6945     if (Cost <= 4)
6946       return true;
6947   }
6948 
6949   bool ReverseVEXT, isV_UNDEF;
6950   unsigned Imm, WhichResult;
6951 
6952   unsigned EltSize = VT.getScalarSizeInBits();
6953   return (EltSize >= 32 ||
6954           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
6955           isVREVMask(M, VT, 64) ||
6956           isVREVMask(M, VT, 32) ||
6957           isVREVMask(M, VT, 16) ||
6958           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
6959           isVTBLMask(M, VT) ||
6960           isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
6961           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
6962 }
6963 
6964 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
6965 /// the specified operations to build the shuffle.
6966 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
6967                                       SDValue RHS, SelectionDAG &DAG,
6968                                       const SDLoc &dl) {
6969   unsigned OpNum = (PFEntry >> 26) & 0x0F;
6970   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
6971   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
6972 
6973   enum {
6974     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
6975     OP_VREV,
6976     OP_VDUP0,
6977     OP_VDUP1,
6978     OP_VDUP2,
6979     OP_VDUP3,
6980     OP_VEXT1,
6981     OP_VEXT2,
6982     OP_VEXT3,
6983     OP_VUZPL, // VUZP, left result
6984     OP_VUZPR, // VUZP, right result
6985     OP_VZIPL, // VZIP, left result
6986     OP_VZIPR, // VZIP, right result
6987     OP_VTRNL, // VTRN, left result
6988     OP_VTRNR  // VTRN, right result
6989   };
6990 
6991   if (OpNum == OP_COPY) {
6992     if (LHSID == (1*9+2)*9+3) return LHS;
6993     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
6994     return RHS;
6995   }
6996 
6997   SDValue OpLHS, OpRHS;
6998   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
6999   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
7000   EVT VT = OpLHS.getValueType();
7001 
7002   switch (OpNum) {
7003   default: llvm_unreachable("Unknown shuffle opcode!");
7004   case OP_VREV:
7005     // VREV divides the vector in half and swaps within the half.
7006     if (VT.getVectorElementType() == MVT::i32 ||
7007         VT.getVectorElementType() == MVT::f32)
7008       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
7009     // vrev <4 x i16> -> VREV32
7010     if (VT.getVectorElementType() == MVT::i16)
7011       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
7012     // vrev <4 x i8> -> VREV16
7013     assert(VT.getVectorElementType() == MVT::i8);
7014     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
7015   case OP_VDUP0:
7016   case OP_VDUP1:
7017   case OP_VDUP2:
7018   case OP_VDUP3:
7019     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7020                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
7021   case OP_VEXT1:
7022   case OP_VEXT2:
7023   case OP_VEXT3:
7024     return DAG.getNode(ARMISD::VEXT, dl, VT,
7025                        OpLHS, OpRHS,
7026                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
7027   case OP_VUZPL:
7028   case OP_VUZPR:
7029     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
7030                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
7031   case OP_VZIPL:
7032   case OP_VZIPR:
7033     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
7034                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
7035   case OP_VTRNL:
7036   case OP_VTRNR:
7037     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
7038                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
7039   }
7040 }
7041 
7042 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
7043                                        ArrayRef<int> ShuffleMask,
7044                                        SelectionDAG &DAG) {
7045   // Check to see if we can use the VTBL instruction.
7046   SDValue V1 = Op.getOperand(0);
7047   SDValue V2 = Op.getOperand(1);
7048   SDLoc DL(Op);
7049 
7050   SmallVector<SDValue, 8> VTBLMask;
7051   for (ArrayRef<int>::iterator
7052          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
7053     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
7054 
7055   if (V2.getNode()->isUndef())
7056     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
7057                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
7058 
7059   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
7060                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
7061 }
7062 
7063 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
7064                                                       SelectionDAG &DAG) {
7065   SDLoc DL(Op);
7066   SDValue OpLHS = Op.getOperand(0);
7067   EVT VT = OpLHS.getValueType();
7068 
7069   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
7070          "Expect an v8i16/v16i8 type");
7071   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
7072   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
7073   // extract the first 8 bytes into the top double word and the last 8 bytes
7074   // into the bottom double word. The v8i16 case is similar.
7075   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
7076   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
7077                      DAG.getConstant(ExtractNum, DL, MVT::i32));
7078 }
7079 
7080 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
7081   SDValue V1 = Op.getOperand(0);
7082   SDValue V2 = Op.getOperand(1);
7083   SDLoc dl(Op);
7084   EVT VT = Op.getValueType();
7085   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
7086 
7087   // Convert shuffles that are directly supported on NEON to target-specific
7088   // DAG nodes, instead of keeping them as shuffles and matching them again
7089   // during code selection.  This is more efficient and avoids the possibility
7090   // of inconsistencies between legalization and selection.
7091   // FIXME: floating-point vectors should be canonicalized to integer vectors
7092   // of the same time so that they get CSEd properly.
7093   ArrayRef<int> ShuffleMask = SVN->getMask();
7094 
7095   unsigned EltSize = VT.getScalarSizeInBits();
7096   if (EltSize <= 32) {
7097     if (SVN->isSplat()) {
7098       int Lane = SVN->getSplatIndex();
7099       // If this is undef splat, generate it via "just" vdup, if possible.
7100       if (Lane == -1) Lane = 0;
7101 
7102       // Test if V1 is a SCALAR_TO_VECTOR.
7103       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7104         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
7105       }
7106       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
7107       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
7108       // reaches it).
7109       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
7110           !isa<ConstantSDNode>(V1.getOperand(0))) {
7111         bool IsScalarToVector = true;
7112         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
7113           if (!V1.getOperand(i).isUndef()) {
7114             IsScalarToVector = false;
7115             break;
7116           }
7117         if (IsScalarToVector)
7118           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
7119       }
7120       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
7121                          DAG.getConstant(Lane, dl, MVT::i32));
7122     }
7123 
7124     bool ReverseVEXT;
7125     unsigned Imm;
7126     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
7127       if (ReverseVEXT)
7128         std::swap(V1, V2);
7129       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
7130                          DAG.getConstant(Imm, dl, MVT::i32));
7131     }
7132 
7133     if (isVREVMask(ShuffleMask, VT, 64))
7134       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
7135     if (isVREVMask(ShuffleMask, VT, 32))
7136       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
7137     if (isVREVMask(ShuffleMask, VT, 16))
7138       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
7139 
7140     if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
7141       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
7142                          DAG.getConstant(Imm, dl, MVT::i32));
7143     }
7144 
7145     // Check for Neon shuffles that modify both input vectors in place.
7146     // If both results are used, i.e., if there are two shuffles with the same
7147     // source operands and with masks corresponding to both results of one of
7148     // these operations, DAG memoization will ensure that a single node is
7149     // used for both shuffles.
7150     unsigned WhichResult;
7151     bool isV_UNDEF;
7152     if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
7153             ShuffleMask, VT, WhichResult, isV_UNDEF)) {
7154       if (isV_UNDEF)
7155         V2 = V1;
7156       return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
7157           .getValue(WhichResult);
7158     }
7159 
7160     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
7161     // shuffles that produce a result larger than their operands with:
7162     //   shuffle(concat(v1, undef), concat(v2, undef))
7163     // ->
7164     //   shuffle(concat(v1, v2), undef)
7165     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
7166     //
7167     // This is useful in the general case, but there are special cases where
7168     // native shuffles produce larger results: the two-result ops.
7169     //
7170     // Look through the concat when lowering them:
7171     //   shuffle(concat(v1, v2), undef)
7172     // ->
7173     //   concat(VZIP(v1, v2):0, :1)
7174     //
7175     if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
7176       SDValue SubV1 = V1->getOperand(0);
7177       SDValue SubV2 = V1->getOperand(1);
7178       EVT SubVT = SubV1.getValueType();
7179 
7180       // We expect these to have been canonicalized to -1.
7181       assert(llvm::all_of(ShuffleMask, [&](int i) {
7182         return i < (int)VT.getVectorNumElements();
7183       }) && "Unexpected shuffle index into UNDEF operand!");
7184 
7185       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
7186               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
7187         if (isV_UNDEF)
7188           SubV2 = SubV1;
7189         assert((WhichResult == 0) &&
7190                "In-place shuffle of concat can only have one result!");
7191         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
7192                                   SubV1, SubV2);
7193         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
7194                            Res.getValue(1));
7195       }
7196     }
7197   }
7198 
7199   // If the shuffle is not directly supported and it has 4 elements, use
7200   // the PerfectShuffle-generated table to synthesize it from other shuffles.
7201   unsigned NumElts = VT.getVectorNumElements();
7202   if (NumElts == 4) {
7203     unsigned PFIndexes[4];
7204     for (unsigned i = 0; i != 4; ++i) {
7205       if (ShuffleMask[i] < 0)
7206         PFIndexes[i] = 8;
7207       else
7208         PFIndexes[i] = ShuffleMask[i];
7209     }
7210 
7211     // Compute the index in the perfect shuffle table.
7212     unsigned PFTableIndex =
7213       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
7214     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
7215     unsigned Cost = (PFEntry >> 30);
7216 
7217     if (Cost <= 4)
7218       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
7219   }
7220 
7221   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
7222   if (EltSize >= 32) {
7223     // Do the expansion with floating-point types, since that is what the VFP
7224     // registers are defined to use, and since i64 is not legal.
7225     EVT EltVT = EVT::getFloatingPointVT(EltSize);
7226     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7227     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
7228     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
7229     SmallVector<SDValue, 8> Ops;
7230     for (unsigned i = 0; i < NumElts; ++i) {
7231       if (ShuffleMask[i] < 0)
7232         Ops.push_back(DAG.getUNDEF(EltVT));
7233       else
7234         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
7235                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
7236                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
7237                                                   dl, MVT::i32)));
7238     }
7239     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7240     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7241   }
7242 
7243   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
7244     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
7245 
7246   if (VT == MVT::v8i8)
7247     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
7248       return NewOp;
7249 
7250   return SDValue();
7251 }
7252 
7253 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
7254   // INSERT_VECTOR_ELT is legal only for immediate indexes.
7255   SDValue Lane = Op.getOperand(2);
7256   if (!isa<ConstantSDNode>(Lane))
7257     return SDValue();
7258 
7259   return Op;
7260 }
7261 
7262 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
7263   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
7264   SDValue Lane = Op.getOperand(1);
7265   if (!isa<ConstantSDNode>(Lane))
7266     return SDValue();
7267 
7268   SDValue Vec = Op.getOperand(0);
7269   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
7270     SDLoc dl(Op);
7271     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
7272   }
7273 
7274   return Op;
7275 }
7276 
7277 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7278   // The only time a CONCAT_VECTORS operation can have legal types is when
7279   // two 64-bit vectors are concatenated to a 128-bit vector.
7280   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
7281          "unexpected CONCAT_VECTORS");
7282   SDLoc dl(Op);
7283   SDValue Val = DAG.getUNDEF(MVT::v2f64);
7284   SDValue Op0 = Op.getOperand(0);
7285   SDValue Op1 = Op.getOperand(1);
7286   if (!Op0.isUndef())
7287     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
7288                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
7289                       DAG.getIntPtrConstant(0, dl));
7290   if (!Op1.isUndef())
7291     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
7292                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
7293                       DAG.getIntPtrConstant(1, dl));
7294   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
7295 }
7296 
7297 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
7298 /// element has been zero/sign-extended, depending on the isSigned parameter,
7299 /// from an integer type half its size.
7300 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
7301                                    bool isSigned) {
7302   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
7303   EVT VT = N->getValueType(0);
7304   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
7305     SDNode *BVN = N->getOperand(0).getNode();
7306     if (BVN->getValueType(0) != MVT::v4i32 ||
7307         BVN->getOpcode() != ISD::BUILD_VECTOR)
7308       return false;
7309     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
7310     unsigned HiElt = 1 - LoElt;
7311     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
7312     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
7313     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
7314     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
7315     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
7316       return false;
7317     if (isSigned) {
7318       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
7319           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
7320         return true;
7321     } else {
7322       if (Hi0->isNullValue() && Hi1->isNullValue())
7323         return true;
7324     }
7325     return false;
7326   }
7327 
7328   if (N->getOpcode() != ISD::BUILD_VECTOR)
7329     return false;
7330 
7331   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
7332     SDNode *Elt = N->getOperand(i).getNode();
7333     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
7334       unsigned EltSize = VT.getScalarSizeInBits();
7335       unsigned HalfSize = EltSize / 2;
7336       if (isSigned) {
7337         if (!isIntN(HalfSize, C->getSExtValue()))
7338           return false;
7339       } else {
7340         if (!isUIntN(HalfSize, C->getZExtValue()))
7341           return false;
7342       }
7343       continue;
7344     }
7345     return false;
7346   }
7347 
7348   return true;
7349 }
7350 
7351 /// isSignExtended - Check if a node is a vector value that is sign-extended
7352 /// or a constant BUILD_VECTOR with sign-extended elements.
7353 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
7354   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
7355     return true;
7356   if (isExtendedBUILD_VECTOR(N, DAG, true))
7357     return true;
7358   return false;
7359 }
7360 
7361 /// isZeroExtended - Check if a node is a vector value that is zero-extended
7362 /// or a constant BUILD_VECTOR with zero-extended elements.
7363 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
7364   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
7365     return true;
7366   if (isExtendedBUILD_VECTOR(N, DAG, false))
7367     return true;
7368   return false;
7369 }
7370 
7371 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
7372   if (OrigVT.getSizeInBits() >= 64)
7373     return OrigVT;
7374 
7375   assert(OrigVT.isSimple() && "Expecting a simple value type");
7376 
7377   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
7378   switch (OrigSimpleTy) {
7379   default: llvm_unreachable("Unexpected Vector Type");
7380   case MVT::v2i8:
7381   case MVT::v2i16:
7382      return MVT::v2i32;
7383   case MVT::v4i8:
7384     return  MVT::v4i16;
7385   }
7386 }
7387 
7388 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
7389 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
7390 /// We insert the required extension here to get the vector to fill a D register.
7391 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
7392                                             const EVT &OrigTy,
7393                                             const EVT &ExtTy,
7394                                             unsigned ExtOpcode) {
7395   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
7396   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
7397   // 64-bits we need to insert a new extension so that it will be 64-bits.
7398   assert(ExtTy.is128BitVector() && "Unexpected extension size");
7399   if (OrigTy.getSizeInBits() >= 64)
7400     return N;
7401 
7402   // Must extend size to at least 64 bits to be used as an operand for VMULL.
7403   EVT NewVT = getExtensionTo64Bits(OrigTy);
7404 
7405   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
7406 }
7407 
7408 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
7409 /// does not do any sign/zero extension. If the original vector is less
7410 /// than 64 bits, an appropriate extension will be added after the load to
7411 /// reach a total size of 64 bits. We have to add the extension separately
7412 /// because ARM does not have a sign/zero extending load for vectors.
7413 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
7414   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
7415 
7416   // The load already has the right type.
7417   if (ExtendedTy == LD->getMemoryVT())
7418     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
7419                        LD->getBasePtr(), LD->getPointerInfo(),
7420                        LD->getAlignment(), LD->getMemOperand()->getFlags());
7421 
7422   // We need to create a zextload/sextload. We cannot just create a load
7423   // followed by a zext/zext node because LowerMUL is also run during normal
7424   // operation legalization where we can't create illegal types.
7425   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
7426                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
7427                         LD->getMemoryVT(), LD->getAlignment(),
7428                         LD->getMemOperand()->getFlags());
7429 }
7430 
7431 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
7432 /// extending load, or BUILD_VECTOR with extended elements, return the
7433 /// unextended value. The unextended vector should be 64 bits so that it can
7434 /// be used as an operand to a VMULL instruction. If the original vector size
7435 /// before extension is less than 64 bits we add a an extension to resize
7436 /// the vector to 64 bits.
7437 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
7438   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
7439     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
7440                                         N->getOperand(0)->getValueType(0),
7441                                         N->getValueType(0),
7442                                         N->getOpcode());
7443 
7444   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
7445     assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
7446            "Expected extending load");
7447 
7448     SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
7449     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
7450     unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7451     SDValue extLoad =
7452         DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
7453     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
7454 
7455     return newLoad;
7456   }
7457 
7458   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
7459   // have been legalized as a BITCAST from v4i32.
7460   if (N->getOpcode() == ISD::BITCAST) {
7461     SDNode *BVN = N->getOperand(0).getNode();
7462     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
7463            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
7464     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
7465     return DAG.getBuildVector(
7466         MVT::v2i32, SDLoc(N),
7467         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
7468   }
7469   // Construct a new BUILD_VECTOR with elements truncated to half the size.
7470   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
7471   EVT VT = N->getValueType(0);
7472   unsigned EltSize = VT.getScalarSizeInBits() / 2;
7473   unsigned NumElts = VT.getVectorNumElements();
7474   MVT TruncVT = MVT::getIntegerVT(EltSize);
7475   SmallVector<SDValue, 8> Ops;
7476   SDLoc dl(N);
7477   for (unsigned i = 0; i != NumElts; ++i) {
7478     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
7479     const APInt &CInt = C->getAPIntValue();
7480     // Element types smaller than 32 bits are not legal, so use i32 elements.
7481     // The values are implicitly truncated so sext vs. zext doesn't matter.
7482     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
7483   }
7484   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
7485 }
7486 
7487 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
7488   unsigned Opcode = N->getOpcode();
7489   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
7490     SDNode *N0 = N->getOperand(0).getNode();
7491     SDNode *N1 = N->getOperand(1).getNode();
7492     return N0->hasOneUse() && N1->hasOneUse() &&
7493       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
7494   }
7495   return false;
7496 }
7497 
7498 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
7499   unsigned Opcode = N->getOpcode();
7500   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
7501     SDNode *N0 = N->getOperand(0).getNode();
7502     SDNode *N1 = N->getOperand(1).getNode();
7503     return N0->hasOneUse() && N1->hasOneUse() &&
7504       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
7505   }
7506   return false;
7507 }
7508 
7509 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
7510   // Multiplications are only custom-lowered for 128-bit vectors so that
7511   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
7512   EVT VT = Op.getValueType();
7513   assert(VT.is128BitVector() && VT.isInteger() &&
7514          "unexpected type for custom-lowering ISD::MUL");
7515   SDNode *N0 = Op.getOperand(0).getNode();
7516   SDNode *N1 = Op.getOperand(1).getNode();
7517   unsigned NewOpc = 0;
7518   bool isMLA = false;
7519   bool isN0SExt = isSignExtended(N0, DAG);
7520   bool isN1SExt = isSignExtended(N1, DAG);
7521   if (isN0SExt && isN1SExt)
7522     NewOpc = ARMISD::VMULLs;
7523   else {
7524     bool isN0ZExt = isZeroExtended(N0, DAG);
7525     bool isN1ZExt = isZeroExtended(N1, DAG);
7526     if (isN0ZExt && isN1ZExt)
7527       NewOpc = ARMISD::VMULLu;
7528     else if (isN1SExt || isN1ZExt) {
7529       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
7530       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
7531       if (isN1SExt && isAddSubSExt(N0, DAG)) {
7532         NewOpc = ARMISD::VMULLs;
7533         isMLA = true;
7534       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
7535         NewOpc = ARMISD::VMULLu;
7536         isMLA = true;
7537       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
7538         std::swap(N0, N1);
7539         NewOpc = ARMISD::VMULLu;
7540         isMLA = true;
7541       }
7542     }
7543 
7544     if (!NewOpc) {
7545       if (VT == MVT::v2i64)
7546         // Fall through to expand this.  It is not legal.
7547         return SDValue();
7548       else
7549         // Other vector multiplications are legal.
7550         return Op;
7551     }
7552   }
7553 
7554   // Legalize to a VMULL instruction.
7555   SDLoc DL(Op);
7556   SDValue Op0;
7557   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
7558   if (!isMLA) {
7559     Op0 = SkipExtensionForVMULL(N0, DAG);
7560     assert(Op0.getValueType().is64BitVector() &&
7561            Op1.getValueType().is64BitVector() &&
7562            "unexpected types for extended operands to VMULL");
7563     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
7564   }
7565 
7566   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
7567   // isel lowering to take advantage of no-stall back to back vmul + vmla.
7568   //   vmull q0, d4, d6
7569   //   vmlal q0, d5, d6
7570   // is faster than
7571   //   vaddl q0, d4, d5
7572   //   vmovl q1, d6
7573   //   vmul  q0, q0, q1
7574   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
7575   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
7576   EVT Op1VT = Op1.getValueType();
7577   return DAG.getNode(N0->getOpcode(), DL, VT,
7578                      DAG.getNode(NewOpc, DL, VT,
7579                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
7580                      DAG.getNode(NewOpc, DL, VT,
7581                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
7582 }
7583 
7584 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
7585                               SelectionDAG &DAG) {
7586   // TODO: Should this propagate fast-math-flags?
7587 
7588   // Convert to float
7589   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
7590   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
7591   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
7592   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
7593   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
7594   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
7595   // Get reciprocal estimate.
7596   // float4 recip = vrecpeq_f32(yf);
7597   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7598                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7599                    Y);
7600   // Because char has a smaller range than uchar, we can actually get away
7601   // without any newton steps.  This requires that we use a weird bias
7602   // of 0xb000, however (again, this has been exhaustively tested).
7603   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
7604   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
7605   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
7606   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
7607   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
7608   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
7609   // Convert back to short.
7610   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
7611   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
7612   return X;
7613 }
7614 
7615 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
7616                                SelectionDAG &DAG) {
7617   // TODO: Should this propagate fast-math-flags?
7618 
7619   SDValue N2;
7620   // Convert to float.
7621   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
7622   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
7623   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
7624   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
7625   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7626   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7627 
7628   // Use reciprocal estimate and one refinement step.
7629   // float4 recip = vrecpeq_f32(yf);
7630   // recip *= vrecpsq_f32(yf, recip);
7631   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7632                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7633                    N1);
7634   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7635                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7636                    N1, N2);
7637   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7638   // Because short has a smaller range than ushort, we can actually get away
7639   // with only a single newton step.  This requires that we use a weird bias
7640   // of 89, however (again, this has been exhaustively tested).
7641   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
7642   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7643   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7644   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
7645   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7646   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7647   // Convert back to integer and return.
7648   // return vmovn_s32(vcvt_s32_f32(result));
7649   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7650   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7651   return N0;
7652 }
7653 
7654 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
7655   EVT VT = Op.getValueType();
7656   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7657          "unexpected type for custom-lowering ISD::SDIV");
7658 
7659   SDLoc dl(Op);
7660   SDValue N0 = Op.getOperand(0);
7661   SDValue N1 = Op.getOperand(1);
7662   SDValue N2, N3;
7663 
7664   if (VT == MVT::v8i8) {
7665     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
7666     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
7667 
7668     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7669                      DAG.getIntPtrConstant(4, dl));
7670     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7671                      DAG.getIntPtrConstant(4, dl));
7672     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7673                      DAG.getIntPtrConstant(0, dl));
7674     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7675                      DAG.getIntPtrConstant(0, dl));
7676 
7677     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
7678     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
7679 
7680     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7681     N0 = LowerCONCAT_VECTORS(N0, DAG);
7682 
7683     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
7684     return N0;
7685   }
7686   return LowerSDIV_v4i16(N0, N1, dl, DAG);
7687 }
7688 
7689 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
7690   // TODO: Should this propagate fast-math-flags?
7691   EVT VT = Op.getValueType();
7692   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
7693          "unexpected type for custom-lowering ISD::UDIV");
7694 
7695   SDLoc dl(Op);
7696   SDValue N0 = Op.getOperand(0);
7697   SDValue N1 = Op.getOperand(1);
7698   SDValue N2, N3;
7699 
7700   if (VT == MVT::v8i8) {
7701     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
7702     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
7703 
7704     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7705                      DAG.getIntPtrConstant(4, dl));
7706     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7707                      DAG.getIntPtrConstant(4, dl));
7708     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
7709                      DAG.getIntPtrConstant(0, dl));
7710     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
7711                      DAG.getIntPtrConstant(0, dl));
7712 
7713     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
7714     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
7715 
7716     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
7717     N0 = LowerCONCAT_VECTORS(N0, DAG);
7718 
7719     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
7720                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
7721                                      MVT::i32),
7722                      N0);
7723     return N0;
7724   }
7725 
7726   // v4i16 sdiv ... Convert to float.
7727   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
7728   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
7729   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
7730   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
7731   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
7732   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
7733 
7734   // Use reciprocal estimate and two refinement steps.
7735   // float4 recip = vrecpeq_f32(yf);
7736   // recip *= vrecpsq_f32(yf, recip);
7737   // recip *= vrecpsq_f32(yf, recip);
7738   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7739                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
7740                    BN1);
7741   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7742                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7743                    BN1, N2);
7744   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7745   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
7746                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
7747                    BN1, N2);
7748   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
7749   // Simply multiplying by the reciprocal estimate can leave us a few ulps
7750   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
7751   // and that it will never cause us to return an answer too large).
7752   // float4 result = as_float4(as_int4(xf*recip) + 2);
7753   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
7754   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
7755   N1 = DAG.getConstant(2, dl, MVT::v4i32);
7756   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
7757   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
7758   // Convert back to integer and return.
7759   // return vmovn_u32(vcvt_s32_f32(result));
7760   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
7761   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
7762   return N0;
7763 }
7764 
7765 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
7766   SDNode *N = Op.getNode();
7767   EVT VT = N->getValueType(0);
7768   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
7769 
7770   SDValue Carry = Op.getOperand(2);
7771 
7772   SDLoc DL(Op);
7773 
7774   SDValue Result;
7775   if (Op.getOpcode() == ISD::ADDCARRY) {
7776     // This converts the boolean value carry into the carry flag.
7777     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
7778 
7779     // Do the addition proper using the carry flag we wanted.
7780     Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
7781                          Op.getOperand(1), Carry);
7782 
7783     // Now convert the carry flag into a boolean value.
7784     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
7785   } else {
7786     // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
7787     // have to invert the carry first.
7788     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
7789                         DAG.getConstant(1, DL, MVT::i32), Carry);
7790     // This converts the boolean value carry into the carry flag.
7791     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
7792 
7793     // Do the subtraction proper using the carry flag we wanted.
7794     Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
7795                          Op.getOperand(1), Carry);
7796 
7797     // Now convert the carry flag into a boolean value.
7798     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
7799     // But the carry returned by ARMISD::SUBE is not a borrow as expected
7800     // by ISD::SUBCARRY, so compute 1 - C.
7801     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
7802                         DAG.getConstant(1, DL, MVT::i32), Carry);
7803   }
7804 
7805   // Return both values.
7806   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
7807 }
7808 
7809 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
7810   assert(Subtarget->isTargetDarwin());
7811 
7812   // For iOS, we want to call an alternative entry point: __sincos_stret,
7813   // return values are passed via sret.
7814   SDLoc dl(Op);
7815   SDValue Arg = Op.getOperand(0);
7816   EVT ArgVT = Arg.getValueType();
7817   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
7818   auto PtrVT = getPointerTy(DAG.getDataLayout());
7819 
7820   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7821   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7822 
7823   // Pair of floats / doubles used to pass the result.
7824   Type *RetTy = StructType::get(ArgTy, ArgTy);
7825   auto &DL = DAG.getDataLayout();
7826 
7827   ArgListTy Args;
7828   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
7829   SDValue SRet;
7830   if (ShouldUseSRet) {
7831     // Create stack object for sret.
7832     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
7833     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
7834     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
7835     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
7836 
7837     ArgListEntry Entry;
7838     Entry.Node = SRet;
7839     Entry.Ty = RetTy->getPointerTo();
7840     Entry.IsSExt = false;
7841     Entry.IsZExt = false;
7842     Entry.IsSRet = true;
7843     Args.push_back(Entry);
7844     RetTy = Type::getVoidTy(*DAG.getContext());
7845   }
7846 
7847   ArgListEntry Entry;
7848   Entry.Node = Arg;
7849   Entry.Ty = ArgTy;
7850   Entry.IsSExt = false;
7851   Entry.IsZExt = false;
7852   Args.push_back(Entry);
7853 
7854   RTLIB::Libcall LC =
7855       (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
7856   const char *LibcallName = getLibcallName(LC);
7857   CallingConv::ID CC = getLibcallCallingConv(LC);
7858   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
7859 
7860   TargetLowering::CallLoweringInfo CLI(DAG);
7861   CLI.setDebugLoc(dl)
7862       .setChain(DAG.getEntryNode())
7863       .setCallee(CC, RetTy, Callee, std::move(Args))
7864       .setDiscardResult(ShouldUseSRet);
7865   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
7866 
7867   if (!ShouldUseSRet)
7868     return CallResult.first;
7869 
7870   SDValue LoadSin =
7871       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
7872 
7873   // Address of cos field.
7874   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
7875                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
7876   SDValue LoadCos =
7877       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
7878 
7879   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
7880   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
7881                      LoadSin.getValue(0), LoadCos.getValue(0));
7882 }
7883 
7884 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
7885                                                   bool Signed,
7886                                                   SDValue &Chain) const {
7887   EVT VT = Op.getValueType();
7888   assert((VT == MVT::i32 || VT == MVT::i64) &&
7889          "unexpected type for custom lowering DIV");
7890   SDLoc dl(Op);
7891 
7892   const auto &DL = DAG.getDataLayout();
7893   const auto &TLI = DAG.getTargetLoweringInfo();
7894 
7895   const char *Name = nullptr;
7896   if (Signed)
7897     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
7898   else
7899     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
7900 
7901   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
7902 
7903   ARMTargetLowering::ArgListTy Args;
7904 
7905   for (auto AI : {1, 0}) {
7906     ArgListEntry Arg;
7907     Arg.Node = Op.getOperand(AI);
7908     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
7909     Args.push_back(Arg);
7910   }
7911 
7912   CallLoweringInfo CLI(DAG);
7913   CLI.setDebugLoc(dl)
7914     .setChain(Chain)
7915     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
7916                ES, std::move(Args));
7917 
7918   return LowerCallTo(CLI).first;
7919 }
7920 
7921 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
7922                                             bool Signed) const {
7923   assert(Op.getValueType() == MVT::i32 &&
7924          "unexpected type for custom lowering DIV");
7925   SDLoc dl(Op);
7926 
7927   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
7928                                DAG.getEntryNode(), Op.getOperand(1));
7929 
7930   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7931 }
7932 
7933 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
7934   SDLoc DL(N);
7935   SDValue Op = N->getOperand(1);
7936   if (N->getValueType(0) == MVT::i32)
7937     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
7938   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7939                            DAG.getConstant(0, DL, MVT::i32));
7940   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
7941                            DAG.getConstant(1, DL, MVT::i32));
7942   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
7943                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
7944 }
7945 
7946 void ARMTargetLowering::ExpandDIV_Windows(
7947     SDValue Op, SelectionDAG &DAG, bool Signed,
7948     SmallVectorImpl<SDValue> &Results) const {
7949   const auto &DL = DAG.getDataLayout();
7950   const auto &TLI = DAG.getTargetLoweringInfo();
7951 
7952   assert(Op.getValueType() == MVT::i64 &&
7953          "unexpected type for custom lowering DIV");
7954   SDLoc dl(Op);
7955 
7956   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
7957 
7958   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
7959 
7960   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
7961   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
7962                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
7963   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
7964 
7965   Results.push_back(Lower);
7966   Results.push_back(Upper);
7967 }
7968 
7969 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
7970   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
7971     // Acquire/Release load/store is not legal for targets without a dmb or
7972     // equivalent available.
7973     return SDValue();
7974 
7975   // Monotonic load/store is legal for all targets.
7976   return Op;
7977 }
7978 
7979 static void ReplaceREADCYCLECOUNTER(SDNode *N,
7980                                     SmallVectorImpl<SDValue> &Results,
7981                                     SelectionDAG &DAG,
7982                                     const ARMSubtarget *Subtarget) {
7983   SDLoc DL(N);
7984   // Under Power Management extensions, the cycle-count is:
7985   //    mrc p15, #0, <Rt>, c9, c13, #0
7986   SDValue Ops[] = { N->getOperand(0), // Chain
7987                     DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
7988                     DAG.getConstant(15, DL, MVT::i32),
7989                     DAG.getConstant(0, DL, MVT::i32),
7990                     DAG.getConstant(9, DL, MVT::i32),
7991                     DAG.getConstant(13, DL, MVT::i32),
7992                     DAG.getConstant(0, DL, MVT::i32)
7993   };
7994 
7995   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
7996                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
7997   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
7998                                 DAG.getConstant(0, DL, MVT::i32)));
7999   Results.push_back(Cycles32.getValue(1));
8000 }
8001 
8002 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
8003   SDLoc dl(V.getNode());
8004   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
8005   SDValue VHi = DAG.getAnyExtOrTrunc(
8006       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
8007       dl, MVT::i32);
8008   bool isBigEndian = DAG.getDataLayout().isBigEndian();
8009   if (isBigEndian)
8010     std::swap (VLo, VHi);
8011   SDValue RegClass =
8012       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
8013   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
8014   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
8015   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
8016   return SDValue(
8017       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
8018 }
8019 
8020 static void ReplaceCMP_SWAP_64Results(SDNode *N,
8021                                        SmallVectorImpl<SDValue> & Results,
8022                                        SelectionDAG &DAG) {
8023   assert(N->getValueType(0) == MVT::i64 &&
8024          "AtomicCmpSwap on types less than 64 should be legal");
8025   SDValue Ops[] = {N->getOperand(1),
8026                    createGPRPairNode(DAG, N->getOperand(2)),
8027                    createGPRPairNode(DAG, N->getOperand(3)),
8028                    N->getOperand(0)};
8029   SDNode *CmpSwap = DAG.getMachineNode(
8030       ARM::CMP_SWAP_64, SDLoc(N),
8031       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
8032 
8033   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
8034   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
8035 
8036   bool isBigEndian = DAG.getDataLayout().isBigEndian();
8037 
8038   Results.push_back(
8039       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
8040                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
8041   Results.push_back(
8042       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
8043                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
8044   Results.push_back(SDValue(CmpSwap, 2));
8045 }
8046 
8047 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
8048                           SelectionDAG &DAG) {
8049   const auto &TLI = DAG.getTargetLoweringInfo();
8050 
8051   assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
8052          "Custom lowering is MSVCRT specific!");
8053 
8054   SDLoc dl(Op);
8055   SDValue Val = Op.getOperand(0);
8056   MVT Ty = Val->getSimpleValueType(0);
8057   SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
8058   SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
8059                                          TLI.getPointerTy(DAG.getDataLayout()));
8060 
8061   TargetLowering::ArgListTy Args;
8062   TargetLowering::ArgListEntry Entry;
8063 
8064   Entry.Node = Val;
8065   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
8066   Entry.IsZExt = true;
8067   Args.push_back(Entry);
8068 
8069   Entry.Node = Exponent;
8070   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
8071   Entry.IsZExt = true;
8072   Args.push_back(Entry);
8073 
8074   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
8075 
8076   // In the in-chain to the call is the entry node  If we are emitting a
8077   // tailcall, the chain will be mutated if the node has a non-entry input
8078   // chain.
8079   SDValue InChain = DAG.getEntryNode();
8080   SDValue TCChain = InChain;
8081 
8082   const Function &F = DAG.getMachineFunction().getFunction();
8083   bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
8084               F.getReturnType() == LCRTy;
8085   if (IsTC)
8086     InChain = TCChain;
8087 
8088   TargetLowering::CallLoweringInfo CLI(DAG);
8089   CLI.setDebugLoc(dl)
8090       .setChain(InChain)
8091       .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
8092       .setTailCall(IsTC);
8093   std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
8094 
8095   // Return the chain (the DAG root) if it is a tail call
8096   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
8097 }
8098 
8099 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
8100   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
8101   switch (Op.getOpcode()) {
8102   default: llvm_unreachable("Don't know how to custom lower this!");
8103   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
8104   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
8105   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
8106   case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
8107   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
8108   case ISD::SELECT:        return LowerSELECT(Op, DAG);
8109   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
8110   case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
8111   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
8112   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
8113   case ISD::VASTART:       return LowerVASTART(Op, DAG);
8114   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
8115   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
8116   case ISD::SINT_TO_FP:
8117   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
8118   case ISD::FP_TO_SINT:
8119   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
8120   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
8121   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
8122   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
8123   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
8124   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
8125   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
8126   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
8127                                                                Subtarget);
8128   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
8129   case ISD::SHL:
8130   case ISD::SRL:
8131   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
8132   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
8133   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
8134   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
8135   case ISD::SRL_PARTS:
8136   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
8137   case ISD::CTTZ:
8138   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
8139   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
8140   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
8141   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
8142   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
8143   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
8144   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
8145   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
8146   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
8147   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
8148   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
8149   case ISD::MUL:           return LowerMUL(Op, DAG);
8150   case ISD::SDIV:
8151     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
8152       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
8153     return LowerSDIV(Op, DAG);
8154   case ISD::UDIV:
8155     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
8156       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
8157     return LowerUDIV(Op, DAG);
8158   case ISD::ADDCARRY:
8159   case ISD::SUBCARRY:      return LowerADDSUBCARRY(Op, DAG);
8160   case ISD::SADDO:
8161   case ISD::SSUBO:
8162     return LowerSignedALUO(Op, DAG);
8163   case ISD::UADDO:
8164   case ISD::USUBO:
8165     return LowerUnsignedALUO(Op, DAG);
8166   case ISD::ATOMIC_LOAD:
8167   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
8168   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
8169   case ISD::SDIVREM:
8170   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
8171   case ISD::DYNAMIC_STACKALLOC:
8172     if (Subtarget->isTargetWindows())
8173       return LowerDYNAMIC_STACKALLOC(Op, DAG);
8174     llvm_unreachable("Don't know how to custom lower this!");
8175   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
8176   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
8177   case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
8178   case ARMISD::WIN__DBZCHK: return SDValue();
8179   }
8180 }
8181 
8182 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
8183                                  SelectionDAG &DAG) {
8184   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
8185   unsigned Opc = 0;
8186   if (IntNo == Intrinsic::arm_smlald)
8187     Opc = ARMISD::SMLALD;
8188   else if (IntNo == Intrinsic::arm_smlaldx)
8189     Opc = ARMISD::SMLALDX;
8190   else if (IntNo == Intrinsic::arm_smlsld)
8191     Opc = ARMISD::SMLSLD;
8192   else if (IntNo == Intrinsic::arm_smlsldx)
8193     Opc = ARMISD::SMLSLDX;
8194   else
8195     return;
8196 
8197   SDLoc dl(N);
8198   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
8199                            N->getOperand(3),
8200                            DAG.getConstant(0, dl, MVT::i32));
8201   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
8202                            N->getOperand(3),
8203                            DAG.getConstant(1, dl, MVT::i32));
8204 
8205   SDValue LongMul = DAG.getNode(Opc, dl,
8206                                 DAG.getVTList(MVT::i32, MVT::i32),
8207                                 N->getOperand(1), N->getOperand(2),
8208                                 Lo, Hi);
8209   Results.push_back(LongMul.getValue(0));
8210   Results.push_back(LongMul.getValue(1));
8211 }
8212 
8213 /// ReplaceNodeResults - Replace the results of node with an illegal result
8214 /// type with new values built out of custom code.
8215 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
8216                                            SmallVectorImpl<SDValue> &Results,
8217                                            SelectionDAG &DAG) const {
8218   SDValue Res;
8219   switch (N->getOpcode()) {
8220   default:
8221     llvm_unreachable("Don't know how to custom expand this!");
8222   case ISD::READ_REGISTER:
8223     ExpandREAD_REGISTER(N, Results, DAG);
8224     break;
8225   case ISD::BITCAST:
8226     Res = ExpandBITCAST(N, DAG, Subtarget);
8227     break;
8228   case ISD::SRL:
8229   case ISD::SRA:
8230     Res = Expand64BitShift(N, DAG, Subtarget);
8231     break;
8232   case ISD::SREM:
8233   case ISD::UREM:
8234     Res = LowerREM(N, DAG);
8235     break;
8236   case ISD::SDIVREM:
8237   case ISD::UDIVREM:
8238     Res = LowerDivRem(SDValue(N, 0), DAG);
8239     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
8240     Results.push_back(Res.getValue(0));
8241     Results.push_back(Res.getValue(1));
8242     return;
8243   case ISD::READCYCLECOUNTER:
8244     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
8245     return;
8246   case ISD::UDIV:
8247   case ISD::SDIV:
8248     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
8249     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
8250                              Results);
8251   case ISD::ATOMIC_CMP_SWAP:
8252     ReplaceCMP_SWAP_64Results(N, Results, DAG);
8253     return;
8254   case ISD::INTRINSIC_WO_CHAIN:
8255     return ReplaceLongIntrinsic(N, Results, DAG);
8256   }
8257   if (Res.getNode())
8258     Results.push_back(Res);
8259 }
8260 
8261 //===----------------------------------------------------------------------===//
8262 //                           ARM Scheduler Hooks
8263 //===----------------------------------------------------------------------===//
8264 
8265 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
8266 /// registers the function context.
8267 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
8268                                                MachineBasicBlock *MBB,
8269                                                MachineBasicBlock *DispatchBB,
8270                                                int FI) const {
8271   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
8272          "ROPI/RWPI not currently supported with SjLj");
8273   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8274   DebugLoc dl = MI.getDebugLoc();
8275   MachineFunction *MF = MBB->getParent();
8276   MachineRegisterInfo *MRI = &MF->getRegInfo();
8277   MachineConstantPool *MCP = MF->getConstantPool();
8278   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
8279   const Function &F = MF->getFunction();
8280 
8281   bool isThumb = Subtarget->isThumb();
8282   bool isThumb2 = Subtarget->isThumb2();
8283 
8284   unsigned PCLabelId = AFI->createPICLabelUId();
8285   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
8286   ARMConstantPoolValue *CPV =
8287     ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
8288   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
8289 
8290   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
8291                                            : &ARM::GPRRegClass;
8292 
8293   // Grab constant pool and fixed stack memory operands.
8294   MachineMemOperand *CPMMO =
8295       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
8296                                MachineMemOperand::MOLoad, 4, 4);
8297 
8298   MachineMemOperand *FIMMOSt =
8299       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
8300                                MachineMemOperand::MOStore, 4, 4);
8301 
8302   // Load the address of the dispatch MBB into the jump buffer.
8303   if (isThumb2) {
8304     // Incoming value: jbuf
8305     //   ldr.n  r5, LCPI1_1
8306     //   orr    r5, r5, #1
8307     //   add    r5, pc
8308     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
8309     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8310     BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
8311         .addConstantPoolIndex(CPI)
8312         .addMemOperand(CPMMO)
8313         .add(predOps(ARMCC::AL));
8314     // Set the low bit because of thumb mode.
8315     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
8316     BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
8317         .addReg(NewVReg1, RegState::Kill)
8318         .addImm(0x01)
8319         .add(predOps(ARMCC::AL))
8320         .add(condCodeOp());
8321     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8322     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
8323       .addReg(NewVReg2, RegState::Kill)
8324       .addImm(PCLabelId);
8325     BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
8326         .addReg(NewVReg3, RegState::Kill)
8327         .addFrameIndex(FI)
8328         .addImm(36) // &jbuf[1] :: pc
8329         .addMemOperand(FIMMOSt)
8330         .add(predOps(ARMCC::AL));
8331   } else if (isThumb) {
8332     // Incoming value: jbuf
8333     //   ldr.n  r1, LCPI1_4
8334     //   add    r1, pc
8335     //   mov    r2, #1
8336     //   orrs   r1, r2
8337     //   add    r2, $jbuf, #+4 ; &jbuf[1]
8338     //   str    r1, [r2]
8339     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8340     BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
8341         .addConstantPoolIndex(CPI)
8342         .addMemOperand(CPMMO)
8343         .add(predOps(ARMCC::AL));
8344     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
8345     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
8346       .addReg(NewVReg1, RegState::Kill)
8347       .addImm(PCLabelId);
8348     // Set the low bit because of thumb mode.
8349     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8350     BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
8351         .addReg(ARM::CPSR, RegState::Define)
8352         .addImm(1)
8353         .add(predOps(ARMCC::AL));
8354     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8355     BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
8356         .addReg(ARM::CPSR, RegState::Define)
8357         .addReg(NewVReg2, RegState::Kill)
8358         .addReg(NewVReg3, RegState::Kill)
8359         .add(predOps(ARMCC::AL));
8360     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8361     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
8362             .addFrameIndex(FI)
8363             .addImm(36); // &jbuf[1] :: pc
8364     BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
8365         .addReg(NewVReg4, RegState::Kill)
8366         .addReg(NewVReg5, RegState::Kill)
8367         .addImm(0)
8368         .addMemOperand(FIMMOSt)
8369         .add(predOps(ARMCC::AL));
8370   } else {
8371     // Incoming value: jbuf
8372     //   ldr  r1, LCPI1_1
8373     //   add  r1, pc, r1
8374     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
8375     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8376     BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
8377         .addConstantPoolIndex(CPI)
8378         .addImm(0)
8379         .addMemOperand(CPMMO)
8380         .add(predOps(ARMCC::AL));
8381     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
8382     BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
8383         .addReg(NewVReg1, RegState::Kill)
8384         .addImm(PCLabelId)
8385         .add(predOps(ARMCC::AL));
8386     BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
8387         .addReg(NewVReg2, RegState::Kill)
8388         .addFrameIndex(FI)
8389         .addImm(36) // &jbuf[1] :: pc
8390         .addMemOperand(FIMMOSt)
8391         .add(predOps(ARMCC::AL));
8392   }
8393 }
8394 
8395 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
8396                                               MachineBasicBlock *MBB) const {
8397   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8398   DebugLoc dl = MI.getDebugLoc();
8399   MachineFunction *MF = MBB->getParent();
8400   MachineRegisterInfo *MRI = &MF->getRegInfo();
8401   MachineFrameInfo &MFI = MF->getFrameInfo();
8402   int FI = MFI.getFunctionContextIndex();
8403 
8404   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
8405                                                         : &ARM::GPRnopcRegClass;
8406 
8407   // Get a mapping of the call site numbers to all of the landing pads they're
8408   // associated with.
8409   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
8410   unsigned MaxCSNum = 0;
8411   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
8412        ++BB) {
8413     if (!BB->isEHPad()) continue;
8414 
8415     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
8416     // pad.
8417     for (MachineBasicBlock::iterator
8418            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
8419       if (!II->isEHLabel()) continue;
8420 
8421       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
8422       if (!MF->hasCallSiteLandingPad(Sym)) continue;
8423 
8424       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
8425       for (SmallVectorImpl<unsigned>::iterator
8426              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
8427            CSI != CSE; ++CSI) {
8428         CallSiteNumToLPad[*CSI].push_back(&*BB);
8429         MaxCSNum = std::max(MaxCSNum, *CSI);
8430       }
8431       break;
8432     }
8433   }
8434 
8435   // Get an ordered list of the machine basic blocks for the jump table.
8436   std::vector<MachineBasicBlock*> LPadList;
8437   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
8438   LPadList.reserve(CallSiteNumToLPad.size());
8439   for (unsigned I = 1; I <= MaxCSNum; ++I) {
8440     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
8441     for (SmallVectorImpl<MachineBasicBlock*>::iterator
8442            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
8443       LPadList.push_back(*II);
8444       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
8445     }
8446   }
8447 
8448   assert(!LPadList.empty() &&
8449          "No landing pad destinations for the dispatch jump table!");
8450 
8451   // Create the jump table and associated information.
8452   MachineJumpTableInfo *JTI =
8453     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
8454   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
8455 
8456   // Create the MBBs for the dispatch code.
8457 
8458   // Shove the dispatch's address into the return slot in the function context.
8459   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
8460   DispatchBB->setIsEHPad();
8461 
8462   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
8463   unsigned trap_opcode;
8464   if (Subtarget->isThumb())
8465     trap_opcode = ARM::tTRAP;
8466   else
8467     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
8468 
8469   BuildMI(TrapBB, dl, TII->get(trap_opcode));
8470   DispatchBB->addSuccessor(TrapBB);
8471 
8472   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
8473   DispatchBB->addSuccessor(DispContBB);
8474 
8475   // Insert and MBBs.
8476   MF->insert(MF->end(), DispatchBB);
8477   MF->insert(MF->end(), DispContBB);
8478   MF->insert(MF->end(), TrapBB);
8479 
8480   // Insert code into the entry block that creates and registers the function
8481   // context.
8482   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
8483 
8484   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
8485       MachinePointerInfo::getFixedStack(*MF, FI),
8486       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
8487 
8488   MachineInstrBuilder MIB;
8489   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
8490 
8491   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
8492   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
8493 
8494   // Add a register mask with no preserved registers.  This results in all
8495   // registers being marked as clobbered. This can't work if the dispatch block
8496   // is in a Thumb1 function and is linked with ARM code which uses the FP
8497   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
8498   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
8499 
8500   bool IsPositionIndependent = isPositionIndependent();
8501   unsigned NumLPads = LPadList.size();
8502   if (Subtarget->isThumb2()) {
8503     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8504     BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
8505         .addFrameIndex(FI)
8506         .addImm(4)
8507         .addMemOperand(FIMMOLd)
8508         .add(predOps(ARMCC::AL));
8509 
8510     if (NumLPads < 256) {
8511       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
8512           .addReg(NewVReg1)
8513           .addImm(LPadList.size())
8514           .add(predOps(ARMCC::AL));
8515     } else {
8516       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8517       BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
8518           .addImm(NumLPads & 0xFFFF)
8519           .add(predOps(ARMCC::AL));
8520 
8521       unsigned VReg2 = VReg1;
8522       if ((NumLPads & 0xFFFF0000) != 0) {
8523         VReg2 = MRI->createVirtualRegister(TRC);
8524         BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
8525             .addReg(VReg1)
8526             .addImm(NumLPads >> 16)
8527             .add(predOps(ARMCC::AL));
8528       }
8529 
8530       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
8531           .addReg(NewVReg1)
8532           .addReg(VReg2)
8533           .add(predOps(ARMCC::AL));
8534     }
8535 
8536     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
8537       .addMBB(TrapBB)
8538       .addImm(ARMCC::HI)
8539       .addReg(ARM::CPSR);
8540 
8541     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8542     BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
8543         .addJumpTableIndex(MJTI)
8544         .add(predOps(ARMCC::AL));
8545 
8546     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8547     BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
8548         .addReg(NewVReg3, RegState::Kill)
8549         .addReg(NewVReg1)
8550         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
8551         .add(predOps(ARMCC::AL))
8552         .add(condCodeOp());
8553 
8554     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
8555       .addReg(NewVReg4, RegState::Kill)
8556       .addReg(NewVReg1)
8557       .addJumpTableIndex(MJTI);
8558   } else if (Subtarget->isThumb()) {
8559     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8560     BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
8561         .addFrameIndex(FI)
8562         .addImm(1)
8563         .addMemOperand(FIMMOLd)
8564         .add(predOps(ARMCC::AL));
8565 
8566     if (NumLPads < 256) {
8567       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
8568           .addReg(NewVReg1)
8569           .addImm(NumLPads)
8570           .add(predOps(ARMCC::AL));
8571     } else {
8572       MachineConstantPool *ConstantPool = MF->getConstantPool();
8573       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
8574       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
8575 
8576       // MachineConstantPool wants an explicit alignment.
8577       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8578       if (Align == 0)
8579         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8580       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8581 
8582       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8583       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
8584           .addReg(VReg1, RegState::Define)
8585           .addConstantPoolIndex(Idx)
8586           .add(predOps(ARMCC::AL));
8587       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
8588           .addReg(NewVReg1)
8589           .addReg(VReg1)
8590           .add(predOps(ARMCC::AL));
8591     }
8592 
8593     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
8594       .addMBB(TrapBB)
8595       .addImm(ARMCC::HI)
8596       .addReg(ARM::CPSR);
8597 
8598     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
8599     BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
8600         .addReg(ARM::CPSR, RegState::Define)
8601         .addReg(NewVReg1)
8602         .addImm(2)
8603         .add(predOps(ARMCC::AL));
8604 
8605     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8606     BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
8607         .addJumpTableIndex(MJTI)
8608         .add(predOps(ARMCC::AL));
8609 
8610     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8611     BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
8612         .addReg(ARM::CPSR, RegState::Define)
8613         .addReg(NewVReg2, RegState::Kill)
8614         .addReg(NewVReg3)
8615         .add(predOps(ARMCC::AL));
8616 
8617     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
8618         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
8619 
8620     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8621     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
8622         .addReg(NewVReg4, RegState::Kill)
8623         .addImm(0)
8624         .addMemOperand(JTMMOLd)
8625         .add(predOps(ARMCC::AL));
8626 
8627     unsigned NewVReg6 = NewVReg5;
8628     if (IsPositionIndependent) {
8629       NewVReg6 = MRI->createVirtualRegister(TRC);
8630       BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
8631           .addReg(ARM::CPSR, RegState::Define)
8632           .addReg(NewVReg5, RegState::Kill)
8633           .addReg(NewVReg3)
8634           .add(predOps(ARMCC::AL));
8635     }
8636 
8637     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
8638       .addReg(NewVReg6, RegState::Kill)
8639       .addJumpTableIndex(MJTI);
8640   } else {
8641     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
8642     BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
8643         .addFrameIndex(FI)
8644         .addImm(4)
8645         .addMemOperand(FIMMOLd)
8646         .add(predOps(ARMCC::AL));
8647 
8648     if (NumLPads < 256) {
8649       BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
8650           .addReg(NewVReg1)
8651           .addImm(NumLPads)
8652           .add(predOps(ARMCC::AL));
8653     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
8654       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8655       BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
8656           .addImm(NumLPads & 0xFFFF)
8657           .add(predOps(ARMCC::AL));
8658 
8659       unsigned VReg2 = VReg1;
8660       if ((NumLPads & 0xFFFF0000) != 0) {
8661         VReg2 = MRI->createVirtualRegister(TRC);
8662         BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
8663             .addReg(VReg1)
8664             .addImm(NumLPads >> 16)
8665             .add(predOps(ARMCC::AL));
8666       }
8667 
8668       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8669           .addReg(NewVReg1)
8670           .addReg(VReg2)
8671           .add(predOps(ARMCC::AL));
8672     } else {
8673       MachineConstantPool *ConstantPool = MF->getConstantPool();
8674       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
8675       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
8676 
8677       // MachineConstantPool wants an explicit alignment.
8678       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
8679       if (Align == 0)
8680         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
8681       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
8682 
8683       unsigned VReg1 = MRI->createVirtualRegister(TRC);
8684       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
8685           .addReg(VReg1, RegState::Define)
8686           .addConstantPoolIndex(Idx)
8687           .addImm(0)
8688           .add(predOps(ARMCC::AL));
8689       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
8690           .addReg(NewVReg1)
8691           .addReg(VReg1, RegState::Kill)
8692           .add(predOps(ARMCC::AL));
8693     }
8694 
8695     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
8696       .addMBB(TrapBB)
8697       .addImm(ARMCC::HI)
8698       .addReg(ARM::CPSR);
8699 
8700     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
8701     BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
8702         .addReg(NewVReg1)
8703         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
8704         .add(predOps(ARMCC::AL))
8705         .add(condCodeOp());
8706     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
8707     BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
8708         .addJumpTableIndex(MJTI)
8709         .add(predOps(ARMCC::AL));
8710 
8711     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
8712         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
8713     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
8714     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
8715         .addReg(NewVReg3, RegState::Kill)
8716         .addReg(NewVReg4)
8717         .addImm(0)
8718         .addMemOperand(JTMMOLd)
8719         .add(predOps(ARMCC::AL));
8720 
8721     if (IsPositionIndependent) {
8722       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
8723         .addReg(NewVReg5, RegState::Kill)
8724         .addReg(NewVReg4)
8725         .addJumpTableIndex(MJTI);
8726     } else {
8727       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
8728         .addReg(NewVReg5, RegState::Kill)
8729         .addJumpTableIndex(MJTI);
8730     }
8731   }
8732 
8733   // Add the jump table entries as successors to the MBB.
8734   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
8735   for (std::vector<MachineBasicBlock*>::iterator
8736          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
8737     MachineBasicBlock *CurMBB = *I;
8738     if (SeenMBBs.insert(CurMBB).second)
8739       DispContBB->addSuccessor(CurMBB);
8740   }
8741 
8742   // N.B. the order the invoke BBs are processed in doesn't matter here.
8743   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
8744   SmallVector<MachineBasicBlock*, 64> MBBLPads;
8745   for (MachineBasicBlock *BB : InvokeBBs) {
8746 
8747     // Remove the landing pad successor from the invoke block and replace it
8748     // with the new dispatch block.
8749     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
8750                                                   BB->succ_end());
8751     while (!Successors.empty()) {
8752       MachineBasicBlock *SMBB = Successors.pop_back_val();
8753       if (SMBB->isEHPad()) {
8754         BB->removeSuccessor(SMBB);
8755         MBBLPads.push_back(SMBB);
8756       }
8757     }
8758 
8759     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
8760     BB->normalizeSuccProbs();
8761 
8762     // Find the invoke call and mark all of the callee-saved registers as
8763     // 'implicit defined' so that they're spilled. This prevents code from
8764     // moving instructions to before the EH block, where they will never be
8765     // executed.
8766     for (MachineBasicBlock::reverse_iterator
8767            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
8768       if (!II->isCall()) continue;
8769 
8770       DenseMap<unsigned, bool> DefRegs;
8771       for (MachineInstr::mop_iterator
8772              OI = II->operands_begin(), OE = II->operands_end();
8773            OI != OE; ++OI) {
8774         if (!OI->isReg()) continue;
8775         DefRegs[OI->getReg()] = true;
8776       }
8777 
8778       MachineInstrBuilder MIB(*MF, &*II);
8779 
8780       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
8781         unsigned Reg = SavedRegs[i];
8782         if (Subtarget->isThumb2() &&
8783             !ARM::tGPRRegClass.contains(Reg) &&
8784             !ARM::hGPRRegClass.contains(Reg))
8785           continue;
8786         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
8787           continue;
8788         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
8789           continue;
8790         if (!DefRegs[Reg])
8791           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
8792       }
8793 
8794       break;
8795     }
8796   }
8797 
8798   // Mark all former landing pads as non-landing pads. The dispatch is the only
8799   // landing pad now.
8800   for (SmallVectorImpl<MachineBasicBlock*>::iterator
8801          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
8802     (*I)->setIsEHPad(false);
8803 
8804   // The instruction is gone now.
8805   MI.eraseFromParent();
8806 }
8807 
8808 static
8809 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
8810   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
8811        E = MBB->succ_end(); I != E; ++I)
8812     if (*I != Succ)
8813       return *I;
8814   llvm_unreachable("Expecting a BB with two successors!");
8815 }
8816 
8817 /// Return the load opcode for a given load size. If load size >= 8,
8818 /// neon opcode will be returned.
8819 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
8820   if (LdSize >= 8)
8821     return LdSize == 16 ? ARM::VLD1q32wb_fixed
8822                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
8823   if (IsThumb1)
8824     return LdSize == 4 ? ARM::tLDRi
8825                        : LdSize == 2 ? ARM::tLDRHi
8826                                      : LdSize == 1 ? ARM::tLDRBi : 0;
8827   if (IsThumb2)
8828     return LdSize == 4 ? ARM::t2LDR_POST
8829                        : LdSize == 2 ? ARM::t2LDRH_POST
8830                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
8831   return LdSize == 4 ? ARM::LDR_POST_IMM
8832                      : LdSize == 2 ? ARM::LDRH_POST
8833                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
8834 }
8835 
8836 /// Return the store opcode for a given store size. If store size >= 8,
8837 /// neon opcode will be returned.
8838 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
8839   if (StSize >= 8)
8840     return StSize == 16 ? ARM::VST1q32wb_fixed
8841                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
8842   if (IsThumb1)
8843     return StSize == 4 ? ARM::tSTRi
8844                        : StSize == 2 ? ARM::tSTRHi
8845                                      : StSize == 1 ? ARM::tSTRBi : 0;
8846   if (IsThumb2)
8847     return StSize == 4 ? ARM::t2STR_POST
8848                        : StSize == 2 ? ARM::t2STRH_POST
8849                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
8850   return StSize == 4 ? ARM::STR_POST_IMM
8851                      : StSize == 2 ? ARM::STRH_POST
8852                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
8853 }
8854 
8855 /// Emit a post-increment load operation with given size. The instructions
8856 /// will be added to BB at Pos.
8857 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8858                        const TargetInstrInfo *TII, const DebugLoc &dl,
8859                        unsigned LdSize, unsigned Data, unsigned AddrIn,
8860                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8861   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
8862   assert(LdOpc != 0 && "Should have a load opcode");
8863   if (LdSize >= 8) {
8864     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8865         .addReg(AddrOut, RegState::Define)
8866         .addReg(AddrIn)
8867         .addImm(0)
8868         .add(predOps(ARMCC::AL));
8869   } else if (IsThumb1) {
8870     // load + update AddrIn
8871     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8872         .addReg(AddrIn)
8873         .addImm(0)
8874         .add(predOps(ARMCC::AL));
8875     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
8876         .add(t1CondCodeOp())
8877         .addReg(AddrIn)
8878         .addImm(LdSize)
8879         .add(predOps(ARMCC::AL));
8880   } else if (IsThumb2) {
8881     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8882         .addReg(AddrOut, RegState::Define)
8883         .addReg(AddrIn)
8884         .addImm(LdSize)
8885         .add(predOps(ARMCC::AL));
8886   } else { // arm
8887     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
8888         .addReg(AddrOut, RegState::Define)
8889         .addReg(AddrIn)
8890         .addReg(0)
8891         .addImm(LdSize)
8892         .add(predOps(ARMCC::AL));
8893   }
8894 }
8895 
8896 /// Emit a post-increment store operation with given size. The instructions
8897 /// will be added to BB at Pos.
8898 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
8899                        const TargetInstrInfo *TII, const DebugLoc &dl,
8900                        unsigned StSize, unsigned Data, unsigned AddrIn,
8901                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
8902   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
8903   assert(StOpc != 0 && "Should have a store opcode");
8904   if (StSize >= 8) {
8905     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8906         .addReg(AddrIn)
8907         .addImm(0)
8908         .addReg(Data)
8909         .add(predOps(ARMCC::AL));
8910   } else if (IsThumb1) {
8911     // store + update AddrIn
8912     BuildMI(*BB, Pos, dl, TII->get(StOpc))
8913         .addReg(Data)
8914         .addReg(AddrIn)
8915         .addImm(0)
8916         .add(predOps(ARMCC::AL));
8917     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
8918         .add(t1CondCodeOp())
8919         .addReg(AddrIn)
8920         .addImm(StSize)
8921         .add(predOps(ARMCC::AL));
8922   } else if (IsThumb2) {
8923     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8924         .addReg(Data)
8925         .addReg(AddrIn)
8926         .addImm(StSize)
8927         .add(predOps(ARMCC::AL));
8928   } else { // arm
8929     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
8930         .addReg(Data)
8931         .addReg(AddrIn)
8932         .addReg(0)
8933         .addImm(StSize)
8934         .add(predOps(ARMCC::AL));
8935   }
8936 }
8937 
8938 MachineBasicBlock *
8939 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
8940                                    MachineBasicBlock *BB) const {
8941   // This pseudo instruction has 3 operands: dst, src, size
8942   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
8943   // Otherwise, we will generate unrolled scalar copies.
8944   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8945   const BasicBlock *LLVM_BB = BB->getBasicBlock();
8946   MachineFunction::iterator It = ++BB->getIterator();
8947 
8948   unsigned dest = MI.getOperand(0).getReg();
8949   unsigned src = MI.getOperand(1).getReg();
8950   unsigned SizeVal = MI.getOperand(2).getImm();
8951   unsigned Align = MI.getOperand(3).getImm();
8952   DebugLoc dl = MI.getDebugLoc();
8953 
8954   MachineFunction *MF = BB->getParent();
8955   MachineRegisterInfo &MRI = MF->getRegInfo();
8956   unsigned UnitSize = 0;
8957   const TargetRegisterClass *TRC = nullptr;
8958   const TargetRegisterClass *VecTRC = nullptr;
8959 
8960   bool IsThumb1 = Subtarget->isThumb1Only();
8961   bool IsThumb2 = Subtarget->isThumb2();
8962   bool IsThumb = Subtarget->isThumb();
8963 
8964   if (Align & 1) {
8965     UnitSize = 1;
8966   } else if (Align & 2) {
8967     UnitSize = 2;
8968   } else {
8969     // Check whether we can use NEON instructions.
8970     if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
8971         Subtarget->hasNEON()) {
8972       if ((Align % 16 == 0) && SizeVal >= 16)
8973         UnitSize = 16;
8974       else if ((Align % 8 == 0) && SizeVal >= 8)
8975         UnitSize = 8;
8976     }
8977     // Can't use NEON instructions.
8978     if (UnitSize == 0)
8979       UnitSize = 4;
8980   }
8981 
8982   // Select the correct opcode and register class for unit size load/store
8983   bool IsNeon = UnitSize >= 8;
8984   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
8985   if (IsNeon)
8986     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
8987                             : UnitSize == 8 ? &ARM::DPRRegClass
8988                                             : nullptr;
8989 
8990   unsigned BytesLeft = SizeVal % UnitSize;
8991   unsigned LoopSize = SizeVal - BytesLeft;
8992 
8993   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
8994     // Use LDR and STR to copy.
8995     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
8996     // [destOut] = STR_POST(scratch, destIn, UnitSize)
8997     unsigned srcIn = src;
8998     unsigned destIn = dest;
8999     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
9000       unsigned srcOut = MRI.createVirtualRegister(TRC);
9001       unsigned destOut = MRI.createVirtualRegister(TRC);
9002       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
9003       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
9004                  IsThumb1, IsThumb2);
9005       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
9006                  IsThumb1, IsThumb2);
9007       srcIn = srcOut;
9008       destIn = destOut;
9009     }
9010 
9011     // Handle the leftover bytes with LDRB and STRB.
9012     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
9013     // [destOut] = STRB_POST(scratch, destIn, 1)
9014     for (unsigned i = 0; i < BytesLeft; i++) {
9015       unsigned srcOut = MRI.createVirtualRegister(TRC);
9016       unsigned destOut = MRI.createVirtualRegister(TRC);
9017       unsigned scratch = MRI.createVirtualRegister(TRC);
9018       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
9019                  IsThumb1, IsThumb2);
9020       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
9021                  IsThumb1, IsThumb2);
9022       srcIn = srcOut;
9023       destIn = destOut;
9024     }
9025     MI.eraseFromParent(); // The instruction is gone now.
9026     return BB;
9027   }
9028 
9029   // Expand the pseudo op to a loop.
9030   // thisMBB:
9031   //   ...
9032   //   movw varEnd, # --> with thumb2
9033   //   movt varEnd, #
9034   //   ldrcp varEnd, idx --> without thumb2
9035   //   fallthrough --> loopMBB
9036   // loopMBB:
9037   //   PHI varPhi, varEnd, varLoop
9038   //   PHI srcPhi, src, srcLoop
9039   //   PHI destPhi, dst, destLoop
9040   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
9041   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
9042   //   subs varLoop, varPhi, #UnitSize
9043   //   bne loopMBB
9044   //   fallthrough --> exitMBB
9045   // exitMBB:
9046   //   epilogue to handle left-over bytes
9047   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
9048   //   [destOut] = STRB_POST(scratch, destLoop, 1)
9049   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
9050   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
9051   MF->insert(It, loopMBB);
9052   MF->insert(It, exitMBB);
9053 
9054   // Transfer the remainder of BB and its successor edges to exitMBB.
9055   exitMBB->splice(exitMBB->begin(), BB,
9056                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
9057   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
9058 
9059   // Load an immediate to varEnd.
9060   unsigned varEnd = MRI.createVirtualRegister(TRC);
9061   if (Subtarget->useMovt(*MF)) {
9062     unsigned Vtmp = varEnd;
9063     if ((LoopSize & 0xFFFF0000) != 0)
9064       Vtmp = MRI.createVirtualRegister(TRC);
9065     BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
9066         .addImm(LoopSize & 0xFFFF)
9067         .add(predOps(ARMCC::AL));
9068 
9069     if ((LoopSize & 0xFFFF0000) != 0)
9070       BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
9071           .addReg(Vtmp)
9072           .addImm(LoopSize >> 16)
9073           .add(predOps(ARMCC::AL));
9074   } else {
9075     MachineConstantPool *ConstantPool = MF->getConstantPool();
9076     Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
9077     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
9078 
9079     // MachineConstantPool wants an explicit alignment.
9080     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
9081     if (Align == 0)
9082       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
9083     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
9084 
9085     if (IsThumb)
9086       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
9087           .addReg(varEnd, RegState::Define)
9088           .addConstantPoolIndex(Idx)
9089           .add(predOps(ARMCC::AL));
9090     else
9091       BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
9092           .addReg(varEnd, RegState::Define)
9093           .addConstantPoolIndex(Idx)
9094           .addImm(0)
9095           .add(predOps(ARMCC::AL));
9096   }
9097   BB->addSuccessor(loopMBB);
9098 
9099   // Generate the loop body:
9100   //   varPhi = PHI(varLoop, varEnd)
9101   //   srcPhi = PHI(srcLoop, src)
9102   //   destPhi = PHI(destLoop, dst)
9103   MachineBasicBlock *entryBB = BB;
9104   BB = loopMBB;
9105   unsigned varLoop = MRI.createVirtualRegister(TRC);
9106   unsigned varPhi = MRI.createVirtualRegister(TRC);
9107   unsigned srcLoop = MRI.createVirtualRegister(TRC);
9108   unsigned srcPhi = MRI.createVirtualRegister(TRC);
9109   unsigned destLoop = MRI.createVirtualRegister(TRC);
9110   unsigned destPhi = MRI.createVirtualRegister(TRC);
9111 
9112   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
9113     .addReg(varLoop).addMBB(loopMBB)
9114     .addReg(varEnd).addMBB(entryBB);
9115   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
9116     .addReg(srcLoop).addMBB(loopMBB)
9117     .addReg(src).addMBB(entryBB);
9118   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
9119     .addReg(destLoop).addMBB(loopMBB)
9120     .addReg(dest).addMBB(entryBB);
9121 
9122   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
9123   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
9124   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
9125   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
9126              IsThumb1, IsThumb2);
9127   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
9128              IsThumb1, IsThumb2);
9129 
9130   // Decrement loop variable by UnitSize.
9131   if (IsThumb1) {
9132     BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
9133         .add(t1CondCodeOp())
9134         .addReg(varPhi)
9135         .addImm(UnitSize)
9136         .add(predOps(ARMCC::AL));
9137   } else {
9138     MachineInstrBuilder MIB =
9139         BuildMI(*BB, BB->end(), dl,
9140                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
9141     MIB.addReg(varPhi)
9142         .addImm(UnitSize)
9143         .add(predOps(ARMCC::AL))
9144         .add(condCodeOp());
9145     MIB->getOperand(5).setReg(ARM::CPSR);
9146     MIB->getOperand(5).setIsDef(true);
9147   }
9148   BuildMI(*BB, BB->end(), dl,
9149           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
9150       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
9151 
9152   // loopMBB can loop back to loopMBB or fall through to exitMBB.
9153   BB->addSuccessor(loopMBB);
9154   BB->addSuccessor(exitMBB);
9155 
9156   // Add epilogue to handle BytesLeft.
9157   BB = exitMBB;
9158   auto StartOfExit = exitMBB->begin();
9159 
9160   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
9161   //   [destOut] = STRB_POST(scratch, destLoop, 1)
9162   unsigned srcIn = srcLoop;
9163   unsigned destIn = destLoop;
9164   for (unsigned i = 0; i < BytesLeft; i++) {
9165     unsigned srcOut = MRI.createVirtualRegister(TRC);
9166     unsigned destOut = MRI.createVirtualRegister(TRC);
9167     unsigned scratch = MRI.createVirtualRegister(TRC);
9168     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
9169                IsThumb1, IsThumb2);
9170     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
9171                IsThumb1, IsThumb2);
9172     srcIn = srcOut;
9173     destIn = destOut;
9174   }
9175 
9176   MI.eraseFromParent(); // The instruction is gone now.
9177   return BB;
9178 }
9179 
9180 MachineBasicBlock *
9181 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
9182                                        MachineBasicBlock *MBB) const {
9183   const TargetMachine &TM = getTargetMachine();
9184   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
9185   DebugLoc DL = MI.getDebugLoc();
9186 
9187   assert(Subtarget->isTargetWindows() &&
9188          "__chkstk is only supported on Windows");
9189   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
9190 
9191   // __chkstk takes the number of words to allocate on the stack in R4, and
9192   // returns the stack adjustment in number of bytes in R4.  This will not
9193   // clober any other registers (other than the obvious lr).
9194   //
9195   // Although, technically, IP should be considered a register which may be
9196   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
9197   // thumb-2 environment, so there is no interworking required.  As a result, we
9198   // do not expect a veneer to be emitted by the linker, clobbering IP.
9199   //
9200   // Each module receives its own copy of __chkstk, so no import thunk is
9201   // required, again, ensuring that IP is not clobbered.
9202   //
9203   // Finally, although some linkers may theoretically provide a trampoline for
9204   // out of range calls (which is quite common due to a 32M range limitation of
9205   // branches for Thumb), we can generate the long-call version via
9206   // -mcmodel=large, alleviating the need for the trampoline which may clobber
9207   // IP.
9208 
9209   switch (TM.getCodeModel()) {
9210   case CodeModel::Tiny:
9211     llvm_unreachable("Tiny code model not available on ARM.");
9212   case CodeModel::Small:
9213   case CodeModel::Medium:
9214   case CodeModel::Kernel:
9215     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
9216         .add(predOps(ARMCC::AL))
9217         .addExternalSymbol("__chkstk")
9218         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
9219         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
9220         .addReg(ARM::R12,
9221                 RegState::Implicit | RegState::Define | RegState::Dead)
9222         .addReg(ARM::CPSR,
9223                 RegState::Implicit | RegState::Define | RegState::Dead);
9224     break;
9225   case CodeModel::Large: {
9226     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
9227     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
9228 
9229     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
9230       .addExternalSymbol("__chkstk");
9231     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
9232         .add(predOps(ARMCC::AL))
9233         .addReg(Reg, RegState::Kill)
9234         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
9235         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
9236         .addReg(ARM::R12,
9237                 RegState::Implicit | RegState::Define | RegState::Dead)
9238         .addReg(ARM::CPSR,
9239                 RegState::Implicit | RegState::Define | RegState::Dead);
9240     break;
9241   }
9242   }
9243 
9244   BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
9245       .addReg(ARM::SP, RegState::Kill)
9246       .addReg(ARM::R4, RegState::Kill)
9247       .setMIFlags(MachineInstr::FrameSetup)
9248       .add(predOps(ARMCC::AL))
9249       .add(condCodeOp());
9250 
9251   MI.eraseFromParent();
9252   return MBB;
9253 }
9254 
9255 MachineBasicBlock *
9256 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
9257                                        MachineBasicBlock *MBB) const {
9258   DebugLoc DL = MI.getDebugLoc();
9259   MachineFunction *MF = MBB->getParent();
9260   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9261 
9262   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
9263   MF->insert(++MBB->getIterator(), ContBB);
9264   ContBB->splice(ContBB->begin(), MBB,
9265                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
9266   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
9267   MBB->addSuccessor(ContBB);
9268 
9269   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
9270   BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
9271   MF->push_back(TrapBB);
9272   MBB->addSuccessor(TrapBB);
9273 
9274   BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
9275       .addReg(MI.getOperand(0).getReg())
9276       .addImm(0)
9277       .add(predOps(ARMCC::AL));
9278   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
9279       .addMBB(TrapBB)
9280       .addImm(ARMCC::EQ)
9281       .addReg(ARM::CPSR);
9282 
9283   MI.eraseFromParent();
9284   return ContBB;
9285 }
9286 
9287 MachineBasicBlock *
9288 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
9289                                                MachineBasicBlock *BB) const {
9290   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9291   DebugLoc dl = MI.getDebugLoc();
9292   bool isThumb2 = Subtarget->isThumb2();
9293   switch (MI.getOpcode()) {
9294   default: {
9295     MI.print(errs());
9296     llvm_unreachable("Unexpected instr type to insert");
9297   }
9298 
9299   // Thumb1 post-indexed loads are really just single-register LDMs.
9300   case ARM::tLDR_postidx: {
9301     MachineOperand Def(MI.getOperand(1));
9302     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
9303         .add(Def)  // Rn_wb
9304         .add(MI.getOperand(2))  // Rn
9305         .add(MI.getOperand(3))  // PredImm
9306         .add(MI.getOperand(4))  // PredReg
9307         .add(MI.getOperand(0)); // Rt
9308     MI.eraseFromParent();
9309     return BB;
9310   }
9311 
9312   // The Thumb2 pre-indexed stores have the same MI operands, they just
9313   // define them differently in the .td files from the isel patterns, so
9314   // they need pseudos.
9315   case ARM::t2STR_preidx:
9316     MI.setDesc(TII->get(ARM::t2STR_PRE));
9317     return BB;
9318   case ARM::t2STRB_preidx:
9319     MI.setDesc(TII->get(ARM::t2STRB_PRE));
9320     return BB;
9321   case ARM::t2STRH_preidx:
9322     MI.setDesc(TII->get(ARM::t2STRH_PRE));
9323     return BB;
9324 
9325   case ARM::STRi_preidx:
9326   case ARM::STRBi_preidx: {
9327     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
9328                                                          : ARM::STRB_PRE_IMM;
9329     // Decode the offset.
9330     unsigned Offset = MI.getOperand(4).getImm();
9331     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
9332     Offset = ARM_AM::getAM2Offset(Offset);
9333     if (isSub)
9334       Offset = -Offset;
9335 
9336     MachineMemOperand *MMO = *MI.memoperands_begin();
9337     BuildMI(*BB, MI, dl, TII->get(NewOpc))
9338         .add(MI.getOperand(0)) // Rn_wb
9339         .add(MI.getOperand(1)) // Rt
9340         .add(MI.getOperand(2)) // Rn
9341         .addImm(Offset)        // offset (skip GPR==zero_reg)
9342         .add(MI.getOperand(5)) // pred
9343         .add(MI.getOperand(6))
9344         .addMemOperand(MMO);
9345     MI.eraseFromParent();
9346     return BB;
9347   }
9348   case ARM::STRr_preidx:
9349   case ARM::STRBr_preidx:
9350   case ARM::STRH_preidx: {
9351     unsigned NewOpc;
9352     switch (MI.getOpcode()) {
9353     default: llvm_unreachable("unexpected opcode!");
9354     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
9355     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
9356     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
9357     }
9358     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
9359     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
9360       MIB.add(MI.getOperand(i));
9361     MI.eraseFromParent();
9362     return BB;
9363   }
9364 
9365   case ARM::tMOVCCr_pseudo: {
9366     // To "insert" a SELECT_CC instruction, we actually have to insert the
9367     // diamond control-flow pattern.  The incoming instruction knows the
9368     // destination vreg to set, the condition code register to branch on, the
9369     // true/false values to select between, and a branch opcode to use.
9370     const BasicBlock *LLVM_BB = BB->getBasicBlock();
9371     MachineFunction::iterator It = ++BB->getIterator();
9372 
9373     //  thisMBB:
9374     //  ...
9375     //   TrueVal = ...
9376     //   cmpTY ccX, r1, r2
9377     //   bCC copy1MBB
9378     //   fallthrough --> copy0MBB
9379     MachineBasicBlock *thisMBB  = BB;
9380     MachineFunction *F = BB->getParent();
9381     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
9382     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
9383     F->insert(It, copy0MBB);
9384     F->insert(It, sinkMBB);
9385 
9386     // Transfer the remainder of BB and its successor edges to sinkMBB.
9387     sinkMBB->splice(sinkMBB->begin(), BB,
9388                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
9389     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
9390 
9391     BB->addSuccessor(copy0MBB);
9392     BB->addSuccessor(sinkMBB);
9393 
9394     BuildMI(BB, dl, TII->get(ARM::tBcc))
9395         .addMBB(sinkMBB)
9396         .addImm(MI.getOperand(3).getImm())
9397         .addReg(MI.getOperand(4).getReg());
9398 
9399     //  copy0MBB:
9400     //   %FalseValue = ...
9401     //   # fallthrough to sinkMBB
9402     BB = copy0MBB;
9403 
9404     // Update machine-CFG edges
9405     BB->addSuccessor(sinkMBB);
9406 
9407     //  sinkMBB:
9408     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
9409     //  ...
9410     BB = sinkMBB;
9411     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
9412         .addReg(MI.getOperand(1).getReg())
9413         .addMBB(copy0MBB)
9414         .addReg(MI.getOperand(2).getReg())
9415         .addMBB(thisMBB);
9416 
9417     MI.eraseFromParent(); // The pseudo instruction is gone now.
9418     return BB;
9419   }
9420 
9421   case ARM::BCCi64:
9422   case ARM::BCCZi64: {
9423     // If there is an unconditional branch to the other successor, remove it.
9424     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
9425 
9426     // Compare both parts that make up the double comparison separately for
9427     // equality.
9428     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
9429 
9430     unsigned LHS1 = MI.getOperand(1).getReg();
9431     unsigned LHS2 = MI.getOperand(2).getReg();
9432     if (RHSisZero) {
9433       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
9434           .addReg(LHS1)
9435           .addImm(0)
9436           .add(predOps(ARMCC::AL));
9437       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
9438         .addReg(LHS2).addImm(0)
9439         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
9440     } else {
9441       unsigned RHS1 = MI.getOperand(3).getReg();
9442       unsigned RHS2 = MI.getOperand(4).getReg();
9443       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
9444           .addReg(LHS1)
9445           .addReg(RHS1)
9446           .add(predOps(ARMCC::AL));
9447       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
9448         .addReg(LHS2).addReg(RHS2)
9449         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
9450     }
9451 
9452     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
9453     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
9454     if (MI.getOperand(0).getImm() == ARMCC::NE)
9455       std::swap(destMBB, exitMBB);
9456 
9457     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
9458       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
9459     if (isThumb2)
9460       BuildMI(BB, dl, TII->get(ARM::t2B))
9461           .addMBB(exitMBB)
9462           .add(predOps(ARMCC::AL));
9463     else
9464       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
9465 
9466     MI.eraseFromParent(); // The pseudo instruction is gone now.
9467     return BB;
9468   }
9469 
9470   case ARM::Int_eh_sjlj_setjmp:
9471   case ARM::Int_eh_sjlj_setjmp_nofp:
9472   case ARM::tInt_eh_sjlj_setjmp:
9473   case ARM::t2Int_eh_sjlj_setjmp:
9474   case ARM::t2Int_eh_sjlj_setjmp_nofp:
9475     return BB;
9476 
9477   case ARM::Int_eh_sjlj_setup_dispatch:
9478     EmitSjLjDispatchBlock(MI, BB);
9479     return BB;
9480 
9481   case ARM::ABS:
9482   case ARM::t2ABS: {
9483     // To insert an ABS instruction, we have to insert the
9484     // diamond control-flow pattern.  The incoming instruction knows the
9485     // source vreg to test against 0, the destination vreg to set,
9486     // the condition code register to branch on, the
9487     // true/false values to select between, and a branch opcode to use.
9488     // It transforms
9489     //     V1 = ABS V0
9490     // into
9491     //     V2 = MOVS V0
9492     //     BCC                      (branch to SinkBB if V0 >= 0)
9493     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
9494     //     SinkBB: V1 = PHI(V2, V3)
9495     const BasicBlock *LLVM_BB = BB->getBasicBlock();
9496     MachineFunction::iterator BBI = ++BB->getIterator();
9497     MachineFunction *Fn = BB->getParent();
9498     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
9499     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
9500     Fn->insert(BBI, RSBBB);
9501     Fn->insert(BBI, SinkBB);
9502 
9503     unsigned int ABSSrcReg = MI.getOperand(1).getReg();
9504     unsigned int ABSDstReg = MI.getOperand(0).getReg();
9505     bool ABSSrcKIll = MI.getOperand(1).isKill();
9506     bool isThumb2 = Subtarget->isThumb2();
9507     MachineRegisterInfo &MRI = Fn->getRegInfo();
9508     // In Thumb mode S must not be specified if source register is the SP or
9509     // PC and if destination register is the SP, so restrict register class
9510     unsigned NewRsbDstReg =
9511       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
9512 
9513     // Transfer the remainder of BB and its successor edges to sinkMBB.
9514     SinkBB->splice(SinkBB->begin(), BB,
9515                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
9516     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
9517 
9518     BB->addSuccessor(RSBBB);
9519     BB->addSuccessor(SinkBB);
9520 
9521     // fall through to SinkMBB
9522     RSBBB->addSuccessor(SinkBB);
9523 
9524     // insert a cmp at the end of BB
9525     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
9526         .addReg(ABSSrcReg)
9527         .addImm(0)
9528         .add(predOps(ARMCC::AL));
9529 
9530     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
9531     BuildMI(BB, dl,
9532       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
9533       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
9534 
9535     // insert rsbri in RSBBB
9536     // Note: BCC and rsbri will be converted into predicated rsbmi
9537     // by if-conversion pass
9538     BuildMI(*RSBBB, RSBBB->begin(), dl,
9539             TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
9540         .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
9541         .addImm(0)
9542         .add(predOps(ARMCC::AL))
9543         .add(condCodeOp());
9544 
9545     // insert PHI in SinkBB,
9546     // reuse ABSDstReg to not change uses of ABS instruction
9547     BuildMI(*SinkBB, SinkBB->begin(), dl,
9548       TII->get(ARM::PHI), ABSDstReg)
9549       .addReg(NewRsbDstReg).addMBB(RSBBB)
9550       .addReg(ABSSrcReg).addMBB(BB);
9551 
9552     // remove ABS instruction
9553     MI.eraseFromParent();
9554 
9555     // return last added BB
9556     return SinkBB;
9557   }
9558   case ARM::COPY_STRUCT_BYVAL_I32:
9559     ++NumLoopByVals;
9560     return EmitStructByval(MI, BB);
9561   case ARM::WIN__CHKSTK:
9562     return EmitLowered__chkstk(MI, BB);
9563   case ARM::WIN__DBZCHK:
9564     return EmitLowered__dbzchk(MI, BB);
9565   }
9566 }
9567 
9568 /// Attaches vregs to MEMCPY that it will use as scratch registers
9569 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
9570 /// instead of as a custom inserter because we need the use list from the SDNode.
9571 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
9572                                     MachineInstr &MI, const SDNode *Node) {
9573   bool isThumb1 = Subtarget->isThumb1Only();
9574 
9575   DebugLoc DL = MI.getDebugLoc();
9576   MachineFunction *MF = MI.getParent()->getParent();
9577   MachineRegisterInfo &MRI = MF->getRegInfo();
9578   MachineInstrBuilder MIB(*MF, MI);
9579 
9580   // If the new dst/src is unused mark it as dead.
9581   if (!Node->hasAnyUseOfValue(0)) {
9582     MI.getOperand(0).setIsDead(true);
9583   }
9584   if (!Node->hasAnyUseOfValue(1)) {
9585     MI.getOperand(1).setIsDead(true);
9586   }
9587 
9588   // The MEMCPY both defines and kills the scratch registers.
9589   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
9590     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
9591                                                          : &ARM::GPRRegClass);
9592     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
9593   }
9594 }
9595 
9596 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9597                                                       SDNode *Node) const {
9598   if (MI.getOpcode() == ARM::MEMCPY) {
9599     attachMEMCPYScratchRegs(Subtarget, MI, Node);
9600     return;
9601   }
9602 
9603   const MCInstrDesc *MCID = &MI.getDesc();
9604   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
9605   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
9606   // operand is still set to noreg. If needed, set the optional operand's
9607   // register to CPSR, and remove the redundant implicit def.
9608   //
9609   // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
9610 
9611   // Rename pseudo opcodes.
9612   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
9613   unsigned ccOutIdx;
9614   if (NewOpc) {
9615     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
9616     MCID = &TII->get(NewOpc);
9617 
9618     assert(MCID->getNumOperands() ==
9619            MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
9620         && "converted opcode should be the same except for cc_out"
9621            " (and, on Thumb1, pred)");
9622 
9623     MI.setDesc(*MCID);
9624 
9625     // Add the optional cc_out operand
9626     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
9627 
9628     // On Thumb1, move all input operands to the end, then add the predicate
9629     if (Subtarget->isThumb1Only()) {
9630       for (unsigned c = MCID->getNumOperands() - 4; c--;) {
9631         MI.addOperand(MI.getOperand(1));
9632         MI.RemoveOperand(1);
9633       }
9634 
9635       // Restore the ties
9636       for (unsigned i = MI.getNumOperands(); i--;) {
9637         const MachineOperand& op = MI.getOperand(i);
9638         if (op.isReg() && op.isUse()) {
9639           int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
9640           if (DefIdx != -1)
9641             MI.tieOperands(DefIdx, i);
9642         }
9643       }
9644 
9645       MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
9646       MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
9647       ccOutIdx = 1;
9648     } else
9649       ccOutIdx = MCID->getNumOperands() - 1;
9650   } else
9651     ccOutIdx = MCID->getNumOperands() - 1;
9652 
9653   // Any ARM instruction that sets the 's' bit should specify an optional
9654   // "cc_out" operand in the last operand position.
9655   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
9656     assert(!NewOpc && "Optional cc_out operand required");
9657     return;
9658   }
9659   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
9660   // since we already have an optional CPSR def.
9661   bool definesCPSR = false;
9662   bool deadCPSR = false;
9663   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
9664        ++i) {
9665     const MachineOperand &MO = MI.getOperand(i);
9666     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
9667       definesCPSR = true;
9668       if (MO.isDead())
9669         deadCPSR = true;
9670       MI.RemoveOperand(i);
9671       break;
9672     }
9673   }
9674   if (!definesCPSR) {
9675     assert(!NewOpc && "Optional cc_out operand required");
9676     return;
9677   }
9678   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
9679   if (deadCPSR) {
9680     assert(!MI.getOperand(ccOutIdx).getReg() &&
9681            "expect uninitialized optional cc_out operand");
9682     // Thumb1 instructions must have the S bit even if the CPSR is dead.
9683     if (!Subtarget->isThumb1Only())
9684       return;
9685   }
9686 
9687   // If this instruction was defined with an optional CPSR def and its dag node
9688   // had a live implicit CPSR def, then activate the optional CPSR def.
9689   MachineOperand &MO = MI.getOperand(ccOutIdx);
9690   MO.setReg(ARM::CPSR);
9691   MO.setIsDef(true);
9692 }
9693 
9694 //===----------------------------------------------------------------------===//
9695 //                           ARM Optimization Hooks
9696 //===----------------------------------------------------------------------===//
9697 
9698 // Helper function that checks if N is a null or all ones constant.
9699 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
9700   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
9701 }
9702 
9703 // Return true if N is conditionally 0 or all ones.
9704 // Detects these expressions where cc is an i1 value:
9705 //
9706 //   (select cc 0, y)   [AllOnes=0]
9707 //   (select cc y, 0)   [AllOnes=0]
9708 //   (zext cc)          [AllOnes=0]
9709 //   (sext cc)          [AllOnes=0/1]
9710 //   (select cc -1, y)  [AllOnes=1]
9711 //   (select cc y, -1)  [AllOnes=1]
9712 //
9713 // Invert is set when N is the null/all ones constant when CC is false.
9714 // OtherOp is set to the alternative value of N.
9715 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
9716                                        SDValue &CC, bool &Invert,
9717                                        SDValue &OtherOp,
9718                                        SelectionDAG &DAG) {
9719   switch (N->getOpcode()) {
9720   default: return false;
9721   case ISD::SELECT: {
9722     CC = N->getOperand(0);
9723     SDValue N1 = N->getOperand(1);
9724     SDValue N2 = N->getOperand(2);
9725     if (isZeroOrAllOnes(N1, AllOnes)) {
9726       Invert = false;
9727       OtherOp = N2;
9728       return true;
9729     }
9730     if (isZeroOrAllOnes(N2, AllOnes)) {
9731       Invert = true;
9732       OtherOp = N1;
9733       return true;
9734     }
9735     return false;
9736   }
9737   case ISD::ZERO_EXTEND:
9738     // (zext cc) can never be the all ones value.
9739     if (AllOnes)
9740       return false;
9741     LLVM_FALLTHROUGH;
9742   case ISD::SIGN_EXTEND: {
9743     SDLoc dl(N);
9744     EVT VT = N->getValueType(0);
9745     CC = N->getOperand(0);
9746     if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
9747       return false;
9748     Invert = !AllOnes;
9749     if (AllOnes)
9750       // When looking for an AllOnes constant, N is an sext, and the 'other'
9751       // value is 0.
9752       OtherOp = DAG.getConstant(0, dl, VT);
9753     else if (N->getOpcode() == ISD::ZERO_EXTEND)
9754       // When looking for a 0 constant, N can be zext or sext.
9755       OtherOp = DAG.getConstant(1, dl, VT);
9756     else
9757       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
9758                                 VT);
9759     return true;
9760   }
9761   }
9762 }
9763 
9764 // Combine a constant select operand into its use:
9765 //
9766 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
9767 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
9768 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
9769 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
9770 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
9771 //
9772 // The transform is rejected if the select doesn't have a constant operand that
9773 // is null, or all ones when AllOnes is set.
9774 //
9775 // Also recognize sext/zext from i1:
9776 //
9777 //   (add (zext cc), x) -> (select cc (add x, 1), x)
9778 //   (add (sext cc), x) -> (select cc (add x, -1), x)
9779 //
9780 // These transformations eventually create predicated instructions.
9781 //
9782 // @param N       The node to transform.
9783 // @param Slct    The N operand that is a select.
9784 // @param OtherOp The other N operand (x above).
9785 // @param DCI     Context.
9786 // @param AllOnes Require the select constant to be all ones instead of null.
9787 // @returns The new node, or SDValue() on failure.
9788 static
9789 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
9790                             TargetLowering::DAGCombinerInfo &DCI,
9791                             bool AllOnes = false) {
9792   SelectionDAG &DAG = DCI.DAG;
9793   EVT VT = N->getValueType(0);
9794   SDValue NonConstantVal;
9795   SDValue CCOp;
9796   bool SwapSelectOps;
9797   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
9798                                   NonConstantVal, DAG))
9799     return SDValue();
9800 
9801   // Slct is now know to be the desired identity constant when CC is true.
9802   SDValue TrueVal = OtherOp;
9803   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
9804                                  OtherOp, NonConstantVal);
9805   // Unless SwapSelectOps says CC should be false.
9806   if (SwapSelectOps)
9807     std::swap(TrueVal, FalseVal);
9808 
9809   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
9810                      CCOp, TrueVal, FalseVal);
9811 }
9812 
9813 // Attempt combineSelectAndUse on each operand of a commutative operator N.
9814 static
9815 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
9816                                        TargetLowering::DAGCombinerInfo &DCI) {
9817   SDValue N0 = N->getOperand(0);
9818   SDValue N1 = N->getOperand(1);
9819   if (N0.getNode()->hasOneUse())
9820     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
9821       return Result;
9822   if (N1.getNode()->hasOneUse())
9823     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
9824       return Result;
9825   return SDValue();
9826 }
9827 
9828 static bool IsVUZPShuffleNode(SDNode *N) {
9829   // VUZP shuffle node.
9830   if (N->getOpcode() == ARMISD::VUZP)
9831     return true;
9832 
9833   // "VUZP" on i32 is an alias for VTRN.
9834   if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
9835     return true;
9836 
9837   return false;
9838 }
9839 
9840 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
9841                                  TargetLowering::DAGCombinerInfo &DCI,
9842                                  const ARMSubtarget *Subtarget) {
9843   // Look for ADD(VUZP.0, VUZP.1).
9844   if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
9845       N0 == N1)
9846    return SDValue();
9847 
9848   // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
9849   if (!N->getValueType(0).is64BitVector())
9850     return SDValue();
9851 
9852   // Generate vpadd.
9853   SelectionDAG &DAG = DCI.DAG;
9854   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9855   SDLoc dl(N);
9856   SDNode *Unzip = N0.getNode();
9857   EVT VT = N->getValueType(0);
9858 
9859   SmallVector<SDValue, 8> Ops;
9860   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
9861                                 TLI.getPointerTy(DAG.getDataLayout())));
9862   Ops.push_back(Unzip->getOperand(0));
9863   Ops.push_back(Unzip->getOperand(1));
9864 
9865   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
9866 }
9867 
9868 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9869                                       TargetLowering::DAGCombinerInfo &DCI,
9870                                       const ARMSubtarget *Subtarget) {
9871   // Check for two extended operands.
9872   if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
9873         N1.getOpcode() == ISD::SIGN_EXTEND) &&
9874       !(N0.getOpcode() == ISD::ZERO_EXTEND &&
9875         N1.getOpcode() == ISD::ZERO_EXTEND))
9876     return SDValue();
9877 
9878   SDValue N00 = N0.getOperand(0);
9879   SDValue N10 = N1.getOperand(0);
9880 
9881   // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
9882   if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
9883       N00 == N10)
9884     return SDValue();
9885 
9886   // We only recognize Q register paddl here; this can't be reached until
9887   // after type legalization.
9888   if (!N00.getValueType().is64BitVector() ||
9889       !N0.getValueType().is128BitVector())
9890     return SDValue();
9891 
9892   // Generate vpaddl.
9893   SelectionDAG &DAG = DCI.DAG;
9894   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9895   SDLoc dl(N);
9896   EVT VT = N->getValueType(0);
9897 
9898   SmallVector<SDValue, 8> Ops;
9899   // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
9900   unsigned Opcode;
9901   if (N0.getOpcode() == ISD::SIGN_EXTEND)
9902     Opcode = Intrinsic::arm_neon_vpaddls;
9903   else
9904     Opcode = Intrinsic::arm_neon_vpaddlu;
9905   Ops.push_back(DAG.getConstant(Opcode, dl,
9906                                 TLI.getPointerTy(DAG.getDataLayout())));
9907   EVT ElemTy = N00.getValueType().getVectorElementType();
9908   unsigned NumElts = VT.getVectorNumElements();
9909   EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
9910   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
9911                                N00.getOperand(0), N00.getOperand(1));
9912   Ops.push_back(Concat);
9913 
9914   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
9915 }
9916 
9917 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
9918 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
9919 // much easier to match.
9920 static SDValue
9921 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9922                                TargetLowering::DAGCombinerInfo &DCI,
9923                                const ARMSubtarget *Subtarget) {
9924   // Only perform optimization if after legalize, and if NEON is available. We
9925   // also expected both operands to be BUILD_VECTORs.
9926   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
9927       || N0.getOpcode() != ISD::BUILD_VECTOR
9928       || N1.getOpcode() != ISD::BUILD_VECTOR)
9929     return SDValue();
9930 
9931   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
9932   EVT VT = N->getValueType(0);
9933   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
9934     return SDValue();
9935 
9936   // Check that the vector operands are of the right form.
9937   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
9938   // operands, where N is the size of the formed vector.
9939   // Each EXTRACT_VECTOR should have the same input vector and odd or even
9940   // index such that we have a pair wise add pattern.
9941 
9942   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
9943   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9944     return SDValue();
9945   SDValue Vec = N0->getOperand(0)->getOperand(0);
9946   SDNode *V = Vec.getNode();
9947   unsigned nextIndex = 0;
9948 
9949   // For each operands to the ADD which are BUILD_VECTORs,
9950   // check to see if each of their operands are an EXTRACT_VECTOR with
9951   // the same vector and appropriate index.
9952   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
9953     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
9954         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
9955 
9956       SDValue ExtVec0 = N0->getOperand(i);
9957       SDValue ExtVec1 = N1->getOperand(i);
9958 
9959       // First operand is the vector, verify its the same.
9960       if (V != ExtVec0->getOperand(0).getNode() ||
9961           V != ExtVec1->getOperand(0).getNode())
9962         return SDValue();
9963 
9964       // Second is the constant, verify its correct.
9965       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
9966       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
9967 
9968       // For the constant, we want to see all the even or all the odd.
9969       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
9970           || C1->getZExtValue() != nextIndex+1)
9971         return SDValue();
9972 
9973       // Increment index.
9974       nextIndex+=2;
9975     } else
9976       return SDValue();
9977   }
9978 
9979   // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
9980   // we're using the entire input vector, otherwise there's a size/legality
9981   // mismatch somewhere.
9982   if (nextIndex != Vec.getValueType().getVectorNumElements() ||
9983       Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
9984     return SDValue();
9985 
9986   // Create VPADDL node.
9987   SelectionDAG &DAG = DCI.DAG;
9988   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9989 
9990   SDLoc dl(N);
9991 
9992   // Build operand list.
9993   SmallVector<SDValue, 8> Ops;
9994   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
9995                                 TLI.getPointerTy(DAG.getDataLayout())));
9996 
9997   // Input is the vector.
9998   Ops.push_back(Vec);
9999 
10000   // Get widened type and narrowed type.
10001   MVT widenType;
10002   unsigned numElem = VT.getVectorNumElements();
10003 
10004   EVT inputLaneType = Vec.getValueType().getVectorElementType();
10005   switch (inputLaneType.getSimpleVT().SimpleTy) {
10006     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
10007     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
10008     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
10009     default:
10010       llvm_unreachable("Invalid vector element type for padd optimization.");
10011   }
10012 
10013   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
10014   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
10015   return DAG.getNode(ExtOp, dl, VT, tmp);
10016 }
10017 
10018 static SDValue findMUL_LOHI(SDValue V) {
10019   if (V->getOpcode() == ISD::UMUL_LOHI ||
10020       V->getOpcode() == ISD::SMUL_LOHI)
10021     return V;
10022   return SDValue();
10023 }
10024 
10025 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
10026                                         TargetLowering::DAGCombinerInfo &DCI,
10027                                         const ARMSubtarget *Subtarget) {
10028   if (Subtarget->isThumb()) {
10029     if (!Subtarget->hasDSP())
10030       return SDValue();
10031   } else if (!Subtarget->hasV5TEOps())
10032     return SDValue();
10033 
10034   // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
10035   // accumulates the product into a 64-bit value. The 16-bit values will
10036   // be sign extended somehow or SRA'd into 32-bit values
10037   // (addc (adde (mul 16bit, 16bit), lo), hi)
10038   SDValue Mul = AddcNode->getOperand(0);
10039   SDValue Lo = AddcNode->getOperand(1);
10040   if (Mul.getOpcode() != ISD::MUL) {
10041     Lo = AddcNode->getOperand(0);
10042     Mul = AddcNode->getOperand(1);
10043     if (Mul.getOpcode() != ISD::MUL)
10044       return SDValue();
10045   }
10046 
10047   SDValue SRA = AddeNode->getOperand(0);
10048   SDValue Hi = AddeNode->getOperand(1);
10049   if (SRA.getOpcode() != ISD::SRA) {
10050     SRA = AddeNode->getOperand(1);
10051     Hi = AddeNode->getOperand(0);
10052     if (SRA.getOpcode() != ISD::SRA)
10053       return SDValue();
10054   }
10055   if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
10056     if (Const->getZExtValue() != 31)
10057       return SDValue();
10058   } else
10059     return SDValue();
10060 
10061   if (SRA.getOperand(0) != Mul)
10062     return SDValue();
10063 
10064   SelectionDAG &DAG = DCI.DAG;
10065   SDLoc dl(AddcNode);
10066   unsigned Opcode = 0;
10067   SDValue Op0;
10068   SDValue Op1;
10069 
10070   if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
10071     Opcode = ARMISD::SMLALBB;
10072     Op0 = Mul.getOperand(0);
10073     Op1 = Mul.getOperand(1);
10074   } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
10075     Opcode = ARMISD::SMLALBT;
10076     Op0 = Mul.getOperand(0);
10077     Op1 = Mul.getOperand(1).getOperand(0);
10078   } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
10079     Opcode = ARMISD::SMLALTB;
10080     Op0 = Mul.getOperand(0).getOperand(0);
10081     Op1 = Mul.getOperand(1);
10082   } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
10083     Opcode = ARMISD::SMLALTT;
10084     Op0 = Mul->getOperand(0).getOperand(0);
10085     Op1 = Mul->getOperand(1).getOperand(0);
10086   }
10087 
10088   if (!Op0 || !Op1)
10089     return SDValue();
10090 
10091   SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
10092                               Op0, Op1, Lo, Hi);
10093   // Replace the ADDs' nodes uses by the MLA node's values.
10094   SDValue HiMLALResult(SMLAL.getNode(), 1);
10095   SDValue LoMLALResult(SMLAL.getNode(), 0);
10096 
10097   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
10098   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
10099 
10100   // Return original node to notify the driver to stop replacing.
10101   SDValue resNode(AddcNode, 0);
10102   return resNode;
10103 }
10104 
10105 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
10106                                      TargetLowering::DAGCombinerInfo &DCI,
10107                                      const ARMSubtarget *Subtarget) {
10108   // Look for multiply add opportunities.
10109   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
10110   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
10111   // a glue link from the first add to the second add.
10112   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
10113   // a S/UMLAL instruction.
10114   //                  UMUL_LOHI
10115   //                 / :lo    \ :hi
10116   //                V          \          [no multiline comment]
10117   //    loAdd ->  ADDC         |
10118   //                 \ :carry /
10119   //                  V      V
10120   //                    ADDE   <- hiAdd
10121   //
10122   // In the special case where only the higher part of a signed result is used
10123   // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
10124   // a constant with the exact value of 0x80000000, we recognize we are dealing
10125   // with a "rounded multiply and add" (or subtract) and transform it into
10126   // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
10127 
10128   assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
10129           AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
10130          "Expect an ADDE or SUBE");
10131 
10132   assert(AddeSubeNode->getNumOperands() == 3 &&
10133          AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
10134          "ADDE node has the wrong inputs");
10135 
10136   // Check that we are chained to the right ADDC or SUBC node.
10137   SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
10138   if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
10139        AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
10140       (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
10141        AddcSubcNode->getOpcode() != ARMISD::SUBC))
10142     return SDValue();
10143 
10144   SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
10145   SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
10146 
10147   // Check if the two operands are from the same mul_lohi node.
10148   if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
10149     return SDValue();
10150 
10151   assert(AddcSubcNode->getNumValues() == 2 &&
10152          AddcSubcNode->getValueType(0) == MVT::i32 &&
10153          "Expect ADDC with two result values. First: i32");
10154 
10155   // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
10156   // maybe a SMLAL which multiplies two 16-bit values.
10157   if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
10158       AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
10159       AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
10160       AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
10161       AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
10162     return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
10163 
10164   // Check for the triangle shape.
10165   SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
10166   SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
10167 
10168   // Make sure that the ADDE/SUBE operands are not coming from the same node.
10169   if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
10170     return SDValue();
10171 
10172   // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
10173   bool IsLeftOperandMUL = false;
10174   SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
10175   if (MULOp == SDValue())
10176     MULOp = findMUL_LOHI(AddeSubeOp1);
10177   else
10178     IsLeftOperandMUL = true;
10179   if (MULOp == SDValue())
10180     return SDValue();
10181 
10182   // Figure out the right opcode.
10183   unsigned Opc = MULOp->getOpcode();
10184   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
10185 
10186   // Figure out the high and low input values to the MLAL node.
10187   SDValue *HiAddSub = nullptr;
10188   SDValue *LoMul = nullptr;
10189   SDValue *LowAddSub = nullptr;
10190 
10191   // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
10192   if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
10193     return SDValue();
10194 
10195   if (IsLeftOperandMUL)
10196     HiAddSub = &AddeSubeOp1;
10197   else
10198     HiAddSub = &AddeSubeOp0;
10199 
10200   // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
10201   // whose low result is fed to the ADDC/SUBC we are checking.
10202 
10203   if (AddcSubcOp0 == MULOp.getValue(0)) {
10204     LoMul = &AddcSubcOp0;
10205     LowAddSub = &AddcSubcOp1;
10206   }
10207   if (AddcSubcOp1 == MULOp.getValue(0)) {
10208     LoMul = &AddcSubcOp1;
10209     LowAddSub = &AddcSubcOp0;
10210   }
10211 
10212   if (!LoMul)
10213     return SDValue();
10214 
10215   // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
10216   // the replacement below will create a cycle.
10217   if (AddcSubcNode == HiAddSub->getNode() ||
10218       AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
10219     return SDValue();
10220 
10221   // Create the merged node.
10222   SelectionDAG &DAG = DCI.DAG;
10223 
10224   // Start building operand list.
10225   SmallVector<SDValue, 8> Ops;
10226   Ops.push_back(LoMul->getOperand(0));
10227   Ops.push_back(LoMul->getOperand(1));
10228 
10229   // Check whether we can use SMMLAR, SMMLSR or SMMULR instead.  For this to be
10230   // the case, we must be doing signed multiplication and only use the higher
10231   // part of the result of the MLAL, furthermore the LowAddSub must be a constant
10232   // addition or subtraction with the value of 0x800000.
10233   if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
10234       FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
10235       LowAddSub->getNode()->getOpcode() == ISD::Constant &&
10236       static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
10237           0x80000000) {
10238     Ops.push_back(*HiAddSub);
10239     if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
10240       FinalOpc = ARMISD::SMMLSR;
10241     } else {
10242       FinalOpc = ARMISD::SMMLAR;
10243     }
10244     SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
10245     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
10246 
10247     return SDValue(AddeSubeNode, 0);
10248   } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
10249     // SMMLS is generated during instruction selection and the rest of this
10250     // function can not handle the case where AddcSubcNode is a SUBC.
10251     return SDValue();
10252 
10253   // Finish building the operand list for {U/S}MLAL
10254   Ops.push_back(*LowAddSub);
10255   Ops.push_back(*HiAddSub);
10256 
10257   SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
10258                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
10259 
10260   // Replace the ADDs' nodes uses by the MLA node's values.
10261   SDValue HiMLALResult(MLALNode.getNode(), 1);
10262   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
10263 
10264   SDValue LoMLALResult(MLALNode.getNode(), 0);
10265   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
10266 
10267   // Return original node to notify the driver to stop replacing.
10268   return SDValue(AddeSubeNode, 0);
10269 }
10270 
10271 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
10272                                       TargetLowering::DAGCombinerInfo &DCI,
10273                                       const ARMSubtarget *Subtarget) {
10274   // UMAAL is similar to UMLAL except that it adds two unsigned values.
10275   // While trying to combine for the other MLAL nodes, first search for the
10276   // chance to use UMAAL. Check if Addc uses a node which has already
10277   // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
10278   // as the addend, and it's handled in PerformUMLALCombine.
10279 
10280   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
10281     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
10282 
10283   // Check that we have a glued ADDC node.
10284   SDNode* AddcNode = AddeNode->getOperand(2).getNode();
10285   if (AddcNode->getOpcode() != ARMISD::ADDC)
10286     return SDValue();
10287 
10288   // Find the converted UMAAL or quit if it doesn't exist.
10289   SDNode *UmlalNode = nullptr;
10290   SDValue AddHi;
10291   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
10292     UmlalNode = AddcNode->getOperand(0).getNode();
10293     AddHi = AddcNode->getOperand(1);
10294   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
10295     UmlalNode = AddcNode->getOperand(1).getNode();
10296     AddHi = AddcNode->getOperand(0);
10297   } else {
10298     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
10299   }
10300 
10301   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
10302   // the ADDC as well as Zero.
10303   if (!isNullConstant(UmlalNode->getOperand(3)))
10304     return SDValue();
10305 
10306   if ((isNullConstant(AddeNode->getOperand(0)) &&
10307        AddeNode->getOperand(1).getNode() == UmlalNode) ||
10308       (AddeNode->getOperand(0).getNode() == UmlalNode &&
10309        isNullConstant(AddeNode->getOperand(1)))) {
10310     SelectionDAG &DAG = DCI.DAG;
10311     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
10312                       UmlalNode->getOperand(2), AddHi };
10313     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
10314                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
10315 
10316     // Replace the ADDs' nodes uses by the UMAAL node's values.
10317     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
10318     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
10319 
10320     // Return original node to notify the driver to stop replacing.
10321     return SDValue(AddeNode, 0);
10322   }
10323   return SDValue();
10324 }
10325 
10326 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
10327                                    const ARMSubtarget *Subtarget) {
10328   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
10329     return SDValue();
10330 
10331   // Check that we have a pair of ADDC and ADDE as operands.
10332   // Both addends of the ADDE must be zero.
10333   SDNode* AddcNode = N->getOperand(2).getNode();
10334   SDNode* AddeNode = N->getOperand(3).getNode();
10335   if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
10336       (AddeNode->getOpcode() == ARMISD::ADDE) &&
10337       isNullConstant(AddeNode->getOperand(0)) &&
10338       isNullConstant(AddeNode->getOperand(1)) &&
10339       (AddeNode->getOperand(2).getNode() == AddcNode))
10340     return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
10341                        DAG.getVTList(MVT::i32, MVT::i32),
10342                        {N->getOperand(0), N->getOperand(1),
10343                         AddcNode->getOperand(0), AddcNode->getOperand(1)});
10344   else
10345     return SDValue();
10346 }
10347 
10348 static SDValue PerformAddcSubcCombine(SDNode *N,
10349                                       TargetLowering::DAGCombinerInfo &DCI,
10350                                       const ARMSubtarget *Subtarget) {
10351   SelectionDAG &DAG(DCI.DAG);
10352 
10353   if (N->getOpcode() == ARMISD::SUBC) {
10354     // (SUBC (ADDE 0, 0, C), 1) -> C
10355     SDValue LHS = N->getOperand(0);
10356     SDValue RHS = N->getOperand(1);
10357     if (LHS->getOpcode() == ARMISD::ADDE &&
10358         isNullConstant(LHS->getOperand(0)) &&
10359         isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
10360       return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
10361     }
10362   }
10363 
10364   if (Subtarget->isThumb1Only()) {
10365     SDValue RHS = N->getOperand(1);
10366     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
10367       int32_t imm = C->getSExtValue();
10368       if (imm < 0 && imm > std::numeric_limits<int>::min()) {
10369         SDLoc DL(N);
10370         RHS = DAG.getConstant(-imm, DL, MVT::i32);
10371         unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
10372                                                            : ARMISD::ADDC;
10373         return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
10374       }
10375     }
10376   }
10377 
10378   return SDValue();
10379 }
10380 
10381 static SDValue PerformAddeSubeCombine(SDNode *N,
10382                                       TargetLowering::DAGCombinerInfo &DCI,
10383                                       const ARMSubtarget *Subtarget) {
10384   if (Subtarget->isThumb1Only()) {
10385     SelectionDAG &DAG = DCI.DAG;
10386     SDValue RHS = N->getOperand(1);
10387     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
10388       int64_t imm = C->getSExtValue();
10389       if (imm < 0) {
10390         SDLoc DL(N);
10391 
10392         // The with-carry-in form matches bitwise not instead of the negation.
10393         // Effectively, the inverse interpretation of the carry flag already
10394         // accounts for part of the negation.
10395         RHS = DAG.getConstant(~imm, DL, MVT::i32);
10396 
10397         unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
10398                                                            : ARMISD::ADDE;
10399         return DAG.getNode(Opcode, DL, N->getVTList(),
10400                            N->getOperand(0), RHS, N->getOperand(2));
10401       }
10402     }
10403   } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
10404     return AddCombineTo64bitMLAL(N, DCI, Subtarget);
10405   }
10406   return SDValue();
10407 }
10408 
10409 /// PerformADDECombine - Target-specific dag combine transform from
10410 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
10411 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
10412 static SDValue PerformADDECombine(SDNode *N,
10413                                   TargetLowering::DAGCombinerInfo &DCI,
10414                                   const ARMSubtarget *Subtarget) {
10415   // Only ARM and Thumb2 support UMLAL/SMLAL.
10416   if (Subtarget->isThumb1Only())
10417     return PerformAddeSubeCombine(N, DCI, Subtarget);
10418 
10419   // Only perform the checks after legalize when the pattern is available.
10420   if (DCI.isBeforeLegalize()) return SDValue();
10421 
10422   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
10423 }
10424 
10425 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
10426 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
10427 /// called with the default operands, and if that fails, with commuted
10428 /// operands.
10429 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
10430                                           TargetLowering::DAGCombinerInfo &DCI,
10431                                           const ARMSubtarget *Subtarget){
10432   // Attempt to create vpadd for this add.
10433   if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
10434     return Result;
10435 
10436   // Attempt to create vpaddl for this add.
10437   if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
10438     return Result;
10439   if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
10440                                                       Subtarget))
10441     return Result;
10442 
10443   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
10444   if (N0.getNode()->hasOneUse())
10445     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
10446       return Result;
10447   return SDValue();
10448 }
10449 
10450 bool
10451 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
10452                                                  CombineLevel Level) const {
10453   if (Level == BeforeLegalizeTypes)
10454     return true;
10455 
10456   if (Subtarget->isThumb() && Subtarget->isThumb1Only())
10457     return true;
10458 
10459   if (N->getOpcode() != ISD::SHL)
10460     return true;
10461 
10462   // Turn off commute-with-shift transform after legalization, so it doesn't
10463   // conflict with PerformSHLSimplify.  (We could try to detect when
10464   // PerformSHLSimplify would trigger more precisely, but it isn't
10465   // really necessary.)
10466   return false;
10467 }
10468 
10469 static SDValue PerformSHLSimplify(SDNode *N,
10470                                 TargetLowering::DAGCombinerInfo &DCI,
10471                                 const ARMSubtarget *ST) {
10472   // Allow the generic combiner to identify potential bswaps.
10473   if (DCI.isBeforeLegalize())
10474     return SDValue();
10475 
10476   // DAG combiner will fold:
10477   // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
10478   // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
10479   // Other code patterns that can be also be modified have the following form:
10480   // b + ((a << 1) | 510)
10481   // b + ((a << 1) & 510)
10482   // b + ((a << 1) ^ 510)
10483   // b + ((a << 1) + 510)
10484 
10485   // Many instructions can  perform the shift for free, but it requires both
10486   // the operands to be registers. If c1 << c2 is too large, a mov immediate
10487   // instruction will needed. So, unfold back to the original pattern if:
10488   // - if c1 and c2 are small enough that they don't require mov imms.
10489   // - the user(s) of the node can perform an shl
10490 
10491   // No shifted operands for 16-bit instructions.
10492   if (ST->isThumb() && ST->isThumb1Only())
10493     return SDValue();
10494 
10495   // Check that all the users could perform the shl themselves.
10496   for (auto U : N->uses()) {
10497     switch(U->getOpcode()) {
10498     default:
10499       return SDValue();
10500     case ISD::SUB:
10501     case ISD::ADD:
10502     case ISD::AND:
10503     case ISD::OR:
10504     case ISD::XOR:
10505     case ISD::SETCC:
10506     case ARMISD::CMP:
10507       // Check that the user isn't already using a constant because there
10508       // aren't any instructions that support an immediate operand and a
10509       // shifted operand.
10510       if (isa<ConstantSDNode>(U->getOperand(0)) ||
10511           isa<ConstantSDNode>(U->getOperand(1)))
10512         return SDValue();
10513 
10514       // Check that it's not already using a shift.
10515       if (U->getOperand(0).getOpcode() == ISD::SHL ||
10516           U->getOperand(1).getOpcode() == ISD::SHL)
10517         return SDValue();
10518       break;
10519     }
10520   }
10521 
10522   if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
10523       N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
10524     return SDValue();
10525 
10526   if (N->getOperand(0).getOpcode() != ISD::SHL)
10527     return SDValue();
10528 
10529   SDValue SHL = N->getOperand(0);
10530 
10531   auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
10532   auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
10533   if (!C1ShlC2 || !C2)
10534     return SDValue();
10535 
10536   APInt C2Int = C2->getAPIntValue();
10537   APInt C1Int = C1ShlC2->getAPIntValue();
10538 
10539   // Check that performing a lshr will not lose any information.
10540   APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
10541                                      C2Int.getBitWidth() - C2->getZExtValue());
10542   if ((C1Int & Mask) != C1Int)
10543     return SDValue();
10544 
10545   // Shift the first constant.
10546   C1Int.lshrInPlace(C2Int);
10547 
10548   // The immediates are encoded as an 8-bit value that can be rotated.
10549   auto LargeImm = [](const APInt &Imm) {
10550     unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
10551     return Imm.getBitWidth() - Zeros > 8;
10552   };
10553 
10554   if (LargeImm(C1Int) || LargeImm(C2Int))
10555     return SDValue();
10556 
10557   SelectionDAG &DAG = DCI.DAG;
10558   SDLoc dl(N);
10559   SDValue X = SHL.getOperand(0);
10560   SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
10561                               DAG.getConstant(C1Int, dl, MVT::i32));
10562   // Shift left to compensate for the lshr of C1Int.
10563   SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
10564 
10565   LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
10566              SHL.dump(); N->dump());
10567   LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
10568   return Res;
10569 }
10570 
10571 
10572 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
10573 ///
10574 static SDValue PerformADDCombine(SDNode *N,
10575                                  TargetLowering::DAGCombinerInfo &DCI,
10576                                  const ARMSubtarget *Subtarget) {
10577   SDValue N0 = N->getOperand(0);
10578   SDValue N1 = N->getOperand(1);
10579 
10580   // Only works one way, because it needs an immediate operand.
10581   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
10582     return Result;
10583 
10584   // First try with the default operand order.
10585   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
10586     return Result;
10587 
10588   // If that didn't work, try again with the operands commuted.
10589   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
10590 }
10591 
10592 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
10593 ///
10594 static SDValue PerformSUBCombine(SDNode *N,
10595                                  TargetLowering::DAGCombinerInfo &DCI) {
10596   SDValue N0 = N->getOperand(0);
10597   SDValue N1 = N->getOperand(1);
10598 
10599   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
10600   if (N1.getNode()->hasOneUse())
10601     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
10602       return Result;
10603 
10604   return SDValue();
10605 }
10606 
10607 /// PerformVMULCombine
10608 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
10609 /// special multiplier accumulator forwarding.
10610 ///   vmul d3, d0, d2
10611 ///   vmla d3, d1, d2
10612 /// is faster than
10613 ///   vadd d3, d0, d1
10614 ///   vmul d3, d3, d2
10615 //  However, for (A + B) * (A + B),
10616 //    vadd d2, d0, d1
10617 //    vmul d3, d0, d2
10618 //    vmla d3, d1, d2
10619 //  is slower than
10620 //    vadd d2, d0, d1
10621 //    vmul d3, d2, d2
10622 static SDValue PerformVMULCombine(SDNode *N,
10623                                   TargetLowering::DAGCombinerInfo &DCI,
10624                                   const ARMSubtarget *Subtarget) {
10625   if (!Subtarget->hasVMLxForwarding())
10626     return SDValue();
10627 
10628   SelectionDAG &DAG = DCI.DAG;
10629   SDValue N0 = N->getOperand(0);
10630   SDValue N1 = N->getOperand(1);
10631   unsigned Opcode = N0.getOpcode();
10632   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
10633       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
10634     Opcode = N1.getOpcode();
10635     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
10636         Opcode != ISD::FADD && Opcode != ISD::FSUB)
10637       return SDValue();
10638     std::swap(N0, N1);
10639   }
10640 
10641   if (N0 == N1)
10642     return SDValue();
10643 
10644   EVT VT = N->getValueType(0);
10645   SDLoc DL(N);
10646   SDValue N00 = N0->getOperand(0);
10647   SDValue N01 = N0->getOperand(1);
10648   return DAG.getNode(Opcode, DL, VT,
10649                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
10650                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
10651 }
10652 
10653 static SDValue PerformMULCombine(SDNode *N,
10654                                  TargetLowering::DAGCombinerInfo &DCI,
10655                                  const ARMSubtarget *Subtarget) {
10656   SelectionDAG &DAG = DCI.DAG;
10657 
10658   if (Subtarget->isThumb1Only())
10659     return SDValue();
10660 
10661   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
10662     return SDValue();
10663 
10664   EVT VT = N->getValueType(0);
10665   if (VT.is64BitVector() || VT.is128BitVector())
10666     return PerformVMULCombine(N, DCI, Subtarget);
10667   if (VT != MVT::i32)
10668     return SDValue();
10669 
10670   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
10671   if (!C)
10672     return SDValue();
10673 
10674   int64_t MulAmt = C->getSExtValue();
10675   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
10676 
10677   ShiftAmt = ShiftAmt & (32 - 1);
10678   SDValue V = N->getOperand(0);
10679   SDLoc DL(N);
10680 
10681   SDValue Res;
10682   MulAmt >>= ShiftAmt;
10683 
10684   if (MulAmt >= 0) {
10685     if (isPowerOf2_32(MulAmt - 1)) {
10686       // (mul x, 2^N + 1) => (add (shl x, N), x)
10687       Res = DAG.getNode(ISD::ADD, DL, VT,
10688                         V,
10689                         DAG.getNode(ISD::SHL, DL, VT,
10690                                     V,
10691                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
10692                                                     MVT::i32)));
10693     } else if (isPowerOf2_32(MulAmt + 1)) {
10694       // (mul x, 2^N - 1) => (sub (shl x, N), x)
10695       Res = DAG.getNode(ISD::SUB, DL, VT,
10696                         DAG.getNode(ISD::SHL, DL, VT,
10697                                     V,
10698                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
10699                                                     MVT::i32)),
10700                         V);
10701     } else
10702       return SDValue();
10703   } else {
10704     uint64_t MulAmtAbs = -MulAmt;
10705     if (isPowerOf2_32(MulAmtAbs + 1)) {
10706       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
10707       Res = DAG.getNode(ISD::SUB, DL, VT,
10708                         V,
10709                         DAG.getNode(ISD::SHL, DL, VT,
10710                                     V,
10711                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
10712                                                     MVT::i32)));
10713     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
10714       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
10715       Res = DAG.getNode(ISD::ADD, DL, VT,
10716                         V,
10717                         DAG.getNode(ISD::SHL, DL, VT,
10718                                     V,
10719                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
10720                                                     MVT::i32)));
10721       Res = DAG.getNode(ISD::SUB, DL, VT,
10722                         DAG.getConstant(0, DL, MVT::i32), Res);
10723     } else
10724       return SDValue();
10725   }
10726 
10727   if (ShiftAmt != 0)
10728     Res = DAG.getNode(ISD::SHL, DL, VT,
10729                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
10730 
10731   // Do not add new nodes to DAG combiner worklist.
10732   DCI.CombineTo(N, Res, false);
10733   return SDValue();
10734 }
10735 
10736 static SDValue CombineANDShift(SDNode *N,
10737                                TargetLowering::DAGCombinerInfo &DCI,
10738                                const ARMSubtarget *Subtarget) {
10739   // Allow DAGCombine to pattern-match before we touch the canonical form.
10740   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
10741     return SDValue();
10742 
10743   if (N->getValueType(0) != MVT::i32)
10744     return SDValue();
10745 
10746   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
10747   if (!N1C)
10748     return SDValue();
10749 
10750   uint32_t C1 = (uint32_t)N1C->getZExtValue();
10751   // Don't transform uxtb/uxth.
10752   if (C1 == 255 || C1 == 65535)
10753     return SDValue();
10754 
10755   SDNode *N0 = N->getOperand(0).getNode();
10756   if (!N0->hasOneUse())
10757     return SDValue();
10758 
10759   if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
10760     return SDValue();
10761 
10762   bool LeftShift = N0->getOpcode() == ISD::SHL;
10763 
10764   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
10765   if (!N01C)
10766     return SDValue();
10767 
10768   uint32_t C2 = (uint32_t)N01C->getZExtValue();
10769   if (!C2 || C2 >= 32)
10770     return SDValue();
10771 
10772   SelectionDAG &DAG = DCI.DAG;
10773   SDLoc DL(N);
10774 
10775   // We have a pattern of the form "(and (shl x, c2) c1)" or
10776   // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
10777   // transform to a pair of shifts, to save materializing c1.
10778 
10779   // First pattern: right shift, and c1+1 is a power of two.
10780   // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power
10781   // of two).
10782   // FIXME: Use demanded bits?
10783   if (!LeftShift && isMask_32(C1)) {
10784     uint32_t C3 = countLeadingZeros(C1);
10785     if (C2 < C3) {
10786       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
10787                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
10788       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
10789                          DAG.getConstant(C3, DL, MVT::i32));
10790     }
10791   }
10792 
10793   // Second pattern: left shift, and (c1>>c2)+1 is a power of two.
10794   // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1
10795   // is a power of two).
10796   // FIXME: Use demanded bits?
10797   if (LeftShift && isShiftedMask_32(C1)) {
10798     uint32_t C3 = countLeadingZeros(C1);
10799     if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) {
10800       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
10801                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
10802       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
10803                         DAG.getConstant(C3, DL, MVT::i32));
10804     }
10805   }
10806 
10807   // FIXME: Transform "(and (shl x, c2) c1)" ->
10808   // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
10809   // c1.
10810   return SDValue();
10811 }
10812 
10813 static SDValue PerformANDCombine(SDNode *N,
10814                                  TargetLowering::DAGCombinerInfo &DCI,
10815                                  const ARMSubtarget *Subtarget) {
10816   // Attempt to use immediate-form VBIC
10817   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
10818   SDLoc dl(N);
10819   EVT VT = N->getValueType(0);
10820   SelectionDAG &DAG = DCI.DAG;
10821 
10822   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
10823     return SDValue();
10824 
10825   APInt SplatBits, SplatUndef;
10826   unsigned SplatBitSize;
10827   bool HasAnyUndefs;
10828   if (BVN &&
10829       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
10830     if (SplatBitSize <= 64) {
10831       EVT VbicVT;
10832       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
10833                                       SplatUndef.getZExtValue(), SplatBitSize,
10834                                       DAG, dl, VbicVT, VT.is128BitVector(),
10835                                       OtherModImm);
10836       if (Val.getNode()) {
10837         SDValue Input =
10838           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
10839         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
10840         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
10841       }
10842     }
10843   }
10844 
10845   if (!Subtarget->isThumb1Only()) {
10846     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
10847     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
10848       return Result;
10849 
10850     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
10851       return Result;
10852   }
10853 
10854   if (Subtarget->isThumb1Only())
10855     if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
10856       return Result;
10857 
10858   return SDValue();
10859 }
10860 
10861 // Try combining OR nodes to SMULWB, SMULWT.
10862 static SDValue PerformORCombineToSMULWBT(SDNode *OR,
10863                                          TargetLowering::DAGCombinerInfo &DCI,
10864                                          const ARMSubtarget *Subtarget) {
10865   if (!Subtarget->hasV6Ops() ||
10866       (Subtarget->isThumb() &&
10867        (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
10868     return SDValue();
10869 
10870   SDValue SRL = OR->getOperand(0);
10871   SDValue SHL = OR->getOperand(1);
10872 
10873   if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
10874     SRL = OR->getOperand(1);
10875     SHL = OR->getOperand(0);
10876   }
10877   if (!isSRL16(SRL) || !isSHL16(SHL))
10878     return SDValue();
10879 
10880   // The first operands to the shifts need to be the two results from the
10881   // same smul_lohi node.
10882   if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
10883        SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
10884     return SDValue();
10885 
10886   SDNode *SMULLOHI = SRL.getOperand(0).getNode();
10887   if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
10888       SHL.getOperand(0) != SDValue(SMULLOHI, 1))
10889     return SDValue();
10890 
10891   // Now we have:
10892   // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
10893   // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
10894   // For SMUWB the 16-bit value will signed extended somehow.
10895   // For SMULWT only the SRA is required.
10896   // Check both sides of SMUL_LOHI
10897   SDValue OpS16 = SMULLOHI->getOperand(0);
10898   SDValue OpS32 = SMULLOHI->getOperand(1);
10899 
10900   SelectionDAG &DAG = DCI.DAG;
10901   if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
10902     OpS16 = OpS32;
10903     OpS32 = SMULLOHI->getOperand(0);
10904   }
10905 
10906   SDLoc dl(OR);
10907   unsigned Opcode = 0;
10908   if (isS16(OpS16, DAG))
10909     Opcode = ARMISD::SMULWB;
10910   else if (isSRA16(OpS16)) {
10911     Opcode = ARMISD::SMULWT;
10912     OpS16 = OpS16->getOperand(0);
10913   }
10914   else
10915     return SDValue();
10916 
10917   SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
10918   DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
10919   return SDValue(OR, 0);
10920 }
10921 
10922 static SDValue PerformORCombineToBFI(SDNode *N,
10923                                      TargetLowering::DAGCombinerInfo &DCI,
10924                                      const ARMSubtarget *Subtarget) {
10925   // BFI is only available on V6T2+
10926   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
10927     return SDValue();
10928 
10929   EVT VT = N->getValueType(0);
10930   SDValue N0 = N->getOperand(0);
10931   SDValue N1 = N->getOperand(1);
10932   SelectionDAG &DAG = DCI.DAG;
10933   SDLoc DL(N);
10934   // 1) or (and A, mask), val => ARMbfi A, val, mask
10935   //      iff (val & mask) == val
10936   //
10937   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
10938   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
10939   //          && mask == ~mask2
10940   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
10941   //          && ~mask == mask2
10942   //  (i.e., copy a bitfield value into another bitfield of the same width)
10943 
10944   if (VT != MVT::i32)
10945     return SDValue();
10946 
10947   SDValue N00 = N0.getOperand(0);
10948 
10949   // The value and the mask need to be constants so we can verify this is
10950   // actually a bitfield set. If the mask is 0xffff, we can do better
10951   // via a movt instruction, so don't use BFI in that case.
10952   SDValue MaskOp = N0.getOperand(1);
10953   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
10954   if (!MaskC)
10955     return SDValue();
10956   unsigned Mask = MaskC->getZExtValue();
10957   if (Mask == 0xffff)
10958     return SDValue();
10959   SDValue Res;
10960   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
10961   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
10962   if (N1C) {
10963     unsigned Val = N1C->getZExtValue();
10964     if ((Val & ~Mask) != Val)
10965       return SDValue();
10966 
10967     if (ARM::isBitFieldInvertedMask(Mask)) {
10968       Val >>= countTrailingZeros(~Mask);
10969 
10970       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
10971                         DAG.getConstant(Val, DL, MVT::i32),
10972                         DAG.getConstant(Mask, DL, MVT::i32));
10973 
10974       DCI.CombineTo(N, Res, false);
10975       // Return value from the original node to inform the combiner than N is
10976       // now dead.
10977       return SDValue(N, 0);
10978     }
10979   } else if (N1.getOpcode() == ISD::AND) {
10980     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
10981     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
10982     if (!N11C)
10983       return SDValue();
10984     unsigned Mask2 = N11C->getZExtValue();
10985 
10986     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
10987     // as is to match.
10988     if (ARM::isBitFieldInvertedMask(Mask) &&
10989         (Mask == ~Mask2)) {
10990       // The pack halfword instruction works better for masks that fit it,
10991       // so use that when it's available.
10992       if (Subtarget->hasDSP() &&
10993           (Mask == 0xffff || Mask == 0xffff0000))
10994         return SDValue();
10995       // 2a
10996       unsigned amt = countTrailingZeros(Mask2);
10997       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
10998                         DAG.getConstant(amt, DL, MVT::i32));
10999       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
11000                         DAG.getConstant(Mask, DL, MVT::i32));
11001       DCI.CombineTo(N, Res, false);
11002       // Return value from the original node to inform the combiner than N is
11003       // now dead.
11004       return SDValue(N, 0);
11005     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
11006                (~Mask == Mask2)) {
11007       // The pack halfword instruction works better for masks that fit it,
11008       // so use that when it's available.
11009       if (Subtarget->hasDSP() &&
11010           (Mask2 == 0xffff || Mask2 == 0xffff0000))
11011         return SDValue();
11012       // 2b
11013       unsigned lsb = countTrailingZeros(Mask);
11014       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
11015                         DAG.getConstant(lsb, DL, MVT::i32));
11016       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
11017                         DAG.getConstant(Mask2, DL, MVT::i32));
11018       DCI.CombineTo(N, Res, false);
11019       // Return value from the original node to inform the combiner than N is
11020       // now dead.
11021       return SDValue(N, 0);
11022     }
11023   }
11024 
11025   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
11026       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
11027       ARM::isBitFieldInvertedMask(~Mask)) {
11028     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
11029     // where lsb(mask) == #shamt and masked bits of B are known zero.
11030     SDValue ShAmt = N00.getOperand(1);
11031     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
11032     unsigned LSB = countTrailingZeros(Mask);
11033     if (ShAmtC != LSB)
11034       return SDValue();
11035 
11036     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
11037                       DAG.getConstant(~Mask, DL, MVT::i32));
11038 
11039     DCI.CombineTo(N, Res, false);
11040     // Return value from the original node to inform the combiner than N is
11041     // now dead.
11042     return SDValue(N, 0);
11043   }
11044 
11045   return SDValue();
11046 }
11047 
11048 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
11049 static SDValue PerformORCombine(SDNode *N,
11050                                 TargetLowering::DAGCombinerInfo &DCI,
11051                                 const ARMSubtarget *Subtarget) {
11052   // Attempt to use immediate-form VORR
11053   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
11054   SDLoc dl(N);
11055   EVT VT = N->getValueType(0);
11056   SelectionDAG &DAG = DCI.DAG;
11057 
11058   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11059     return SDValue();
11060 
11061   APInt SplatBits, SplatUndef;
11062   unsigned SplatBitSize;
11063   bool HasAnyUndefs;
11064   if (BVN && Subtarget->hasNEON() &&
11065       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
11066     if (SplatBitSize <= 64) {
11067       EVT VorrVT;
11068       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
11069                                       SplatUndef.getZExtValue(), SplatBitSize,
11070                                       DAG, dl, VorrVT, VT.is128BitVector(),
11071                                       OtherModImm);
11072       if (Val.getNode()) {
11073         SDValue Input =
11074           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
11075         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
11076         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
11077       }
11078     }
11079   }
11080 
11081   if (!Subtarget->isThumb1Only()) {
11082     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
11083     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
11084       return Result;
11085     if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
11086       return Result;
11087   }
11088 
11089   SDValue N0 = N->getOperand(0);
11090   SDValue N1 = N->getOperand(1);
11091 
11092   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
11093   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
11094       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
11095 
11096     // The code below optimizes (or (and X, Y), Z).
11097     // The AND operand needs to have a single user to make these optimizations
11098     // profitable.
11099     if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
11100       return SDValue();
11101 
11102     APInt SplatUndef;
11103     unsigned SplatBitSize;
11104     bool HasAnyUndefs;
11105 
11106     APInt SplatBits0, SplatBits1;
11107     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
11108     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
11109     // Ensure that the second operand of both ands are constants
11110     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
11111                                       HasAnyUndefs) && !HasAnyUndefs) {
11112         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
11113                                           HasAnyUndefs) && !HasAnyUndefs) {
11114             // Ensure that the bit width of the constants are the same and that
11115             // the splat arguments are logical inverses as per the pattern we
11116             // are trying to simplify.
11117             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
11118                 SplatBits0 == ~SplatBits1) {
11119                 // Canonicalize the vector type to make instruction selection
11120                 // simpler.
11121                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
11122                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
11123                                              N0->getOperand(1),
11124                                              N0->getOperand(0),
11125                                              N1->getOperand(0));
11126                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
11127             }
11128         }
11129     }
11130   }
11131 
11132   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
11133   // reasonable.
11134   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
11135     if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
11136       return Res;
11137   }
11138 
11139   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
11140     return Result;
11141 
11142   return SDValue();
11143 }
11144 
11145 static SDValue PerformXORCombine(SDNode *N,
11146                                  TargetLowering::DAGCombinerInfo &DCI,
11147                                  const ARMSubtarget *Subtarget) {
11148   EVT VT = N->getValueType(0);
11149   SelectionDAG &DAG = DCI.DAG;
11150 
11151   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11152     return SDValue();
11153 
11154   if (!Subtarget->isThumb1Only()) {
11155     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
11156     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
11157       return Result;
11158 
11159     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
11160       return Result;
11161   }
11162 
11163   return SDValue();
11164 }
11165 
11166 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
11167 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
11168 // their position in "to" (Rd).
11169 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
11170   assert(N->getOpcode() == ARMISD::BFI);
11171 
11172   SDValue From = N->getOperand(1);
11173   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
11174   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
11175 
11176   // If the Base came from a SHR #C, we can deduce that it is really testing bit
11177   // #C in the base of the SHR.
11178   if (From->getOpcode() == ISD::SRL &&
11179       isa<ConstantSDNode>(From->getOperand(1))) {
11180     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
11181     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
11182     FromMask <<= Shift.getLimitedValue(31);
11183     From = From->getOperand(0);
11184   }
11185 
11186   return From;
11187 }
11188 
11189 // If A and B contain one contiguous set of bits, does A | B == A . B?
11190 //
11191 // Neither A nor B must be zero.
11192 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
11193   unsigned LastActiveBitInA =  A.countTrailingZeros();
11194   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
11195   return LastActiveBitInA - 1 == FirstActiveBitInB;
11196 }
11197 
11198 static SDValue FindBFIToCombineWith(SDNode *N) {
11199   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
11200   // if one exists.
11201   APInt ToMask, FromMask;
11202   SDValue From = ParseBFI(N, ToMask, FromMask);
11203   SDValue To = N->getOperand(0);
11204 
11205   // Now check for a compatible BFI to merge with. We can pass through BFIs that
11206   // aren't compatible, but not if they set the same bit in their destination as
11207   // we do (or that of any BFI we're going to combine with).
11208   SDValue V = To;
11209   APInt CombinedToMask = ToMask;
11210   while (V.getOpcode() == ARMISD::BFI) {
11211     APInt NewToMask, NewFromMask;
11212     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
11213     if (NewFrom != From) {
11214       // This BFI has a different base. Keep going.
11215       CombinedToMask |= NewToMask;
11216       V = V.getOperand(0);
11217       continue;
11218     }
11219 
11220     // Do the written bits conflict with any we've seen so far?
11221     if ((NewToMask & CombinedToMask).getBoolValue())
11222       // Conflicting bits - bail out because going further is unsafe.
11223       return SDValue();
11224 
11225     // Are the new bits contiguous when combined with the old bits?
11226     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
11227         BitsProperlyConcatenate(FromMask, NewFromMask))
11228       return V;
11229     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
11230         BitsProperlyConcatenate(NewFromMask, FromMask))
11231       return V;
11232 
11233     // We've seen a write to some bits, so track it.
11234     CombinedToMask |= NewToMask;
11235     // Keep going...
11236     V = V.getOperand(0);
11237   }
11238 
11239   return SDValue();
11240 }
11241 
11242 static SDValue PerformBFICombine(SDNode *N,
11243                                  TargetLowering::DAGCombinerInfo &DCI) {
11244   SDValue N1 = N->getOperand(1);
11245   if (N1.getOpcode() == ISD::AND) {
11246     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
11247     // the bits being cleared by the AND are not demanded by the BFI.
11248     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
11249     if (!N11C)
11250       return SDValue();
11251     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
11252     unsigned LSB = countTrailingZeros(~InvMask);
11253     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
11254     assert(Width <
11255                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
11256            "undefined behavior");
11257     unsigned Mask = (1u << Width) - 1;
11258     unsigned Mask2 = N11C->getZExtValue();
11259     if ((Mask & (~Mask2)) == 0)
11260       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
11261                              N->getOperand(0), N1.getOperand(0),
11262                              N->getOperand(2));
11263   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
11264     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
11265     // Keep track of any consecutive bits set that all come from the same base
11266     // value. We can combine these together into a single BFI.
11267     SDValue CombineBFI = FindBFIToCombineWith(N);
11268     if (CombineBFI == SDValue())
11269       return SDValue();
11270 
11271     // We've found a BFI.
11272     APInt ToMask1, FromMask1;
11273     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
11274 
11275     APInt ToMask2, FromMask2;
11276     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
11277     assert(From1 == From2);
11278     (void)From2;
11279 
11280     // First, unlink CombineBFI.
11281     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
11282     // Then create a new BFI, combining the two together.
11283     APInt NewFromMask = FromMask1 | FromMask2;
11284     APInt NewToMask = ToMask1 | ToMask2;
11285 
11286     EVT VT = N->getValueType(0);
11287     SDLoc dl(N);
11288 
11289     if (NewFromMask[0] == 0)
11290       From1 = DCI.DAG.getNode(
11291         ISD::SRL, dl, VT, From1,
11292         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
11293     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
11294                            DCI.DAG.getConstant(~NewToMask, dl, VT));
11295   }
11296   return SDValue();
11297 }
11298 
11299 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
11300 /// ARMISD::VMOVRRD.
11301 static SDValue PerformVMOVRRDCombine(SDNode *N,
11302                                      TargetLowering::DAGCombinerInfo &DCI,
11303                                      const ARMSubtarget *Subtarget) {
11304   // vmovrrd(vmovdrr x, y) -> x,y
11305   SDValue InDouble = N->getOperand(0);
11306   if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
11307     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
11308 
11309   // vmovrrd(load f64) -> (load i32), (load i32)
11310   SDNode *InNode = InDouble.getNode();
11311   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
11312       InNode->getValueType(0) == MVT::f64 &&
11313       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
11314       !cast<LoadSDNode>(InNode)->isVolatile()) {
11315     // TODO: Should this be done for non-FrameIndex operands?
11316     LoadSDNode *LD = cast<LoadSDNode>(InNode);
11317 
11318     SelectionDAG &DAG = DCI.DAG;
11319     SDLoc DL(LD);
11320     SDValue BasePtr = LD->getBasePtr();
11321     SDValue NewLD1 =
11322         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
11323                     LD->getAlignment(), LD->getMemOperand()->getFlags());
11324 
11325     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
11326                                     DAG.getConstant(4, DL, MVT::i32));
11327     SDValue NewLD2 = DAG.getLoad(
11328         MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
11329         std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
11330 
11331     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
11332     if (DCI.DAG.getDataLayout().isBigEndian())
11333       std::swap (NewLD1, NewLD2);
11334     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
11335     return Result;
11336   }
11337 
11338   return SDValue();
11339 }
11340 
11341 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
11342 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
11343 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
11344   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
11345   SDValue Op0 = N->getOperand(0);
11346   SDValue Op1 = N->getOperand(1);
11347   if (Op0.getOpcode() == ISD::BITCAST)
11348     Op0 = Op0.getOperand(0);
11349   if (Op1.getOpcode() == ISD::BITCAST)
11350     Op1 = Op1.getOperand(0);
11351   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
11352       Op0.getNode() == Op1.getNode() &&
11353       Op0.getResNo() == 0 && Op1.getResNo() == 1)
11354     return DAG.getNode(ISD::BITCAST, SDLoc(N),
11355                        N->getValueType(0), Op0.getOperand(0));
11356   return SDValue();
11357 }
11358 
11359 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
11360 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
11361 /// i64 vector to have f64 elements, since the value can then be loaded
11362 /// directly into a VFP register.
11363 static bool hasNormalLoadOperand(SDNode *N) {
11364   unsigned NumElts = N->getValueType(0).getVectorNumElements();
11365   for (unsigned i = 0; i < NumElts; ++i) {
11366     SDNode *Elt = N->getOperand(i).getNode();
11367     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
11368       return true;
11369   }
11370   return false;
11371 }
11372 
11373 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
11374 /// ISD::BUILD_VECTOR.
11375 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
11376                                           TargetLowering::DAGCombinerInfo &DCI,
11377                                           const ARMSubtarget *Subtarget) {
11378   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
11379   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
11380   // into a pair of GPRs, which is fine when the value is used as a scalar,
11381   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
11382   SelectionDAG &DAG = DCI.DAG;
11383   if (N->getNumOperands() == 2)
11384     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
11385       return RV;
11386 
11387   // Load i64 elements as f64 values so that type legalization does not split
11388   // them up into i32 values.
11389   EVT VT = N->getValueType(0);
11390   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
11391     return SDValue();
11392   SDLoc dl(N);
11393   SmallVector<SDValue, 8> Ops;
11394   unsigned NumElts = VT.getVectorNumElements();
11395   for (unsigned i = 0; i < NumElts; ++i) {
11396     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
11397     Ops.push_back(V);
11398     // Make the DAGCombiner fold the bitcast.
11399     DCI.AddToWorklist(V.getNode());
11400   }
11401   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
11402   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
11403   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
11404 }
11405 
11406 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
11407 static SDValue
11408 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11409   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
11410   // At that time, we may have inserted bitcasts from integer to float.
11411   // If these bitcasts have survived DAGCombine, change the lowering of this
11412   // BUILD_VECTOR in something more vector friendly, i.e., that does not
11413   // force to use floating point types.
11414 
11415   // Make sure we can change the type of the vector.
11416   // This is possible iff:
11417   // 1. The vector is only used in a bitcast to a integer type. I.e.,
11418   //    1.1. Vector is used only once.
11419   //    1.2. Use is a bit convert to an integer type.
11420   // 2. The size of its operands are 32-bits (64-bits are not legal).
11421   EVT VT = N->getValueType(0);
11422   EVT EltVT = VT.getVectorElementType();
11423 
11424   // Check 1.1. and 2.
11425   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
11426     return SDValue();
11427 
11428   // By construction, the input type must be float.
11429   assert(EltVT == MVT::f32 && "Unexpected type!");
11430 
11431   // Check 1.2.
11432   SDNode *Use = *N->use_begin();
11433   if (Use->getOpcode() != ISD::BITCAST ||
11434       Use->getValueType(0).isFloatingPoint())
11435     return SDValue();
11436 
11437   // Check profitability.
11438   // Model is, if more than half of the relevant operands are bitcast from
11439   // i32, turn the build_vector into a sequence of insert_vector_elt.
11440   // Relevant operands are everything that is not statically
11441   // (i.e., at compile time) bitcasted.
11442   unsigned NumOfBitCastedElts = 0;
11443   unsigned NumElts = VT.getVectorNumElements();
11444   unsigned NumOfRelevantElts = NumElts;
11445   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
11446     SDValue Elt = N->getOperand(Idx);
11447     if (Elt->getOpcode() == ISD::BITCAST) {
11448       // Assume only bit cast to i32 will go away.
11449       if (Elt->getOperand(0).getValueType() == MVT::i32)
11450         ++NumOfBitCastedElts;
11451     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
11452       // Constants are statically casted, thus do not count them as
11453       // relevant operands.
11454       --NumOfRelevantElts;
11455   }
11456 
11457   // Check if more than half of the elements require a non-free bitcast.
11458   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
11459     return SDValue();
11460 
11461   SelectionDAG &DAG = DCI.DAG;
11462   // Create the new vector type.
11463   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
11464   // Check if the type is legal.
11465   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11466   if (!TLI.isTypeLegal(VecVT))
11467     return SDValue();
11468 
11469   // Combine:
11470   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
11471   // => BITCAST INSERT_VECTOR_ELT
11472   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
11473   //                      (BITCAST EN), N.
11474   SDValue Vec = DAG.getUNDEF(VecVT);
11475   SDLoc dl(N);
11476   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
11477     SDValue V = N->getOperand(Idx);
11478     if (V.isUndef())
11479       continue;
11480     if (V.getOpcode() == ISD::BITCAST &&
11481         V->getOperand(0).getValueType() == MVT::i32)
11482       // Fold obvious case.
11483       V = V.getOperand(0);
11484     else {
11485       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
11486       // Make the DAGCombiner fold the bitcasts.
11487       DCI.AddToWorklist(V.getNode());
11488     }
11489     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
11490     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
11491   }
11492   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
11493   // Make the DAGCombiner fold the bitcasts.
11494   DCI.AddToWorklist(Vec.getNode());
11495   return Vec;
11496 }
11497 
11498 /// PerformInsertEltCombine - Target-specific dag combine xforms for
11499 /// ISD::INSERT_VECTOR_ELT.
11500 static SDValue PerformInsertEltCombine(SDNode *N,
11501                                        TargetLowering::DAGCombinerInfo &DCI) {
11502   // Bitcast an i64 load inserted into a vector to f64.
11503   // Otherwise, the i64 value will be legalized to a pair of i32 values.
11504   EVT VT = N->getValueType(0);
11505   SDNode *Elt = N->getOperand(1).getNode();
11506   if (VT.getVectorElementType() != MVT::i64 ||
11507       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
11508     return SDValue();
11509 
11510   SelectionDAG &DAG = DCI.DAG;
11511   SDLoc dl(N);
11512   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
11513                                  VT.getVectorNumElements());
11514   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
11515   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
11516   // Make the DAGCombiner fold the bitcasts.
11517   DCI.AddToWorklist(Vec.getNode());
11518   DCI.AddToWorklist(V.getNode());
11519   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
11520                                Vec, V, N->getOperand(2));
11521   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
11522 }
11523 
11524 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
11525 /// ISD::VECTOR_SHUFFLE.
11526 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
11527   // The LLVM shufflevector instruction does not require the shuffle mask
11528   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
11529   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
11530   // operands do not match the mask length, they are extended by concatenating
11531   // them with undef vectors.  That is probably the right thing for other
11532   // targets, but for NEON it is better to concatenate two double-register
11533   // size vector operands into a single quad-register size vector.  Do that
11534   // transformation here:
11535   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
11536   //   shuffle(concat(v1, v2), undef)
11537   SDValue Op0 = N->getOperand(0);
11538   SDValue Op1 = N->getOperand(1);
11539   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
11540       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
11541       Op0.getNumOperands() != 2 ||
11542       Op1.getNumOperands() != 2)
11543     return SDValue();
11544   SDValue Concat0Op1 = Op0.getOperand(1);
11545   SDValue Concat1Op1 = Op1.getOperand(1);
11546   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
11547     return SDValue();
11548   // Skip the transformation if any of the types are illegal.
11549   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11550   EVT VT = N->getValueType(0);
11551   if (!TLI.isTypeLegal(VT) ||
11552       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
11553       !TLI.isTypeLegal(Concat1Op1.getValueType()))
11554     return SDValue();
11555 
11556   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
11557                                   Op0.getOperand(0), Op1.getOperand(0));
11558   // Translate the shuffle mask.
11559   SmallVector<int, 16> NewMask;
11560   unsigned NumElts = VT.getVectorNumElements();
11561   unsigned HalfElts = NumElts/2;
11562   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
11563   for (unsigned n = 0; n < NumElts; ++n) {
11564     int MaskElt = SVN->getMaskElt(n);
11565     int NewElt = -1;
11566     if (MaskElt < (int)HalfElts)
11567       NewElt = MaskElt;
11568     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
11569       NewElt = HalfElts + MaskElt - NumElts;
11570     NewMask.push_back(NewElt);
11571   }
11572   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
11573                               DAG.getUNDEF(VT), NewMask);
11574 }
11575 
11576 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
11577 /// NEON load/store intrinsics, and generic vector load/stores, to merge
11578 /// base address updates.
11579 /// For generic load/stores, the memory type is assumed to be a vector.
11580 /// The caller is assumed to have checked legality.
11581 static SDValue CombineBaseUpdate(SDNode *N,
11582                                  TargetLowering::DAGCombinerInfo &DCI) {
11583   SelectionDAG &DAG = DCI.DAG;
11584   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
11585                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
11586   const bool isStore = N->getOpcode() == ISD::STORE;
11587   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
11588   SDValue Addr = N->getOperand(AddrOpIdx);
11589   MemSDNode *MemN = cast<MemSDNode>(N);
11590   SDLoc dl(N);
11591 
11592   // Search for a use of the address operand that is an increment.
11593   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
11594          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
11595     SDNode *User = *UI;
11596     if (User->getOpcode() != ISD::ADD ||
11597         UI.getUse().getResNo() != Addr.getResNo())
11598       continue;
11599 
11600     // Check that the add is independent of the load/store.  Otherwise, folding
11601     // it would create a cycle.
11602     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
11603       continue;
11604 
11605     // Find the new opcode for the updating load/store.
11606     bool isLoadOp = true;
11607     bool isLaneOp = false;
11608     unsigned NewOpc = 0;
11609     unsigned NumVecs = 0;
11610     if (isIntrinsic) {
11611       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
11612       switch (IntNo) {
11613       default: llvm_unreachable("unexpected intrinsic for Neon base update");
11614       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
11615         NumVecs = 1; break;
11616       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
11617         NumVecs = 2; break;
11618       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
11619         NumVecs = 3; break;
11620       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
11621         NumVecs = 4; break;
11622       case Intrinsic::arm_neon_vld2dup:
11623       case Intrinsic::arm_neon_vld3dup:
11624       case Intrinsic::arm_neon_vld4dup:
11625         // TODO: Support updating VLDxDUP nodes. For now, we just skip
11626         // combining base updates for such intrinsics.
11627         continue;
11628       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
11629         NumVecs = 2; isLaneOp = true; break;
11630       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
11631         NumVecs = 3; isLaneOp = true; break;
11632       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
11633         NumVecs = 4; isLaneOp = true; break;
11634       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
11635         NumVecs = 1; isLoadOp = false; break;
11636       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
11637         NumVecs = 2; isLoadOp = false; break;
11638       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
11639         NumVecs = 3; isLoadOp = false; break;
11640       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
11641         NumVecs = 4; isLoadOp = false; break;
11642       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
11643         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
11644       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
11645         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
11646       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
11647         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
11648       }
11649     } else {
11650       isLaneOp = true;
11651       switch (N->getOpcode()) {
11652       default: llvm_unreachable("unexpected opcode for Neon base update");
11653       case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
11654       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
11655       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
11656       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
11657       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
11658         NumVecs = 1; isLaneOp = false; break;
11659       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
11660         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
11661       }
11662     }
11663 
11664     // Find the size of memory referenced by the load/store.
11665     EVT VecTy;
11666     if (isLoadOp) {
11667       VecTy = N->getValueType(0);
11668     } else if (isIntrinsic) {
11669       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
11670     } else {
11671       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
11672       VecTy = N->getOperand(1).getValueType();
11673     }
11674 
11675     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
11676     if (isLaneOp)
11677       NumBytes /= VecTy.getVectorNumElements();
11678 
11679     // If the increment is a constant, it must match the memory ref size.
11680     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
11681     ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
11682     if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
11683       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
11684       // separate instructions that make it harder to use a non-constant update.
11685       continue;
11686     }
11687 
11688     // OK, we found an ADD we can fold into the base update.
11689     // Now, create a _UPD node, taking care of not breaking alignment.
11690 
11691     EVT AlignedVecTy = VecTy;
11692     unsigned Alignment = MemN->getAlignment();
11693 
11694     // If this is a less-than-standard-aligned load/store, change the type to
11695     // match the standard alignment.
11696     // The alignment is overlooked when selecting _UPD variants; and it's
11697     // easier to introduce bitcasts here than fix that.
11698     // There are 3 ways to get to this base-update combine:
11699     // - intrinsics: they are assumed to be properly aligned (to the standard
11700     //   alignment of the memory type), so we don't need to do anything.
11701     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
11702     //   intrinsics, so, likewise, there's nothing to do.
11703     // - generic load/store instructions: the alignment is specified as an
11704     //   explicit operand, rather than implicitly as the standard alignment
11705     //   of the memory type (like the intrisics).  We need to change the
11706     //   memory type to match the explicit alignment.  That way, we don't
11707     //   generate non-standard-aligned ARMISD::VLDx nodes.
11708     if (isa<LSBaseSDNode>(N)) {
11709       if (Alignment == 0)
11710         Alignment = 1;
11711       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
11712         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
11713         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
11714         assert(!isLaneOp && "Unexpected generic load/store lane.");
11715         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
11716         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
11717       }
11718       // Don't set an explicit alignment on regular load/stores that we want
11719       // to transform to VLD/VST 1_UPD nodes.
11720       // This matches the behavior of regular load/stores, which only get an
11721       // explicit alignment if the MMO alignment is larger than the standard
11722       // alignment of the memory type.
11723       // Intrinsics, however, always get an explicit alignment, set to the
11724       // alignment of the MMO.
11725       Alignment = 1;
11726     }
11727 
11728     // Create the new updating load/store node.
11729     // First, create an SDVTList for the new updating node's results.
11730     EVT Tys[6];
11731     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
11732     unsigned n;
11733     for (n = 0; n < NumResultVecs; ++n)
11734       Tys[n] = AlignedVecTy;
11735     Tys[n++] = MVT::i32;
11736     Tys[n] = MVT::Other;
11737     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
11738 
11739     // Then, gather the new node's operands.
11740     SmallVector<SDValue, 8> Ops;
11741     Ops.push_back(N->getOperand(0)); // incoming chain
11742     Ops.push_back(N->getOperand(AddrOpIdx));
11743     Ops.push_back(Inc);
11744 
11745     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
11746       // Try to match the intrinsic's signature
11747       Ops.push_back(StN->getValue());
11748     } else {
11749       // Loads (and of course intrinsics) match the intrinsics' signature,
11750       // so just add all but the alignment operand.
11751       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
11752         Ops.push_back(N->getOperand(i));
11753     }
11754 
11755     // For all node types, the alignment operand is always the last one.
11756     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
11757 
11758     // If this is a non-standard-aligned STORE, the penultimate operand is the
11759     // stored value.  Bitcast it to the aligned type.
11760     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
11761       SDValue &StVal = Ops[Ops.size()-2];
11762       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
11763     }
11764 
11765     EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
11766     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
11767                                            MemN->getMemOperand());
11768 
11769     // Update the uses.
11770     SmallVector<SDValue, 5> NewResults;
11771     for (unsigned i = 0; i < NumResultVecs; ++i)
11772       NewResults.push_back(SDValue(UpdN.getNode(), i));
11773 
11774     // If this is an non-standard-aligned LOAD, the first result is the loaded
11775     // value.  Bitcast it to the expected result type.
11776     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
11777       SDValue &LdVal = NewResults[0];
11778       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
11779     }
11780 
11781     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
11782     DCI.CombineTo(N, NewResults);
11783     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
11784 
11785     break;
11786   }
11787   return SDValue();
11788 }
11789 
11790 static SDValue PerformVLDCombine(SDNode *N,
11791                                  TargetLowering::DAGCombinerInfo &DCI) {
11792   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
11793     return SDValue();
11794 
11795   return CombineBaseUpdate(N, DCI);
11796 }
11797 
11798 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
11799 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
11800 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
11801 /// return true.
11802 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11803   SelectionDAG &DAG = DCI.DAG;
11804   EVT VT = N->getValueType(0);
11805   // vldN-dup instructions only support 64-bit vectors for N > 1.
11806   if (!VT.is64BitVector())
11807     return false;
11808 
11809   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
11810   SDNode *VLD = N->getOperand(0).getNode();
11811   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
11812     return false;
11813   unsigned NumVecs = 0;
11814   unsigned NewOpc = 0;
11815   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
11816   if (IntNo == Intrinsic::arm_neon_vld2lane) {
11817     NumVecs = 2;
11818     NewOpc = ARMISD::VLD2DUP;
11819   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
11820     NumVecs = 3;
11821     NewOpc = ARMISD::VLD3DUP;
11822   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
11823     NumVecs = 4;
11824     NewOpc = ARMISD::VLD4DUP;
11825   } else {
11826     return false;
11827   }
11828 
11829   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
11830   // numbers match the load.
11831   unsigned VLDLaneNo =
11832     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
11833   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
11834        UI != UE; ++UI) {
11835     // Ignore uses of the chain result.
11836     if (UI.getUse().getResNo() == NumVecs)
11837       continue;
11838     SDNode *User = *UI;
11839     if (User->getOpcode() != ARMISD::VDUPLANE ||
11840         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
11841       return false;
11842   }
11843 
11844   // Create the vldN-dup node.
11845   EVT Tys[5];
11846   unsigned n;
11847   for (n = 0; n < NumVecs; ++n)
11848     Tys[n] = VT;
11849   Tys[n] = MVT::Other;
11850   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
11851   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
11852   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
11853   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
11854                                            Ops, VLDMemInt->getMemoryVT(),
11855                                            VLDMemInt->getMemOperand());
11856 
11857   // Update the uses.
11858   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
11859        UI != UE; ++UI) {
11860     unsigned ResNo = UI.getUse().getResNo();
11861     // Ignore uses of the chain result.
11862     if (ResNo == NumVecs)
11863       continue;
11864     SDNode *User = *UI;
11865     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
11866   }
11867 
11868   // Now the vldN-lane intrinsic is dead except for its chain result.
11869   // Update uses of the chain.
11870   std::vector<SDValue> VLDDupResults;
11871   for (unsigned n = 0; n < NumVecs; ++n)
11872     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
11873   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
11874   DCI.CombineTo(VLD, VLDDupResults);
11875 
11876   return true;
11877 }
11878 
11879 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
11880 /// ARMISD::VDUPLANE.
11881 static SDValue PerformVDUPLANECombine(SDNode *N,
11882                                       TargetLowering::DAGCombinerInfo &DCI) {
11883   SDValue Op = N->getOperand(0);
11884 
11885   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
11886   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
11887   if (CombineVLDDUP(N, DCI))
11888     return SDValue(N, 0);
11889 
11890   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
11891   // redundant.  Ignore bit_converts for now; element sizes are checked below.
11892   while (Op.getOpcode() == ISD::BITCAST)
11893     Op = Op.getOperand(0);
11894   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
11895     return SDValue();
11896 
11897   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
11898   unsigned EltSize = Op.getScalarValueSizeInBits();
11899   // The canonical VMOV for a zero vector uses a 32-bit element size.
11900   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11901   unsigned EltBits;
11902   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
11903     EltSize = 8;
11904   EVT VT = N->getValueType(0);
11905   if (EltSize > VT.getScalarSizeInBits())
11906     return SDValue();
11907 
11908   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
11909 }
11910 
11911 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
11912 static SDValue PerformVDUPCombine(SDNode *N,
11913                                   TargetLowering::DAGCombinerInfo &DCI) {
11914   SelectionDAG &DAG = DCI.DAG;
11915   SDValue Op = N->getOperand(0);
11916 
11917   // Match VDUP(LOAD) -> VLD1DUP.
11918   // We match this pattern here rather than waiting for isel because the
11919   // transform is only legal for unindexed loads.
11920   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
11921   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
11922       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
11923     SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
11924                       DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
11925     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
11926     SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
11927                                              Ops, LD->getMemoryVT(),
11928                                              LD->getMemOperand());
11929     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
11930     return VLDDup;
11931   }
11932 
11933   return SDValue();
11934 }
11935 
11936 static SDValue PerformLOADCombine(SDNode *N,
11937                                   TargetLowering::DAGCombinerInfo &DCI) {
11938   EVT VT = N->getValueType(0);
11939 
11940   // If this is a legal vector load, try to combine it into a VLD1_UPD.
11941   if (ISD::isNormalLoad(N) && VT.isVector() &&
11942       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
11943     return CombineBaseUpdate(N, DCI);
11944 
11945   return SDValue();
11946 }
11947 
11948 /// PerformSTORECombine - Target-specific dag combine xforms for
11949 /// ISD::STORE.
11950 static SDValue PerformSTORECombine(SDNode *N,
11951                                    TargetLowering::DAGCombinerInfo &DCI) {
11952   StoreSDNode *St = cast<StoreSDNode>(N);
11953   if (St->isVolatile())
11954     return SDValue();
11955 
11956   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
11957   // pack all of the elements in one place.  Next, store to memory in fewer
11958   // chunks.
11959   SDValue StVal = St->getValue();
11960   EVT VT = StVal.getValueType();
11961   if (St->isTruncatingStore() && VT.isVector()) {
11962     SelectionDAG &DAG = DCI.DAG;
11963     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11964     EVT StVT = St->getMemoryVT();
11965     unsigned NumElems = VT.getVectorNumElements();
11966     assert(StVT != VT && "Cannot truncate to the same type");
11967     unsigned FromEltSz = VT.getScalarSizeInBits();
11968     unsigned ToEltSz = StVT.getScalarSizeInBits();
11969 
11970     // From, To sizes and ElemCount must be pow of two
11971     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
11972 
11973     // We are going to use the original vector elt for storing.
11974     // Accumulated smaller vector elements must be a multiple of the store size.
11975     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
11976 
11977     unsigned SizeRatio  = FromEltSz / ToEltSz;
11978     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
11979 
11980     // Create a type on which we perform the shuffle.
11981     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
11982                                      NumElems*SizeRatio);
11983     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
11984 
11985     SDLoc DL(St);
11986     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
11987     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
11988     for (unsigned i = 0; i < NumElems; ++i)
11989       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
11990                           ? (i + 1) * SizeRatio - 1
11991                           : i * SizeRatio;
11992 
11993     // Can't shuffle using an illegal type.
11994     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
11995 
11996     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
11997                                 DAG.getUNDEF(WideVec.getValueType()),
11998                                 ShuffleVec);
11999     // At this point all of the data is stored at the bottom of the
12000     // register. We now need to save it to mem.
12001 
12002     // Find the largest store unit
12003     MVT StoreType = MVT::i8;
12004     for (MVT Tp : MVT::integer_valuetypes()) {
12005       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
12006         StoreType = Tp;
12007     }
12008     // Didn't find a legal store type.
12009     if (!TLI.isTypeLegal(StoreType))
12010       return SDValue();
12011 
12012     // Bitcast the original vector into a vector of store-size units
12013     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
12014             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
12015     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
12016     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
12017     SmallVector<SDValue, 8> Chains;
12018     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
12019                                         TLI.getPointerTy(DAG.getDataLayout()));
12020     SDValue BasePtr = St->getBasePtr();
12021 
12022     // Perform one or more big stores into memory.
12023     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
12024     for (unsigned I = 0; I < E; I++) {
12025       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
12026                                    StoreType, ShuffWide,
12027                                    DAG.getIntPtrConstant(I, DL));
12028       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
12029                                 St->getPointerInfo(), St->getAlignment(),
12030                                 St->getMemOperand()->getFlags());
12031       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
12032                             Increment);
12033       Chains.push_back(Ch);
12034     }
12035     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
12036   }
12037 
12038   if (!ISD::isNormalStore(St))
12039     return SDValue();
12040 
12041   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
12042   // ARM stores of arguments in the same cache line.
12043   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
12044       StVal.getNode()->hasOneUse()) {
12045     SelectionDAG  &DAG = DCI.DAG;
12046     bool isBigEndian = DAG.getDataLayout().isBigEndian();
12047     SDLoc DL(St);
12048     SDValue BasePtr = St->getBasePtr();
12049     SDValue NewST1 = DAG.getStore(
12050         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
12051         BasePtr, St->getPointerInfo(), St->getAlignment(),
12052         St->getMemOperand()->getFlags());
12053 
12054     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
12055                                     DAG.getConstant(4, DL, MVT::i32));
12056     return DAG.getStore(NewST1.getValue(0), DL,
12057                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
12058                         OffsetPtr, St->getPointerInfo(),
12059                         std::min(4U, St->getAlignment() / 2),
12060                         St->getMemOperand()->getFlags());
12061   }
12062 
12063   if (StVal.getValueType() == MVT::i64 &&
12064       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12065 
12066     // Bitcast an i64 store extracted from a vector to f64.
12067     // Otherwise, the i64 value will be legalized to a pair of i32 values.
12068     SelectionDAG &DAG = DCI.DAG;
12069     SDLoc dl(StVal);
12070     SDValue IntVec = StVal.getOperand(0);
12071     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
12072                                    IntVec.getValueType().getVectorNumElements());
12073     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
12074     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
12075                                  Vec, StVal.getOperand(1));
12076     dl = SDLoc(N);
12077     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
12078     // Make the DAGCombiner fold the bitcasts.
12079     DCI.AddToWorklist(Vec.getNode());
12080     DCI.AddToWorklist(ExtElt.getNode());
12081     DCI.AddToWorklist(V.getNode());
12082     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
12083                         St->getPointerInfo(), St->getAlignment(),
12084                         St->getMemOperand()->getFlags(), St->getAAInfo());
12085   }
12086 
12087   // If this is a legal vector store, try to combine it into a VST1_UPD.
12088   if (ISD::isNormalStore(N) && VT.isVector() &&
12089       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
12090     return CombineBaseUpdate(N, DCI);
12091 
12092   return SDValue();
12093 }
12094 
12095 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
12096 /// can replace combinations of VMUL and VCVT (floating-point to integer)
12097 /// when the VMUL has a constant operand that is a power of 2.
12098 ///
12099 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
12100 ///  vmul.f32        d16, d17, d16
12101 ///  vcvt.s32.f32    d16, d16
12102 /// becomes:
12103 ///  vcvt.s32.f32    d16, d16, #3
12104 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
12105                                   const ARMSubtarget *Subtarget) {
12106   if (!Subtarget->hasNEON())
12107     return SDValue();
12108 
12109   SDValue Op = N->getOperand(0);
12110   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12111       Op.getOpcode() != ISD::FMUL)
12112     return SDValue();
12113 
12114   SDValue ConstVec = Op->getOperand(1);
12115   if (!isa<BuildVectorSDNode>(ConstVec))
12116     return SDValue();
12117 
12118   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
12119   uint32_t FloatBits = FloatTy.getSizeInBits();
12120   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
12121   uint32_t IntBits = IntTy.getSizeInBits();
12122   unsigned NumLanes = Op.getValueType().getVectorNumElements();
12123   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
12124     // These instructions only exist converting from f32 to i32. We can handle
12125     // smaller integers by generating an extra truncate, but larger ones would
12126     // be lossy. We also can't handle more then 4 lanes, since these intructions
12127     // only support v2i32/v4i32 types.
12128     return SDValue();
12129   }
12130 
12131   BitVector UndefElements;
12132   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12133   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
12134   if (C == -1 || C == 0 || C > 32)
12135     return SDValue();
12136 
12137   SDLoc dl(N);
12138   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
12139   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
12140     Intrinsic::arm_neon_vcvtfp2fxu;
12141   SDValue FixConv = DAG.getNode(
12142       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
12143       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
12144       DAG.getConstant(C, dl, MVT::i32));
12145 
12146   if (IntBits < FloatBits)
12147     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
12148 
12149   return FixConv;
12150 }
12151 
12152 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
12153 /// can replace combinations of VCVT (integer to floating-point) and VDIV
12154 /// when the VDIV has a constant operand that is a power of 2.
12155 ///
12156 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
12157 ///  vcvt.f32.s32    d16, d16
12158 ///  vdiv.f32        d16, d17, d16
12159 /// becomes:
12160 ///  vcvt.f32.s32    d16, d16, #3
12161 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
12162                                   const ARMSubtarget *Subtarget) {
12163   if (!Subtarget->hasNEON())
12164     return SDValue();
12165 
12166   SDValue Op = N->getOperand(0);
12167   unsigned OpOpcode = Op.getNode()->getOpcode();
12168   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
12169       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
12170     return SDValue();
12171 
12172   SDValue ConstVec = N->getOperand(1);
12173   if (!isa<BuildVectorSDNode>(ConstVec))
12174     return SDValue();
12175 
12176   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
12177   uint32_t FloatBits = FloatTy.getSizeInBits();
12178   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
12179   uint32_t IntBits = IntTy.getSizeInBits();
12180   unsigned NumLanes = Op.getValueType().getVectorNumElements();
12181   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
12182     // These instructions only exist converting from i32 to f32. We can handle
12183     // smaller integers by generating an extra extend, but larger ones would
12184     // be lossy. We also can't handle more then 4 lanes, since these intructions
12185     // only support v2i32/v4i32 types.
12186     return SDValue();
12187   }
12188 
12189   BitVector UndefElements;
12190   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12191   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
12192   if (C == -1 || C == 0 || C > 32)
12193     return SDValue();
12194 
12195   SDLoc dl(N);
12196   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
12197   SDValue ConvInput = Op.getOperand(0);
12198   if (IntBits < FloatBits)
12199     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
12200                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
12201                             ConvInput);
12202 
12203   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
12204     Intrinsic::arm_neon_vcvtfxu2fp;
12205   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
12206                      Op.getValueType(),
12207                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
12208                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
12209 }
12210 
12211 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
12212 /// operand of a vector shift operation, where all the elements of the
12213 /// build_vector must have the same constant integer value.
12214 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
12215   // Ignore bit_converts.
12216   while (Op.getOpcode() == ISD::BITCAST)
12217     Op = Op.getOperand(0);
12218   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
12219   APInt SplatBits, SplatUndef;
12220   unsigned SplatBitSize;
12221   bool HasAnyUndefs;
12222   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
12223                                       HasAnyUndefs, ElementBits) ||
12224       SplatBitSize > ElementBits)
12225     return false;
12226   Cnt = SplatBits.getSExtValue();
12227   return true;
12228 }
12229 
12230 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
12231 /// operand of a vector shift left operation.  That value must be in the range:
12232 ///   0 <= Value < ElementBits for a left shift; or
12233 ///   0 <= Value <= ElementBits for a long left shift.
12234 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
12235   assert(VT.isVector() && "vector shift count is not a vector type");
12236   int64_t ElementBits = VT.getScalarSizeInBits();
12237   if (! getVShiftImm(Op, ElementBits, Cnt))
12238     return false;
12239   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
12240 }
12241 
12242 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
12243 /// operand of a vector shift right operation.  For a shift opcode, the value
12244 /// is positive, but for an intrinsic the value count must be negative. The
12245 /// absolute value must be in the range:
12246 ///   1 <= |Value| <= ElementBits for a right shift; or
12247 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
12248 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
12249                          int64_t &Cnt) {
12250   assert(VT.isVector() && "vector shift count is not a vector type");
12251   int64_t ElementBits = VT.getScalarSizeInBits();
12252   if (! getVShiftImm(Op, ElementBits, Cnt))
12253     return false;
12254   if (!isIntrinsic)
12255     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
12256   if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
12257     Cnt = -Cnt;
12258     return true;
12259   }
12260   return false;
12261 }
12262 
12263 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
12264 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
12265   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
12266   switch (IntNo) {
12267   default:
12268     // Don't do anything for most intrinsics.
12269     break;
12270 
12271   // Vector shifts: check for immediate versions and lower them.
12272   // Note: This is done during DAG combining instead of DAG legalizing because
12273   // the build_vectors for 64-bit vector element shift counts are generally
12274   // not legal, and it is hard to see their values after they get legalized to
12275   // loads from a constant pool.
12276   case Intrinsic::arm_neon_vshifts:
12277   case Intrinsic::arm_neon_vshiftu:
12278   case Intrinsic::arm_neon_vrshifts:
12279   case Intrinsic::arm_neon_vrshiftu:
12280   case Intrinsic::arm_neon_vrshiftn:
12281   case Intrinsic::arm_neon_vqshifts:
12282   case Intrinsic::arm_neon_vqshiftu:
12283   case Intrinsic::arm_neon_vqshiftsu:
12284   case Intrinsic::arm_neon_vqshiftns:
12285   case Intrinsic::arm_neon_vqshiftnu:
12286   case Intrinsic::arm_neon_vqshiftnsu:
12287   case Intrinsic::arm_neon_vqrshiftns:
12288   case Intrinsic::arm_neon_vqrshiftnu:
12289   case Intrinsic::arm_neon_vqrshiftnsu: {
12290     EVT VT = N->getOperand(1).getValueType();
12291     int64_t Cnt;
12292     unsigned VShiftOpc = 0;
12293 
12294     switch (IntNo) {
12295     case Intrinsic::arm_neon_vshifts:
12296     case Intrinsic::arm_neon_vshiftu:
12297       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
12298         VShiftOpc = ARMISD::VSHL;
12299         break;
12300       }
12301       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
12302         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
12303                      ARMISD::VSHRs : ARMISD::VSHRu);
12304         break;
12305       }
12306       return SDValue();
12307 
12308     case Intrinsic::arm_neon_vrshifts:
12309     case Intrinsic::arm_neon_vrshiftu:
12310       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
12311         break;
12312       return SDValue();
12313 
12314     case Intrinsic::arm_neon_vqshifts:
12315     case Intrinsic::arm_neon_vqshiftu:
12316       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
12317         break;
12318       return SDValue();
12319 
12320     case Intrinsic::arm_neon_vqshiftsu:
12321       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
12322         break;
12323       llvm_unreachable("invalid shift count for vqshlu intrinsic");
12324 
12325     case Intrinsic::arm_neon_vrshiftn:
12326     case Intrinsic::arm_neon_vqshiftns:
12327     case Intrinsic::arm_neon_vqshiftnu:
12328     case Intrinsic::arm_neon_vqshiftnsu:
12329     case Intrinsic::arm_neon_vqrshiftns:
12330     case Intrinsic::arm_neon_vqrshiftnu:
12331     case Intrinsic::arm_neon_vqrshiftnsu:
12332       // Narrowing shifts require an immediate right shift.
12333       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
12334         break;
12335       llvm_unreachable("invalid shift count for narrowing vector shift "
12336                        "intrinsic");
12337 
12338     default:
12339       llvm_unreachable("unhandled vector shift");
12340     }
12341 
12342     switch (IntNo) {
12343     case Intrinsic::arm_neon_vshifts:
12344     case Intrinsic::arm_neon_vshiftu:
12345       // Opcode already set above.
12346       break;
12347     case Intrinsic::arm_neon_vrshifts:
12348       VShiftOpc = ARMISD::VRSHRs; break;
12349     case Intrinsic::arm_neon_vrshiftu:
12350       VShiftOpc = ARMISD::VRSHRu; break;
12351     case Intrinsic::arm_neon_vrshiftn:
12352       VShiftOpc = ARMISD::VRSHRN; break;
12353     case Intrinsic::arm_neon_vqshifts:
12354       VShiftOpc = ARMISD::VQSHLs; break;
12355     case Intrinsic::arm_neon_vqshiftu:
12356       VShiftOpc = ARMISD::VQSHLu; break;
12357     case Intrinsic::arm_neon_vqshiftsu:
12358       VShiftOpc = ARMISD::VQSHLsu; break;
12359     case Intrinsic::arm_neon_vqshiftns:
12360       VShiftOpc = ARMISD::VQSHRNs; break;
12361     case Intrinsic::arm_neon_vqshiftnu:
12362       VShiftOpc = ARMISD::VQSHRNu; break;
12363     case Intrinsic::arm_neon_vqshiftnsu:
12364       VShiftOpc = ARMISD::VQSHRNsu; break;
12365     case Intrinsic::arm_neon_vqrshiftns:
12366       VShiftOpc = ARMISD::VQRSHRNs; break;
12367     case Intrinsic::arm_neon_vqrshiftnu:
12368       VShiftOpc = ARMISD::VQRSHRNu; break;
12369     case Intrinsic::arm_neon_vqrshiftnsu:
12370       VShiftOpc = ARMISD::VQRSHRNsu; break;
12371     }
12372 
12373     SDLoc dl(N);
12374     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
12375                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
12376   }
12377 
12378   case Intrinsic::arm_neon_vshiftins: {
12379     EVT VT = N->getOperand(1).getValueType();
12380     int64_t Cnt;
12381     unsigned VShiftOpc = 0;
12382 
12383     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
12384       VShiftOpc = ARMISD::VSLI;
12385     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
12386       VShiftOpc = ARMISD::VSRI;
12387     else {
12388       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
12389     }
12390 
12391     SDLoc dl(N);
12392     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
12393                        N->getOperand(1), N->getOperand(2),
12394                        DAG.getConstant(Cnt, dl, MVT::i32));
12395   }
12396 
12397   case Intrinsic::arm_neon_vqrshifts:
12398   case Intrinsic::arm_neon_vqrshiftu:
12399     // No immediate versions of these to check for.
12400     break;
12401   }
12402 
12403   return SDValue();
12404 }
12405 
12406 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
12407 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
12408 /// combining instead of DAG legalizing because the build_vectors for 64-bit
12409 /// vector element shift counts are generally not legal, and it is hard to see
12410 /// their values after they get legalized to loads from a constant pool.
12411 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
12412                                    const ARMSubtarget *ST) {
12413   EVT VT = N->getValueType(0);
12414   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
12415     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
12416     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
12417     SDValue N1 = N->getOperand(1);
12418     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
12419       SDValue N0 = N->getOperand(0);
12420       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
12421           DAG.MaskedValueIsZero(N0.getOperand(0),
12422                                 APInt::getHighBitsSet(32, 16)))
12423         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
12424     }
12425   }
12426 
12427   // Nothing to be done for scalar shifts.
12428   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12429   if (!VT.isVector() || !TLI.isTypeLegal(VT))
12430     return SDValue();
12431 
12432   assert(ST->hasNEON() && "unexpected vector shift");
12433   int64_t Cnt;
12434 
12435   switch (N->getOpcode()) {
12436   default: llvm_unreachable("unexpected shift opcode");
12437 
12438   case ISD::SHL:
12439     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
12440       SDLoc dl(N);
12441       return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
12442                          DAG.getConstant(Cnt, dl, MVT::i32));
12443     }
12444     break;
12445 
12446   case ISD::SRA:
12447   case ISD::SRL:
12448     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
12449       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
12450                             ARMISD::VSHRs : ARMISD::VSHRu);
12451       SDLoc dl(N);
12452       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
12453                          DAG.getConstant(Cnt, dl, MVT::i32));
12454     }
12455   }
12456   return SDValue();
12457 }
12458 
12459 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
12460 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
12461 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
12462                                     const ARMSubtarget *ST) {
12463   SDValue N0 = N->getOperand(0);
12464 
12465   // Check for sign- and zero-extensions of vector extract operations of 8-
12466   // and 16-bit vector elements.  NEON supports these directly.  They are
12467   // handled during DAG combining because type legalization will promote them
12468   // to 32-bit types and it is messy to recognize the operations after that.
12469   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12470     SDValue Vec = N0.getOperand(0);
12471     SDValue Lane = N0.getOperand(1);
12472     EVT VT = N->getValueType(0);
12473     EVT EltVT = N0.getValueType();
12474     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12475 
12476     if (VT == MVT::i32 &&
12477         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
12478         TLI.isTypeLegal(Vec.getValueType()) &&
12479         isa<ConstantSDNode>(Lane)) {
12480 
12481       unsigned Opc = 0;
12482       switch (N->getOpcode()) {
12483       default: llvm_unreachable("unexpected opcode");
12484       case ISD::SIGN_EXTEND:
12485         Opc = ARMISD::VGETLANEs;
12486         break;
12487       case ISD::ZERO_EXTEND:
12488       case ISD::ANY_EXTEND:
12489         Opc = ARMISD::VGETLANEu;
12490         break;
12491       }
12492       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
12493     }
12494   }
12495 
12496   return SDValue();
12497 }
12498 
12499 static const APInt *isPowerOf2Constant(SDValue V) {
12500   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
12501   if (!C)
12502     return nullptr;
12503   const APInt *CV = &C->getAPIntValue();
12504   return CV->isPowerOf2() ? CV : nullptr;
12505 }
12506 
12507 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
12508   // If we have a CMOV, OR and AND combination such as:
12509   //   if (x & CN)
12510   //     y |= CM;
12511   //
12512   // And:
12513   //   * CN is a single bit;
12514   //   * All bits covered by CM are known zero in y
12515   //
12516   // Then we can convert this into a sequence of BFI instructions. This will
12517   // always be a win if CM is a single bit, will always be no worse than the
12518   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
12519   // three bits (due to the extra IT instruction).
12520 
12521   SDValue Op0 = CMOV->getOperand(0);
12522   SDValue Op1 = CMOV->getOperand(1);
12523   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
12524   auto CC = CCNode->getAPIntValue().getLimitedValue();
12525   SDValue CmpZ = CMOV->getOperand(4);
12526 
12527   // The compare must be against zero.
12528   if (!isNullConstant(CmpZ->getOperand(1)))
12529     return SDValue();
12530 
12531   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
12532   SDValue And = CmpZ->getOperand(0);
12533   if (And->getOpcode() != ISD::AND)
12534     return SDValue();
12535   const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
12536   if (!AndC)
12537     return SDValue();
12538   SDValue X = And->getOperand(0);
12539 
12540   if (CC == ARMCC::EQ) {
12541     // We're performing an "equal to zero" compare. Swap the operands so we
12542     // canonicalize on a "not equal to zero" compare.
12543     std::swap(Op0, Op1);
12544   } else {
12545     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
12546   }
12547 
12548   if (Op1->getOpcode() != ISD::OR)
12549     return SDValue();
12550 
12551   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
12552   if (!OrC)
12553     return SDValue();
12554   SDValue Y = Op1->getOperand(0);
12555 
12556   if (Op0 != Y)
12557     return SDValue();
12558 
12559   // Now, is it profitable to continue?
12560   APInt OrCI = OrC->getAPIntValue();
12561   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
12562   if (OrCI.countPopulation() > Heuristic)
12563     return SDValue();
12564 
12565   // Lastly, can we determine that the bits defined by OrCI
12566   // are zero in Y?
12567   KnownBits Known;
12568   DAG.computeKnownBits(Y, Known);
12569   if ((OrCI & Known.Zero) != OrCI)
12570     return SDValue();
12571 
12572   // OK, we can do the combine.
12573   SDValue V = Y;
12574   SDLoc dl(X);
12575   EVT VT = X.getValueType();
12576   unsigned BitInX = AndC->logBase2();
12577 
12578   if (BitInX != 0) {
12579     // We must shift X first.
12580     X = DAG.getNode(ISD::SRL, dl, VT, X,
12581                     DAG.getConstant(BitInX, dl, VT));
12582   }
12583 
12584   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
12585        BitInY < NumActiveBits; ++BitInY) {
12586     if (OrCI[BitInY] == 0)
12587       continue;
12588     APInt Mask(VT.getSizeInBits(), 0);
12589     Mask.setBit(BitInY);
12590     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
12591                     // Confusingly, the operand is an *inverted* mask.
12592                     DAG.getConstant(~Mask, dl, VT));
12593   }
12594 
12595   return V;
12596 }
12597 
12598 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
12599 SDValue
12600 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
12601   SDValue Cmp = N->getOperand(4);
12602   if (Cmp.getOpcode() != ARMISD::CMPZ)
12603     // Only looking at NE cases.
12604     return SDValue();
12605 
12606   EVT VT = N->getValueType(0);
12607   SDLoc dl(N);
12608   SDValue LHS = Cmp.getOperand(0);
12609   SDValue RHS = Cmp.getOperand(1);
12610   SDValue Chain = N->getOperand(0);
12611   SDValue BB = N->getOperand(1);
12612   SDValue ARMcc = N->getOperand(2);
12613   ARMCC::CondCodes CC =
12614     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
12615 
12616   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
12617   // -> (brcond Chain BB CC CPSR Cmp)
12618   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
12619       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
12620       LHS->getOperand(0)->hasOneUse()) {
12621     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
12622     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
12623     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
12624     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
12625     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
12626         (LHS01C && LHS01C->getZExtValue() == 1) &&
12627         (LHS1C && LHS1C->getZExtValue() == 1) &&
12628         (RHSC && RHSC->getZExtValue() == 0)) {
12629       return DAG.getNode(
12630           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
12631           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
12632     }
12633   }
12634 
12635   return SDValue();
12636 }
12637 
12638 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
12639 SDValue
12640 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
12641   SDValue Cmp = N->getOperand(4);
12642   if (Cmp.getOpcode() != ARMISD::CMPZ)
12643     // Only looking at EQ and NE cases.
12644     return SDValue();
12645 
12646   EVT VT = N->getValueType(0);
12647   SDLoc dl(N);
12648   SDValue LHS = Cmp.getOperand(0);
12649   SDValue RHS = Cmp.getOperand(1);
12650   SDValue FalseVal = N->getOperand(0);
12651   SDValue TrueVal = N->getOperand(1);
12652   SDValue ARMcc = N->getOperand(2);
12653   ARMCC::CondCodes CC =
12654     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
12655 
12656   // BFI is only available on V6T2+.
12657   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
12658     SDValue R = PerformCMOVToBFICombine(N, DAG);
12659     if (R)
12660       return R;
12661   }
12662 
12663   // Simplify
12664   //   mov     r1, r0
12665   //   cmp     r1, x
12666   //   mov     r0, y
12667   //   moveq   r0, x
12668   // to
12669   //   cmp     r0, x
12670   //   movne   r0, y
12671   //
12672   //   mov     r1, r0
12673   //   cmp     r1, x
12674   //   mov     r0, x
12675   //   movne   r0, y
12676   // to
12677   //   cmp     r0, x
12678   //   movne   r0, y
12679   /// FIXME: Turn this into a target neutral optimization?
12680   SDValue Res;
12681   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
12682     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
12683                       N->getOperand(3), Cmp);
12684   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
12685     SDValue ARMcc;
12686     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
12687     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
12688                       N->getOperand(3), NewCmp);
12689   }
12690 
12691   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
12692   // -> (cmov F T CC CPSR Cmp)
12693   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
12694     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
12695     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
12696     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
12697     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
12698         (LHS1C && LHS1C->getZExtValue() == 1) &&
12699         (RHSC && RHSC->getZExtValue() == 0)) {
12700       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
12701                          LHS->getOperand(2), LHS->getOperand(3),
12702                          LHS->getOperand(4));
12703     }
12704   }
12705 
12706   if (!VT.isInteger())
12707       return SDValue();
12708 
12709   // Materialize a boolean comparison for integers so we can avoid branching.
12710   if (isNullConstant(FalseVal)) {
12711     if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
12712       if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
12713         // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
12714         // right 5 bits will make that 32 be 1, otherwise it will be 0.
12715         // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
12716         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
12717         Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
12718                           DAG.getConstant(5, dl, MVT::i32));
12719       } else {
12720         // CMOV 0, 1, ==, (CMPZ x, y) ->
12721         //     (ADDCARRY (SUB x, y), t:0, t:1)
12722         // where t = (SUBCARRY 0, (SUB x, y), 0)
12723         //
12724         // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
12725         // x != y. In other words, a carry C == 1 when x == y, C == 0
12726         // otherwise.
12727         // The final ADDCARRY computes
12728         //     x - y + (0 - (x - y)) + C == C
12729         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
12730         SDVTList VTs = DAG.getVTList(VT, MVT::i32);
12731         SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
12732         // ISD::SUBCARRY returns a borrow but we want the carry here
12733         // actually.
12734         SDValue Carry =
12735             DAG.getNode(ISD::SUB, dl, MVT::i32,
12736                         DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
12737         Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
12738       }
12739     } else if (CC == ARMCC::NE && LHS != RHS &&
12740                (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
12741       // This seems pointless but will allow us to combine it further below.
12742       // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
12743       SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
12744       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
12745                         N->getOperand(3), Cmp);
12746     }
12747   } else if (isNullConstant(TrueVal)) {
12748     if (CC == ARMCC::EQ && LHS != RHS &&
12749         (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
12750       // This seems pointless but will allow us to combine it further below
12751       // Note that we change == for != as this is the dual for the case above.
12752       // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
12753       SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
12754       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
12755                         DAG.getConstant(ARMCC::NE, dl, MVT::i32),
12756                         N->getOperand(3), Cmp);
12757     }
12758   }
12759 
12760   // On Thumb1, the DAG above may be further combined if z is a power of 2
12761   // (z == 2 ^ K).
12762   // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->
12763   //       merge t3, t4
12764   // where t1 = (SUBCARRY (SUB x, y), z, 0)
12765   //       t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
12766   //       t3 = if K != 0 then (SHL t2:0, K) else t2:0
12767   //       t4 = (SUB 1, t2:1)   [ we want a carry, not a borrow ]
12768   const APInt *TrueConst;
12769   if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
12770       (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&
12771       (FalseVal.getOperand(1) == RHS) &&
12772       (TrueConst = isPowerOf2Constant(TrueVal))) {
12773     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
12774     unsigned ShiftAmount = TrueConst->logBase2();
12775     if (ShiftAmount)
12776       TrueVal = DAG.getConstant(1, dl, VT);
12777     SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
12778     Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
12779     // Make it a carry, not a borrow.
12780     SDValue Carry = DAG.getNode(
12781         ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));
12782     Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);
12783 
12784     if (ShiftAmount)
12785       Res = DAG.getNode(ISD::SHL, dl, VT, Res,
12786                         DAG.getConstant(ShiftAmount, dl, MVT::i32));
12787   }
12788 
12789   if (Res.getNode()) {
12790     KnownBits Known;
12791     DAG.computeKnownBits(SDValue(N,0), Known);
12792     // Capture demanded bits information that would be otherwise lost.
12793     if (Known.Zero == 0xfffffffe)
12794       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
12795                         DAG.getValueType(MVT::i1));
12796     else if (Known.Zero == 0xffffff00)
12797       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
12798                         DAG.getValueType(MVT::i8));
12799     else if (Known.Zero == 0xffff0000)
12800       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
12801                         DAG.getValueType(MVT::i16));
12802   }
12803 
12804   return Res;
12805 }
12806 
12807 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
12808                                              DAGCombinerInfo &DCI) const {
12809   switch (N->getOpcode()) {
12810   default: break;
12811   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
12812   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
12813   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
12814   case ISD::SUB:        return PerformSUBCombine(N, DCI);
12815   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
12816   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
12817   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
12818   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
12819   case ARMISD::ADDC:
12820   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
12821   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
12822   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
12823   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
12824   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
12825   case ISD::STORE:      return PerformSTORECombine(N, DCI);
12826   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
12827   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
12828   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
12829   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
12830   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
12831   case ISD::FP_TO_SINT:
12832   case ISD::FP_TO_UINT:
12833     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
12834   case ISD::FDIV:
12835     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
12836   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
12837   case ISD::SHL:
12838   case ISD::SRA:
12839   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
12840   case ISD::SIGN_EXTEND:
12841   case ISD::ZERO_EXTEND:
12842   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
12843   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
12844   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
12845   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
12846   case ARMISD::VLD1DUP:
12847   case ARMISD::VLD2DUP:
12848   case ARMISD::VLD3DUP:
12849   case ARMISD::VLD4DUP:
12850     return PerformVLDCombine(N, DCI);
12851   case ARMISD::BUILD_VECTOR:
12852     return PerformARMBUILD_VECTORCombine(N, DCI);
12853   case ARMISD::SMULWB: {
12854     unsigned BitWidth = N->getValueType(0).getSizeInBits();
12855     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
12856     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
12857       return SDValue();
12858     break;
12859   }
12860   case ARMISD::SMULWT: {
12861     unsigned BitWidth = N->getValueType(0).getSizeInBits();
12862     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
12863     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
12864       return SDValue();
12865     break;
12866   }
12867   case ARMISD::SMLALBB: {
12868     unsigned BitWidth = N->getValueType(0).getSizeInBits();
12869     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
12870     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
12871         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
12872       return SDValue();
12873     break;
12874   }
12875   case ARMISD::SMLALBT: {
12876     unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
12877     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
12878     unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
12879     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
12880     if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
12881         (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
12882       return SDValue();
12883     break;
12884   }
12885   case ARMISD::SMLALTB: {
12886     unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
12887     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
12888     unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
12889     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
12890     if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
12891         (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
12892       return SDValue();
12893     break;
12894   }
12895   case ARMISD::SMLALTT: {
12896     unsigned BitWidth = N->getValueType(0).getSizeInBits();
12897     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
12898     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
12899         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
12900       return SDValue();
12901     break;
12902   }
12903   case ISD::INTRINSIC_VOID:
12904   case ISD::INTRINSIC_W_CHAIN:
12905     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
12906     case Intrinsic::arm_neon_vld1:
12907     case Intrinsic::arm_neon_vld1x2:
12908     case Intrinsic::arm_neon_vld1x3:
12909     case Intrinsic::arm_neon_vld1x4:
12910     case Intrinsic::arm_neon_vld2:
12911     case Intrinsic::arm_neon_vld3:
12912     case Intrinsic::arm_neon_vld4:
12913     case Intrinsic::arm_neon_vld2lane:
12914     case Intrinsic::arm_neon_vld3lane:
12915     case Intrinsic::arm_neon_vld4lane:
12916     case Intrinsic::arm_neon_vld2dup:
12917     case Intrinsic::arm_neon_vld3dup:
12918     case Intrinsic::arm_neon_vld4dup:
12919     case Intrinsic::arm_neon_vst1:
12920     case Intrinsic::arm_neon_vst1x2:
12921     case Intrinsic::arm_neon_vst1x3:
12922     case Intrinsic::arm_neon_vst1x4:
12923     case Intrinsic::arm_neon_vst2:
12924     case Intrinsic::arm_neon_vst3:
12925     case Intrinsic::arm_neon_vst4:
12926     case Intrinsic::arm_neon_vst2lane:
12927     case Intrinsic::arm_neon_vst3lane:
12928     case Intrinsic::arm_neon_vst4lane:
12929       return PerformVLDCombine(N, DCI);
12930     default: break;
12931     }
12932     break;
12933   }
12934   return SDValue();
12935 }
12936 
12937 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
12938                                                           EVT VT) const {
12939   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
12940 }
12941 
12942 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
12943                                                        unsigned,
12944                                                        unsigned,
12945                                                        bool *Fast) const {
12946   // Depends what it gets converted into if the type is weird.
12947   if (!VT.isSimple())
12948     return false;
12949 
12950   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
12951   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
12952 
12953   switch (VT.getSimpleVT().SimpleTy) {
12954   default:
12955     return false;
12956   case MVT::i8:
12957   case MVT::i16:
12958   case MVT::i32: {
12959     // Unaligned access can use (for example) LRDB, LRDH, LDR
12960     if (AllowsUnaligned) {
12961       if (Fast)
12962         *Fast = Subtarget->hasV7Ops();
12963       return true;
12964     }
12965     return false;
12966   }
12967   case MVT::f64:
12968   case MVT::v2f64: {
12969     // For any little-endian targets with neon, we can support unaligned ld/st
12970     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
12971     // A big-endian target may also explicitly support unaligned accesses
12972     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
12973       if (Fast)
12974         *Fast = true;
12975       return true;
12976     }
12977     return false;
12978   }
12979   }
12980 }
12981 
12982 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
12983                        unsigned AlignCheck) {
12984   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
12985           (DstAlign == 0 || DstAlign % AlignCheck == 0));
12986 }
12987 
12988 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
12989                                            unsigned DstAlign, unsigned SrcAlign,
12990                                            bool IsMemset, bool ZeroMemset,
12991                                            bool MemcpyStrSrc,
12992                                            MachineFunction &MF) const {
12993   const Function &F = MF.getFunction();
12994 
12995   // See if we can use NEON instructions for this...
12996   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
12997       !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
12998     bool Fast;
12999     if (Size >= 16 &&
13000         (memOpAlign(SrcAlign, DstAlign, 16) ||
13001          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
13002       return MVT::v2f64;
13003     } else if (Size >= 8 &&
13004                (memOpAlign(SrcAlign, DstAlign, 8) ||
13005                 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
13006                  Fast))) {
13007       return MVT::f64;
13008     }
13009   }
13010 
13011   // Let the target-independent logic figure it out.
13012   return MVT::Other;
13013 }
13014 
13015 // 64-bit integers are split into their high and low parts and held in two
13016 // different registers, so the trunc is free since the low register can just
13017 // be used.
13018 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
13019   if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
13020     return false;
13021   unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
13022   unsigned DestBits = DstTy->getPrimitiveSizeInBits();
13023   return (SrcBits == 64 && DestBits == 32);
13024 }
13025 
13026 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
13027   if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
13028       !DstVT.isInteger())
13029     return false;
13030   unsigned SrcBits = SrcVT.getSizeInBits();
13031   unsigned DestBits = DstVT.getSizeInBits();
13032   return (SrcBits == 64 && DestBits == 32);
13033 }
13034 
13035 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
13036   if (Val.getOpcode() != ISD::LOAD)
13037     return false;
13038 
13039   EVT VT1 = Val.getValueType();
13040   if (!VT1.isSimple() || !VT1.isInteger() ||
13041       !VT2.isSimple() || !VT2.isInteger())
13042     return false;
13043 
13044   switch (VT1.getSimpleVT().SimpleTy) {
13045   default: break;
13046   case MVT::i1:
13047   case MVT::i8:
13048   case MVT::i16:
13049     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
13050     return true;
13051   }
13052 
13053   return false;
13054 }
13055 
13056 bool ARMTargetLowering::isFNegFree(EVT VT) const {
13057   if (!VT.isSimple())
13058     return false;
13059 
13060   // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
13061   // negate values directly (fneg is free). So, we don't want to let the DAG
13062   // combiner rewrite fneg into xors and some other instructions.  For f16 and
13063   // FullFP16 argument passing, some bitcast nodes may be introduced,
13064   // triggering this DAG combine rewrite, so we are avoiding that with this.
13065   switch (VT.getSimpleVT().SimpleTy) {
13066   default: break;
13067   case MVT::f16:
13068     return Subtarget->hasFullFP16();
13069   }
13070 
13071   return false;
13072 }
13073 
13074 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
13075   EVT VT = ExtVal.getValueType();
13076 
13077   if (!isTypeLegal(VT))
13078     return false;
13079 
13080   // Don't create a loadext if we can fold the extension into a wide/long
13081   // instruction.
13082   // If there's more than one user instruction, the loadext is desirable no
13083   // matter what.  There can be two uses by the same instruction.
13084   if (ExtVal->use_empty() ||
13085       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
13086     return true;
13087 
13088   SDNode *U = *ExtVal->use_begin();
13089   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
13090        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
13091     return false;
13092 
13093   return true;
13094 }
13095 
13096 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
13097   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13098     return false;
13099 
13100   if (!isTypeLegal(EVT::getEVT(Ty1)))
13101     return false;
13102 
13103   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
13104 
13105   // Assuming the caller doesn't have a zeroext or signext return parameter,
13106   // truncation all the way down to i1 is valid.
13107   return true;
13108 }
13109 
13110 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
13111                                                 const AddrMode &AM, Type *Ty,
13112                                                 unsigned AS) const {
13113   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
13114     if (Subtarget->hasFPAO())
13115       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
13116     return 0;
13117   }
13118   return -1;
13119 }
13120 
13121 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
13122   if (V < 0)
13123     return false;
13124 
13125   unsigned Scale = 1;
13126   switch (VT.getSimpleVT().SimpleTy) {
13127   default: return false;
13128   case MVT::i1:
13129   case MVT::i8:
13130     // Scale == 1;
13131     break;
13132   case MVT::i16:
13133     // Scale == 2;
13134     Scale = 2;
13135     break;
13136   case MVT::i32:
13137     // Scale == 4;
13138     Scale = 4;
13139     break;
13140   }
13141 
13142   if ((V & (Scale - 1)) != 0)
13143     return false;
13144   V /= Scale;
13145   return V == (V & ((1LL << 5) - 1));
13146 }
13147 
13148 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
13149                                       const ARMSubtarget *Subtarget) {
13150   bool isNeg = false;
13151   if (V < 0) {
13152     isNeg = true;
13153     V = - V;
13154   }
13155 
13156   switch (VT.getSimpleVT().SimpleTy) {
13157   default: return false;
13158   case MVT::i1:
13159   case MVT::i8:
13160   case MVT::i16:
13161   case MVT::i32:
13162     // + imm12 or - imm8
13163     if (isNeg)
13164       return V == (V & ((1LL << 8) - 1));
13165     return V == (V & ((1LL << 12) - 1));
13166   case MVT::f32:
13167   case MVT::f64:
13168     // Same as ARM mode. FIXME: NEON?
13169     if (!Subtarget->hasVFP2())
13170       return false;
13171     if ((V & 3) != 0)
13172       return false;
13173     V >>= 2;
13174     return V == (V & ((1LL << 8) - 1));
13175   }
13176 }
13177 
13178 /// isLegalAddressImmediate - Return true if the integer value can be used
13179 /// as the offset of the target addressing mode for load / store of the
13180 /// given type.
13181 static bool isLegalAddressImmediate(int64_t V, EVT VT,
13182                                     const ARMSubtarget *Subtarget) {
13183   if (V == 0)
13184     return true;
13185 
13186   if (!VT.isSimple())
13187     return false;
13188 
13189   if (Subtarget->isThumb1Only())
13190     return isLegalT1AddressImmediate(V, VT);
13191   else if (Subtarget->isThumb2())
13192     return isLegalT2AddressImmediate(V, VT, Subtarget);
13193 
13194   // ARM mode.
13195   if (V < 0)
13196     V = - V;
13197   switch (VT.getSimpleVT().SimpleTy) {
13198   default: return false;
13199   case MVT::i1:
13200   case MVT::i8:
13201   case MVT::i32:
13202     // +- imm12
13203     return V == (V & ((1LL << 12) - 1));
13204   case MVT::i16:
13205     // +- imm8
13206     return V == (V & ((1LL << 8) - 1));
13207   case MVT::f32:
13208   case MVT::f64:
13209     if (!Subtarget->hasVFP2()) // FIXME: NEON?
13210       return false;
13211     if ((V & 3) != 0)
13212       return false;
13213     V >>= 2;
13214     return V == (V & ((1LL << 8) - 1));
13215   }
13216 }
13217 
13218 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
13219                                                       EVT VT) const {
13220   int Scale = AM.Scale;
13221   if (Scale < 0)
13222     return false;
13223 
13224   switch (VT.getSimpleVT().SimpleTy) {
13225   default: return false;
13226   case MVT::i1:
13227   case MVT::i8:
13228   case MVT::i16:
13229   case MVT::i32:
13230     if (Scale == 1)
13231       return true;
13232     // r + r << imm
13233     Scale = Scale & ~1;
13234     return Scale == 2 || Scale == 4 || Scale == 8;
13235   case MVT::i64:
13236     // FIXME: What are we trying to model here? ldrd doesn't have an r + r
13237     // version in Thumb mode.
13238     // r + r
13239     if (Scale == 1)
13240       return true;
13241     // r * 2 (this can be lowered to r + r).
13242     if (!AM.HasBaseReg && Scale == 2)
13243       return true;
13244     return false;
13245   case MVT::isVoid:
13246     // Note, we allow "void" uses (basically, uses that aren't loads or
13247     // stores), because arm allows folding a scale into many arithmetic
13248     // operations.  This should be made more precise and revisited later.
13249 
13250     // Allow r << imm, but the imm has to be a multiple of two.
13251     if (Scale & 1) return false;
13252     return isPowerOf2_32(Scale);
13253   }
13254 }
13255 
13256 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
13257                                                       EVT VT) const {
13258   const int Scale = AM.Scale;
13259 
13260   // Negative scales are not supported in Thumb1.
13261   if (Scale < 0)
13262     return false;
13263 
13264   // Thumb1 addressing modes do not support register scaling excepting the
13265   // following cases:
13266   // 1. Scale == 1 means no scaling.
13267   // 2. Scale == 2 this can be lowered to r + r if there is no base register.
13268   return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
13269 }
13270 
13271 /// isLegalAddressingMode - Return true if the addressing mode represented
13272 /// by AM is legal for this target, for a load/store of the specified type.
13273 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
13274                                               const AddrMode &AM, Type *Ty,
13275                                               unsigned AS, Instruction *I) const {
13276   EVT VT = getValueType(DL, Ty, true);
13277   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
13278     return false;
13279 
13280   // Can never fold addr of global into load/store.
13281   if (AM.BaseGV)
13282     return false;
13283 
13284   switch (AM.Scale) {
13285   case 0:  // no scale reg, must be "r+i" or "r", or "i".
13286     break;
13287   default:
13288     // ARM doesn't support any R+R*scale+imm addr modes.
13289     if (AM.BaseOffs)
13290       return false;
13291 
13292     if (!VT.isSimple())
13293       return false;
13294 
13295     if (Subtarget->isThumb1Only())
13296       return isLegalT1ScaledAddressingMode(AM, VT);
13297 
13298     if (Subtarget->isThumb2())
13299       return isLegalT2ScaledAddressingMode(AM, VT);
13300 
13301     int Scale = AM.Scale;
13302     switch (VT.getSimpleVT().SimpleTy) {
13303     default: return false;
13304     case MVT::i1:
13305     case MVT::i8:
13306     case MVT::i32:
13307       if (Scale < 0) Scale = -Scale;
13308       if (Scale == 1)
13309         return true;
13310       // r + r << imm
13311       return isPowerOf2_32(Scale & ~1);
13312     case MVT::i16:
13313     case MVT::i64:
13314       // r +/- r
13315       if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
13316         return true;
13317       // r * 2 (this can be lowered to r + r).
13318       if (!AM.HasBaseReg && Scale == 2)
13319         return true;
13320       return false;
13321 
13322     case MVT::isVoid:
13323       // Note, we allow "void" uses (basically, uses that aren't loads or
13324       // stores), because arm allows folding a scale into many arithmetic
13325       // operations.  This should be made more precise and revisited later.
13326 
13327       // Allow r << imm, but the imm has to be a multiple of two.
13328       if (Scale & 1) return false;
13329       return isPowerOf2_32(Scale);
13330     }
13331   }
13332   return true;
13333 }
13334 
13335 /// isLegalICmpImmediate - Return true if the specified immediate is legal
13336 /// icmp immediate, that is the target has icmp instructions which can compare
13337 /// a register against the immediate without having to materialize the
13338 /// immediate into a register.
13339 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
13340   // Thumb2 and ARM modes can use cmn for negative immediates.
13341   if (!Subtarget->isThumb())
13342     return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
13343            ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
13344   if (Subtarget->isThumb2())
13345     return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
13346            ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
13347   // Thumb1 doesn't have cmn, and only 8-bit immediates.
13348   return Imm >= 0 && Imm <= 255;
13349 }
13350 
13351 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
13352 /// *or sub* immediate, that is the target has add or sub instructions which can
13353 /// add a register with the immediate without having to materialize the
13354 /// immediate into a register.
13355 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
13356   // Same encoding for add/sub, just flip the sign.
13357   int64_t AbsImm = std::abs(Imm);
13358   if (!Subtarget->isThumb())
13359     return ARM_AM::getSOImmVal(AbsImm) != -1;
13360   if (Subtarget->isThumb2())
13361     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
13362   // Thumb1 only has 8-bit unsigned immediate.
13363   return AbsImm >= 0 && AbsImm <= 255;
13364 }
13365 
13366 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
13367                                       bool isSEXTLoad, SDValue &Base,
13368                                       SDValue &Offset, bool &isInc,
13369                                       SelectionDAG &DAG) {
13370   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
13371     return false;
13372 
13373   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
13374     // AddressingMode 3
13375     Base = Ptr->getOperand(0);
13376     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
13377       int RHSC = (int)RHS->getZExtValue();
13378       if (RHSC < 0 && RHSC > -256) {
13379         assert(Ptr->getOpcode() == ISD::ADD);
13380         isInc = false;
13381         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
13382         return true;
13383       }
13384     }
13385     isInc = (Ptr->getOpcode() == ISD::ADD);
13386     Offset = Ptr->getOperand(1);
13387     return true;
13388   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
13389     // AddressingMode 2
13390     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
13391       int RHSC = (int)RHS->getZExtValue();
13392       if (RHSC < 0 && RHSC > -0x1000) {
13393         assert(Ptr->getOpcode() == ISD::ADD);
13394         isInc = false;
13395         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
13396         Base = Ptr->getOperand(0);
13397         return true;
13398       }
13399     }
13400 
13401     if (Ptr->getOpcode() == ISD::ADD) {
13402       isInc = true;
13403       ARM_AM::ShiftOpc ShOpcVal=
13404         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
13405       if (ShOpcVal != ARM_AM::no_shift) {
13406         Base = Ptr->getOperand(1);
13407         Offset = Ptr->getOperand(0);
13408       } else {
13409         Base = Ptr->getOperand(0);
13410         Offset = Ptr->getOperand(1);
13411       }
13412       return true;
13413     }
13414 
13415     isInc = (Ptr->getOpcode() == ISD::ADD);
13416     Base = Ptr->getOperand(0);
13417     Offset = Ptr->getOperand(1);
13418     return true;
13419   }
13420 
13421   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
13422   return false;
13423 }
13424 
13425 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
13426                                      bool isSEXTLoad, SDValue &Base,
13427                                      SDValue &Offset, bool &isInc,
13428                                      SelectionDAG &DAG) {
13429   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
13430     return false;
13431 
13432   Base = Ptr->getOperand(0);
13433   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
13434     int RHSC = (int)RHS->getZExtValue();
13435     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
13436       assert(Ptr->getOpcode() == ISD::ADD);
13437       isInc = false;
13438       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
13439       return true;
13440     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
13441       isInc = Ptr->getOpcode() == ISD::ADD;
13442       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
13443       return true;
13444     }
13445   }
13446 
13447   return false;
13448 }
13449 
13450 /// getPreIndexedAddressParts - returns true by value, base pointer and
13451 /// offset pointer and addressing mode by reference if the node's address
13452 /// can be legally represented as pre-indexed load / store address.
13453 bool
13454 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
13455                                              SDValue &Offset,
13456                                              ISD::MemIndexedMode &AM,
13457                                              SelectionDAG &DAG) const {
13458   if (Subtarget->isThumb1Only())
13459     return false;
13460 
13461   EVT VT;
13462   SDValue Ptr;
13463   bool isSEXTLoad = false;
13464   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
13465     Ptr = LD->getBasePtr();
13466     VT  = LD->getMemoryVT();
13467     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
13468   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
13469     Ptr = ST->getBasePtr();
13470     VT  = ST->getMemoryVT();
13471   } else
13472     return false;
13473 
13474   bool isInc;
13475   bool isLegal = false;
13476   if (Subtarget->isThumb2())
13477     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
13478                                        Offset, isInc, DAG);
13479   else
13480     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
13481                                         Offset, isInc, DAG);
13482   if (!isLegal)
13483     return false;
13484 
13485   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
13486   return true;
13487 }
13488 
13489 /// getPostIndexedAddressParts - returns true by value, base pointer and
13490 /// offset pointer and addressing mode by reference if this node can be
13491 /// combined with a load / store to form a post-indexed load / store.
13492 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
13493                                                    SDValue &Base,
13494                                                    SDValue &Offset,
13495                                                    ISD::MemIndexedMode &AM,
13496                                                    SelectionDAG &DAG) const {
13497   EVT VT;
13498   SDValue Ptr;
13499   bool isSEXTLoad = false, isNonExt;
13500   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
13501     VT  = LD->getMemoryVT();
13502     Ptr = LD->getBasePtr();
13503     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
13504     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
13505   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
13506     VT  = ST->getMemoryVT();
13507     Ptr = ST->getBasePtr();
13508     isNonExt = !ST->isTruncatingStore();
13509   } else
13510     return false;
13511 
13512   if (Subtarget->isThumb1Only()) {
13513     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
13514     // must be non-extending/truncating, i32, with an offset of 4.
13515     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
13516     if (Op->getOpcode() != ISD::ADD || !isNonExt)
13517       return false;
13518     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13519     if (!RHS || RHS->getZExtValue() != 4)
13520       return false;
13521 
13522     Offset = Op->getOperand(1);
13523     Base = Op->getOperand(0);
13524     AM = ISD::POST_INC;
13525     return true;
13526   }
13527 
13528   bool isInc;
13529   bool isLegal = false;
13530   if (Subtarget->isThumb2())
13531     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
13532                                        isInc, DAG);
13533   else
13534     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
13535                                         isInc, DAG);
13536   if (!isLegal)
13537     return false;
13538 
13539   if (Ptr != Base) {
13540     // Swap base ptr and offset to catch more post-index load / store when
13541     // it's legal. In Thumb2 mode, offset must be an immediate.
13542     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
13543         !Subtarget->isThumb2())
13544       std::swap(Base, Offset);
13545 
13546     // Post-indexed load / store update the base pointer.
13547     if (Ptr != Base)
13548       return false;
13549   }
13550 
13551   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
13552   return true;
13553 }
13554 
13555 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
13556                                                       KnownBits &Known,
13557                                                       const APInt &DemandedElts,
13558                                                       const SelectionDAG &DAG,
13559                                                       unsigned Depth) const {
13560   unsigned BitWidth = Known.getBitWidth();
13561   Known.resetAll();
13562   switch (Op.getOpcode()) {
13563   default: break;
13564   case ARMISD::ADDC:
13565   case ARMISD::ADDE:
13566   case ARMISD::SUBC:
13567   case ARMISD::SUBE:
13568     // Special cases when we convert a carry to a boolean.
13569     if (Op.getResNo() == 0) {
13570       SDValue LHS = Op.getOperand(0);
13571       SDValue RHS = Op.getOperand(1);
13572       // (ADDE 0, 0, C) will give us a single bit.
13573       if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
13574           isNullConstant(RHS)) {
13575         Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
13576         return;
13577       }
13578     }
13579     break;
13580   case ARMISD::CMOV: {
13581     // Bits are known zero/one if known on the LHS and RHS.
13582     DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1);
13583     if (Known.isUnknown())
13584       return;
13585 
13586     KnownBits KnownRHS;
13587     DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1);
13588     Known.Zero &= KnownRHS.Zero;
13589     Known.One  &= KnownRHS.One;
13590     return;
13591   }
13592   case ISD::INTRINSIC_W_CHAIN: {
13593     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
13594     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
13595     switch (IntID) {
13596     default: return;
13597     case Intrinsic::arm_ldaex:
13598     case Intrinsic::arm_ldrex: {
13599       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
13600       unsigned MemBits = VT.getScalarSizeInBits();
13601       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
13602       return;
13603     }
13604     }
13605   }
13606   case ARMISD::BFI: {
13607     // Conservatively, we can recurse down the first operand
13608     // and just mask out all affected bits.
13609     DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
13610 
13611     // The operand to BFI is already a mask suitable for removing the bits it
13612     // sets.
13613     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
13614     const APInt &Mask = CI->getAPIntValue();
13615     Known.Zero &= Mask;
13616     Known.One &= Mask;
13617     return;
13618   }
13619   }
13620 }
13621 
13622 bool
13623 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
13624                                                 const APInt &DemandedAPInt,
13625                                                 TargetLoweringOpt &TLO) const {
13626   // Delay optimization, so we don't have to deal with illegal types, or block
13627   // optimizations.
13628   if (!TLO.LegalOps)
13629     return false;
13630 
13631   // Only optimize AND for now.
13632   if (Op.getOpcode() != ISD::AND)
13633     return false;
13634 
13635   EVT VT = Op.getValueType();
13636 
13637   // Ignore vectors.
13638   if (VT.isVector())
13639     return false;
13640 
13641   assert(VT == MVT::i32 && "Unexpected integer type");
13642 
13643   // Make sure the RHS really is a constant.
13644   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13645   if (!C)
13646     return false;
13647 
13648   unsigned Mask = C->getZExtValue();
13649 
13650   // If mask is zero, nothing to do.
13651   if (!Mask)
13652     return false;
13653 
13654   unsigned Demanded = DemandedAPInt.getZExtValue();
13655   unsigned ShrunkMask = Mask & Demanded;
13656   unsigned ExpandedMask = Mask | ~Demanded;
13657 
13658   auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
13659     return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
13660   };
13661   auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
13662     if (NewMask == Mask)
13663       return true;
13664     SDLoc DL(Op);
13665     SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
13666     SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
13667     return TLO.CombineTo(Op, NewOp);
13668   };
13669 
13670   // Prefer uxtb mask.
13671   if (IsLegalMask(0xFF))
13672     return UseMask(0xFF);
13673 
13674   // Prefer uxth mask.
13675   if (IsLegalMask(0xFFFF))
13676     return UseMask(0xFFFF);
13677 
13678   // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
13679   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
13680   if (ShrunkMask < 256)
13681     return UseMask(ShrunkMask);
13682 
13683   // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
13684   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
13685   if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
13686     return UseMask(ExpandedMask);
13687 
13688   // Potential improvements:
13689   //
13690   // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
13691   // We could try to prefer Thumb1 immediates which can be lowered to a
13692   // two-instruction sequence.
13693   // We could try to recognize more legal ARM/Thumb2 immediates here.
13694 
13695   return false;
13696 }
13697 
13698 
13699 //===----------------------------------------------------------------------===//
13700 //                           ARM Inline Assembly Support
13701 //===----------------------------------------------------------------------===//
13702 
13703 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
13704   // Looking for "rev" which is V6+.
13705   if (!Subtarget->hasV6Ops())
13706     return false;
13707 
13708   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
13709   std::string AsmStr = IA->getAsmString();
13710   SmallVector<StringRef, 4> AsmPieces;
13711   SplitString(AsmStr, AsmPieces, ";\n");
13712 
13713   switch (AsmPieces.size()) {
13714   default: return false;
13715   case 1:
13716     AsmStr = AsmPieces[0];
13717     AsmPieces.clear();
13718     SplitString(AsmStr, AsmPieces, " \t,");
13719 
13720     // rev $0, $1
13721     if (AsmPieces.size() == 3 &&
13722         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
13723         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
13724       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
13725       if (Ty && Ty->getBitWidth() == 32)
13726         return IntrinsicLowering::LowerToByteSwap(CI);
13727     }
13728     break;
13729   }
13730 
13731   return false;
13732 }
13733 
13734 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
13735   // At this point, we have to lower this constraint to something else, so we
13736   // lower it to an "r" or "w". However, by doing this we will force the result
13737   // to be in register, while the X constraint is much more permissive.
13738   //
13739   // Although we are correct (we are free to emit anything, without
13740   // constraints), we might break use cases that would expect us to be more
13741   // efficient and emit something else.
13742   if (!Subtarget->hasVFP2())
13743     return "r";
13744   if (ConstraintVT.isFloatingPoint())
13745     return "w";
13746   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
13747      (ConstraintVT.getSizeInBits() == 64 ||
13748       ConstraintVT.getSizeInBits() == 128))
13749     return "w";
13750 
13751   return "r";
13752 }
13753 
13754 /// getConstraintType - Given a constraint letter, return the type of
13755 /// constraint it is for this target.
13756 ARMTargetLowering::ConstraintType
13757 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
13758   if (Constraint.size() == 1) {
13759     switch (Constraint[0]) {
13760     default:  break;
13761     case 'l': return C_RegisterClass;
13762     case 'w': return C_RegisterClass;
13763     case 'h': return C_RegisterClass;
13764     case 'x': return C_RegisterClass;
13765     case 't': return C_RegisterClass;
13766     case 'j': return C_Other; // Constant for movw.
13767       // An address with a single base register. Due to the way we
13768       // currently handle addresses it is the same as an 'r' memory constraint.
13769     case 'Q': return C_Memory;
13770     }
13771   } else if (Constraint.size() == 2) {
13772     switch (Constraint[0]) {
13773     default: break;
13774     // All 'U+' constraints are addresses.
13775     case 'U': return C_Memory;
13776     }
13777   }
13778   return TargetLowering::getConstraintType(Constraint);
13779 }
13780 
13781 /// Examine constraint type and operand type and determine a weight value.
13782 /// This object must already have been set up with the operand type
13783 /// and the current alternative constraint selected.
13784 TargetLowering::ConstraintWeight
13785 ARMTargetLowering::getSingleConstraintMatchWeight(
13786     AsmOperandInfo &info, const char *constraint) const {
13787   ConstraintWeight weight = CW_Invalid;
13788   Value *CallOperandVal = info.CallOperandVal;
13789     // If we don't have a value, we can't do a match,
13790     // but allow it at the lowest weight.
13791   if (!CallOperandVal)
13792     return CW_Default;
13793   Type *type = CallOperandVal->getType();
13794   // Look at the constraint type.
13795   switch (*constraint) {
13796   default:
13797     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
13798     break;
13799   case 'l':
13800     if (type->isIntegerTy()) {
13801       if (Subtarget->isThumb())
13802         weight = CW_SpecificReg;
13803       else
13804         weight = CW_Register;
13805     }
13806     break;
13807   case 'w':
13808     if (type->isFloatingPointTy())
13809       weight = CW_Register;
13810     break;
13811   }
13812   return weight;
13813 }
13814 
13815 using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
13816 
13817 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
13818     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13819   if (Constraint.size() == 1) {
13820     // GCC ARM Constraint Letters
13821     switch (Constraint[0]) {
13822     case 'l': // Low regs or general regs.
13823       if (Subtarget->isThumb())
13824         return RCPair(0U, &ARM::tGPRRegClass);
13825       return RCPair(0U, &ARM::GPRRegClass);
13826     case 'h': // High regs or no regs.
13827       if (Subtarget->isThumb())
13828         return RCPair(0U, &ARM::hGPRRegClass);
13829       break;
13830     case 'r':
13831       if (Subtarget->isThumb1Only())
13832         return RCPair(0U, &ARM::tGPRRegClass);
13833       return RCPair(0U, &ARM::GPRRegClass);
13834     case 'w':
13835       if (VT == MVT::Other)
13836         break;
13837       if (VT == MVT::f32)
13838         return RCPair(0U, &ARM::SPRRegClass);
13839       if (VT.getSizeInBits() == 64)
13840         return RCPair(0U, &ARM::DPRRegClass);
13841       if (VT.getSizeInBits() == 128)
13842         return RCPair(0U, &ARM::QPRRegClass);
13843       break;
13844     case 'x':
13845       if (VT == MVT::Other)
13846         break;
13847       if (VT == MVT::f32)
13848         return RCPair(0U, &ARM::SPR_8RegClass);
13849       if (VT.getSizeInBits() == 64)
13850         return RCPair(0U, &ARM::DPR_8RegClass);
13851       if (VT.getSizeInBits() == 128)
13852         return RCPair(0U, &ARM::QPR_8RegClass);
13853       break;
13854     case 't':
13855       if (VT == MVT::Other)
13856         break;
13857       if (VT == MVT::f32 || VT == MVT::i32)
13858         return RCPair(0U, &ARM::SPRRegClass);
13859       if (VT.getSizeInBits() == 64)
13860         return RCPair(0U, &ARM::DPR_VFP2RegClass);
13861       if (VT.getSizeInBits() == 128)
13862         return RCPair(0U, &ARM::QPR_VFP2RegClass);
13863       break;
13864     }
13865   }
13866   if (StringRef("{cc}").equals_lower(Constraint))
13867     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
13868 
13869   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
13870 }
13871 
13872 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13873 /// vector.  If it is invalid, don't add anything to Ops.
13874 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
13875                                                      std::string &Constraint,
13876                                                      std::vector<SDValue>&Ops,
13877                                                      SelectionDAG &DAG) const {
13878   SDValue Result;
13879 
13880   // Currently only support length 1 constraints.
13881   if (Constraint.length() != 1) return;
13882 
13883   char ConstraintLetter = Constraint[0];
13884   switch (ConstraintLetter) {
13885   default: break;
13886   case 'j':
13887   case 'I': case 'J': case 'K': case 'L':
13888   case 'M': case 'N': case 'O':
13889     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
13890     if (!C)
13891       return;
13892 
13893     int64_t CVal64 = C->getSExtValue();
13894     int CVal = (int) CVal64;
13895     // None of these constraints allow values larger than 32 bits.  Check
13896     // that the value fits in an int.
13897     if (CVal != CVal64)
13898       return;
13899 
13900     switch (ConstraintLetter) {
13901       case 'j':
13902         // Constant suitable for movw, must be between 0 and
13903         // 65535.
13904         if (Subtarget->hasV6T2Ops())
13905           if (CVal >= 0 && CVal <= 65535)
13906             break;
13907         return;
13908       case 'I':
13909         if (Subtarget->isThumb1Only()) {
13910           // This must be a constant between 0 and 255, for ADD
13911           // immediates.
13912           if (CVal >= 0 && CVal <= 255)
13913             break;
13914         } else if (Subtarget->isThumb2()) {
13915           // A constant that can be used as an immediate value in a
13916           // data-processing instruction.
13917           if (ARM_AM::getT2SOImmVal(CVal) != -1)
13918             break;
13919         } else {
13920           // A constant that can be used as an immediate value in a
13921           // data-processing instruction.
13922           if (ARM_AM::getSOImmVal(CVal) != -1)
13923             break;
13924         }
13925         return;
13926 
13927       case 'J':
13928         if (Subtarget->isThumb1Only()) {
13929           // This must be a constant between -255 and -1, for negated ADD
13930           // immediates. This can be used in GCC with an "n" modifier that
13931           // prints the negated value, for use with SUB instructions. It is
13932           // not useful otherwise but is implemented for compatibility.
13933           if (CVal >= -255 && CVal <= -1)
13934             break;
13935         } else {
13936           // This must be a constant between -4095 and 4095. It is not clear
13937           // what this constraint is intended for. Implemented for
13938           // compatibility with GCC.
13939           if (CVal >= -4095 && CVal <= 4095)
13940             break;
13941         }
13942         return;
13943 
13944       case 'K':
13945         if (Subtarget->isThumb1Only()) {
13946           // A 32-bit value where only one byte has a nonzero value. Exclude
13947           // zero to match GCC. This constraint is used by GCC internally for
13948           // constants that can be loaded with a move/shift combination.
13949           // It is not useful otherwise but is implemented for compatibility.
13950           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
13951             break;
13952         } else if (Subtarget->isThumb2()) {
13953           // A constant whose bitwise inverse can be used as an immediate
13954           // value in a data-processing instruction. This can be used in GCC
13955           // with a "B" modifier that prints the inverted value, for use with
13956           // BIC and MVN instructions. It is not useful otherwise but is
13957           // implemented for compatibility.
13958           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
13959             break;
13960         } else {
13961           // A constant whose bitwise inverse can be used as an immediate
13962           // value in a data-processing instruction. This can be used in GCC
13963           // with a "B" modifier that prints the inverted value, for use with
13964           // BIC and MVN instructions. It is not useful otherwise but is
13965           // implemented for compatibility.
13966           if (ARM_AM::getSOImmVal(~CVal) != -1)
13967             break;
13968         }
13969         return;
13970 
13971       case 'L':
13972         if (Subtarget->isThumb1Only()) {
13973           // This must be a constant between -7 and 7,
13974           // for 3-operand ADD/SUB immediate instructions.
13975           if (CVal >= -7 && CVal < 7)
13976             break;
13977         } else if (Subtarget->isThumb2()) {
13978           // A constant whose negation can be used as an immediate value in a
13979           // data-processing instruction. This can be used in GCC with an "n"
13980           // modifier that prints the negated value, for use with SUB
13981           // instructions. It is not useful otherwise but is implemented for
13982           // compatibility.
13983           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
13984             break;
13985         } else {
13986           // A constant whose negation can be used as an immediate value in a
13987           // data-processing instruction. This can be used in GCC with an "n"
13988           // modifier that prints the negated value, for use with SUB
13989           // instructions. It is not useful otherwise but is implemented for
13990           // compatibility.
13991           if (ARM_AM::getSOImmVal(-CVal) != -1)
13992             break;
13993         }
13994         return;
13995 
13996       case 'M':
13997         if (Subtarget->isThumb1Only()) {
13998           // This must be a multiple of 4 between 0 and 1020, for
13999           // ADD sp + immediate.
14000           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
14001             break;
14002         } else {
14003           // A power of two or a constant between 0 and 32.  This is used in
14004           // GCC for the shift amount on shifted register operands, but it is
14005           // useful in general for any shift amounts.
14006           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
14007             break;
14008         }
14009         return;
14010 
14011       case 'N':
14012         if (Subtarget->isThumb()) {  // FIXME thumb2
14013           // This must be a constant between 0 and 31, for shift amounts.
14014           if (CVal >= 0 && CVal <= 31)
14015             break;
14016         }
14017         return;
14018 
14019       case 'O':
14020         if (Subtarget->isThumb()) {  // FIXME thumb2
14021           // This must be a multiple of 4 between -508 and 508, for
14022           // ADD/SUB sp = sp + immediate.
14023           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
14024             break;
14025         }
14026         return;
14027     }
14028     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
14029     break;
14030   }
14031 
14032   if (Result.getNode()) {
14033     Ops.push_back(Result);
14034     return;
14035   }
14036   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
14037 }
14038 
14039 static RTLIB::Libcall getDivRemLibcall(
14040     const SDNode *N, MVT::SimpleValueType SVT) {
14041   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
14042           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
14043          "Unhandled Opcode in getDivRemLibcall");
14044   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
14045                   N->getOpcode() == ISD::SREM;
14046   RTLIB::Libcall LC;
14047   switch (SVT) {
14048   default: llvm_unreachable("Unexpected request for libcall!");
14049   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
14050   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
14051   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
14052   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
14053   }
14054   return LC;
14055 }
14056 
14057 static TargetLowering::ArgListTy getDivRemArgList(
14058     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
14059   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
14060           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
14061          "Unhandled Opcode in getDivRemArgList");
14062   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
14063                   N->getOpcode() == ISD::SREM;
14064   TargetLowering::ArgListTy Args;
14065   TargetLowering::ArgListEntry Entry;
14066   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
14067     EVT ArgVT = N->getOperand(i).getValueType();
14068     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
14069     Entry.Node = N->getOperand(i);
14070     Entry.Ty = ArgTy;
14071     Entry.IsSExt = isSigned;
14072     Entry.IsZExt = !isSigned;
14073     Args.push_back(Entry);
14074   }
14075   if (Subtarget->isTargetWindows() && Args.size() >= 2)
14076     std::swap(Args[0], Args[1]);
14077   return Args;
14078 }
14079 
14080 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
14081   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
14082           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
14083           Subtarget->isTargetWindows()) &&
14084          "Register-based DivRem lowering only");
14085   unsigned Opcode = Op->getOpcode();
14086   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
14087          "Invalid opcode for Div/Rem lowering");
14088   bool isSigned = (Opcode == ISD::SDIVREM);
14089   EVT VT = Op->getValueType(0);
14090   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
14091   SDLoc dl(Op);
14092 
14093   // If the target has hardware divide, use divide + multiply + subtract:
14094   //     div = a / b
14095   //     rem = a - b * div
14096   //     return {div, rem}
14097   // This should be lowered into UDIV/SDIV + MLS later on.
14098   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
14099                                         : Subtarget->hasDivideInARMMode();
14100   if (hasDivide && Op->getValueType(0).isSimple() &&
14101       Op->getSimpleValueType(0) == MVT::i32) {
14102     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
14103     const SDValue Dividend = Op->getOperand(0);
14104     const SDValue Divisor = Op->getOperand(1);
14105     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
14106     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
14107     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
14108 
14109     SDValue Values[2] = {Div, Rem};
14110     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
14111   }
14112 
14113   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
14114                                        VT.getSimpleVT().SimpleTy);
14115   SDValue InChain = DAG.getEntryNode();
14116 
14117   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
14118                                                     DAG.getContext(),
14119                                                     Subtarget);
14120 
14121   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
14122                                          getPointerTy(DAG.getDataLayout()));
14123 
14124   Type *RetTy = StructType::get(Ty, Ty);
14125 
14126   if (Subtarget->isTargetWindows())
14127     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
14128 
14129   TargetLowering::CallLoweringInfo CLI(DAG);
14130   CLI.setDebugLoc(dl).setChain(InChain)
14131     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
14132     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
14133 
14134   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
14135   return CallInfo.first;
14136 }
14137 
14138 // Lowers REM using divmod helpers
14139 // see RTABI section 4.2/4.3
14140 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
14141   // Build return types (div and rem)
14142   std::vector<Type*> RetTyParams;
14143   Type *RetTyElement;
14144 
14145   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
14146   default: llvm_unreachable("Unexpected request for libcall!");
14147   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
14148   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
14149   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
14150   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
14151   }
14152 
14153   RetTyParams.push_back(RetTyElement);
14154   RetTyParams.push_back(RetTyElement);
14155   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
14156   Type *RetTy = StructType::get(*DAG.getContext(), ret);
14157 
14158   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
14159                                                              SimpleTy);
14160   SDValue InChain = DAG.getEntryNode();
14161   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
14162                                                     Subtarget);
14163   bool isSigned = N->getOpcode() == ISD::SREM;
14164   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
14165                                          getPointerTy(DAG.getDataLayout()));
14166 
14167   if (Subtarget->isTargetWindows())
14168     InChain = WinDBZCheckDenominator(DAG, N, InChain);
14169 
14170   // Lower call
14171   CallLoweringInfo CLI(DAG);
14172   CLI.setChain(InChain)
14173      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
14174      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
14175   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
14176 
14177   // Return second (rem) result operand (first contains div)
14178   SDNode *ResNode = CallResult.first.getNode();
14179   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
14180   return ResNode->getOperand(1);
14181 }
14182 
14183 SDValue
14184 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
14185   assert(Subtarget->isTargetWindows() && "unsupported target platform");
14186   SDLoc DL(Op);
14187 
14188   // Get the inputs.
14189   SDValue Chain = Op.getOperand(0);
14190   SDValue Size  = Op.getOperand(1);
14191 
14192   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
14193           "no-stack-arg-probe")) {
14194     unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
14195     SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
14196     Chain = SP.getValue(1);
14197     SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
14198     if (Align)
14199       SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
14200                        DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
14201     Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
14202     SDValue Ops[2] = { SP, Chain };
14203     return DAG.getMergeValues(Ops, DL);
14204   }
14205 
14206   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
14207                               DAG.getConstant(2, DL, MVT::i32));
14208 
14209   SDValue Flag;
14210   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
14211   Flag = Chain.getValue(1);
14212 
14213   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14214   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
14215 
14216   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
14217   Chain = NewSP.getValue(1);
14218 
14219   SDValue Ops[2] = { NewSP, Chain };
14220   return DAG.getMergeValues(Ops, DL);
14221 }
14222 
14223 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
14224   assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
14225          "Unexpected type for custom-lowering FP_EXTEND");
14226 
14227   RTLIB::Libcall LC;
14228   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
14229 
14230   SDValue SrcVal = Op.getOperand(0);
14231   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
14232                      SDLoc(Op)).first;
14233 }
14234 
14235 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
14236   assert(Op.getOperand(0).getValueType() == MVT::f64 &&
14237          Subtarget->isFPOnlySP() &&
14238          "Unexpected type for custom-lowering FP_ROUND");
14239 
14240   RTLIB::Libcall LC;
14241   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
14242 
14243   SDValue SrcVal = Op.getOperand(0);
14244   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
14245                      SDLoc(Op)).first;
14246 }
14247 
14248 bool
14249 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
14250   // The ARM target isn't yet aware of offsets.
14251   return false;
14252 }
14253 
14254 bool ARM::isBitFieldInvertedMask(unsigned v) {
14255   if (v == 0xffffffff)
14256     return false;
14257 
14258   // there can be 1's on either or both "outsides", all the "inside"
14259   // bits must be 0's
14260   return isShiftedMask_32(~v);
14261 }
14262 
14263 /// isFPImmLegal - Returns true if the target can instruction select the
14264 /// specified FP immediate natively. If false, the legalizer will
14265 /// materialize the FP immediate as a load from a constant pool.
14266 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
14267   if (!Subtarget->hasVFP3())
14268     return false;
14269   if (VT == MVT::f16 && Subtarget->hasFullFP16())
14270     return ARM_AM::getFP16Imm(Imm) != -1;
14271   if (VT == MVT::f32)
14272     return ARM_AM::getFP32Imm(Imm) != -1;
14273   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
14274     return ARM_AM::getFP64Imm(Imm) != -1;
14275   return false;
14276 }
14277 
14278 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14279 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
14280 /// specified in the intrinsic calls.
14281 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14282                                            const CallInst &I,
14283                                            MachineFunction &MF,
14284                                            unsigned Intrinsic) const {
14285   switch (Intrinsic) {
14286   case Intrinsic::arm_neon_vld1:
14287   case Intrinsic::arm_neon_vld2:
14288   case Intrinsic::arm_neon_vld3:
14289   case Intrinsic::arm_neon_vld4:
14290   case Intrinsic::arm_neon_vld2lane:
14291   case Intrinsic::arm_neon_vld3lane:
14292   case Intrinsic::arm_neon_vld4lane:
14293   case Intrinsic::arm_neon_vld2dup:
14294   case Intrinsic::arm_neon_vld3dup:
14295   case Intrinsic::arm_neon_vld4dup: {
14296     Info.opc = ISD::INTRINSIC_W_CHAIN;
14297     // Conservatively set memVT to the entire set of vectors loaded.
14298     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
14299     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14300     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14301     Info.ptrVal = I.getArgOperand(0);
14302     Info.offset = 0;
14303     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
14304     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
14305     // volatile loads with NEON intrinsics not supported
14306     Info.flags = MachineMemOperand::MOLoad;
14307     return true;
14308   }
14309   case Intrinsic::arm_neon_vld1x2:
14310   case Intrinsic::arm_neon_vld1x3:
14311   case Intrinsic::arm_neon_vld1x4: {
14312     Info.opc = ISD::INTRINSIC_W_CHAIN;
14313     // Conservatively set memVT to the entire set of vectors loaded.
14314     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
14315     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14316     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14317     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
14318     Info.offset = 0;
14319     Info.align = 0;
14320     // volatile loads with NEON intrinsics not supported
14321     Info.flags = MachineMemOperand::MOLoad;
14322     return true;
14323   }
14324   case Intrinsic::arm_neon_vst1:
14325   case Intrinsic::arm_neon_vst2:
14326   case Intrinsic::arm_neon_vst3:
14327   case Intrinsic::arm_neon_vst4:
14328   case Intrinsic::arm_neon_vst2lane:
14329   case Intrinsic::arm_neon_vst3lane:
14330   case Intrinsic::arm_neon_vst4lane: {
14331     Info.opc = ISD::INTRINSIC_VOID;
14332     // Conservatively set memVT to the entire set of vectors stored.
14333     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
14334     unsigned NumElts = 0;
14335     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
14336       Type *ArgTy = I.getArgOperand(ArgI)->getType();
14337       if (!ArgTy->isVectorTy())
14338         break;
14339       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14340     }
14341     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14342     Info.ptrVal = I.getArgOperand(0);
14343     Info.offset = 0;
14344     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
14345     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
14346     // volatile stores with NEON intrinsics not supported
14347     Info.flags = MachineMemOperand::MOStore;
14348     return true;
14349   }
14350   case Intrinsic::arm_neon_vst1x2:
14351   case Intrinsic::arm_neon_vst1x3:
14352   case Intrinsic::arm_neon_vst1x4: {
14353     Info.opc = ISD::INTRINSIC_VOID;
14354     // Conservatively set memVT to the entire set of vectors stored.
14355     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
14356     unsigned NumElts = 0;
14357     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
14358       Type *ArgTy = I.getArgOperand(ArgI)->getType();
14359       if (!ArgTy->isVectorTy())
14360         break;
14361       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14362     }
14363     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14364     Info.ptrVal = I.getArgOperand(0);
14365     Info.offset = 0;
14366     Info.align = 0;
14367     // volatile stores with NEON intrinsics not supported
14368     Info.flags = MachineMemOperand::MOStore;
14369     return true;
14370   }
14371   case Intrinsic::arm_ldaex:
14372   case Intrinsic::arm_ldrex: {
14373     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
14374     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
14375     Info.opc = ISD::INTRINSIC_W_CHAIN;
14376     Info.memVT = MVT::getVT(PtrTy->getElementType());
14377     Info.ptrVal = I.getArgOperand(0);
14378     Info.offset = 0;
14379     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
14380     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
14381     return true;
14382   }
14383   case Intrinsic::arm_stlex:
14384   case Intrinsic::arm_strex: {
14385     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
14386     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
14387     Info.opc = ISD::INTRINSIC_W_CHAIN;
14388     Info.memVT = MVT::getVT(PtrTy->getElementType());
14389     Info.ptrVal = I.getArgOperand(1);
14390     Info.offset = 0;
14391     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
14392     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
14393     return true;
14394   }
14395   case Intrinsic::arm_stlexd:
14396   case Intrinsic::arm_strexd:
14397     Info.opc = ISD::INTRINSIC_W_CHAIN;
14398     Info.memVT = MVT::i64;
14399     Info.ptrVal = I.getArgOperand(2);
14400     Info.offset = 0;
14401     Info.align = 8;
14402     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
14403     return true;
14404 
14405   case Intrinsic::arm_ldaexd:
14406   case Intrinsic::arm_ldrexd:
14407     Info.opc = ISD::INTRINSIC_W_CHAIN;
14408     Info.memVT = MVT::i64;
14409     Info.ptrVal = I.getArgOperand(0);
14410     Info.offset = 0;
14411     Info.align = 8;
14412     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
14413     return true;
14414 
14415   default:
14416     break;
14417   }
14418 
14419   return false;
14420 }
14421 
14422 /// Returns true if it is beneficial to convert a load of a constant
14423 /// to just the constant itself.
14424 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
14425                                                           Type *Ty) const {
14426   assert(Ty->isIntegerTy());
14427 
14428   unsigned Bits = Ty->getPrimitiveSizeInBits();
14429   if (Bits == 0 || Bits > 32)
14430     return false;
14431   return true;
14432 }
14433 
14434 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
14435                                                 unsigned Index) const {
14436   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
14437     return false;
14438 
14439   return (Index == 0 || Index == ResVT.getVectorNumElements());
14440 }
14441 
14442 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
14443                                         ARM_MB::MemBOpt Domain) const {
14444   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
14445 
14446   // First, if the target has no DMB, see what fallback we can use.
14447   if (!Subtarget->hasDataBarrier()) {
14448     // Some ARMv6 cpus can support data barriers with an mcr instruction.
14449     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
14450     // here.
14451     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
14452       Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
14453       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
14454                         Builder.getInt32(0), Builder.getInt32(7),
14455                         Builder.getInt32(10), Builder.getInt32(5)};
14456       return Builder.CreateCall(MCR, args);
14457     } else {
14458       // Instead of using barriers, atomic accesses on these subtargets use
14459       // libcalls.
14460       llvm_unreachable("makeDMB on a target so old that it has no barriers");
14461     }
14462   } else {
14463     Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
14464     // Only a full system barrier exists in the M-class architectures.
14465     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
14466     Constant *CDomain = Builder.getInt32(Domain);
14467     return Builder.CreateCall(DMB, CDomain);
14468   }
14469 }
14470 
14471 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
14472 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
14473                                                  Instruction *Inst,
14474                                                  AtomicOrdering Ord) const {
14475   switch (Ord) {
14476   case AtomicOrdering::NotAtomic:
14477   case AtomicOrdering::Unordered:
14478     llvm_unreachable("Invalid fence: unordered/non-atomic");
14479   case AtomicOrdering::Monotonic:
14480   case AtomicOrdering::Acquire:
14481     return nullptr; // Nothing to do
14482   case AtomicOrdering::SequentiallyConsistent:
14483     if (!Inst->hasAtomicStore())
14484       return nullptr; // Nothing to do
14485     LLVM_FALLTHROUGH;
14486   case AtomicOrdering::Release:
14487   case AtomicOrdering::AcquireRelease:
14488     if (Subtarget->preferISHSTBarriers())
14489       return makeDMB(Builder, ARM_MB::ISHST);
14490     // FIXME: add a comment with a link to documentation justifying this.
14491     else
14492       return makeDMB(Builder, ARM_MB::ISH);
14493   }
14494   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
14495 }
14496 
14497 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
14498                                                   Instruction *Inst,
14499                                                   AtomicOrdering Ord) const {
14500   switch (Ord) {
14501   case AtomicOrdering::NotAtomic:
14502   case AtomicOrdering::Unordered:
14503     llvm_unreachable("Invalid fence: unordered/not-atomic");
14504   case AtomicOrdering::Monotonic:
14505   case AtomicOrdering::Release:
14506     return nullptr; // Nothing to do
14507   case AtomicOrdering::Acquire:
14508   case AtomicOrdering::AcquireRelease:
14509   case AtomicOrdering::SequentiallyConsistent:
14510     return makeDMB(Builder, ARM_MB::ISH);
14511   }
14512   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
14513 }
14514 
14515 // Loads and stores less than 64-bits are already atomic; ones above that
14516 // are doomed anyway, so defer to the default libcall and blame the OS when
14517 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
14518 // anything for those.
14519 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
14520   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
14521   return (Size == 64) && !Subtarget->isMClass();
14522 }
14523 
14524 // Loads and stores less than 64-bits are already atomic; ones above that
14525 // are doomed anyway, so defer to the default libcall and blame the OS when
14526 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
14527 // anything for those.
14528 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
14529 // guarantee, see DDI0406C ARM architecture reference manual,
14530 // sections A8.8.72-74 LDRD)
14531 TargetLowering::AtomicExpansionKind
14532 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
14533   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
14534   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
14535                                                   : AtomicExpansionKind::None;
14536 }
14537 
14538 // For the real atomic operations, we have ldrex/strex up to 32 bits,
14539 // and up to 64 bits on the non-M profiles
14540 TargetLowering::AtomicExpansionKind
14541 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
14542   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
14543   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
14544   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
14545              ? AtomicExpansionKind::LLSC
14546              : AtomicExpansionKind::None;
14547 }
14548 
14549 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
14550     AtomicCmpXchgInst *AI) const {
14551   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
14552   // implement cmpxchg without spilling. If the address being exchanged is also
14553   // on the stack and close enough to the spill slot, this can lead to a
14554   // situation where the monitor always gets cleared and the atomic operation
14555   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
14556   bool hasAtomicCmpXchg =
14557       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
14558   return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg;
14559 }
14560 
14561 bool ARMTargetLowering::shouldInsertFencesForAtomic(
14562     const Instruction *I) const {
14563   return InsertFencesForAtomic;
14564 }
14565 
14566 // This has so far only been implemented for MachO.
14567 bool ARMTargetLowering::useLoadStackGuardNode() const {
14568   return Subtarget->isTargetMachO();
14569 }
14570 
14571 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
14572                                                   unsigned &Cost) const {
14573   // If we do not have NEON, vector types are not natively supported.
14574   if (!Subtarget->hasNEON())
14575     return false;
14576 
14577   // Floating point values and vector values map to the same register file.
14578   // Therefore, although we could do a store extract of a vector type, this is
14579   // better to leave at float as we have more freedom in the addressing mode for
14580   // those.
14581   if (VectorTy->isFPOrFPVectorTy())
14582     return false;
14583 
14584   // If the index is unknown at compile time, this is very expensive to lower
14585   // and it is not possible to combine the store with the extract.
14586   if (!isa<ConstantInt>(Idx))
14587     return false;
14588 
14589   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
14590   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
14591   // We can do a store + vector extract on any vector that fits perfectly in a D
14592   // or Q register.
14593   if (BitWidth == 64 || BitWidth == 128) {
14594     Cost = 0;
14595     return true;
14596   }
14597   return false;
14598 }
14599 
14600 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
14601   return Subtarget->hasV6T2Ops();
14602 }
14603 
14604 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
14605   return Subtarget->hasV6T2Ops();
14606 }
14607 
14608 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
14609                                          AtomicOrdering Ord) const {
14610   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
14611   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
14612   bool IsAcquire = isAcquireOrStronger(Ord);
14613 
14614   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
14615   // intrinsic must return {i32, i32} and we have to recombine them into a
14616   // single i64 here.
14617   if (ValTy->getPrimitiveSizeInBits() == 64) {
14618     Intrinsic::ID Int =
14619         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
14620     Function *Ldrex = Intrinsic::getDeclaration(M, Int);
14621 
14622     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
14623     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
14624 
14625     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
14626     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
14627     if (!Subtarget->isLittle())
14628       std::swap (Lo, Hi);
14629     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
14630     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
14631     return Builder.CreateOr(
14632         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
14633   }
14634 
14635   Type *Tys[] = { Addr->getType() };
14636   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
14637   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
14638 
14639   return Builder.CreateTruncOrBitCast(
14640       Builder.CreateCall(Ldrex, Addr),
14641       cast<PointerType>(Addr->getType())->getElementType());
14642 }
14643 
14644 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
14645     IRBuilder<> &Builder) const {
14646   if (!Subtarget->hasV7Ops())
14647     return;
14648   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
14649   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
14650 }
14651 
14652 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
14653                                                Value *Addr,
14654                                                AtomicOrdering Ord) const {
14655   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
14656   bool IsRelease = isReleaseOrStronger(Ord);
14657 
14658   // Since the intrinsics must have legal type, the i64 intrinsics take two
14659   // parameters: "i32, i32". We must marshal Val into the appropriate form
14660   // before the call.
14661   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
14662     Intrinsic::ID Int =
14663         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
14664     Function *Strex = Intrinsic::getDeclaration(M, Int);
14665     Type *Int32Ty = Type::getInt32Ty(M->getContext());
14666 
14667     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
14668     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
14669     if (!Subtarget->isLittle())
14670       std::swap(Lo, Hi);
14671     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
14672     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
14673   }
14674 
14675   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
14676   Type *Tys[] = { Addr->getType() };
14677   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
14678 
14679   return Builder.CreateCall(
14680       Strex, {Builder.CreateZExtOrBitCast(
14681                   Val, Strex->getFunctionType()->getParamType(0)),
14682               Addr});
14683 }
14684 
14685 /// A helper function for determining the number of interleaved accesses we
14686 /// will generate when lowering accesses of the given type.
14687 unsigned
14688 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
14689                                              const DataLayout &DL) const {
14690   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
14691 }
14692 
14693 bool ARMTargetLowering::isLegalInterleavedAccessType(
14694     VectorType *VecTy, const DataLayout &DL) const {
14695 
14696   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
14697   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
14698 
14699   // Ensure the vector doesn't have f16 elements. Even though we could do an
14700   // i16 vldN, we can't hold the f16 vectors and will end up converting via
14701   // f32.
14702   if (VecTy->getElementType()->isHalfTy())
14703     return false;
14704 
14705   // Ensure the number of vector elements is greater than 1.
14706   if (VecTy->getNumElements() < 2)
14707     return false;
14708 
14709   // Ensure the element type is legal.
14710   if (ElSize != 8 && ElSize != 16 && ElSize != 32)
14711     return false;
14712 
14713   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
14714   // 128 will be split into multiple interleaved accesses.
14715   return VecSize == 64 || VecSize % 128 == 0;
14716 }
14717 
14718 /// Lower an interleaved load into a vldN intrinsic.
14719 ///
14720 /// E.g. Lower an interleaved load (Factor = 2):
14721 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
14722 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
14723 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
14724 ///
14725 ///      Into:
14726 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
14727 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
14728 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
14729 bool ARMTargetLowering::lowerInterleavedLoad(
14730     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
14731     ArrayRef<unsigned> Indices, unsigned Factor) const {
14732   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
14733          "Invalid interleave factor");
14734   assert(!Shuffles.empty() && "Empty shufflevector input");
14735   assert(Shuffles.size() == Indices.size() &&
14736          "Unmatched number of shufflevectors and indices");
14737 
14738   VectorType *VecTy = Shuffles[0]->getType();
14739   Type *EltTy = VecTy->getVectorElementType();
14740 
14741   const DataLayout &DL = LI->getModule()->getDataLayout();
14742 
14743   // Skip if we do not have NEON and skip illegal vector types. We can
14744   // "legalize" wide vector types into multiple interleaved accesses as long as
14745   // the vector types are divisible by 128.
14746   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
14747     return false;
14748 
14749   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
14750 
14751   // A pointer vector can not be the return type of the ldN intrinsics. Need to
14752   // load integer vectors first and then convert to pointer vectors.
14753   if (EltTy->isPointerTy())
14754     VecTy =
14755         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
14756 
14757   IRBuilder<> Builder(LI);
14758 
14759   // The base address of the load.
14760   Value *BaseAddr = LI->getPointerOperand();
14761 
14762   if (NumLoads > 1) {
14763     // If we're going to generate more than one load, reset the sub-vector type
14764     // to something legal.
14765     VecTy = VectorType::get(VecTy->getVectorElementType(),
14766                             VecTy->getVectorNumElements() / NumLoads);
14767 
14768     // We will compute the pointer operand of each load from the original base
14769     // address using GEPs. Cast the base address to a pointer to the scalar
14770     // element type.
14771     BaseAddr = Builder.CreateBitCast(
14772         BaseAddr, VecTy->getVectorElementType()->getPointerTo(
14773                       LI->getPointerAddressSpace()));
14774   }
14775 
14776   assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
14777 
14778   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
14779   Type *Tys[] = {VecTy, Int8Ptr};
14780   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
14781                                             Intrinsic::arm_neon_vld3,
14782                                             Intrinsic::arm_neon_vld4};
14783   Function *VldnFunc =
14784       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
14785 
14786   // Holds sub-vectors extracted from the load intrinsic return values. The
14787   // sub-vectors are associated with the shufflevector instructions they will
14788   // replace.
14789   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
14790 
14791   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
14792     // If we're generating more than one load, compute the base address of
14793     // subsequent loads as an offset from the previous.
14794     if (LoadCount > 0)
14795       BaseAddr = Builder.CreateConstGEP1_32(
14796           BaseAddr, VecTy->getVectorNumElements() * Factor);
14797 
14798     SmallVector<Value *, 2> Ops;
14799     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
14800     Ops.push_back(Builder.getInt32(LI->getAlignment()));
14801 
14802     CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
14803 
14804     // Replace uses of each shufflevector with the corresponding vector loaded
14805     // by ldN.
14806     for (unsigned i = 0; i < Shuffles.size(); i++) {
14807       ShuffleVectorInst *SV = Shuffles[i];
14808       unsigned Index = Indices[i];
14809 
14810       Value *SubVec = Builder.CreateExtractValue(VldN, Index);
14811 
14812       // Convert the integer vector to pointer vector if the element is pointer.
14813       if (EltTy->isPointerTy())
14814         SubVec = Builder.CreateIntToPtr(
14815             SubVec, VectorType::get(SV->getType()->getVectorElementType(),
14816                                     VecTy->getVectorNumElements()));
14817 
14818       SubVecs[SV].push_back(SubVec);
14819     }
14820   }
14821 
14822   // Replace uses of the shufflevector instructions with the sub-vectors
14823   // returned by the load intrinsic. If a shufflevector instruction is
14824   // associated with more than one sub-vector, those sub-vectors will be
14825   // concatenated into a single wide vector.
14826   for (ShuffleVectorInst *SVI : Shuffles) {
14827     auto &SubVec = SubVecs[SVI];
14828     auto *WideVec =
14829         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
14830     SVI->replaceAllUsesWith(WideVec);
14831   }
14832 
14833   return true;
14834 }
14835 
14836 /// Lower an interleaved store into a vstN intrinsic.
14837 ///
14838 /// E.g. Lower an interleaved store (Factor = 3):
14839 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
14840 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
14841 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
14842 ///
14843 ///      Into:
14844 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
14845 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
14846 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
14847 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
14848 ///
14849 /// Note that the new shufflevectors will be removed and we'll only generate one
14850 /// vst3 instruction in CodeGen.
14851 ///
14852 /// Example for a more general valid mask (Factor 3). Lower:
14853 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
14854 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
14855 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
14856 ///
14857 ///      Into:
14858 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
14859 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
14860 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
14861 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
14862 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
14863                                               ShuffleVectorInst *SVI,
14864                                               unsigned Factor) const {
14865   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
14866          "Invalid interleave factor");
14867 
14868   VectorType *VecTy = SVI->getType();
14869   assert(VecTy->getVectorNumElements() % Factor == 0 &&
14870          "Invalid interleaved store");
14871 
14872   unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
14873   Type *EltTy = VecTy->getVectorElementType();
14874   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
14875 
14876   const DataLayout &DL = SI->getModule()->getDataLayout();
14877 
14878   // Skip if we do not have NEON and skip illegal vector types. We can
14879   // "legalize" wide vector types into multiple interleaved accesses as long as
14880   // the vector types are divisible by 128.
14881   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
14882     return false;
14883 
14884   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
14885 
14886   Value *Op0 = SVI->getOperand(0);
14887   Value *Op1 = SVI->getOperand(1);
14888   IRBuilder<> Builder(SI);
14889 
14890   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
14891   // vectors to integer vectors.
14892   if (EltTy->isPointerTy()) {
14893     Type *IntTy = DL.getIntPtrType(EltTy);
14894 
14895     // Convert to the corresponding integer vector.
14896     Type *IntVecTy =
14897         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
14898     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
14899     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
14900 
14901     SubVecTy = VectorType::get(IntTy, LaneLen);
14902   }
14903 
14904   // The base address of the store.
14905   Value *BaseAddr = SI->getPointerOperand();
14906 
14907   if (NumStores > 1) {
14908     // If we're going to generate more than one store, reset the lane length
14909     // and sub-vector type to something legal.
14910     LaneLen /= NumStores;
14911     SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
14912 
14913     // We will compute the pointer operand of each store from the original base
14914     // address using GEPs. Cast the base address to a pointer to the scalar
14915     // element type.
14916     BaseAddr = Builder.CreateBitCast(
14917         BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
14918                       SI->getPointerAddressSpace()));
14919   }
14920 
14921   assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
14922 
14923   auto Mask = SVI->getShuffleMask();
14924 
14925   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
14926   Type *Tys[] = {Int8Ptr, SubVecTy};
14927   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
14928                                              Intrinsic::arm_neon_vst3,
14929                                              Intrinsic::arm_neon_vst4};
14930 
14931   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
14932     // If we generating more than one store, we compute the base address of
14933     // subsequent stores as an offset from the previous.
14934     if (StoreCount > 0)
14935       BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
14936 
14937     SmallVector<Value *, 6> Ops;
14938     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
14939 
14940     Function *VstNFunc =
14941         Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
14942 
14943     // Split the shufflevector operands into sub vectors for the new vstN call.
14944     for (unsigned i = 0; i < Factor; i++) {
14945       unsigned IdxI = StoreCount * LaneLen * Factor + i;
14946       if (Mask[IdxI] >= 0) {
14947         Ops.push_back(Builder.CreateShuffleVector(
14948             Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
14949       } else {
14950         unsigned StartMask = 0;
14951         for (unsigned j = 1; j < LaneLen; j++) {
14952           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
14953           if (Mask[IdxJ * Factor + IdxI] >= 0) {
14954             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
14955             break;
14956           }
14957         }
14958         // Note: If all elements in a chunk are undefs, StartMask=0!
14959         // Note: Filling undef gaps with random elements is ok, since
14960         // those elements were being written anyway (with undefs).
14961         // In the case of all undefs we're defaulting to using elems from 0
14962         // Note: StartMask cannot be negative, it's checked in
14963         // isReInterleaveMask
14964         Ops.push_back(Builder.CreateShuffleVector(
14965             Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
14966       }
14967     }
14968 
14969     Ops.push_back(Builder.getInt32(SI->getAlignment()));
14970     Builder.CreateCall(VstNFunc, Ops);
14971   }
14972   return true;
14973 }
14974 
14975 enum HABaseType {
14976   HA_UNKNOWN = 0,
14977   HA_FLOAT,
14978   HA_DOUBLE,
14979   HA_VECT64,
14980   HA_VECT128
14981 };
14982 
14983 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
14984                                    uint64_t &Members) {
14985   if (auto *ST = dyn_cast<StructType>(Ty)) {
14986     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
14987       uint64_t SubMembers = 0;
14988       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
14989         return false;
14990       Members += SubMembers;
14991     }
14992   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
14993     uint64_t SubMembers = 0;
14994     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
14995       return false;
14996     Members += SubMembers * AT->getNumElements();
14997   } else if (Ty->isFloatTy()) {
14998     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
14999       return false;
15000     Members = 1;
15001     Base = HA_FLOAT;
15002   } else if (Ty->isDoubleTy()) {
15003     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
15004       return false;
15005     Members = 1;
15006     Base = HA_DOUBLE;
15007   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
15008     Members = 1;
15009     switch (Base) {
15010     case HA_FLOAT:
15011     case HA_DOUBLE:
15012       return false;
15013     case HA_VECT64:
15014       return VT->getBitWidth() == 64;
15015     case HA_VECT128:
15016       return VT->getBitWidth() == 128;
15017     case HA_UNKNOWN:
15018       switch (VT->getBitWidth()) {
15019       case 64:
15020         Base = HA_VECT64;
15021         return true;
15022       case 128:
15023         Base = HA_VECT128;
15024         return true;
15025       default:
15026         return false;
15027       }
15028     }
15029   }
15030 
15031   return (Members > 0 && Members <= 4);
15032 }
15033 
15034 /// Return the correct alignment for the current calling convention.
15035 unsigned
15036 ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
15037                                                  DataLayout DL) const {
15038   if (!ArgTy->isVectorTy())
15039     return DL.getABITypeAlignment(ArgTy);
15040 
15041   // Avoid over-aligning vector parameters. It would require realigning the
15042   // stack and waste space for no real benefit.
15043   return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
15044 }
15045 
15046 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
15047 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
15048 /// passing according to AAPCS rules.
15049 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
15050     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
15051   if (getEffectiveCallingConv(CallConv, isVarArg) !=
15052       CallingConv::ARM_AAPCS_VFP)
15053     return false;
15054 
15055   HABaseType Base = HA_UNKNOWN;
15056   uint64_t Members = 0;
15057   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
15058   LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
15059 
15060   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
15061   return IsHA || IsIntArray;
15062 }
15063 
15064 unsigned ARMTargetLowering::getExceptionPointerRegister(
15065     const Constant *PersonalityFn) const {
15066   // Platforms which do not use SjLj EH may return values in these registers
15067   // via the personality function.
15068   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
15069 }
15070 
15071 unsigned ARMTargetLowering::getExceptionSelectorRegister(
15072     const Constant *PersonalityFn) const {
15073   // Platforms which do not use SjLj EH may return values in these registers
15074   // via the personality function.
15075   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
15076 }
15077 
15078 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
15079   // Update IsSplitCSR in ARMFunctionInfo.
15080   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
15081   AFI->setIsSplitCSR(true);
15082 }
15083 
15084 void ARMTargetLowering::insertCopiesSplitCSR(
15085     MachineBasicBlock *Entry,
15086     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
15087   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
15088   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
15089   if (!IStart)
15090     return;
15091 
15092   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
15093   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
15094   MachineBasicBlock::iterator MBBI = Entry->begin();
15095   for (const MCPhysReg *I = IStart; *I; ++I) {
15096     const TargetRegisterClass *RC = nullptr;
15097     if (ARM::GPRRegClass.contains(*I))
15098       RC = &ARM::GPRRegClass;
15099     else if (ARM::DPRRegClass.contains(*I))
15100       RC = &ARM::DPRRegClass;
15101     else
15102       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
15103 
15104     unsigned NewVR = MRI->createVirtualRegister(RC);
15105     // Create copy from CSR to a virtual register.
15106     // FIXME: this currently does not emit CFI pseudo-instructions, it works
15107     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
15108     // nounwind. If we want to generalize this later, we may need to emit
15109     // CFI pseudo-instructions.
15110     assert(Entry->getParent()->getFunction().hasFnAttribute(
15111                Attribute::NoUnwind) &&
15112            "Function should be nounwind in insertCopiesSplitCSR!");
15113     Entry->addLiveIn(*I);
15114     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
15115         .addReg(*I);
15116 
15117     // Insert the copy-back instructions right before the terminator.
15118     for (auto *Exit : Exits)
15119       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
15120               TII->get(TargetOpcode::COPY), *I)
15121           .addReg(NewVR);
15122   }
15123 }
15124 
15125 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
15126   MF.getFrameInfo().computeMaxCallFrameSize(MF);
15127   TargetLoweringBase::finalizeLowering(MF);
15128 }
15129