1 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the interfaces that VE uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "VEISelLowering.h"
15 #include "MCTargetDesc/VEMCExpr.h"
16 #include "VECustomDAG.h"
17 #include "VEInstrBuilder.h"
18 #include "VEMachineFunctionInfo.h"
19 #include "VERegisterInfo.h"
20 #include "VETargetMachine.h"
21 #include "llvm/ADT/StringSwitch.h"
22 #include "llvm/CodeGen/CallingConvLower.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineJumpTableInfo.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/SelectionDAG.h"
30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/Module.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/KnownBits.h"
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "ve-lower"
40 
41 //===----------------------------------------------------------------------===//
42 // Calling Convention Implementation
43 //===----------------------------------------------------------------------===//
44 
45 #include "VEGenCallingConv.inc"
46 
47 CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
48   switch (CallConv) {
49   default:
50     return RetCC_VE_C;
51   case CallingConv::Fast:
52     return RetCC_VE_Fast;
53   }
54 }
55 
56 CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
57   if (IsVarArg)
58     return CC_VE2;
59   switch (CallConv) {
60   default:
61     return CC_VE_C;
62   case CallingConv::Fast:
63     return CC_VE_Fast;
64   }
65 }
66 
67 bool VETargetLowering::CanLowerReturn(
68     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
69     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
70   CCAssignFn *RetCC = getReturnCC(CallConv);
71   SmallVector<CCValAssign, 16> RVLocs;
72   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
73   return CCInfo.CheckReturn(Outs, RetCC);
74 }
75 
76 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
77                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};
78 
79 static const MVT AllMaskVTs[] = {MVT::v256i1, MVT::v512i1};
80 
81 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
82 
83 void VETargetLowering::initRegisterClasses() {
84   // Set up the register classes.
85   addRegisterClass(MVT::i32, &VE::I32RegClass);
86   addRegisterClass(MVT::i64, &VE::I64RegClass);
87   addRegisterClass(MVT::f32, &VE::F32RegClass);
88   addRegisterClass(MVT::f64, &VE::I64RegClass);
89   addRegisterClass(MVT::f128, &VE::F128RegClass);
90 
91   if (Subtarget->enableVPU()) {
92     for (MVT VecVT : AllVectorVTs)
93       addRegisterClass(VecVT, &VE::V64RegClass);
94     addRegisterClass(MVT::v256i1, &VE::VMRegClass);
95     addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
96   }
97 }
98 
99 void VETargetLowering::initSPUActions() {
100   const auto &TM = getTargetMachine();
101   /// Load & Store {
102 
103   // VE doesn't have i1 sign extending load.
104   for (MVT VT : MVT::integer_valuetypes()) {
105     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
106     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
107     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
108     setTruncStoreAction(VT, MVT::i1, Expand);
109   }
110 
111   // VE doesn't have floating point extload/truncstore, so expand them.
112   for (MVT FPVT : MVT::fp_valuetypes()) {
113     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
114       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
115       setTruncStoreAction(FPVT, OtherFPVT, Expand);
116     }
117   }
118 
119   // VE doesn't have fp128 load/store, so expand them in custom lower.
120   setOperationAction(ISD::LOAD, MVT::f128, Custom);
121   setOperationAction(ISD::STORE, MVT::f128, Custom);
122 
123   /// } Load & Store
124 
125   // Custom legalize address nodes into LO/HI parts.
126   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
127   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
128   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
129   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
130   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
131   setOperationAction(ISD::JumpTable, PtrVT, Custom);
132 
133   /// VAARG handling {
134   setOperationAction(ISD::VASTART, MVT::Other, Custom);
135   // VAARG needs to be lowered to access with 8 bytes alignment.
136   setOperationAction(ISD::VAARG, MVT::Other, Custom);
137   // Use the default implementation.
138   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
139   setOperationAction(ISD::VAEND, MVT::Other, Expand);
140   /// } VAARG handling
141 
142   /// Stack {
143   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
144   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
145 
146   // Use the default implementation.
147   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
148   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
149   /// } Stack
150 
151   /// Branch {
152 
153   // VE doesn't have BRCOND
154   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
155 
156   // BR_JT is not implemented yet.
157   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
158 
159   /// } Branch
160 
161   /// Int Ops {
162   for (MVT IntVT : {MVT::i32, MVT::i64}) {
163     // VE has no REM or DIVREM operations.
164     setOperationAction(ISD::UREM, IntVT, Expand);
165     setOperationAction(ISD::SREM, IntVT, Expand);
166     setOperationAction(ISD::SDIVREM, IntVT, Expand);
167     setOperationAction(ISD::UDIVREM, IntVT, Expand);
168 
169     // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
170     setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
171     setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
172     setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
173 
174     // VE has no MULHU/S or U/SMUL_LOHI operations.
175     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
176     setOperationAction(ISD::MULHU, IntVT, Expand);
177     setOperationAction(ISD::MULHS, IntVT, Expand);
178     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
179     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
180 
181     // VE has no CTTZ, ROTL, ROTR operations.
182     setOperationAction(ISD::CTTZ, IntVT, Expand);
183     setOperationAction(ISD::ROTL, IntVT, Expand);
184     setOperationAction(ISD::ROTR, IntVT, Expand);
185 
186     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
187     // instruction works fine as i32 BSWAP operation with an additional
188     // parameter.  Use isel patterns to lower BSWAP.
189     setOperationAction(ISD::BSWAP, IntVT, Legal);
190 
191     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
192     // operations.  Use isel patterns for i64, promote for i32.
193     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
194     setOperationAction(ISD::BITREVERSE, IntVT, Act);
195     setOperationAction(ISD::CTLZ, IntVT, Act);
196     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
197     setOperationAction(ISD::CTPOP, IntVT, Act);
198 
199     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
200     // Use isel patterns for i64, promote for i32.
201     setOperationAction(ISD::AND, IntVT, Act);
202     setOperationAction(ISD::OR, IntVT, Act);
203     setOperationAction(ISD::XOR, IntVT, Act);
204   }
205   /// } Int Ops
206 
207   /// Conversion {
208   // VE doesn't have instructions for fp<->uint, so expand them by llvm
209   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
210   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
211   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
212   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
213 
214   // fp16 not supported
215   for (MVT FPVT : MVT::fp_valuetypes()) {
216     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
217     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
218   }
219   /// } Conversion
220 
221   /// Floating-point Ops {
222   /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
223   ///       and fcmp.
224 
225   // VE doesn't have following floating point operations.
226   for (MVT VT : MVT::fp_valuetypes()) {
227     setOperationAction(ISD::FNEG, VT, Expand);
228     setOperationAction(ISD::FREM, VT, Expand);
229   }
230 
231   // VE doesn't have fdiv of f128.
232   setOperationAction(ISD::FDIV, MVT::f128, Expand);
233 
234   for (MVT FPVT : {MVT::f32, MVT::f64}) {
235     // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
236     setOperationAction(ISD::ConstantFP, FPVT, Legal);
237   }
238   /// } Floating-point Ops
239 
240   /// Floating-point math functions {
241 
242   // VE doesn't have following floating point math functions.
243   for (MVT VT : MVT::fp_valuetypes()) {
244     setOperationAction(ISD::FABS, VT, Expand);
245     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
246     setOperationAction(ISD::FCOS, VT, Expand);
247     setOperationAction(ISD::FSIN, VT, Expand);
248     setOperationAction(ISD::FSQRT, VT, Expand);
249   }
250 
251   /// } Floating-point math functions
252 
253   /// Atomic instructions {
254 
255   setMaxAtomicSizeInBitsSupported(64);
256   setMinCmpXchgSizeInBits(32);
257   setSupportsUnalignedAtomics(false);
258 
259   // Use custom inserter for ATOMIC_FENCE.
260   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
261 
262   // Other atomic instructions.
263   for (MVT VT : MVT::integer_valuetypes()) {
264     // Support i8/i16 atomic swap.
265     setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
266 
267     // FIXME: Support "atmam" instructions.
268     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
269     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
270     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
271     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
272 
273     // VE doesn't have follwing instructions.
274     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
275     setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
276     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
277     setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
278     setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
279     setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
280     setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
281     setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
282   }
283 
284   /// } Atomic instructions
285 
286   /// SJLJ instructions {
287   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
288   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
289   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
290   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
291     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
292   /// } SJLJ instructions
293 
294   // Intrinsic instructions
295   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
296 }
297 
298 void VETargetLowering::initVPUActions() {
299   for (MVT LegalMaskVT : AllMaskVTs)
300     setOperationAction(ISD::BUILD_VECTOR, LegalMaskVT, Custom);
301 
302   for (unsigned Opc : {ISD::AND, ISD::OR, ISD::XOR})
303     setOperationAction(Opc, MVT::v512i1, Custom);
304 
305   for (MVT LegalVecVT : AllVectorVTs) {
306     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
307     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
308     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
309     // Translate all vector instructions with legal element types to VVP_*
310     // nodes.
311     // TODO We will custom-widen into VVP_* nodes in the future. While we are
312     // buildling the infrastructure for this, we only do this for legal vector
313     // VTs.
314 #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
315   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
316 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
317   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
318     setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_LOAD, LegalVecVT, Custom);
319     setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_STORE, LegalVecVT, Custom);
320 #include "VVPNodes.def"
321   }
322 
323   for (MVT LegalPackedVT : AllPackedVTs) {
324     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
325     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
326   }
327 
328   // vNt32, vNt64 ops (legal element types)
329   for (MVT VT : MVT::vector_valuetypes()) {
330     MVT ElemVT = VT.getVectorElementType();
331     unsigned ElemBits = ElemVT.getScalarSizeInBits();
332     if (ElemBits != 32 && ElemBits != 64)
333       continue;
334 
335     for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE})
336       setOperationAction(MemOpc, VT, Custom);
337 
338     const ISD::NodeType IntReductionOCs[] = {
339         ISD::VECREDUCE_ADD,  ISD::VECREDUCE_MUL,  ISD::VECREDUCE_AND,
340         ISD::VECREDUCE_OR,   ISD::VECREDUCE_XOR,  ISD::VECREDUCE_SMIN,
341         ISD::VECREDUCE_SMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_UMAX};
342 
343     for (unsigned IntRedOpc : IntReductionOCs)
344       setOperationAction(IntRedOpc, VT, Custom);
345   }
346 }
347 
348 SDValue
349 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
350                               bool IsVarArg,
351                               const SmallVectorImpl<ISD::OutputArg> &Outs,
352                               const SmallVectorImpl<SDValue> &OutVals,
353                               const SDLoc &DL, SelectionDAG &DAG) const {
354   // CCValAssign - represent the assignment of the return value to locations.
355   SmallVector<CCValAssign, 16> RVLocs;
356 
357   // CCState - Info about the registers and stack slot.
358   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
359                  *DAG.getContext());
360 
361   // Analyze return values.
362   CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
363 
364   SDValue Flag;
365   SmallVector<SDValue, 4> RetOps(1, Chain);
366 
367   // Copy the result values into the output registers.
368   for (unsigned i = 0; i != RVLocs.size(); ++i) {
369     CCValAssign &VA = RVLocs[i];
370     assert(VA.isRegLoc() && "Can only return in registers!");
371     assert(!VA.needsCustom() && "Unexpected custom lowering");
372     SDValue OutVal = OutVals[i];
373 
374     // Integer return values must be sign or zero extended by the callee.
375     switch (VA.getLocInfo()) {
376     case CCValAssign::Full:
377       break;
378     case CCValAssign::SExt:
379       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
380       break;
381     case CCValAssign::ZExt:
382       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
383       break;
384     case CCValAssign::AExt:
385       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
386       break;
387     case CCValAssign::BCvt: {
388       // Convert a float return value to i64 with padding.
389       //     63     31   0
390       //    +------+------+
391       //    | float|   0  |
392       //    +------+------+
393       assert(VA.getLocVT() == MVT::i64);
394       assert(VA.getValVT() == MVT::f32);
395       SDValue Undef = SDValue(
396           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
397       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
398       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
399                                           MVT::i64, Undef, OutVal, Sub_f32),
400                        0);
401       break;
402     }
403     default:
404       llvm_unreachable("Unknown loc info!");
405     }
406 
407     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
408 
409     // Guarantee that all emitted copies are stuck together with flags.
410     Flag = Chain.getValue(1);
411     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
412   }
413 
414   RetOps[0] = Chain; // Update chain.
415 
416   // Add the flag if we have it.
417   if (Flag.getNode())
418     RetOps.push_back(Flag);
419 
420   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
421 }
422 
423 SDValue VETargetLowering::LowerFormalArguments(
424     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
425     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
426     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
427   MachineFunction &MF = DAG.getMachineFunction();
428 
429   // Get the base offset of the incoming arguments stack space.
430   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
431   // Get the size of the preserved arguments area
432   unsigned ArgsPreserved = 64;
433 
434   // Analyze arguments according to CC_VE.
435   SmallVector<CCValAssign, 16> ArgLocs;
436   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
437                  *DAG.getContext());
438   // Allocate the preserved area first.
439   CCInfo.AllocateStack(ArgsPreserved, Align(8));
440   // We already allocated the preserved area, so the stack offset computed
441   // by CC_VE would be correct now.
442   CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
443 
444   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
445     CCValAssign &VA = ArgLocs[i];
446     assert(!VA.needsCustom() && "Unexpected custom lowering");
447     if (VA.isRegLoc()) {
448       // This argument is passed in a register.
449       // All integer register arguments are promoted by the caller to i64.
450 
451       // Create a virtual register for the promoted live-in value.
452       Register VReg =
453           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
454       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
455 
456       // The caller promoted the argument, so insert an Assert?ext SDNode so we
457       // won't promote the value again in this function.
458       switch (VA.getLocInfo()) {
459       case CCValAssign::SExt:
460         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
461                           DAG.getValueType(VA.getValVT()));
462         break;
463       case CCValAssign::ZExt:
464         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
465                           DAG.getValueType(VA.getValVT()));
466         break;
467       case CCValAssign::BCvt: {
468         // Extract a float argument from i64 with padding.
469         //     63     31   0
470         //    +------+------+
471         //    | float|   0  |
472         //    +------+------+
473         assert(VA.getLocVT() == MVT::i64);
474         assert(VA.getValVT() == MVT::f32);
475         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
476         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
477                                          MVT::f32, Arg, Sub_f32),
478                       0);
479         break;
480       }
481       default:
482         break;
483       }
484 
485       // Truncate the register down to the argument type.
486       if (VA.isExtInLoc())
487         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
488 
489       InVals.push_back(Arg);
490       continue;
491     }
492 
493     // The registers are exhausted. This argument was passed on the stack.
494     assert(VA.isMemLoc());
495     // The CC_VE_Full/Half functions compute stack offsets relative to the
496     // beginning of the arguments area at %fp + the size of reserved area.
497     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
498     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
499 
500     // Adjust offset for a float argument by adding 4 since the argument is
501     // stored in 8 bytes buffer with offset like below.  LLVM generates
502     // 4 bytes load instruction, so need to adjust offset here.  This
503     // adjustment is required in only LowerFormalArguments.  In LowerCall,
504     // a float argument is converted to i64 first, and stored as 8 bytes
505     // data, which is required by ABI, so no need for adjustment.
506     //    0      4
507     //    +------+------+
508     //    | empty| float|
509     //    +------+------+
510     if (VA.getValVT() == MVT::f32)
511       Offset += 4;
512 
513     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
514     InVals.push_back(
515         DAG.getLoad(VA.getValVT(), DL, Chain,
516                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
517                     MachinePointerInfo::getFixedStack(MF, FI)));
518   }
519 
520   if (!IsVarArg)
521     return Chain;
522 
523   // This function takes variable arguments, some of which may have been passed
524   // in registers %s0-%s8.
525   //
526   // The va_start intrinsic needs to know the offset to the first variable
527   // argument.
528   // TODO: need to calculate offset correctly once we support f128.
529   unsigned ArgOffset = ArgLocs.size() * 8;
530   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
531   // Skip the reserved area at the top of stack.
532   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
533 
534   return Chain;
535 }
536 
537 // FIXME? Maybe this could be a TableGen attribute on some registers and
538 // this table could be generated automatically from RegInfo.
539 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
540                                              const MachineFunction &MF) const {
541   Register Reg = StringSwitch<Register>(RegName)
542                      .Case("sp", VE::SX11)    // Stack pointer
543                      .Case("fp", VE::SX9)     // Frame pointer
544                      .Case("sl", VE::SX8)     // Stack limit
545                      .Case("lr", VE::SX10)    // Link register
546                      .Case("tp", VE::SX14)    // Thread pointer
547                      .Case("outer", VE::SX12) // Outer regiser
548                      .Case("info", VE::SX17)  // Info area register
549                      .Case("got", VE::SX15)   // Global offset table register
550                      .Case("plt", VE::SX16) // Procedure linkage table register
551                      .Default(0);
552 
553   if (Reg)
554     return Reg;
555 
556   report_fatal_error("Invalid register name global variable");
557 }
558 
559 //===----------------------------------------------------------------------===//
560 // TargetLowering Implementation
561 //===----------------------------------------------------------------------===//
562 
563 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
564                                     SmallVectorImpl<SDValue> &InVals) const {
565   SelectionDAG &DAG = CLI.DAG;
566   SDLoc DL = CLI.DL;
567   SDValue Chain = CLI.Chain;
568   auto PtrVT = getPointerTy(DAG.getDataLayout());
569 
570   // VE target does not yet support tail call optimization.
571   CLI.IsTailCall = false;
572 
573   // Get the base offset of the outgoing arguments stack space.
574   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
575   // Get the size of the preserved arguments area
576   unsigned ArgsPreserved = 8 * 8u;
577 
578   // Analyze operands of the call, assigning locations to each operand.
579   SmallVector<CCValAssign, 16> ArgLocs;
580   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
581                  *DAG.getContext());
582   // Allocate the preserved area first.
583   CCInfo.AllocateStack(ArgsPreserved, Align(8));
584   // We already allocated the preserved area, so the stack offset computed
585   // by CC_VE would be correct now.
586   CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
587 
588   // VE requires to use both register and stack for varargs or no-prototyped
589   // functions.
590   bool UseBoth = CLI.IsVarArg;
591 
592   // Analyze operands again if it is required to store BOTH.
593   SmallVector<CCValAssign, 16> ArgLocs2;
594   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
595                   ArgLocs2, *DAG.getContext());
596   if (UseBoth)
597     CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
598 
599   // Get the size of the outgoing arguments stack space requirement.
600   unsigned ArgsSize = CCInfo.getNextStackOffset();
601 
602   // Keep stack frames 16-byte aligned.
603   ArgsSize = alignTo(ArgsSize, 16);
604 
605   // Adjust the stack pointer to make room for the arguments.
606   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
607   // with more than 6 arguments.
608   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
609 
610   // Collect the set of registers to pass to the function and their values.
611   // This will be emitted as a sequence of CopyToReg nodes glued to the call
612   // instruction.
613   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
614 
615   // Collect chains from all the memory opeations that copy arguments to the
616   // stack. They must follow the stack pointer adjustment above and precede the
617   // call instruction itself.
618   SmallVector<SDValue, 8> MemOpChains;
619 
620   // VE needs to get address of callee function in a register
621   // So, prepare to copy it to SX12 here.
622 
623   // If the callee is a GlobalAddress node (quite common, every direct call is)
624   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
625   // Likewise ExternalSymbol -> TargetExternalSymbol.
626   SDValue Callee = CLI.Callee;
627 
628   bool IsPICCall = isPositionIndependent();
629 
630   // PC-relative references to external symbols should go through $stub.
631   // If so, we need to prepare GlobalBaseReg first.
632   const TargetMachine &TM = DAG.getTarget();
633   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
634   const GlobalValue *GV = nullptr;
635   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
636   if (CalleeG)
637     GV = CalleeG->getGlobal();
638   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
639   bool UsePlt = !Local;
640   MachineFunction &MF = DAG.getMachineFunction();
641 
642   // Turn GlobalAddress/ExternalSymbol node into a value node
643   // containing the address of them here.
644   if (CalleeG) {
645     if (IsPICCall) {
646       if (UsePlt)
647         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
648       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
649       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
650     } else {
651       Callee =
652           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
653     }
654   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
655     if (IsPICCall) {
656       if (UsePlt)
657         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
658       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
659       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
660     } else {
661       Callee =
662           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
663     }
664   }
665 
666   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));
667 
668   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
669     CCValAssign &VA = ArgLocs[i];
670     SDValue Arg = CLI.OutVals[i];
671 
672     // Promote the value if needed.
673     switch (VA.getLocInfo()) {
674     default:
675       llvm_unreachable("Unknown location info!");
676     case CCValAssign::Full:
677       break;
678     case CCValAssign::SExt:
679       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
680       break;
681     case CCValAssign::ZExt:
682       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
683       break;
684     case CCValAssign::AExt:
685       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
686       break;
687     case CCValAssign::BCvt: {
688       // Convert a float argument to i64 with padding.
689       //     63     31   0
690       //    +------+------+
691       //    | float|   0  |
692       //    +------+------+
693       assert(VA.getLocVT() == MVT::i64);
694       assert(VA.getValVT() == MVT::f32);
695       SDValue Undef = SDValue(
696           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
697       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
698       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
699                                        MVT::i64, Undef, Arg, Sub_f32),
700                     0);
701       break;
702     }
703     }
704 
705     if (VA.isRegLoc()) {
706       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
707       if (!UseBoth)
708         continue;
709       VA = ArgLocs2[i];
710     }
711 
712     assert(VA.isMemLoc());
713 
714     // Create a store off the stack pointer for this argument.
715     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
716     // The argument area starts at %fp/%sp + the size of reserved area.
717     SDValue PtrOff =
718         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
719     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
720     MemOpChains.push_back(
721         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
722   }
723 
724   // Emit all stores, make sure they occur before the call.
725   if (!MemOpChains.empty())
726     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
727 
728   // Build a sequence of CopyToReg nodes glued together with token chain and
729   // glue operands which copy the outgoing args into registers. The InGlue is
730   // necessary since all emitted instructions must be stuck together in order
731   // to pass the live physical registers.
732   SDValue InGlue;
733   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
734     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
735                              RegsToPass[i].second, InGlue);
736     InGlue = Chain.getValue(1);
737   }
738 
739   // Build the operands for the call instruction itself.
740   SmallVector<SDValue, 8> Ops;
741   Ops.push_back(Chain);
742   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
743     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
744                                   RegsToPass[i].second.getValueType()));
745 
746   // Add a register mask operand representing the call-preserved registers.
747   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
748   const uint32_t *Mask =
749       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
750   assert(Mask && "Missing call preserved mask for calling convention");
751   Ops.push_back(DAG.getRegisterMask(Mask));
752 
753   // Make sure the CopyToReg nodes are glued to the call instruction which
754   // consumes the registers.
755   if (InGlue.getNode())
756     Ops.push_back(InGlue);
757 
758   // Now the call itself.
759   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
760   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
761   InGlue = Chain.getValue(1);
762 
763   // Revert the stack pointer immediately after the call.
764   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
765                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
766   InGlue = Chain.getValue(1);
767 
768   // Now extract the return values. This is more or less the same as
769   // LowerFormalArguments.
770 
771   // Assign locations to each value returned by this call.
772   SmallVector<CCValAssign, 16> RVLocs;
773   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
774                  *DAG.getContext());
775 
776   // Set inreg flag manually for codegen generated library calls that
777   // return float.
778   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
779     CLI.Ins[0].Flags.setInReg();
780 
781   RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
782 
783   // Copy all of the result registers out of their specified physreg.
784   for (unsigned i = 0; i != RVLocs.size(); ++i) {
785     CCValAssign &VA = RVLocs[i];
786     assert(!VA.needsCustom() && "Unexpected custom lowering");
787     Register Reg = VA.getLocReg();
788 
789     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
790     // reside in the same register in the high and low bits. Reuse the
791     // CopyFromReg previous node to avoid duplicate copies.
792     SDValue RV;
793     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
794       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
795         RV = Chain.getValue(0);
796 
797     // But usually we'll create a new CopyFromReg for a different register.
798     if (!RV.getNode()) {
799       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
800       Chain = RV.getValue(1);
801       InGlue = Chain.getValue(2);
802     }
803 
804     // The callee promoted the return value, so insert an Assert?ext SDNode so
805     // we won't promote the value again in this function.
806     switch (VA.getLocInfo()) {
807     case CCValAssign::SExt:
808       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
809                        DAG.getValueType(VA.getValVT()));
810       break;
811     case CCValAssign::ZExt:
812       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
813                        DAG.getValueType(VA.getValVT()));
814       break;
815     case CCValAssign::BCvt: {
816       // Extract a float return value from i64 with padding.
817       //     63     31   0
818       //    +------+------+
819       //    | float|   0  |
820       //    +------+------+
821       assert(VA.getLocVT() == MVT::i64);
822       assert(VA.getValVT() == MVT::f32);
823       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
824       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
825                                       MVT::f32, RV, Sub_f32),
826                    0);
827       break;
828     }
829     default:
830       break;
831     }
832 
833     // Truncate the register down to the return value type.
834     if (VA.isExtInLoc())
835       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
836 
837     InVals.push_back(RV);
838   }
839 
840   return Chain;
841 }
842 
843 bool VETargetLowering::isOffsetFoldingLegal(
844     const GlobalAddressSDNode *GA) const {
845   // VE uses 64 bit addressing, so we need multiple instructions to generate
846   // an address.  Folding address with offset increases the number of
847   // instructions, so that we disable it here.  Offsets will be folded in
848   // the DAG combine later if it worth to do so.
849   return false;
850 }
851 
852 /// isFPImmLegal - Returns true if the target can instruction select the
853 /// specified FP immediate natively. If false, the legalizer will
854 /// materialize the FP immediate as a load from a constant pool.
855 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
856                                     bool ForCodeSize) const {
857   return VT == MVT::f32 || VT == MVT::f64;
858 }
859 
860 /// Determine if the target supports unaligned memory accesses.
861 ///
862 /// This function returns true if the target allows unaligned memory accesses
863 /// of the specified type in the given address space. If true, it also returns
864 /// whether the unaligned memory access is "fast" in the last argument by
865 /// reference. This is used, for example, in situations where an array
866 /// copy/move/set is converted to a sequence of store operations. Its use
867 /// helps to ensure that such replacements don't generate code that causes an
868 /// alignment error (trap) on the target machine.
869 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
870                                                       unsigned AddrSpace,
871                                                       Align A,
872                                                       MachineMemOperand::Flags,
873                                                       bool *Fast) const {
874   if (Fast) {
875     // It's fast anytime on VE
876     *Fast = true;
877   }
878   return true;
879 }
880 
881 VETargetLowering::VETargetLowering(const TargetMachine &TM,
882                                    const VESubtarget &STI)
883     : TargetLowering(TM), Subtarget(&STI) {
884   // Instructions which use registers as conditionals examine all the
885   // bits (as does the pseudo SELECT_CC expansion). I don't think it
886   // matters much whether it's ZeroOrOneBooleanContent, or
887   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
888   // former.
889   setBooleanContents(ZeroOrOneBooleanContent);
890   setBooleanVectorContents(ZeroOrOneBooleanContent);
891 
892   initRegisterClasses();
893   initSPUActions();
894   initVPUActions();
895 
896   setStackPointerRegisterToSaveRestore(VE::SX11);
897 
898   // We have target-specific dag combine patterns for the following nodes:
899   setTargetDAGCombine(ISD::TRUNCATE);
900 
901   // Set function alignment to 16 bytes
902   setMinFunctionAlignment(Align(16));
903 
904   // VE stores all argument by 8 bytes alignment
905   setMinStackArgumentAlignment(Align(8));
906 
907   computeRegisterProperties(Subtarget->getRegisterInfo());
908 }
909 
910 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
911 #define TARGET_NODE_CASE(NAME)                                                 \
912   case VEISD::NAME:                                                            \
913     return "VEISD::" #NAME;
914   switch ((VEISD::NodeType)Opcode) {
915   case VEISD::FIRST_NUMBER:
916     break;
917     TARGET_NODE_CASE(CALL)
918     TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
919     TARGET_NODE_CASE(EH_SJLJ_SETJMP)
920     TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
921     TARGET_NODE_CASE(GETFUNPLT)
922     TARGET_NODE_CASE(GETSTACKTOP)
923     TARGET_NODE_CASE(GETTLSADDR)
924     TARGET_NODE_CASE(GLOBAL_BASE_REG)
925     TARGET_NODE_CASE(Hi)
926     TARGET_NODE_CASE(Lo)
927     TARGET_NODE_CASE(MEMBARRIER)
928     TARGET_NODE_CASE(RET_FLAG)
929     TARGET_NODE_CASE(TS1AM)
930     TARGET_NODE_CASE(VEC_UNPACK_LO)
931     TARGET_NODE_CASE(VEC_UNPACK_HI)
932     TARGET_NODE_CASE(VEC_PACK)
933     TARGET_NODE_CASE(VEC_BROADCAST)
934     TARGET_NODE_CASE(REPL_I32)
935     TARGET_NODE_CASE(REPL_F32)
936 
937     TARGET_NODE_CASE(LEGALAVL)
938 
939     // Register the VVP_* SDNodes.
940 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
941 #include "VVPNodes.def"
942   }
943 #undef TARGET_NODE_CASE
944   return nullptr;
945 }
946 
947 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
948                                          EVT VT) const {
949   return MVT::i32;
950 }
951 
952 // Convert to a target node and set target flags.
953 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
954                                           SelectionDAG &DAG) const {
955   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
956     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
957                                       GA->getValueType(0), GA->getOffset(), TF);
958 
959   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
960     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
961                                      0, TF);
962 
963   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
964     return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
965                                      CP->getAlign(), CP->getOffset(), TF);
966 
967   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
968     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
969                                        TF);
970 
971   if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
972     return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
973 
974   llvm_unreachable("Unhandled address SDNode");
975 }
976 
977 // Split Op into high and low parts according to HiTF and LoTF.
978 // Return an ADD node combining the parts.
979 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
980                                        SelectionDAG &DAG) const {
981   SDLoc DL(Op);
982   EVT VT = Op.getValueType();
983   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
984   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
985   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
986 }
987 
988 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
989 // or ExternalSymbol SDNode.
990 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
991   SDLoc DL(Op);
992   EVT PtrVT = Op.getValueType();
993 
994   // Handle PIC mode first. VE needs a got load for every variable!
995   if (isPositionIndependent()) {
996     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
997 
998     if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
999         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
1000       // Create following instructions for local linkage PIC code.
1001       //     lea %reg, label@gotoff_lo
1002       //     and %reg, %reg, (32)0
1003       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
1004       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
1005                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1006       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
1007       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
1008     }
1009     // Create following instructions for not local linkage PIC code.
1010     //     lea %reg, label@got_lo
1011     //     and %reg, %reg, (32)0
1012     //     lea.sl %reg, label@got_hi(%reg)
1013     //     ld %reg, (%reg, %got)
1014     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
1015                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
1016     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
1017     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
1018     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
1019                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
1020   }
1021 
1022   // This is one of the absolute code models.
1023   switch (getTargetMachine().getCodeModel()) {
1024   default:
1025     llvm_unreachable("Unsupported absolute code model");
1026   case CodeModel::Small:
1027   case CodeModel::Medium:
1028   case CodeModel::Large:
1029     // abs64.
1030     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1031   }
1032 }
1033 
1034 /// Custom Lower {
1035 
1036 // The mappings for emitLeading/TrailingFence for VE is designed by following
1037 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
1038 Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder,
1039                                                 Instruction *Inst,
1040                                                 AtomicOrdering Ord) const {
1041   switch (Ord) {
1042   case AtomicOrdering::NotAtomic:
1043   case AtomicOrdering::Unordered:
1044     llvm_unreachable("Invalid fence: unordered/non-atomic");
1045   case AtomicOrdering::Monotonic:
1046   case AtomicOrdering::Acquire:
1047     return nullptr; // Nothing to do
1048   case AtomicOrdering::Release:
1049   case AtomicOrdering::AcquireRelease:
1050     return Builder.CreateFence(AtomicOrdering::Release);
1051   case AtomicOrdering::SequentiallyConsistent:
1052     if (!Inst->hasAtomicStore())
1053       return nullptr; // Nothing to do
1054     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1055   }
1056   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
1057 }
1058 
1059 Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
1060                                                  Instruction *Inst,
1061                                                  AtomicOrdering Ord) const {
1062   switch (Ord) {
1063   case AtomicOrdering::NotAtomic:
1064   case AtomicOrdering::Unordered:
1065     llvm_unreachable("Invalid fence: unordered/not-atomic");
1066   case AtomicOrdering::Monotonic:
1067   case AtomicOrdering::Release:
1068     return nullptr; // Nothing to do
1069   case AtomicOrdering::Acquire:
1070   case AtomicOrdering::AcquireRelease:
1071     return Builder.CreateFence(AtomicOrdering::Acquire);
1072   case AtomicOrdering::SequentiallyConsistent:
1073     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1074   }
1075   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
1076 }
1077 
1078 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
1079                                             SelectionDAG &DAG) const {
1080   SDLoc DL(Op);
1081   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
1082       cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
1083   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
1084       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
1085 
1086   // VE uses Release consistency, so need a fence instruction if it is a
1087   // cross-thread fence.
1088   if (FenceSSID == SyncScope::System) {
1089     switch (FenceOrdering) {
1090     case AtomicOrdering::NotAtomic:
1091     case AtomicOrdering::Unordered:
1092     case AtomicOrdering::Monotonic:
1093       // No need to generate fencem instruction here.
1094       break;
1095     case AtomicOrdering::Acquire:
1096       // Generate "fencem 2" as acquire fence.
1097       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1098                                         DAG.getTargetConstant(2, DL, MVT::i32),
1099                                         Op.getOperand(0)),
1100                      0);
1101     case AtomicOrdering::Release:
1102       // Generate "fencem 1" as release fence.
1103       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1104                                         DAG.getTargetConstant(1, DL, MVT::i32),
1105                                         Op.getOperand(0)),
1106                      0);
1107     case AtomicOrdering::AcquireRelease:
1108     case AtomicOrdering::SequentiallyConsistent:
1109       // Generate "fencem 3" as acq_rel and seq_cst fence.
1110       // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
1111       //        so  seq_cst may require more instruction for them.
1112       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1113                                         DAG.getTargetConstant(3, DL, MVT::i32),
1114                                         Op.getOperand(0)),
1115                      0);
1116     }
1117   }
1118 
1119   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
1120   return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
1121 }
1122 
1123 TargetLowering::AtomicExpansionKind
1124 VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
1125   // We have TS1AM implementation for i8/i16/i32/i64, so use it.
1126   if (AI->getOperation() == AtomicRMWInst::Xchg) {
1127     return AtomicExpansionKind::None;
1128   }
1129   // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
1130 
1131   // Otherwise, expand it using compare and exchange instruction to not call
1132   // __sync_fetch_and_* functions.
1133   return AtomicExpansionKind::CmpXChg;
1134 }
1135 
1136 static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
1137                             SDValue &Bits) {
1138   SDLoc DL(Op);
1139   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1140   SDValue Ptr = N->getOperand(1);
1141   SDValue Val = N->getOperand(2);
1142   EVT PtrVT = Ptr.getValueType();
1143   bool Byte = N->getMemoryVT() == MVT::i8;
1144   //   Remainder = AND Ptr, 3
1145   //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
1146   //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
1147   //   Bits = Remainder << 3
1148   //   NewVal = Val << Bits
1149   SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
1150   SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
1151   SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
1152                       : DAG.getConstant(3, DL, MVT::i32);
1153   Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
1154   Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
1155   return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
1156 }
1157 
1158 static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
1159                              SDValue Bits) {
1160   SDLoc DL(Op);
1161   EVT VT = Data.getValueType();
1162   bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
1163   //   NewData = Data >> Bits
1164   //   Result = NewData & 0xff   ; If Byte is true (1 byte)
1165   //   Result = NewData & 0xffff ; If Byte is false (2 bytes)
1166 
1167   SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
1168   return DAG.getNode(ISD::AND, DL, VT,
1169                      {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
1170 }
1171 
1172 SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
1173                                            SelectionDAG &DAG) const {
1174   SDLoc DL(Op);
1175   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1176 
1177   if (N->getMemoryVT() == MVT::i8) {
1178     // For i8, use "ts1am"
1179     //   Input:
1180     //     ATOMIC_SWAP Ptr, Val, Order
1181     //
1182     //   Output:
1183     //     Remainder = AND Ptr, 3
1184     //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
1185     //     Bits = Remainder << 3
1186     //     NewVal = Val << Bits
1187     //
1188     //     Aligned = AND Ptr, -4
1189     //     Data = TS1AM Aligned, Flag, NewVal
1190     //
1191     //     NewData = Data >> Bits
1192     //     Result = NewData & 0xff ; 1 byte result
1193     SDValue Flag;
1194     SDValue Bits;
1195     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1196 
1197     SDValue Ptr = N->getOperand(1);
1198     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1199                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1200     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1201                                   DAG.getVTList(Op.getNode()->getValueType(0),
1202                                                 Op.getNode()->getValueType(1)),
1203                                   {N->getChain(), Aligned, Flag, NewVal},
1204                                   N->getMemOperand());
1205 
1206     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1207     SDValue Chain = TS1AM.getValue(1);
1208     return DAG.getMergeValues({Result, Chain}, DL);
1209   }
1210   if (N->getMemoryVT() == MVT::i16) {
1211     // For i16, use "ts1am"
1212     SDValue Flag;
1213     SDValue Bits;
1214     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1215 
1216     SDValue Ptr = N->getOperand(1);
1217     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1218                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1219     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1220                                   DAG.getVTList(Op.getNode()->getValueType(0),
1221                                                 Op.getNode()->getValueType(1)),
1222                                   {N->getChain(), Aligned, Flag, NewVal},
1223                                   N->getMemOperand());
1224 
1225     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1226     SDValue Chain = TS1AM.getValue(1);
1227     return DAG.getMergeValues({Result, Chain}, DL);
1228   }
1229   // Otherwise, let llvm legalize it.
1230   return Op;
1231 }
1232 
1233 SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
1234                                              SelectionDAG &DAG) const {
1235   return makeAddress(Op, DAG);
1236 }
1237 
1238 SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
1239                                             SelectionDAG &DAG) const {
1240   return makeAddress(Op, DAG);
1241 }
1242 
1243 SDValue VETargetLowering::lowerConstantPool(SDValue Op,
1244                                             SelectionDAG &DAG) const {
1245   return makeAddress(Op, DAG);
1246 }
1247 
1248 SDValue
1249 VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
1250                                                 SelectionDAG &DAG) const {
1251   SDLoc DL(Op);
1252 
1253   // Generate the following code:
1254   //   t1: ch,glue = callseq_start t0, 0, 0
1255   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
1256   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
1257   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
1258   SDValue Label = withTargetFlags(Op, 0, DAG);
1259   EVT PtrVT = Op.getValueType();
1260 
1261   // Lowering the machine isd will make sure everything is in the right
1262   // location.
1263   SDValue Chain = DAG.getEntryNode();
1264   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1265   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
1266       DAG.getMachineFunction(), CallingConv::C);
1267   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
1268   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
1269   Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
1270   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
1271                              DAG.getIntPtrConstant(0, DL, true),
1272                              Chain.getValue(1), DL);
1273   Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
1274 
1275   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
1276   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1277   MFI.setHasCalls(true);
1278 
1279   // Also generate code to prepare a GOT register if it is PIC.
1280   if (isPositionIndependent()) {
1281     MachineFunction &MF = DAG.getMachineFunction();
1282     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
1283   }
1284 
1285   return Chain;
1286 }
1287 
1288 SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
1289                                                 SelectionDAG &DAG) const {
1290   // The current implementation of nld (2.26) doesn't allow local exec model
1291   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
1292   // generate the general dynamic model code sequence.
1293   //
1294   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
1295   return lowerToTLSGeneralDynamicModel(Op, DAG);
1296 }
1297 
1298 SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
1299   return makeAddress(Op, DAG);
1300 }
1301 
1302 // Lower a f128 load into two f64 loads.
1303 static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
1304   SDLoc DL(Op);
1305   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
1306   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
1307   unsigned Alignment = LdNode->getAlign().value();
1308   if (Alignment > 8)
1309     Alignment = 8;
1310 
1311   SDValue Lo64 =
1312       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
1313                   LdNode->getPointerInfo(), Alignment,
1314                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1315                                        : MachineMemOperand::MONone);
1316   EVT AddrVT = LdNode->getBasePtr().getValueType();
1317   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
1318                               DAG.getConstant(8, DL, AddrVT));
1319   SDValue Hi64 =
1320       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
1321                   LdNode->getPointerInfo(), Alignment,
1322                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1323                                        : MachineMemOperand::MONone);
1324 
1325   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1326   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1327 
1328   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1329   SDNode *InFP128 =
1330       DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
1331   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1332                                SDValue(InFP128, 0), Hi64, SubRegEven);
1333   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1334                                SDValue(InFP128, 0), Lo64, SubRegOdd);
1335   SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
1336                           SDValue(Hi64.getNode(), 1)};
1337   SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1338   SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
1339   return DAG.getMergeValues(Ops, DL);
1340 }
1341 
1342 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1343   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
1344 
1345   EVT MemVT = LdNode->getMemoryVT();
1346 
1347   // Dispatch to vector isel.
1348   if (MemVT.isVector() && !isMaskType(MemVT))
1349     return lowerToVVP(Op, DAG);
1350 
1351   SDValue BasePtr = LdNode->getBasePtr();
1352   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1353     // Do not expand store instruction with frame index here because of
1354     // dependency problems.  We expand it later in eliminateFrameIndex().
1355     return Op;
1356   }
1357 
1358   if (MemVT == MVT::f128)
1359     return lowerLoadF128(Op, DAG);
1360 
1361   return Op;
1362 }
1363 
1364 // Lower a f128 store into two f64 stores.
1365 static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
1366   SDLoc DL(Op);
1367   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
1368   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1369 
1370   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1371   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1372 
1373   SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1374                                     StNode->getValue(), SubRegEven);
1375   SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1376                                     StNode->getValue(), SubRegOdd);
1377 
1378   unsigned Alignment = StNode->getAlign().value();
1379   if (Alignment > 8)
1380     Alignment = 8;
1381 
1382   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1383   SDValue OutChains[2];
1384   OutChains[0] =
1385       DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
1386                    StNode->getBasePtr(), MachinePointerInfo(), Alignment,
1387                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1388                                         : MachineMemOperand::MONone);
1389   EVT AddrVT = StNode->getBasePtr().getValueType();
1390   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
1391                               DAG.getConstant(8, DL, AddrVT));
1392   OutChains[1] =
1393       DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
1394                    MachinePointerInfo(), Alignment,
1395                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1396                                         : MachineMemOperand::MONone);
1397   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1398 }
1399 
1400 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1401   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
1402   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1403 
1404     // always expand non-mask vector loads to VVP
1405   EVT MemVT = StNode->getMemoryVT();
1406   if (MemVT.isVector() && !isMaskType(MemVT))
1407     return lowerToVVP(Op, DAG);
1408 
1409   SDValue BasePtr = StNode->getBasePtr();
1410   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1411     // Do not expand store instruction with frame index here because of
1412     // dependency problems.  We expand it later in eliminateFrameIndex().
1413     return Op;
1414   }
1415 
1416   if (MemVT == MVT::f128)
1417     return lowerStoreF128(Op, DAG);
1418 
1419   // Otherwise, ask llvm to expand it.
1420   return SDValue();
1421 }
1422 
1423 SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
1424   MachineFunction &MF = DAG.getMachineFunction();
1425   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
1426   auto PtrVT = getPointerTy(DAG.getDataLayout());
1427 
1428   // Need frame address to find the address of VarArgsFrameIndex.
1429   MF.getFrameInfo().setFrameAddressIsTaken(true);
1430 
1431   // vastart just stores the address of the VarArgsFrameIndex slot into the
1432   // memory location argument.
1433   SDLoc DL(Op);
1434   SDValue Offset =
1435       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
1436                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
1437   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
1438   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
1439                       MachinePointerInfo(SV));
1440 }
1441 
1442 SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
1443   SDNode *Node = Op.getNode();
1444   EVT VT = Node->getValueType(0);
1445   SDValue InChain = Node->getOperand(0);
1446   SDValue VAListPtr = Node->getOperand(1);
1447   EVT PtrVT = VAListPtr.getValueType();
1448   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
1449   SDLoc DL(Node);
1450   SDValue VAList =
1451       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
1452   SDValue Chain = VAList.getValue(1);
1453   SDValue NextPtr;
1454 
1455   if (VT == MVT::f128) {
1456     // VE f128 values must be stored with 16 bytes alignment.  We doesn't
1457     // know the actual alignment of VAList, so we take alignment of it
1458     // dyanmically.
1459     int Align = 16;
1460     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1461                          DAG.getConstant(Align - 1, DL, PtrVT));
1462     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
1463                          DAG.getConstant(-Align, DL, PtrVT));
1464     // Increment the pointer, VAList, by 16 to the next vaarg.
1465     NextPtr =
1466         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
1467   } else if (VT == MVT::f32) {
1468     // float --> need special handling like below.
1469     //    0      4
1470     //    +------+------+
1471     //    | empty| float|
1472     //    +------+------+
1473     // Increment the pointer, VAList, by 8 to the next vaarg.
1474     NextPtr =
1475         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1476     // Then, adjust VAList.
1477     unsigned InternalOffset = 4;
1478     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1479                          DAG.getConstant(InternalOffset, DL, PtrVT));
1480   } else {
1481     // Increment the pointer, VAList, by 8 to the next vaarg.
1482     NextPtr =
1483         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1484   }
1485 
1486   // Store the incremented VAList to the legalized pointer.
1487   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
1488 
1489   // Load the actual argument out of the pointer VAList.
1490   // We can't count on greater alignment than the word size.
1491   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
1492                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
1493 }
1494 
1495 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
1496                                                   SelectionDAG &DAG) const {
1497   // Generate following code.
1498   //   (void)__llvm_grow_stack(size);
1499   //   ret = GETSTACKTOP;        // pseudo instruction
1500   SDLoc DL(Op);
1501 
1502   // Get the inputs.
1503   SDNode *Node = Op.getNode();
1504   SDValue Chain = Op.getOperand(0);
1505   SDValue Size = Op.getOperand(1);
1506   MaybeAlign Alignment(Op.getConstantOperandVal(2));
1507   EVT VT = Node->getValueType(0);
1508 
1509   // Chain the dynamic stack allocation so that it doesn't modify the stack
1510   // pointer when other instructions are using the stack.
1511   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
1512 
1513   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
1514   Align StackAlign = TFI.getStackAlign();
1515   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
1516 
1517   // Prepare arguments
1518   TargetLowering::ArgListTy Args;
1519   TargetLowering::ArgListEntry Entry;
1520   Entry.Node = Size;
1521   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1522   Args.push_back(Entry);
1523   if (NeedsAlign) {
1524     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
1525     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1526     Args.push_back(Entry);
1527   }
1528   Type *RetTy = Type::getVoidTy(*DAG.getContext());
1529 
1530   EVT PtrVT = Op.getValueType();
1531   SDValue Callee;
1532   if (NeedsAlign) {
1533     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
1534   } else {
1535     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
1536   }
1537 
1538   TargetLowering::CallLoweringInfo CLI(DAG);
1539   CLI.setDebugLoc(DL)
1540       .setChain(Chain)
1541       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
1542       .setDiscardResult(true);
1543   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
1544   Chain = pair.second;
1545   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
1546   if (NeedsAlign) {
1547     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
1548                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
1549     Result = DAG.getNode(ISD::AND, DL, VT, Result,
1550                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
1551   }
1552   //  Chain = Result.getValue(1);
1553   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
1554                              DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
1555 
1556   SDValue Ops[2] = {Result, Chain};
1557   return DAG.getMergeValues(Ops, DL);
1558 }
1559 
1560 SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
1561                                                SelectionDAG &DAG) const {
1562   SDLoc DL(Op);
1563   return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
1564                      Op.getOperand(1));
1565 }
1566 
1567 SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
1568                                               SelectionDAG &DAG) const {
1569   SDLoc DL(Op);
1570   return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
1571                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
1572                      Op.getOperand(1));
1573 }
1574 
1575 SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
1576                                                       SelectionDAG &DAG) const {
1577   SDLoc DL(Op);
1578   return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
1579                      Op.getOperand(0));
1580 }
1581 
1582 static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
1583                               const VETargetLowering &TLI,
1584                               const VESubtarget *Subtarget) {
1585   SDLoc DL(Op);
1586   MachineFunction &MF = DAG.getMachineFunction();
1587   EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
1588 
1589   MachineFrameInfo &MFI = MF.getFrameInfo();
1590   MFI.setFrameAddressIsTaken(true);
1591 
1592   unsigned Depth = Op.getConstantOperandVal(0);
1593   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
1594   Register FrameReg = RegInfo->getFrameRegister(MF);
1595   SDValue FrameAddr =
1596       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
1597   while (Depth--)
1598     FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
1599                             FrameAddr, MachinePointerInfo());
1600   return FrameAddr;
1601 }
1602 
1603 static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
1604                                const VETargetLowering &TLI,
1605                                const VESubtarget *Subtarget) {
1606   MachineFunction &MF = DAG.getMachineFunction();
1607   MachineFrameInfo &MFI = MF.getFrameInfo();
1608   MFI.setReturnAddressIsTaken(true);
1609 
1610   if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
1611     return SDValue();
1612 
1613   SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
1614 
1615   SDLoc DL(Op);
1616   EVT VT = Op.getValueType();
1617   SDValue Offset = DAG.getConstant(8, DL, VT);
1618   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1619                      DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
1620                      MachinePointerInfo());
1621 }
1622 
1623 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
1624                                                   SelectionDAG &DAG) const {
1625   SDLoc DL(Op);
1626   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1627   switch (IntNo) {
1628   default: // Don't custom lower most intrinsics.
1629     return SDValue();
1630   case Intrinsic::eh_sjlj_lsda: {
1631     MachineFunction &MF = DAG.getMachineFunction();
1632     MVT VT = Op.getSimpleValueType();
1633     const VETargetMachine *TM =
1634         static_cast<const VETargetMachine *>(&DAG.getTarget());
1635 
1636     // Create GCC_except_tableXX string.  The real symbol for that will be
1637     // generated in EHStreamer::emitExceptionTable() later.  So, we just
1638     // borrow it's name here.
1639     TM->getStrList()->push_back(std::string(
1640         (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
1641     SDValue Addr =
1642         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
1643     if (isPositionIndependent()) {
1644       Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
1645                           VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1646       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
1647       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
1648     }
1649     return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1650   }
1651   }
1652 }
1653 
1654 static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
1655   if (!isa<BuildVectorSDNode>(N))
1656     return false;
1657   const auto *BVN = cast<BuildVectorSDNode>(N);
1658 
1659   // Find first non-undef insertion.
1660   unsigned Idx;
1661   for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
1662     auto ElemV = BVN->getOperand(Idx);
1663     if (!ElemV->isUndef())
1664       break;
1665   }
1666   // Catch the (hypothetical) all-undef case.
1667   if (Idx == BVN->getNumOperands())
1668     return false;
1669   // Remember insertion.
1670   UniqueIdx = Idx++;
1671   // Verify that all other insertions are undef.
1672   for (; Idx < BVN->getNumOperands(); ++Idx) {
1673     auto ElemV = BVN->getOperand(Idx);
1674     if (!ElemV->isUndef())
1675       return false;
1676   }
1677   return true;
1678 }
1679 
1680 static SDValue getSplatValue(SDNode *N) {
1681   if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
1682     return BuildVec->getSplatValue();
1683   }
1684   return SDValue();
1685 }
1686 
1687 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
1688                                             SelectionDAG &DAG) const {
1689   VECustomDAG CDAG(DAG, Op);
1690   MVT ResultVT = Op.getSimpleValueType();
1691 
1692   // If there is just one element, expand to INSERT_VECTOR_ELT.
1693   unsigned UniqueIdx;
1694   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
1695     SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
1696     auto ElemV = Op->getOperand(UniqueIdx);
1697     SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
1698     return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
1699   }
1700 
1701   // Else emit a broadcast.
1702   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
1703     unsigned NumEls = ResultVT.getVectorNumElements();
1704     auto AVL = CDAG.getConstant(NumEls, MVT::i32);
1705     return CDAG.getBroadcast(ResultVT, ScalarV, AVL);
1706   }
1707 
1708   // Expand
1709   return SDValue();
1710 }
1711 
1712 TargetLowering::LegalizeAction
1713 VETargetLowering::getCustomOperationAction(SDNode &Op) const {
1714   // Custom legalization on VVP_* and VEC_* opcodes is required to pack-legalize
1715   // these operations (transform nodes such that their AVL parameter refers to
1716   // packs of 64bit, instead of number of elements.
1717 
1718   // Packing opcodes are created with a pack-legal AVL (LEGALAVL). No need to
1719   // re-visit them.
1720   if (isPackingSupportOpcode(Op.getOpcode()))
1721     return Legal;
1722 
1723   // Custom lower to legalize AVL for packed mode.
1724   if (isVVPOrVEC(Op.getOpcode()))
1725     return Custom;
1726   return Legal;
1727 }
1728 
1729 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1730   LLVM_DEBUG(dbgs() << "::LowerOperation"; Op->print(dbgs()););
1731   unsigned Opcode = Op.getOpcode();
1732 
1733   /// Scalar isel.
1734   switch (Opcode) {
1735   case ISD::ATOMIC_FENCE:
1736     return lowerATOMIC_FENCE(Op, DAG);
1737   case ISD::ATOMIC_SWAP:
1738     return lowerATOMIC_SWAP(Op, DAG);
1739   case ISD::BlockAddress:
1740     return lowerBlockAddress(Op, DAG);
1741   case ISD::ConstantPool:
1742     return lowerConstantPool(Op, DAG);
1743   case ISD::DYNAMIC_STACKALLOC:
1744     return lowerDYNAMIC_STACKALLOC(Op, DAG);
1745   case ISD::EH_SJLJ_LONGJMP:
1746     return lowerEH_SJLJ_LONGJMP(Op, DAG);
1747   case ISD::EH_SJLJ_SETJMP:
1748     return lowerEH_SJLJ_SETJMP(Op, DAG);
1749   case ISD::EH_SJLJ_SETUP_DISPATCH:
1750     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
1751   case ISD::FRAMEADDR:
1752     return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
1753   case ISD::GlobalAddress:
1754     return lowerGlobalAddress(Op, DAG);
1755   case ISD::GlobalTLSAddress:
1756     return lowerGlobalTLSAddress(Op, DAG);
1757   case ISD::INTRINSIC_WO_CHAIN:
1758     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
1759   case ISD::JumpTable:
1760     return lowerJumpTable(Op, DAG);
1761   case ISD::LOAD:
1762     return lowerLOAD(Op, DAG);
1763   case ISD::RETURNADDR:
1764     return lowerRETURNADDR(Op, DAG, *this, Subtarget);
1765   case ISD::BUILD_VECTOR:
1766     return lowerBUILD_VECTOR(Op, DAG);
1767   case ISD::STORE:
1768     return lowerSTORE(Op, DAG);
1769   case ISD::VASTART:
1770     return lowerVASTART(Op, DAG);
1771   case ISD::VAARG:
1772     return lowerVAARG(Op, DAG);
1773 
1774   case ISD::INSERT_VECTOR_ELT:
1775     return lowerINSERT_VECTOR_ELT(Op, DAG);
1776   case ISD::EXTRACT_VECTOR_ELT:
1777     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
1778   }
1779 
1780   /// Vector isel.
1781   LLVM_DEBUG(dbgs() << "::LowerOperation_VVP"; Op->print(dbgs()););
1782   if (ISD::isVPOpcode(Opcode))
1783     return lowerToVVP(Op, DAG);
1784 
1785   switch (Opcode) {
1786   default:
1787     llvm_unreachable("Should not custom lower this!");
1788 
1789   // Legalize the AVL of this internal node.
1790   case VEISD::VEC_BROADCAST:
1791 #define ADD_VVP_OP(VVP_NAME, ...) case VEISD::VVP_NAME:
1792 #include "VVPNodes.def"
1793     // AVL already legalized.
1794     if (getAnnotatedNodeAVL(Op).second)
1795       return Op;
1796     return legalizeInternalVectorOp(Op, DAG);
1797 
1798     // Translate into a VEC_*/VVP_* layer operation.
1799   case ISD::MLOAD:
1800   case ISD::MSTORE:
1801 #define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
1802 #include "VVPNodes.def"
1803     if (isMaskArithmetic(Op) && isPackedVectorType(Op.getValueType()))
1804       return splitMaskArithmetic(Op, DAG);
1805     return lowerToVVP(Op, DAG);
1806   }
1807 }
1808 /// } Custom Lower
1809 
1810 void VETargetLowering::ReplaceNodeResults(SDNode *N,
1811                                           SmallVectorImpl<SDValue> &Results,
1812                                           SelectionDAG &DAG) const {
1813   switch (N->getOpcode()) {
1814   case ISD::ATOMIC_SWAP:
1815     // Let LLVM expand atomic swap instruction through LowerOperation.
1816     return;
1817   default:
1818     LLVM_DEBUG(N->dumpr(&DAG));
1819     llvm_unreachable("Do not know how to custom type legalize this operation!");
1820   }
1821 }
1822 
1823 /// JumpTable for VE.
1824 ///
1825 ///   VE cannot generate relocatable symbol in jump table.  VE cannot
1826 ///   generate expressions using symbols in both text segment and data
1827 ///   segment like below.
1828 ///             .4byte  .LBB0_2-.LJTI0_0
1829 ///   So, we generate offset from the top of function like below as
1830 ///   a custom label.
1831 ///             .4byte  .LBB0_2-<function name>
1832 
1833 unsigned VETargetLowering::getJumpTableEncoding() const {
1834   // Use custom label for PIC.
1835   if (isPositionIndependent())
1836     return MachineJumpTableInfo::EK_Custom32;
1837 
1838   // Otherwise, use the normal jump table encoding heuristics.
1839   return TargetLowering::getJumpTableEncoding();
1840 }
1841 
1842 const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
1843     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
1844     unsigned Uid, MCContext &Ctx) const {
1845   assert(isPositionIndependent());
1846 
1847   // Generate custom label for PIC like below.
1848   //    .4bytes  .LBB0_2-<function name>
1849   const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
1850   MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
1851   const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
1852   return MCBinaryExpr::createSub(Value, Base, Ctx);
1853 }
1854 
1855 SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
1856                                                    SelectionDAG &DAG) const {
1857   assert(isPositionIndependent());
1858   SDLoc DL(Table);
1859   Function *Function = &DAG.getMachineFunction().getFunction();
1860   assert(Function != nullptr);
1861   auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
1862 
1863   // In the jump table, we have following values in PIC mode.
1864   //    .4bytes  .LBB0_2-<function name>
1865   // We need to add this value and the address of this function to generate
1866   // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
1867   // instructions:
1868   //     lea %reg, fun@gotoff_lo
1869   //     and %reg, %reg, (32)0
1870   //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
1871   // In order to do so, we need to genarate correctly marked DAG node using
1872   // makeHiLoPair.
1873   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
1874   SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
1875                               VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1876   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
1877   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
1878 }
1879 
1880 Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
1881                                       MachineBasicBlock::iterator I,
1882                                       MachineBasicBlock *TargetBB,
1883                                       const DebugLoc &DL) const {
1884   MachineFunction *MF = MBB.getParent();
1885   MachineRegisterInfo &MRI = MF->getRegInfo();
1886   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1887 
1888   const TargetRegisterClass *RC = &VE::I64RegClass;
1889   Register Tmp1 = MRI.createVirtualRegister(RC);
1890   Register Tmp2 = MRI.createVirtualRegister(RC);
1891   Register Result = MRI.createVirtualRegister(RC);
1892 
1893   if (isPositionIndependent()) {
1894     // Create following instructions for local linkage PIC code.
1895     //     lea %Tmp1, TargetBB@gotoff_lo
1896     //     and %Tmp2, %Tmp1, (32)0
1897     //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1898     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1899         .addImm(0)
1900         .addImm(0)
1901         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
1902     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1903         .addReg(Tmp1, getKillRegState(true))
1904         .addImm(M0(32));
1905     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
1906         .addReg(VE::SX15)
1907         .addReg(Tmp2, getKillRegState(true))
1908         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
1909   } else {
1910     // Create following instructions for non-PIC code.
1911     //     lea     %Tmp1, TargetBB@lo
1912     //     and     %Tmp2, %Tmp1, (32)0
1913     //     lea.sl  %Result, TargetBB@hi(%Tmp2)
1914     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1915         .addImm(0)
1916         .addImm(0)
1917         .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
1918     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1919         .addReg(Tmp1, getKillRegState(true))
1920         .addImm(M0(32));
1921     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
1922         .addReg(Tmp2, getKillRegState(true))
1923         .addImm(0)
1924         .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
1925   }
1926   return Result;
1927 }
1928 
1929 Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
1930                                          MachineBasicBlock::iterator I,
1931                                          StringRef Symbol, const DebugLoc &DL,
1932                                          bool IsLocal = false,
1933                                          bool IsCall = false) const {
1934   MachineFunction *MF = MBB.getParent();
1935   MachineRegisterInfo &MRI = MF->getRegInfo();
1936   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1937 
1938   const TargetRegisterClass *RC = &VE::I64RegClass;
1939   Register Result = MRI.createVirtualRegister(RC);
1940 
1941   if (isPositionIndependent()) {
1942     if (IsCall && !IsLocal) {
1943       // Create following instructions for non-local linkage PIC code function
1944       // calls.  These instructions uses IC and magic number -24, so we expand
1945       // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
1946       //     lea %Reg, Symbol@plt_lo(-24)
1947       //     and %Reg, %Reg, (32)0
1948       //     sic %s16
1949       //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
1950       BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
1951           .addExternalSymbol("abort");
1952     } else if (IsLocal) {
1953       Register Tmp1 = MRI.createVirtualRegister(RC);
1954       Register Tmp2 = MRI.createVirtualRegister(RC);
1955       // Create following instructions for local linkage PIC code.
1956       //     lea %Tmp1, Symbol@gotoff_lo
1957       //     and %Tmp2, %Tmp1, (32)0
1958       //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1959       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1960           .addImm(0)
1961           .addImm(0)
1962           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
1963       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1964           .addReg(Tmp1, getKillRegState(true))
1965           .addImm(M0(32));
1966       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
1967           .addReg(VE::SX15)
1968           .addReg(Tmp2, getKillRegState(true))
1969           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
1970     } else {
1971       Register Tmp1 = MRI.createVirtualRegister(RC);
1972       Register Tmp2 = MRI.createVirtualRegister(RC);
1973       // Create following instructions for not local linkage PIC code.
1974       //     lea %Tmp1, Symbol@got_lo
1975       //     and %Tmp2, %Tmp1, (32)0
1976       //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1977       //     ld %Result, 0(%Tmp3)
1978       Register Tmp3 = MRI.createVirtualRegister(RC);
1979       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1980           .addImm(0)
1981           .addImm(0)
1982           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
1983       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1984           .addReg(Tmp1, getKillRegState(true))
1985           .addImm(M0(32));
1986       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
1987           .addReg(VE::SX15)
1988           .addReg(Tmp2, getKillRegState(true))
1989           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
1990       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
1991           .addReg(Tmp3, getKillRegState(true))
1992           .addImm(0)
1993           .addImm(0);
1994     }
1995   } else {
1996     Register Tmp1 = MRI.createVirtualRegister(RC);
1997     Register Tmp2 = MRI.createVirtualRegister(RC);
1998     // Create following instructions for non-PIC code.
1999     //     lea     %Tmp1, Symbol@lo
2000     //     and     %Tmp2, %Tmp1, (32)0
2001     //     lea.sl  %Result, Symbol@hi(%Tmp2)
2002     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2003         .addImm(0)
2004         .addImm(0)
2005         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
2006     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2007         .addReg(Tmp1, getKillRegState(true))
2008         .addImm(M0(32));
2009     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
2010         .addReg(Tmp2, getKillRegState(true))
2011         .addImm(0)
2012         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
2013   }
2014   return Result;
2015 }
2016 
2017 void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
2018                                               MachineBasicBlock *MBB,
2019                                               MachineBasicBlock *DispatchBB,
2020                                               int FI, int Offset) const {
2021   DebugLoc DL = MI.getDebugLoc();
2022   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2023 
2024   Register LabelReg =
2025       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
2026 
2027   // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
2028   // referenced by longjmp (throw) later.
2029   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2030   addFrameReference(MIB, FI, Offset); // jmpbuf[1]
2031   MIB.addReg(LabelReg, getKillRegState(true));
2032 }
2033 
2034 MachineBasicBlock *
2035 VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
2036                                    MachineBasicBlock *MBB) const {
2037   DebugLoc DL = MI.getDebugLoc();
2038   MachineFunction *MF = MBB->getParent();
2039   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2040   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
2041   MachineRegisterInfo &MRI = MF->getRegInfo();
2042 
2043   const BasicBlock *BB = MBB->getBasicBlock();
2044   MachineFunction::iterator I = ++MBB->getIterator();
2045 
2046   // Memory Reference.
2047   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2048                                            MI.memoperands_end());
2049   Register BufReg = MI.getOperand(1).getReg();
2050 
2051   Register DstReg;
2052 
2053   DstReg = MI.getOperand(0).getReg();
2054   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
2055   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
2056   (void)TRI;
2057   Register MainDestReg = MRI.createVirtualRegister(RC);
2058   Register RestoreDestReg = MRI.createVirtualRegister(RC);
2059 
2060   // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
2061   // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
2062   //
2063   // ThisMBB:
2064   //   buf[3] = %s17 iff %s17 is used as BP
2065   //   buf[1] = RestoreMBB as IC after longjmp
2066   //   # SjLjSetup RestoreMBB
2067   //
2068   // MainMBB:
2069   //   v_main = 0
2070   //
2071   // SinkMBB:
2072   //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
2073   //   ...
2074   //
2075   // RestoreMBB:
2076   //   %s17 = buf[3] = iff %s17 is used as BP
2077   //   v_restore = 1
2078   //   goto SinkMBB
2079 
2080   MachineBasicBlock *ThisMBB = MBB;
2081   MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
2082   MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
2083   MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
2084   MF->insert(I, MainMBB);
2085   MF->insert(I, SinkMBB);
2086   MF->push_back(RestoreMBB);
2087   RestoreMBB->setHasAddressTaken();
2088 
2089   // Transfer the remainder of BB and its successor edges to SinkMBB.
2090   SinkMBB->splice(SinkMBB->begin(), MBB,
2091                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2092   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
2093 
2094   // ThisMBB:
2095   Register LabelReg =
2096       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
2097 
2098   // Store BP in buf[3] iff this function is using BP.
2099   const VEFrameLowering *TFI = Subtarget->getFrameLowering();
2100   if (TFI->hasBP(*MF)) {
2101     MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2102     MIB.addReg(BufReg);
2103     MIB.addImm(0);
2104     MIB.addImm(24);
2105     MIB.addReg(VE::SX17);
2106     MIB.setMemRefs(MMOs);
2107   }
2108 
2109   // Store IP in buf[1].
2110   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2111   MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
2112   MIB.addImm(0);
2113   MIB.addImm(8);
2114   MIB.addReg(LabelReg, getKillRegState(true));
2115   MIB.setMemRefs(MMOs);
2116 
2117   // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
2118 
2119   // Insert setup.
2120   MIB =
2121       BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
2122 
2123   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2124   MIB.addRegMask(RegInfo->getNoPreservedMask());
2125   ThisMBB->addSuccessor(MainMBB);
2126   ThisMBB->addSuccessor(RestoreMBB);
2127 
2128   // MainMBB:
2129   BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
2130       .addImm(0)
2131       .addImm(0)
2132       .addImm(0);
2133   MainMBB->addSuccessor(SinkMBB);
2134 
2135   // SinkMBB:
2136   BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
2137       .addReg(MainDestReg)
2138       .addMBB(MainMBB)
2139       .addReg(RestoreDestReg)
2140       .addMBB(RestoreMBB);
2141 
2142   // RestoreMBB:
2143   // Restore BP from buf[3] iff this function is using BP.  The address of
2144   // buf is in SX10.
2145   // FIXME: Better to not use SX10 here
2146   if (TFI->hasBP(*MF)) {
2147     MachineInstrBuilder MIB =
2148         BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
2149     MIB.addReg(VE::SX10);
2150     MIB.addImm(0);
2151     MIB.addImm(24);
2152     MIB.setMemRefs(MMOs);
2153   }
2154   BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
2155       .addImm(0)
2156       .addImm(0)
2157       .addImm(1);
2158   BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
2159   RestoreMBB->addSuccessor(SinkMBB);
2160 
2161   MI.eraseFromParent();
2162   return SinkMBB;
2163 }
2164 
2165 MachineBasicBlock *
2166 VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
2167                                     MachineBasicBlock *MBB) const {
2168   DebugLoc DL = MI.getDebugLoc();
2169   MachineFunction *MF = MBB->getParent();
2170   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2171   MachineRegisterInfo &MRI = MF->getRegInfo();
2172 
2173   // Memory Reference.
2174   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2175                                            MI.memoperands_end());
2176   Register BufReg = MI.getOperand(0).getReg();
2177 
2178   Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
2179   // Since FP is only updated here but NOT referenced, it's treated as GPR.
2180   Register FP = VE::SX9;
2181   Register SP = VE::SX11;
2182 
2183   MachineInstrBuilder MIB;
2184 
2185   MachineBasicBlock *ThisMBB = MBB;
2186 
2187   // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
2188   //
2189   // ThisMBB:
2190   //   %fp = load buf[0]
2191   //   %jmp = load buf[1]
2192   //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
2193   //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
2194   //   jmp %jmp
2195 
2196   // Reload FP.
2197   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
2198   MIB.addReg(BufReg);
2199   MIB.addImm(0);
2200   MIB.addImm(0);
2201   MIB.setMemRefs(MMOs);
2202 
2203   // Reload IP.
2204   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
2205   MIB.addReg(BufReg);
2206   MIB.addImm(0);
2207   MIB.addImm(8);
2208   MIB.setMemRefs(MMOs);
2209 
2210   // Copy BufReg to SX10 for later use in setjmp.
2211   // FIXME: Better to not use SX10 here
2212   BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
2213       .addReg(BufReg)
2214       .addImm(0);
2215 
2216   // Reload SP.
2217   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
2218   MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
2219   MIB.addImm(0);
2220   MIB.addImm(16);
2221   MIB.setMemRefs(MMOs);
2222 
2223   // Jump.
2224   BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
2225       .addReg(Tmp, getKillRegState(true))
2226       .addImm(0);
2227 
2228   MI.eraseFromParent();
2229   return ThisMBB;
2230 }
2231 
2232 MachineBasicBlock *
2233 VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
2234                                         MachineBasicBlock *BB) const {
2235   DebugLoc DL = MI.getDebugLoc();
2236   MachineFunction *MF = BB->getParent();
2237   MachineFrameInfo &MFI = MF->getFrameInfo();
2238   MachineRegisterInfo &MRI = MF->getRegInfo();
2239   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2240   int FI = MFI.getFunctionContextIndex();
2241 
2242   // Get a mapping of the call site numbers to all of the landing pads they're
2243   // associated with.
2244   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
2245   unsigned MaxCSNum = 0;
2246   for (auto &MBB : *MF) {
2247     if (!MBB.isEHPad())
2248       continue;
2249 
2250     MCSymbol *Sym = nullptr;
2251     for (const auto &MI : MBB) {
2252       if (MI.isDebugInstr())
2253         continue;
2254 
2255       assert(MI.isEHLabel() && "expected EH_LABEL");
2256       Sym = MI.getOperand(0).getMCSymbol();
2257       break;
2258     }
2259 
2260     if (!MF->hasCallSiteLandingPad(Sym))
2261       continue;
2262 
2263     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
2264       CallSiteNumToLPad[CSI].push_back(&MBB);
2265       MaxCSNum = std::max(MaxCSNum, CSI);
2266     }
2267   }
2268 
2269   // Get an ordered list of the machine basic blocks for the jump table.
2270   std::vector<MachineBasicBlock *> LPadList;
2271   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
2272   LPadList.reserve(CallSiteNumToLPad.size());
2273 
2274   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
2275     for (auto &LP : CallSiteNumToLPad[CSI]) {
2276       LPadList.push_back(LP);
2277       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
2278     }
2279   }
2280 
2281   assert(!LPadList.empty() &&
2282          "No landing pad destinations for the dispatch jump table!");
2283 
2284   // The %fn_context is allocated like below (from --print-after=sjljehprepare):
2285   //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
2286   //
2287   // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
2288   // First `i64` is callsite, so callsite is FI+8.
2289   static const int OffsetIC = 72;
2290   static const int OffsetCS = 8;
2291 
2292   // Create the MBBs for the dispatch code like following:
2293   //
2294   // ThisMBB:
2295   //   Prepare DispatchBB address and store it to buf[1].
2296   //   ...
2297   //
2298   // DispatchBB:
2299   //   %s15 = GETGOT iff isPositionIndependent
2300   //   %callsite = load callsite
2301   //   brgt.l.t #size of callsites, %callsite, DispContBB
2302   //
2303   // TrapBB:
2304   //   Call abort.
2305   //
2306   // DispContBB:
2307   //   %breg = address of jump table
2308   //   %pc = load and calculate next pc from %breg and %callsite
2309   //   jmp %pc
2310 
2311   // Shove the dispatch's address into the return slot in the function context.
2312   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
2313   DispatchBB->setIsEHPad(true);
2314 
2315   // Trap BB will causes trap like `assert(0)`.
2316   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
2317   DispatchBB->addSuccessor(TrapBB);
2318 
2319   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
2320   DispatchBB->addSuccessor(DispContBB);
2321 
2322   // Insert MBBs.
2323   MF->push_back(DispatchBB);
2324   MF->push_back(DispContBB);
2325   MF->push_back(TrapBB);
2326 
2327   // Insert code to call abort in the TrapBB.
2328   Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
2329                                  /* Local */ false, /* Call */ true);
2330   BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
2331       .addReg(Abort, getKillRegState(true))
2332       .addImm(0)
2333       .addImm(0);
2334 
2335   // Insert code into the entry block that creates and registers the function
2336   // context.
2337   setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
2338 
2339   // Create the jump table and associated information
2340   unsigned JTE = getJumpTableEncoding();
2341   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
2342   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
2343 
2344   const VERegisterInfo &RI = TII->getRegisterInfo();
2345   // Add a register mask with no preserved registers.  This results in all
2346   // registers being marked as clobbered.
2347   BuildMI(DispatchBB, DL, TII->get(VE::NOP))
2348       .addRegMask(RI.getNoPreservedMask());
2349 
2350   if (isPositionIndependent()) {
2351     // Force to generate GETGOT, since current implementation doesn't store GOT
2352     // register.
2353     BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
2354   }
2355 
2356   // IReg is used as an index in a memory operand and therefore can't be SP
2357   const TargetRegisterClass *RC = &VE::I64RegClass;
2358   Register IReg = MRI.createVirtualRegister(RC);
2359   addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
2360                     OffsetCS);
2361   if (LPadList.size() < 64) {
2362     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
2363         .addImm(VECC::CC_ILE)
2364         .addImm(LPadList.size())
2365         .addReg(IReg)
2366         .addMBB(TrapBB);
2367   } else {
2368     assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
2369     Register TmpReg = MRI.createVirtualRegister(RC);
2370     BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
2371         .addImm(0)
2372         .addImm(0)
2373         .addImm(LPadList.size());
2374     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
2375         .addImm(VECC::CC_ILE)
2376         .addReg(TmpReg, getKillRegState(true))
2377         .addReg(IReg)
2378         .addMBB(TrapBB);
2379   }
2380 
2381   Register BReg = MRI.createVirtualRegister(RC);
2382   Register Tmp1 = MRI.createVirtualRegister(RC);
2383   Register Tmp2 = MRI.createVirtualRegister(RC);
2384 
2385   if (isPositionIndependent()) {
2386     // Create following instructions for local linkage PIC code.
2387     //     lea    %Tmp1, .LJTI0_0@gotoff_lo
2388     //     and    %Tmp2, %Tmp1, (32)0
2389     //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2390     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2391         .addImm(0)
2392         .addImm(0)
2393         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
2394     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2395         .addReg(Tmp1, getKillRegState(true))
2396         .addImm(M0(32));
2397     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
2398         .addReg(VE::SX15)
2399         .addReg(Tmp2, getKillRegState(true))
2400         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
2401   } else {
2402     // Create following instructions for non-PIC code.
2403     //     lea     %Tmp1, .LJTI0_0@lo
2404     //     and     %Tmp2, %Tmp1, (32)0
2405     //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
2406     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2407         .addImm(0)
2408         .addImm(0)
2409         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
2410     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2411         .addReg(Tmp1, getKillRegState(true))
2412         .addImm(M0(32));
2413     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
2414         .addReg(Tmp2, getKillRegState(true))
2415         .addImm(0)
2416         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
2417   }
2418 
2419   switch (JTE) {
2420   case MachineJumpTableInfo::EK_BlockAddress: {
2421     // Generate simple block address code for no-PIC model.
2422     //     sll %Tmp1, %IReg, 3
2423     //     lds %TReg, 0(%Tmp1, %BReg)
2424     //     bcfla %TReg
2425 
2426     Register TReg = MRI.createVirtualRegister(RC);
2427     Register Tmp1 = MRI.createVirtualRegister(RC);
2428 
2429     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2430         .addReg(IReg, getKillRegState(true))
2431         .addImm(3);
2432     BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
2433         .addReg(BReg, getKillRegState(true))
2434         .addReg(Tmp1, getKillRegState(true))
2435         .addImm(0);
2436     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2437         .addReg(TReg, getKillRegState(true))
2438         .addImm(0);
2439     break;
2440   }
2441   case MachineJumpTableInfo::EK_Custom32: {
2442     // Generate block address code using differences from the function pointer
2443     // for PIC model.
2444     //     sll %Tmp1, %IReg, 2
2445     //     ldl.zx %OReg, 0(%Tmp1, %BReg)
2446     //     Prepare function address in BReg2.
2447     //     adds.l %TReg, %BReg2, %OReg
2448     //     bcfla %TReg
2449 
2450     assert(isPositionIndependent());
2451     Register OReg = MRI.createVirtualRegister(RC);
2452     Register TReg = MRI.createVirtualRegister(RC);
2453     Register Tmp1 = MRI.createVirtualRegister(RC);
2454 
2455     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2456         .addReg(IReg, getKillRegState(true))
2457         .addImm(2);
2458     BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
2459         .addReg(BReg, getKillRegState(true))
2460         .addReg(Tmp1, getKillRegState(true))
2461         .addImm(0);
2462     Register BReg2 =
2463         prepareSymbol(*DispContBB, DispContBB->end(),
2464                       DispContBB->getParent()->getName(), DL, /* Local */ true);
2465     BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
2466         .addReg(OReg, getKillRegState(true))
2467         .addReg(BReg2, getKillRegState(true));
2468     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2469         .addReg(TReg, getKillRegState(true))
2470         .addImm(0);
2471     break;
2472   }
2473   default:
2474     llvm_unreachable("Unexpected jump table encoding");
2475   }
2476 
2477   // Add the jump table entries as successors to the MBB.
2478   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
2479   for (auto &LP : LPadList)
2480     if (SeenMBBs.insert(LP).second)
2481       DispContBB->addSuccessor(LP);
2482 
2483   // N.B. the order the invoke BBs are processed in doesn't matter here.
2484   SmallVector<MachineBasicBlock *, 64> MBBLPads;
2485   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
2486   for (MachineBasicBlock *MBB : InvokeBBs) {
2487     // Remove the landing pad successor from the invoke block and replace it
2488     // with the new dispatch block.
2489     // Keep a copy of Successors since it's modified inside the loop.
2490     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
2491                                                    MBB->succ_rend());
2492     // FIXME: Avoid quadratic complexity.
2493     for (auto MBBS : Successors) {
2494       if (MBBS->isEHPad()) {
2495         MBB->removeSuccessor(MBBS);
2496         MBBLPads.push_back(MBBS);
2497       }
2498     }
2499 
2500     MBB->addSuccessor(DispatchBB);
2501 
2502     // Find the invoke call and mark all of the callee-saved registers as
2503     // 'implicit defined' so that they're spilled.  This prevents code from
2504     // moving instructions to before the EH block, where they will never be
2505     // executed.
2506     for (auto &II : reverse(*MBB)) {
2507       if (!II.isCall())
2508         continue;
2509 
2510       DenseMap<Register, bool> DefRegs;
2511       for (auto &MOp : II.operands())
2512         if (MOp.isReg())
2513           DefRegs[MOp.getReg()] = true;
2514 
2515       MachineInstrBuilder MIB(*MF, &II);
2516       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
2517         Register Reg = SavedRegs[RI];
2518         if (!DefRegs[Reg])
2519           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
2520       }
2521 
2522       break;
2523     }
2524   }
2525 
2526   // Mark all former landing pads as non-landing pads.  The dispatch is the only
2527   // landing pad now.
2528   for (auto &LP : MBBLPads)
2529     LP->setIsEHPad(false);
2530 
2531   // The instruction is gone now.
2532   MI.eraseFromParent();
2533   return BB;
2534 }
2535 
2536 MachineBasicBlock *
2537 VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
2538                                               MachineBasicBlock *BB) const {
2539   switch (MI.getOpcode()) {
2540   default:
2541     llvm_unreachable("Unknown Custom Instruction!");
2542   case VE::EH_SjLj_LongJmp:
2543     return emitEHSjLjLongJmp(MI, BB);
2544   case VE::EH_SjLj_SetJmp:
2545     return emitEHSjLjSetJmp(MI, BB);
2546   case VE::EH_SjLj_Setup_Dispatch:
2547     return emitSjLjDispatchBlock(MI, BB);
2548   }
2549 }
2550 
2551 static bool isI32Insn(const SDNode *User, const SDNode *N) {
2552   switch (User->getOpcode()) {
2553   default:
2554     return false;
2555   case ISD::ADD:
2556   case ISD::SUB:
2557   case ISD::MUL:
2558   case ISD::SDIV:
2559   case ISD::UDIV:
2560   case ISD::SETCC:
2561   case ISD::SMIN:
2562   case ISD::SMAX:
2563   case ISD::SHL:
2564   case ISD::SRA:
2565   case ISD::BSWAP:
2566   case ISD::SINT_TO_FP:
2567   case ISD::UINT_TO_FP:
2568   case ISD::BR_CC:
2569   case ISD::BITCAST:
2570   case ISD::ATOMIC_CMP_SWAP:
2571   case ISD::ATOMIC_SWAP:
2572     return true;
2573   case ISD::SRL:
2574     if (N->getOperand(0).getOpcode() != ISD::SRL)
2575       return true;
2576     // (srl (trunc (srl ...))) may be optimized by combining srl, so
2577     // doesn't optimize trunc now.
2578     return false;
2579   case ISD::SELECT_CC:
2580     if (User->getOperand(2).getNode() != N &&
2581         User->getOperand(3).getNode() != N)
2582       return true;
2583     LLVM_FALLTHROUGH;
2584   case ISD::AND:
2585   case ISD::OR:
2586   case ISD::XOR:
2587   case ISD::SELECT:
2588   case ISD::CopyToReg:
2589     // Check all use of selections, bit operations, and copies.  If all of them
2590     // are safe, optimize truncate to extract_subreg.
2591     for (const SDNode *U : User->uses()) {
2592       switch (U->getOpcode()) {
2593       default:
2594         // If the use is an instruction which treats the source operand as i32,
2595         // it is safe to avoid truncate here.
2596         if (isI32Insn(U, N))
2597           continue;
2598         break;
2599       case ISD::ANY_EXTEND:
2600       case ISD::SIGN_EXTEND:
2601       case ISD::ZERO_EXTEND: {
2602         // Special optimizations to the combination of ext and trunc.
2603         // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
2604         // since this truncate instruction clears higher 32 bits which is filled
2605         // by one of ext instructions later.
2606         assert(N->getValueType(0) == MVT::i32 &&
2607                "find truncate to not i32 integer");
2608         if (User->getOpcode() == ISD::SELECT_CC ||
2609             User->getOpcode() == ISD::SELECT)
2610           continue;
2611         break;
2612       }
2613       }
2614       return false;
2615     }
2616     return true;
2617   }
2618 }
2619 
2620 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
2621 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
2622 // is sometime too late.  So, doing it at here.
2623 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
2624                                           DAGCombinerInfo &DCI) const {
2625   assert(N->getOpcode() == ISD::TRUNCATE &&
2626          "Should be called with a TRUNCATE node");
2627 
2628   SelectionDAG &DAG = DCI.DAG;
2629   SDLoc DL(N);
2630   EVT VT = N->getValueType(0);
2631 
2632   // We prefer to do this when all types are legal.
2633   if (!DCI.isAfterLegalizeDAG())
2634     return SDValue();
2635 
2636   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
2637   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
2638       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
2639       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
2640     return SDValue();
2641 
2642   // Check all use of this TRUNCATE.
2643   for (const SDNode *User : N->uses()) {
2644     // Make sure that we're not going to replace TRUNCATE for non i32
2645     // instructions.
2646     //
2647     // FIXME: Although we could sometimes handle this, and it does occur in
2648     // practice that one of the condition inputs to the select is also one of
2649     // the outputs, we currently can't deal with this.
2650     if (isI32Insn(User, N))
2651       continue;
2652 
2653     return SDValue();
2654   }
2655 
2656   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2657   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
2658                                     N->getOperand(0), SubI32),
2659                  0);
2660 }
2661 
2662 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
2663                                             DAGCombinerInfo &DCI) const {
2664   switch (N->getOpcode()) {
2665   default:
2666     break;
2667   case ISD::TRUNCATE:
2668     return combineTRUNCATE(N, DCI);
2669   }
2670 
2671   return SDValue();
2672 }
2673 
2674 //===----------------------------------------------------------------------===//
2675 // VE Inline Assembly Support
2676 //===----------------------------------------------------------------------===//
2677 
2678 VETargetLowering::ConstraintType
2679 VETargetLowering::getConstraintType(StringRef Constraint) const {
2680   if (Constraint.size() == 1) {
2681     switch (Constraint[0]) {
2682     default:
2683       break;
2684     case 'v': // vector registers
2685       return C_RegisterClass;
2686     }
2687   }
2688   return TargetLowering::getConstraintType(Constraint);
2689 }
2690 
2691 std::pair<unsigned, const TargetRegisterClass *>
2692 VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
2693                                                StringRef Constraint,
2694                                                MVT VT) const {
2695   const TargetRegisterClass *RC = nullptr;
2696   if (Constraint.size() == 1) {
2697     switch (Constraint[0]) {
2698     default:
2699       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2700     case 'r':
2701       RC = &VE::I64RegClass;
2702       break;
2703     case 'v':
2704       RC = &VE::V64RegClass;
2705       break;
2706     }
2707     return std::make_pair(0U, RC);
2708   }
2709 
2710   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2711 }
2712 
2713 //===----------------------------------------------------------------------===//
2714 // VE Target Optimization Support
2715 //===----------------------------------------------------------------------===//
2716 
2717 unsigned VETargetLowering::getMinimumJumpTableEntries() const {
2718   // Specify 8 for PIC model to relieve the impact of PIC load instructions.
2719   if (isJumpTableRelative())
2720     return 8;
2721 
2722   return TargetLowering::getMinimumJumpTableEntries();
2723 }
2724 
2725 bool VETargetLowering::hasAndNot(SDValue Y) const {
2726   EVT VT = Y.getValueType();
2727 
2728   // VE doesn't have vector and not instruction.
2729   if (VT.isVector())
2730     return false;
2731 
2732   // VE allows different immediate values for X and Y where ~X & Y.
2733   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
2734   // function is used to check whether an immediate value is OK for and-not
2735   // instruction as both X and Y.  Generating additional instruction to
2736   // retrieve an immediate value is no good since the purpose of this
2737   // function is to convert a series of 3 instructions to another series of
2738   // 3 instructions with better parallelism.  Therefore, we return false
2739   // for all immediate values now.
2740   // FIXME: Change hasAndNot function to have two operands to make it work
2741   //        correctly with Aurora VE.
2742   if (isa<ConstantSDNode>(Y))
2743     return false;
2744 
2745   // It's ok for generic registers.
2746   return true;
2747 }
2748 
2749 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
2750                                                   SelectionDAG &DAG) const {
2751   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
2752   MVT VT = Op.getOperand(0).getSimpleValueType();
2753 
2754   // Special treatment for packed V64 types.
2755   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2756   (void)VT;
2757   // Example of codes:
2758   //   %packed_v = extractelt %vr, %idx / 2
2759   //   %v = %packed_v >> (%idx % 2 * 32)
2760   //   %res = %v & 0xffffffff
2761 
2762   SDValue Vec = Op.getOperand(0);
2763   SDValue Idx = Op.getOperand(1);
2764   SDLoc DL(Op);
2765   SDValue Result = Op;
2766   if (false /* Idx->isConstant() */) {
2767     // TODO: optimized implementation using constant values
2768   } else {
2769     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2770     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2771     SDValue PackedElt =
2772         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2773     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2774     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2775     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2776     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2777     PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
2778     SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
2779     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2780     SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2781     Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
2782                                         MVT::i32, PackedElt, SubI32),
2783                      0);
2784 
2785     if (Op.getSimpleValueType() == MVT::f32) {
2786       Result = DAG.getBitcast(MVT::f32, Result);
2787     } else {
2788       assert(Op.getSimpleValueType() == MVT::i32);
2789     }
2790   }
2791   return Result;
2792 }
2793 
2794 SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
2795                                                  SelectionDAG &DAG) const {
2796   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
2797   MVT VT = Op.getOperand(0).getSimpleValueType();
2798 
2799   // Special treatment for packed V64 types.
2800   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2801   (void)VT;
2802   // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
2803   // bits" required `val << 32` from C implementation's point of view.
2804   //
2805   // Example of codes:
2806   //   %packed_elt = extractelt %vr, (%idx >> 1)
2807   //   %shift = ((%idx & 1) ^ 1) << 5
2808   //   %packed_elt &= 0xffffffff00000000 >> shift
2809   //   %packed_elt |= (zext %val) << shift
2810   //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)
2811 
2812   SDLoc DL(Op);
2813   SDValue Vec = Op.getOperand(0);
2814   SDValue Val = Op.getOperand(1);
2815   SDValue Idx = Op.getOperand(2);
2816   if (Idx.getSimpleValueType() == MVT::i32)
2817     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
2818   if (Val.getSimpleValueType() == MVT::f32)
2819     Val = DAG.getBitcast(MVT::i32, Val);
2820   assert(Val.getSimpleValueType() == MVT::i32);
2821   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
2822 
2823   SDValue Result = Op;
2824   if (false /* Idx->isConstant()*/) {
2825     // TODO: optimized implementation using constant values
2826   } else {
2827     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2828     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2829     SDValue PackedElt =
2830         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2831     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2832     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2833     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2834     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2835     SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
2836     Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
2837     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2838     Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
2839     PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
2840     Result =
2841         SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
2842                                    {HalfIdx, PackedElt, Vec}),
2843                 0);
2844   }
2845   return Result;
2846 }
2847